From 7c685f789eec87dba44f673ae54362196ec1de55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Wed, 30 Nov 2022 15:27:46 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=20Segmenter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- analyse/textrank.go | 17 +- dictionary.go | 2 +- dictionary/dictionary.go | 3 +- posseg/char_state_tab.go | 6 +- posseg/dictionary.go | 4 +- posseg/example_test.go | 8 +- posseg/posseg.go | 380 ++++++++++++++++++--------------------- posseg/posseg_test.go | 32 ++-- 8 files changed, 203 insertions(+), 249 deletions(-) diff --git a/analyse/textrank.go b/analyse/textrank.go index 7e2cdab..4b2007e 100755 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -133,10 +133,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin return h.Sum64() } span := 5 - var pairs []posseg.Segment - for pair := range (*posseg.Segmenter)(t).Cut(sentence, true) { - pairs = append(pairs, pair) - } + pairs := (*posseg.Segmenter)(t).Cut(sentence, true) for i := range pairs { if _, ok := posFilt[pairs[i].Pos()]; ok { for j := i + 1; j < i+span && j <= len(pairs); j++ { @@ -174,13 +171,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments { type TextRanker posseg.Segmenter // NewTextRanker reads a given file and create a new dictionary file for Textranker. -func NewTextRanker(file fs.File) (TextRanker, error) { - seg := posseg.Segmenter{} - return TextRanker(seg), seg.LoadDictionary(file) +func NewTextRanker(file fs.File) (*TextRanker, error) { + seg, err := posseg.LoadDictionary(file) + return (*TextRanker)(seg), err } // NewTextRankerAt reads a given file and create a new dictionary file for Textranker. -func NewTextRankerAt(fileName string) (TextRanker, error) { - seg := posseg.Segmenter{} - return TextRanker(seg), seg.LoadDictionaryAt(fileName) +func NewTextRankerAt(file string) (*TextRanker, error) { + seg, err := posseg.LoadDictionaryAt(file) + return (*TextRanker)(seg), err } diff --git a/dictionary.go b/dictionary.go index ff7e6ed..ba0e3e0 100755 --- a/dictionary.go +++ b/dictionary.go @@ -15,7 +15,7 @@ type Dictionary struct { sync.RWMutex } -// Load loads all tokens from given channel +// Load loads all tokens func (d *Dictionary) Load(tokens ...dictionary.Token) { d.Lock() for _, token := range tokens { diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go index 4f96183..7514f34 100755 --- a/dictionary/dictionary.go +++ b/dictionary/dictionary.go @@ -10,8 +10,7 @@ import ( "strings" ) -// DictLoader is the interface that could add one token or load -// tokens from channel. +// DictLoader is the interface that could add one token or load tokens type DictLoader interface { Load(...Token) AddToken(Token) diff --git a/posseg/char_state_tab.go b/posseg/char_state_tab.go index e8ae043..4c5d508 100755 --- a/posseg/char_state_tab.go +++ b/posseg/char_state_tab.go @@ -39,7 +39,7 @@ func newTag(position, pos string) (tag, error) { } } if positionIndex < 0 || posIndex < 0 { - return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos) + return 0, fmt.Errorf("failed to convert %s %s to Tag", position, pos) } return tag(positionIndex + posIndex), nil } @@ -6705,6 +6705,6 @@ var ( '\u9fa0': []uint16{413}, } - positions = []string{"B", "E", "M", "S"} - poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"} + positions = [...]string{"B", "E", "M", "S"} + poss = [...]string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"} ) diff --git a/posseg/dictionary.go b/posseg/dictionary.go index 8adcf3a..a2cb3a6 100755 --- a/posseg/dictionary.go +++ b/posseg/dictionary.go @@ -10,13 +10,13 @@ import ( // A Dictionary represents a thread-safe dictionary used for word segmentation. type Dictionary struct { + sync.RWMutex total, logTotal float64 freqMap map[string]float64 posMap map[string]string - sync.RWMutex } -// Load loads all tokens from given channel +// Load loads all tokens func (d *Dictionary) Load(tokens ...dictionary.Token) { d.Lock() for _, token := range tokens { diff --git a/posseg/example_test.go b/posseg/example_test.go index 8ac1d84..8d9e6ad 100755 --- a/posseg/example_test.go +++ b/posseg/example_test.go @@ -7,10 +7,12 @@ import ( ) func Example() { - var seg posseg.Segmenter - seg.LoadDictionaryAt("../dict.txt") + seg, err := posseg.LoadDictionaryAt("../dict.txt") + if err != nil { + panic(err) + } - for segment := range seg.Cut("我爱北京天安门", true) { + for _, segment := range seg.Cut("我爱北京天安门", true) { fmt.Printf("%s %s\n", segment.Text(), segment.Pos()) } // Output: diff --git a/posseg/posseg.go b/posseg/posseg.go index 6dad982..21efd1a 100755 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -35,107 +35,102 @@ func (s Segment) Pos() string { } // Segmenter is a Chinese words segmentation struct. -type Segmenter struct { - dict *Dictionary -} +type Segmenter Dictionary // LoadDictionary loads dictionary from given file name. // Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionary(file fs.File) error { - seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} - return seg.dict.loadDictionary(file) +func LoadDictionary(file fs.File) (*Segmenter, error) { + dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} + err := dict.loadDictionary(file) + if err != nil { + return nil, err + } + return (*Segmenter)(dict), nil } // LoadDictionaryAt loads dictionary from given file name. // Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionaryAt(fileName string) error { - seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} - return seg.dict.loadDictionaryAt(fileName) +func LoadDictionaryAt(file string) (*Segmenter, error) { + dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} + err := dict.loadDictionaryAt(file) + if err != nil { + return nil, err + } + return (*Segmenter)(dict), nil } // LoadUserDictionary loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionary(file fs.File) error { - return seg.dict.loadDictionary(file) + return (*Dictionary)(seg).loadDictionary(file) } // LoadUserDictionaryAt loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error { - return seg.dict.loadDictionaryAt(fileName) + return (*Dictionary)(seg).loadDictionaryAt(fileName) } -func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { - result := make(chan Segment) - - go func() { - runes := []rune(sentence) - posList := viterbi(runes) - begin := 0 - next := 0 - for i, char := range runes { - pos := posList[i] - switch pos.position() { - case "B": - begin = i - case "E": - result <- Segment{string(runes[begin : i+1]), pos.pos()} - next = i + 1 - case "S": - result <- Segment{string(char), pos.pos()} - next = i + 1 - } +func (seg *Segmenter) cutDetailInternal(sentence string) (results []Segment) { + runes := []rune(sentence) + posList := viterbi(runes) + begin := 0 + next := 0 + for i, char := range runes { + pos := posList[i] + switch pos.position() { + case "B": + begin = i + case "E": + results = append(results, Segment{string(runes[begin : i+1]), pos.pos()}) + next = i + 1 + case "S": + results = append(results, Segment{string(char), pos.pos()}) + next = i + 1 } - if next < len(runes) { - result <- Segment{string(runes[next:]), posList[next].pos()} - } - close(result) - }() - return result + } + if next < len(runes) { + results = append(results, Segment{string(runes[next:]), posList[next].pos()}) + } + return } -func (seg *Segmenter) cutDetail(sentence string) <-chan Segment { - result := make(chan Segment) - go func() { - for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) { - if reHanDetail.MatchString(blk) { - for segment := range seg.cutDetailInternal(blk) { - result <- segment - } +func (seg *Segmenter) cutDetail(sentence string) (results []Segment) { + for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) { + if reHanDetail.MatchString(blk) { + results = append(results, seg.cutDetailInternal(blk)...) + continue + } + for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) { + if len(x) == 0 { continue } - for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) { - if len(x) == 0 { - continue - } - switch { - case reNum.MatchString(x): - result <- Segment{x, "m"} - case reEng.MatchString(x): - result <- Segment{x, "eng"} - default: - result <- Segment{x, "x"} - } + switch { + case reNum.MatchString(x): + results = append(results, Segment{x, "m"}) + case reEng.MatchString(x): + results = append(results, Segment{x, "eng"}) + default: + results = append(results, Segment{x, "x"}) } } - close(result) - }() - return result + } + return } -func (seg *Segmenter) dag(runes []rune) map[int][]int { - dag := make(map[int][]int) +func (seg *Segmenter) dag(runes []rune) [][]int { n := len(runes) + dag := make([][]int, n) var frag []rune var i int for k := 0; k < n; k++ { - dag[k] = make([]int, 0) + dag[k] = make([]int, 0, 64) i = k frag = runes[k : k+1] for { - freq, ok := seg.dict.Frequency(string(frag)) + freq, ok := (*Dictionary)(seg).Frequency(string(frag)) if !ok { break } @@ -160,20 +155,20 @@ type route struct { index int } -func (seg *Segmenter) calc(runes []rune) map[int]route { +func (seg *Segmenter) calc(runes []rune) []*route { dag := seg.dag(runes) n := len(runes) - rs := make(map[int]route) - rs[n] = route{frequency: 0.0, index: 0} - var r route + rs := make([]*route, n+1) + rs[n] = &route{frequency: 0.0, index: 0} + var r *route for idx := n - 1; idx >= 0; idx-- { for _, i := range dag[idx] { - if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok { - r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i} + if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok { + r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } else { - r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i} + r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } - if v, ok := rs[idx]; !ok { + if v := rs[idx]; v == nil { rs[idx] = r } else { if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { @@ -185,168 +180,139 @@ func (seg *Segmenter) calc(runes []rune) map[int]route { return rs } -type cutFunc func(sentence string) <-chan Segment - -func (seg *Segmenter) cutDAG(sentence string) <-chan Segment { - result := make(chan Segment) - - go func() { - runes := []rune(sentence) - routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 - frag := runes[x:y] - if y-x == 1 { - buf = append(buf, frag...) - x = y - continue - } - if len(buf) > 0 { - bufString := string(buf) - if len(buf) == 1 { - if tag, ok := seg.dict.Pos(bufString); ok { - result <- Segment{bufString, tag} - } else { - result <- Segment{bufString, "x"} - } - buf = make([]rune, 0) - continue - } - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { - for t := range seg.cutDetail(bufString) { - result <- t - } - } else { - for _, elem := range buf { - selem := string(elem) - if tag, ok := seg.dict.Pos(selem); ok { - result <- Segment{selem, tag} - } else { - result <- Segment{selem, "x"} - } - - } - } - buf = make([]rune, 0) - } - word := string(frag) - if tag, ok := seg.dict.Pos(word); ok { - result <- Segment{word, tag} - } else { - result <- Segment{word, "x"} - } +func (seg *Segmenter) cutDAG(sentence string) (results []Segment) { + runes := []rune(sentence) + routes := seg.calc(runes) + buf := make([]rune, 0, 256) + for x := 0; x < len(runes); { + y := routes[x].index + 1 + frag := runes[x:y] + if y-x == 1 { + buf = append(buf, frag...) x = y + continue } - if len(buf) > 0 { bufString := string(buf) if len(buf) == 1 { - if tag, ok := seg.dict.Pos(bufString); ok { - result <- Segment{bufString, tag} + if tag, ok := (*Dictionary)(seg).Pos(bufString); ok { + results = append(results, Segment{bufString, tag}) } else { - result <- Segment{bufString, "x"} + results = append(results, Segment{bufString, "x"}) } - } else { - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { - for t := range seg.cutDetail(bufString) { - result <- t - } - } else { - for _, elem := range buf { - selem := string(elem) - if tag, ok := seg.dict.Pos(selem); ok { - result <- Segment{selem, tag} - } else { - result <- Segment{selem, "x"} - } - } - } - } - } - close(result) - }() - return result -} - -func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment { - result := make(chan Segment) - - go func() { - runes := []rune(sentence) - routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 - frag := runes[x:y] - if reEng1.MatchString(string(frag)) && len(frag) == 1 { - buf = append(buf, frag...) - x = y + buf = buf[:0] continue } - if len(buf) > 0 { - result <- Segment{string(buf), "eng"} - buf = make([]rune, 0) - } - word := string(frag) - if tag, ok := seg.dict.Pos(word); ok { - result <- Segment{word, tag} + if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { + results = append(results, seg.cutDetail(bufString)...) } else { - result <- Segment{word, "x"} + for _, elem := range buf { + selem := string(elem) + if tag, ok := (*Dictionary)(seg).Pos(selem); ok { + results = append(results, Segment{selem, tag}) + } else { + results = append(results, Segment{selem, "x"}) + } + } } - x = y + buf = buf[:0] + } + word := string(frag) + if tag, ok := (*Dictionary)(seg).Pos(word); ok { + results = append(results, Segment{word, tag}) + } else { + results = append(results, Segment{word, "x"}) + } + x = y + } + if len(buf) > 0 { + bufString := string(buf) + if len(buf) == 1 { + if tag, ok := (*Dictionary)(seg).Pos(bufString); ok { + results = append(results, Segment{bufString, tag}) + } else { + results = append(results, Segment{bufString, "x"}) + } + return + } + if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { + results = append(results, seg.cutDetail(bufString)...) + return + } + for _, elem := range buf { + selem := string(elem) + if tag, ok := (*Dictionary)(seg).Pos(selem); ok { + results = append(results, Segment{selem, tag}) + } else { + results = append(results, Segment{selem, "x"}) + } + } + } + return +} + +func (seg *Segmenter) cutDAGNoHMM(sentence string) (results []Segment) { + runes := []rune(sentence) + routes := seg.calc(runes) + buf := make([]rune, 0, 256) + for x := 0; x < len(runes); { + y := routes[x].index + 1 + frag := runes[x:y] + if reEng1.MatchString(string(frag)) && len(frag) == 1 { + buf = append(buf, frag...) + x = y + continue } if len(buf) > 0 { - result <- Segment{string(buf), "eng"} - buf = make([]rune, 0) + results = append(results, Segment{string(buf), "eng"}) + buf = buf[:0] } - close(result) - }() - return result + word := string(frag) + if tag, ok := (*Dictionary)(seg).Pos(word); ok { + results = append(results, Segment{word, tag}) + } else { + results = append(results, Segment{word, "x"}) + } + x = y + } + if len(buf) > 0 { + results = append(results, Segment{string(buf), "eng"}) + } + return } // Cut cuts a sentence into words. // Parameter hmm controls whether to use the Hidden Markov Model. -func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment { - result := make(chan Segment) - var cut cutFunc +func (seg *Segmenter) Cut(sentence string, hmm bool) (results []Segment) { + var cut func(sentence string) []Segment if hmm { cut = seg.cutDAG } else { cut = seg.cutDAGNoHMM } - go func() { - for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) { - if reHanInternal.MatchString(blk) { - for wordTag := range cut(blk) { - result <- wordTag - } + for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) { + if reHanInternal.MatchString(blk) { + results = append(results, cut(blk)...) + continue + } + for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) { + if reSkipInternal.MatchString(x) { + results = append(results, Segment{x, "x"}) continue } - for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) { - if reSkipInternal.MatchString(x) { - result <- Segment{x, "x"} - continue - } - for _, xx := range x { - s := string(xx) - switch { - case reNum.MatchString(s): - result <- Segment{s, "m"} - case reEng.MatchString(x): - result <- Segment{x, "eng"} - default: - result <- Segment{s, "x"} - } + for _, xx := range x { + s := string(xx) + switch { + case reNum.MatchString(s): + results = append(results, Segment{s, "m"}) + case reEng.MatchString(x): + results = append(results, Segment{x, "eng"}) + default: + results = append(results, Segment{s, "x"}) } } } - close(result) - }() - return result + } + return } diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index e307b68..e26633a 100755 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -5,7 +5,7 @@ import ( ) var ( - seg Segmenter + seg, _ = LoadDictionaryAt("../dict.txt") testContents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", @@ -268,21 +268,9 @@ var ( } ) -func init() { - seg.LoadDictionaryAt("../dict.txt") -} - -func chanToArray(ch <-chan Segment) []Segment { - var result []Segment - for word := range ch { - result = append(result, word) - } - return result -} - func TestCut(t *testing.T) { for index, content := range testContents { - result := chanToArray(seg.Cut(content, true)) + result := seg.Cut(content, true) if len(defaultCutResult[index]) != len(result) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -294,7 +282,7 @@ func TestCut(t *testing.T) { t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i]) } } - result = chanToArray(seg.Cut(content, false)) + result = seg.Cut(content, false) if len(noHMMCutResult[index]) != len(result) { t.Fatal(content) } @@ -316,7 +304,7 @@ func TestBug132(t *testing.T) { {"又", "d"}, {"啞", "v"}, } - result := chanToArray(seg.Cut(sentence, true)) + result := seg.Cut(sentence, true) if len(cutResult) != len(result) { t.Fatal(result) } @@ -345,7 +333,7 @@ func TestBug137(t *testing.T) { {"研究", "vn"}, {"組", "x"}, } - result := chanToArray(seg.Cut(sentence, true)) + result := seg.Cut(sentence, true) if len(cutResult) != len(result) { t.Fatal(result) } @@ -358,7 +346,9 @@ func TestBug137(t *testing.T) { func TestUserDict(t *testing.T) { seg.LoadUserDictionaryAt("../userdict.txt") - defer seg.LoadDictionaryAt("../dict.txt") + defer func() { + seg, _ = LoadDictionaryAt("../dict.txt") + }() sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" cutResult := []Segment{ @@ -400,7 +390,7 @@ func TestUserDict(t *testing.T) { {"N", "eng"}, {"类型", "n"}} - result := chanToArray(seg.Cut(sentence, true)) + result := seg.Cut(sentence, true) if len(cutResult) != len(result) { t.Fatal(result) } @@ -415,7 +405,7 @@ func BenchmarkCutNoHMM(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.Cut(sentence, false)) + seg.Cut(sentence, false) } } @@ -423,6 +413,6 @@ func BenchmarkCut(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.Cut(sentence, true)) + seg.Cut(sentence, true) } }