diff --git a/dictionary.go b/dictionary.go new file mode 100644 index 0000000..53e0b78 --- /dev/null +++ b/dictionary.go @@ -0,0 +1,15 @@ +package jiebago + +type Pair struct { + Word string + Flag string +} + +type Token struct { + *Pair + Freq float64 +} + +type DictLoader interface { + Add(*Token) +} diff --git a/jieba.go b/jieba.go index cad16e8..fbc2e83 100644 --- a/jieba.go +++ b/jieba.go @@ -49,7 +49,7 @@ func (rs routes) Swap(i, j int) { } // Build a directed acyclic graph (DAG) for sentence. -func DAG(sentence string) map[int][]int { +func (j *Jieba) DAG(sentence string) map[int][]int { dag := make(map[int][]int) runes := []rune(sentence) n := len(runes) @@ -60,7 +60,7 @@ func DAG(sentence string) map[int][]int { i = k frag = string(runes[k]) for { - if freq, ok := Trie.Freq[frag]; !ok { + if freq, ok := j.Freq[frag]; !ok { break } else { if freq > 0.0 { @@ -81,19 +81,19 @@ func DAG(sentence string) map[int][]int { return dag } -func Calc(sentence string, dag map[int][]int) map[int]*route { +func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route { runes := []rune(sentence) number := len(runes) rs := make(map[int]*route) rs[number] = &route{Freq: 0.0, Index: 0} - logTotal := math.Log(Trie.Total) + logTotal := math.Log(j.Total) for idx := number - 1; idx >= 0; idx-- { candidates := make(routes, 0) for _, i := range dag[idx] { word := string(runes[idx : i+1]) var r *route - if _, ok := Trie.Freq[word]; ok { - r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i} + if _, ok := j.Freq[word]; ok { + r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i} } else { r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} } @@ -107,11 +107,11 @@ func Calc(sentence string, dag map[int][]int) map[int]*route { type cutFunc func(sentence string) chan string -func cutDAG(sentence string) chan string { +func (j *Jieba) cutDAG(sentence string) chan string { result := make(chan string) go func() { - dag := DAG(sentence) - routes := Calc(sentence, dag) + dag := j.DAG(sentence) + routes := j.Calc(sentence, dag) x := 0 var y int runes := []rune(sentence) @@ -132,7 +132,7 @@ func cutDAG(sentence string) chan string { buf = make([]rune, 0) } else { bufString := string(buf) - if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { + if v, ok := j.Freq[bufString]; !ok || v == 0.0 { for x := range finalseg.Cut(bufString) { result <- x } @@ -154,7 +154,7 @@ func cutDAG(sentence string) chan string { result <- string(buf) } else { bufString := string(buf) - if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { + if v, ok := j.Freq[bufString]; !ok || v == 0.0 { for t := range finalseg.Cut(bufString) { result <- t } @@ -170,12 +170,12 @@ func cutDAG(sentence string) chan string { return result } -func cutDAGNoHMM(sentence string) chan string { +func (j *Jieba) cutDAGNoHMM(sentence string) chan string { result := make(chan string) go func() { - dag := DAG(sentence) - routes := Calc(sentence, dag) + dag := j.DAG(sentence) + routes := j.Calc(sentence, dag) x := 0 var y int runes := []rune(sentence) @@ -208,12 +208,12 @@ func cutDAGNoHMM(sentence string) chan string { return result } -func cutAll(sentence string) chan string { +func (j *Jieba) cutAll(sentence string) chan string { result := make(chan string) go func() { runes := []rune(sentence) - dag := DAG(sentence) + dag := j.DAG(sentence) old_j := -1 ks := make([]int, 0) for k := range dag { @@ -251,7 +251,7 @@ which is suitable for text analysis. HMM contols whether to use the Hidden Markov Mode. */ -func Cut(sentence string, isCutAll bool, HMM bool) chan string { +func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string { result := make(chan string) go func() { var reHan, reSkip *regexp.Regexp @@ -264,12 +264,12 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string { } var cut cutFunc if HMM { - cut = cutDAG + cut = j.cutDAG } else { - cut = cutDAGNoHMM + cut = j.cutDAGNoHMM } if isCutAll { - cut = cutAll + cut = j.cutAll } for blk := range RegexpSplit(reHan, sentence) { if len(blk) == 0 { @@ -320,17 +320,17 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string { // Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts // to cut long words into several short words, which can raise the recall rate. // Suitable for search engines. -func CutForSearch(sentence string, hmm bool) chan string { +func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string { result := make(chan string) go func() { - for word := range Cut(sentence, false, hmm) { + for word := range j.Cut(sentence, false, hmm) { runes := []rune(word) for _, increment := range []int{2, 3} { if len(runes) > increment { var gram2 string for i := 0; i < len(runes)-increment+1; i++ { gram2 = string(runes[i : i+increment]) - if v, ok := Trie.Freq[gram2]; ok && v > 0.0 { + if v, ok := j.Freq[gram2]; ok && v > 0.0 { result <- gram2 } } diff --git a/jieba_test.go b/jieba_test.go index 9d161c1..f1786a8 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -617,10 +617,6 @@ var ( } ) -func init() { - SetDictionary("dict.txt") -} - func chanToArray(ch chan string) []string { result := make([]string, 0) for word := range ch { @@ -630,14 +626,18 @@ func chanToArray(ch chan string) []string { } func TestCutDAG(t *testing.T) { - result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) + j, _ := NewJieba("dict.txt") + + result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) if len(result) != 11 { t.Error(result) } } func TestCutDAGNoHmm(t *testing.T) { - result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) + j, _ := NewJieba("dict.txt") + + result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) if len(result) != 11 { t.Error(result) } @@ -657,9 +657,11 @@ func TestRegexpSplit(t *testing.T) { } func TestDefaultCut(t *testing.T) { + j, _ := NewJieba("dict.txt") + var result []string for index, content := range test_contents { - result = chanToArray(Cut(content, false, true)) + result = chanToArray(j.Cut(content, false, true)) if len(result) != len(defaultCutResult[index]) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -673,9 +675,11 @@ func TestDefaultCut(t *testing.T) { } func TestCutAll(t *testing.T) { + j, _ := NewJieba("dict.txt") + var result []string for index, content := range test_contents { - result = chanToArray(Cut(content, true, true)) + result = chanToArray(j.Cut(content, true, true)) if len(result) != len(cutAllResult[index]) { t.Errorf("cut all for %s length should be %d not %d\n", content, len(cutAllResult[index]), len(result)) @@ -689,9 +693,11 @@ func TestCutAll(t *testing.T) { } func TestDefaultCutNoHMM(t *testing.T) { + j, _ := NewJieba("dict.txt") + var result []string for index, content := range test_contents { - result = chanToArray(Cut(content, false, false)) + result = chanToArray(j.Cut(content, false, false)) if len(result) != len(defaultCutNoHMMResult[index]) { t.Errorf("default cut no hmm for %s length should be %d not %d\n", content, len(defaultCutNoHMMResult[index]), len(result)) @@ -705,9 +711,11 @@ func TestDefaultCutNoHMM(t *testing.T) { } func TestCutForSearch(t *testing.T) { + j, _ := NewJieba("dict.txt") + var result []string for index, content := range test_contents { - result = chanToArray(CutForSearch(content, true)) + result = chanToArray(j.CutForSearch(content, true)) if len(result) != len(cutForSearchResult[index]) { t.Errorf("cut for search for %s length should be %d not %d\n", content, len(cutForSearchResult[index]), len(result)) @@ -719,7 +727,7 @@ func TestCutForSearch(t *testing.T) { } } for index, content := range test_contents { - result = chanToArray(CutForSearch(content, false)) + result = chanToArray(j.CutForSearch(content, false)) if len(result) != len(cutForSearchNoHMMResult[index]) { t.Errorf("cut for search no hmm for %s length should be %d not %d\n", content, len(cutForSearchNoHMMResult[index]), len(result)) @@ -734,9 +742,9 @@ func TestCutForSearch(t *testing.T) { func TestSetdictionary(t *testing.T) { var result []string - SetDictionary("foobar.txt") + j, _ := NewJieba("foobar.txt") for index, content := range test_contents { - result = chanToArray(Cut(content, false, true)) + result = chanToArray(j.Cut(content, false, true)) if len(result) != len(userDictCutResult[index]) { t.Errorf("default cut with user dictionary for %s length should be %d not %d\n", content, len(userDictCutResult[index]), len(result)) @@ -750,13 +758,13 @@ func TestSetdictionary(t *testing.T) { } func TestLoadUserDict(t *testing.T) { - SetDictionary("dict.txt") - LoadUserDict("userdict.txt") + j, _ := NewJieba("dict.txt") + j.LoadUserDict("userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} - words := chanToArray(Cut(sentence, false, true)) + words := chanToArray(j.Cut(sentence, false, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -768,7 +776,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "easy_install is great" result = []string{"easy_install", " ", "is", " ", "great"} - words = chanToArray(Cut(sentence, false, true)) + words = chanToArray(j.Cut(sentence, false, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -780,7 +788,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "python 的正则表达式是好用的" result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} - words = chanToArray(Cut(sentence, false, true)) + words = chanToArray(j.Cut(sentence, false, true)) if len(words) != len(result) { t.Error(words) t.Error(result) diff --git a/posseg/posseg.go b/posseg/posseg.go index cc0ddcb..1995b87 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -3,10 +3,10 @@ package posseg import ( "github.com/wangbin/jiebago" "regexp" + "strings" ) var ( - wordTagMap = make(map[string]string) reHanDetail = regexp.MustCompile(`\p{Han}+`) reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) reEng = regexp.MustCompile(`[[:alnum:]]`) @@ -20,26 +20,48 @@ type WordTag struct { Word, Tag string } +type Posseg struct { + *jiebago.Jieba + Flag map[string]string +} + +func (p *Posseg) Add(wtf *jiebago.WordTagFreq) { + if len(wtf.Tag) > 0 { + p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag) + } + p.AddWord(wtf) +} + // Set dictionary, it could be absolute path of dictionary file, or dictionary // name in current diectory. -func SetDictionary(dictFileName string) error { - err := jiebago.SetDictionary(dictFileName) - if err != nil { - return err - } +func NewPosseg(dictFileName string) (*Posseg, error) { + j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)} + p := &Posseg{j, make(map[string]string)} dictFilePath, err := jiebago.DictPath(dictFileName) if err != nil { - return err + return nil, err } wtfs, err := jiebago.ParseDictFile(dictFilePath) for _, wtf := range wtfs { - wordTagMap[wtf.Word] = wtf.Tag + p.Add(wtf) + } + return p, nil +} + +// Load user specified dictionary file. +func (p *Posseg) LoadUserDict(dictFilePath string) error { + wtfs, err := jiebago.ParseDictFile(dictFilePath) + if err != nil { + return err + } + for _, wtf := range wtfs { + p.Add(wtf) } return nil } -func cutDetailInternal(sentence string) chan WordTag { +func (p *Posseg) cutDetailInternal(sentence string) chan WordTag { result := make(chan WordTag) go func() { @@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag { return result } -func cutDetail(sentence string) chan WordTag { +func (p *Posseg) cutDetail(sentence string) chan WordTag { result := make(chan WordTag) go func() { for blk := range jiebago.RegexpSplit(reHanDetail, sentence) { if reHanDetail.MatchString(blk) { - for wordTag := range cutDetailInternal(blk) { + for wordTag := range p.cutDetailInternal(blk) { result <- wordTag } } else { @@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag { type cutFunc func(sentence string) chan WordTag -func cutDAG(sentence string) chan WordTag { +func (p *Posseg) cutDAG(sentence string) chan WordTag { result := make(chan WordTag) go func() { - dag := jiebago.DAG(sentence) - routes := jiebago.Calc(sentence, dag) + dag := p.DAG(sentence) + routes := p.Calc(sentence, dag) x := 0 var y int runes := []rune(sentence) @@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := wordTagMap[sbuf]; ok { + if tag, ok := p.Flag[sbuf]; ok { result <- WordTag{sbuf, tag} } else { result <- WordTag{sbuf, "x"} @@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag { buf = make([]rune, 0) } else { bufString := string(buf) - if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 { - for t := range cutDetail(bufString) { + if v, ok := p.Freq[bufString]; !ok || v == 0.0 { + for t := range p.cutDetail(bufString) { result <- t } } else { for _, elem := range buf { selem := string(elem) - if tag, ok := wordTagMap[selem]; ok { + if tag, ok := p.Flag[selem]; ok { result <- WordTag{string(elem), tag} } else { result <- WordTag{string(elem), "x"} @@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag { } } sl_word := string(l_word) - if tag, ok := wordTagMap[sl_word]; ok { + if tag, ok := p.Flag[sl_word]; ok { result <- WordTag{sl_word, tag} } else { result <- WordTag{sl_word, "x"} @@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := wordTagMap[sbuf]; ok { + if tag, ok := p.Flag[sbuf]; ok { result <- WordTag{sbuf, tag} } else { result <- WordTag{sbuf, "x"} } } else { bufString := string(buf) - if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 { - for t := range cutDetail(bufString) { + if v, ok := p.Freq[bufString]; !ok || v == 0.0 { + for t := range p.cutDetail(bufString) { result <- t } } else { for _, elem := range buf { selem := string(elem) - if tag, ok := wordTagMap[selem]; ok { + if tag, ok := p.Flag[selem]; ok { result <- WordTag{selem, tag} } else { result <- WordTag{selem, "x"} @@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag { return result } -func cutDAGNoHMM(sentence string) chan WordTag { +func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag { result := make(chan WordTag) go func() { - dag := jiebago.DAG(sentence) - routes := jiebago.Calc(sentence, dag) + dag := p.DAG(sentence) + routes := p.Calc(sentence, dag) x := 0 var y int runes := []rune(sentence) @@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag { buf = make([]rune, 0) } sl_word := string(l_word) - if tag, ok := wordTagMap[sl_word]; ok { + if tag, ok := p.Flag[sl_word]; ok { result <- WordTag{sl_word, tag} } else { result <- WordTag{sl_word, "x"} @@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag { // Tags the POS of each word after segmentation, using labels compatible with // ictclas. -func Cut(sentence string, HMM bool) chan WordTag { - for key := range jiebago.UserWordTagTab { - wordTagMap[key] = jiebago.UserWordTagTab[key] - delete(jiebago.UserWordTagTab, key) - } +func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag { result := make(chan WordTag) var cut cutFunc if HMM { - cut = cutDAG + cut = p.cutDAG } else { - cut = cutDAGNoHMM + cut = p.cutDAGNoHMM } go func() { for blk := range jiebago.RegexpSplit(reHanInternal, sentence) { diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 5771d7e..a1884d8 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -1,7 +1,6 @@ package posseg import ( - "github.com/wangbin/jiebago" "testing" ) @@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag { } func TestCut(t *testing.T) { - SetDictionary("../dict.txt") + p, err := NewPosseg("../dict.txt") + if err != nil { + t.Fatal(err) + } for index, content := range test_contents { - result := chanToArray(Cut(content, true)) + result := chanToArray(p.Cut(content, true)) if len(defaultCutResult[index]) != len(result) { t.Error(content) } for i, _ := range result { if result[i] != defaultCutResult[index][i] { - t.Error(content) + t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i]) } } - result = chanToArray(Cut(content, false)) + result = chanToArray(p.Cut(content, false)) if len(noHMMCutResult[index]) != len(result) { t.Error(content) } @@ -305,7 +307,7 @@ func TestBug132(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/132 */ - SetDictionary("../dict.txt") + p, _ := NewPosseg("../dict.txt") sentence := "又跛又啞" cutResult := []WordTag{ WordTag{"又", "d"}, @@ -313,7 +315,7 @@ func TestBug132(t *testing.T) { WordTag{"又", "d"}, WordTag{"啞", "v"}, } - result := chanToArray(Cut(sentence, true)) + result := chanToArray(p.Cut(sentence, true)) if len(cutResult) != len(result) { t.Error(result) } @@ -328,7 +330,7 @@ func TestBug137(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/137 */ - SetDictionary("../dict.txt") + p, _ := NewPosseg("../dict.txt") sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" cutResult := []WordTag{ WordTag{"前", "f"}, @@ -345,7 +347,7 @@ func TestBug137(t *testing.T) { WordTag{"研究", "vn"}, WordTag{"組", "x"}, } - result := chanToArray(Cut(sentence, true)) + result := chanToArray(p.Cut(sentence, true)) if len(cutResult) != len(result) { t.Error(result) } @@ -357,8 +359,8 @@ func TestBug137(t *testing.T) { } func TestUserDict(t *testing.T) { - SetDictionary("../dict.txt") - jiebago.LoadUserDict("../userdict.txt") + p, _ := NewPosseg("../dict.txt") + p.LoadUserDict("../userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" cutResult := []WordTag{ @@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) { WordTag{"N", "eng"}, WordTag{"类型", "n"}} - result := chanToArray(Cut(sentence, true)) + result := chanToArray(p.Cut(sentence, true)) if len(cutResult) != len(result) { t.Error(result) } diff --git a/trie.go b/trie.go index 679f1f4..bf8920f 100644 --- a/trie.go +++ b/trie.go @@ -7,18 +7,14 @@ import ( "log" "os" "path/filepath" - "strings" ) -// Trie store the total frequency and map of all words and their frequenciesb -var Trie *trie - -type trie struct { +type Jieba struct { Total float64 Freq map[string]float64 } -func (t *trie) load(dictFileName string) error { +func (j *Jieba) load(dictFileName string) error { dictFilePath, err := DictPath(dictFileName) if err != nil { return err @@ -55,7 +51,7 @@ func (t *trie) load(dictFileName string) error { if isDictCached { dec := gob.NewDecoder(cacheFile) - err = dec.Decode(&t) + err = dec.Decode(&j) if err != nil { isDictCached = false } else { @@ -70,7 +66,7 @@ func (t *trie) load(dictFileName string) error { } for _, wtf := range wtfs { - t.addWord(wtf) + j.AddWord(wtf) } // dump trie cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) @@ -79,7 +75,7 @@ func (t *trie) load(dictFileName string) error { } defer cacheFile.Close() enc := gob.NewEncoder(cacheFile) - err = enc.Encode(t) + err = enc.Encode(j) if err != nil { return err } else { @@ -89,30 +85,27 @@ func (t *trie) load(dictFileName string) error { return nil } -func (t *trie) addWord(wtf *WordTagFreq) { - t.Freq[wtf.Word] = wtf.Freq - t.Total += wtf.Freq +func (j *Jieba) AddWord(wtf *WordTagFreq) { + j.Freq[wtf.Word] = wtf.Freq + j.Total += wtf.Freq runes := []rune(wtf.Word) count := len(runes) for i := 0; i < count; i++ { wfrag := string(runes[0 : i+1]) - if _, ok := t.Freq[wfrag]; !ok { - t.Freq[wfrag] = 0.0 + if _, ok := j.Freq[wfrag]; !ok { + j.Freq[wfrag] = 0.0 } } } // Load user specified dictionary file. -func LoadUserDict(dictFilePath string) error { +func (j *Jieba) LoadUserDict(dictFilePath string) error { wtfs, err := ParseDictFile(dictFilePath) if err != nil { return err } for _, wtf := range wtfs { - if len(wtf.Tag) > 0 { - UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag) - } - Trie.addWord(wtf) + j.AddWord(wtf) } return nil } @@ -120,7 +113,8 @@ func LoadUserDict(dictFilePath string) error { // Set the dictionary, could be absolute path of dictionary file, or dictionary // name in current directory. This function must be called before cut any // sentence. -func SetDictionary(dictFileName string) error { - Trie = &trie{Total: 0.0, Freq: make(map[string]float64)} - return Trie.load(dictFileName) +func NewJieba(dictFileName string) (*Jieba, error) { + j := &Jieba{Total: 0.0, Freq: make(map[string]float64)} + err := j.load(dictFileName) + return j, err }