diff --git a/example_test.go b/example_test.go index d29c3fe..62c5e7a 100644 --- a/example_test.go +++ b/example_test.go @@ -35,6 +35,60 @@ func Example() { // 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 / } +func Example_suggestFrequency() { + var seg jiebago.Segmenter + seg.LoadDictionary("dict.txt") + + print := func(ch <-chan string) { + for word := range ch { + fmt.Printf(" %s /", word) + } + fmt.Println() + } + sentence := "超敏C反应蛋白是什么?" + fmt.Print("Before:") + print(seg.Cut(sentence, false)) + word := "超敏C反应蛋白" + oldFrequency, _ := seg.Frequency(word) + frequency := seg.SuggestFrequency(word) + fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) + seg.AddWord(word, frequency) + fmt.Print("After:") + print(seg.Cut(sentence, false)) + + sentence = "如果放到post中将出错" + fmt.Print("Before:") + print(seg.Cut(sentence, false)) + word = "中将" + oldFrequency, _ = seg.Frequency(word) + frequency = seg.SuggestFrequency("中", "将") + fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) + seg.AddWord(word, frequency) + fmt.Print("After:") + print(seg.Cut(sentence, false)) + + sentence = "今天天气不错" + fmt.Print("Before:") + print(seg.Cut(sentence, false)) + word = "今天天气" + oldFrequency, _ = seg.Frequency(word) + frequency = seg.SuggestFrequency("今天", "天气") + fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) + seg.AddWord(word, frequency) + fmt.Print("After:") + print(seg.Cut(sentence, false)) + // Output: + // Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? / + // 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000. + // After: 超敏C反应蛋白 / 是 / 什么 / ? / + // Before: 如果 / 放到 / post / 中将 / 出错 / + // 中将 current frequency: 763.000000, suggest: 494.000000. + // After: 如果 / 放到 / post / 中 / 将 / 出错 / + // Before: 今天天气 / 不错 / + // 今天天气 current frequency: 3.000000, suggest: 0.000000. + // After: 今天 / 天气 / 不错 / +} + func Example_loadUserDictionary() { var seg jiebago.Segmenter seg.LoadDictionary("dict.txt") diff --git a/jieba.go b/jieba.go index f81f2ab..efa04c1 100644 --- a/jieba.go +++ b/jieba.go @@ -4,7 +4,9 @@ package jiebago import ( "math" "regexp" + "strings" + "github.com/wangbin/jiebago/dictionary" "github.com/wangbin/jiebago/finalseg" "github.com/wangbin/jiebago/util" ) @@ -22,9 +24,71 @@ type Segmenter struct { dict *Dictionary } -// Dictionary returns segmenter's dictionary -func (seg *Segmenter) Dictionary() *Dictionary { - return seg.dict +// Frequency returns a word's frequency and existence +func (seg *Segmenter) Frequency(word string) (float64, bool) { + return seg.dict.Frequency(word) +} + +// AddWord adds a new word with frequency to dictionary +func (seg *Segmenter) AddWord(word string, frequency float64) { + seg.dict.AddToken(dictionary.NewToken(word, frequency, "")) +} + +// DeleteWord removes a word from dictionary +func (seg *Segmenter) DeleteWord(word string) { + seg.dict.AddToken(dictionary.NewToken(word, 0.0, "")) +} + +/* +SuggestFrequency returns a suggested frequncy of a word or a long word +cutted into several short words. + +This method is useful when a word in the sentence is not cutted out correctly. + +If a word should not be further cutted, for example word "石墨烯" should not be +cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu +frequency for this word. + +If a word should be further cutted, for example word "今天天气" should be +further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气") +should return the minimum frequency for word "今天天气". +*/ +func (seg *Segmenter) SuggestFrequency(words ...string) float64 { + frequency := 1.0 + if len(words) > 1 { + for _, word := range words { + if freq, ok := seg.dict.Frequency(word); ok { + frequency *= freq + } + frequency /= seg.dict.total + } + frequency, _ = math.Modf(frequency * seg.dict.total) + wordFreq := 0.0 + if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok { + wordFreq = freq + } + if wordFreq < frequency { + frequency = wordFreq + } + } else { + word := words[0] + for segment := range seg.Cut(word, false) { + if freq, ok := seg.dict.Frequency(segment); ok { + frequency *= freq + } + frequency /= seg.dict.total + } + frequency, _ = math.Modf(frequency * seg.dict.total) + frequency += 1.0 + wordFreq := 1.0 + if freq, ok := seg.dict.Frequency(word); ok { + wordFreq = freq + } + if wordFreq > frequency { + frequency = wordFreq + } + } + return frequency } // LoadDictionary loads dictionary from given file name. Everytime @@ -175,14 +239,14 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { if reEng.MatchString(string(frag)) && len(frag) == 1 { buf = append(buf, frag...) x = y - } else { - if len(buf) > 0 { - result <- string(buf) - buf = make([]rune, 0) - } - result <- string(frag) - x = y + continue } + if len(buf) > 0 { + result <- string(buf) + buf = make([]rune, 0) + } + result <- string(frag) + x = y } if len(buf) > 0 { result <- string(buf) diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index f1bdcb6..6311b5d 100644 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - dict := jt.seg.Dictionary() for word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes) for _, step := range [2]int{2, 3} { - if width > step { - for i := 0; i < width-step+1; i++ { - gram = string(runes[i : i+step]) - gramLen := len(gram) - if frequency, ok := dict.Frequency(gram); ok && frequency > 0 { - gramStart := start + len(string(runes[:i])) - token := analysis.Token{ - Term: []byte(gram), - Start: gramStart, - End: gramStart + gramLen, - Position: pos, - Type: detectTokenType(gram), - } - rv = append(rv, &token) - pos++ + if width <= step { + continue + } + for i := 0; i < width-step+1; i++ { + gram = string(runes[i : i+step]) + gramLen := len(gram) + if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 { + gramStart := start + len(string(runes[:i])) + token := analysis.Token{ + Term: []byte(gram), + Start: gramStart, + End: gramStart + gramLen, + Position: pos, + Type: detectTokenType(gram), } + rv = append(rv, &token) + pos++ } } }