Merge branch 'release/v0.3.2'

2026-07-17 02:40:23 +08:00 · 2015-05-08 16:35:21 +08:00
parent 3d91f615cf 6b75cef871
commit ab8b95ef87
3 changed files with 144 additions and 26 deletions
--- a/example_test.go
+++ b/example_test.go
@@ -35,6 +35,60 @@ func Example() {
 	// 【搜索引擎模式】： 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / ， / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
 }

+func Example_suggestFrequency() {
+	var seg jiebago.Segmenter
+	seg.LoadDictionary("dict.txt")
+
+	print := func(ch <-chan string) {
+		for word := range ch {
+			fmt.Printf(" %s /", word)
+		}
+		fmt.Println()
+	}
+	sentence := "超敏C反应蛋白是什么？"
+	fmt.Print("Before:")
+	print(seg.Cut(sentence, false))
+	word := "超敏C反应蛋白"
+	oldFrequency, _ := seg.Frequency(word)
+	frequency := seg.SuggestFrequency(word)
+	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
+	seg.AddWord(word, frequency)
+	fmt.Print("After:")
+	print(seg.Cut(sentence, false))
+
+	sentence = "如果放到post中将出错"
+	fmt.Print("Before:")
+	print(seg.Cut(sentence, false))
+	word = "中将"
+	oldFrequency, _ = seg.Frequency(word)
+	frequency = seg.SuggestFrequency("中", "将")
+	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
+	seg.AddWord(word, frequency)
+	fmt.Print("After:")
+	print(seg.Cut(sentence, false))
+
+	sentence = "今天天气不错"
+	fmt.Print("Before:")
+	print(seg.Cut(sentence, false))
+	word = "今天天气"
+	oldFrequency, _ = seg.Frequency(word)
+	frequency = seg.SuggestFrequency("今天", "天气")
+	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
+	seg.AddWord(word, frequency)
+	fmt.Print("After:")
+	print(seg.Cut(sentence, false))
+	// Output:
+	// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ？ /
+	// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
+	// After: 超敏C反应蛋白 / 是 / 什么 / ？ /
+	// Before: 如果 / 放到 / post / 中将 / 出错 /
+	// 中将 current frequency: 763.000000, suggest: 494.000000.
+	// After: 如果 / 放到 / post / 中 / 将 / 出错 /
+	// Before: 今天天气 / 不错 /
+	// 今天天气 current frequency: 3.000000, suggest: 0.000000.
+	// After: 今天 / 天气 / 不错 /
+}
+
 func Example_loadUserDictionary() {
 	var seg jiebago.Segmenter
 	seg.LoadDictionary("dict.txt")
--- a/jieba.go
+++ b/jieba.go
@@ -4,7 +4,9 @@ package jiebago
 import (
 	"math"
 	"regexp"
+	"strings"

+	"github.com/wangbin/jiebago/dictionary"
 	"github.com/wangbin/jiebago/finalseg"
 	"github.com/wangbin/jiebago/util"
 )
@@ -22,9 +24,71 @@ type Segmenter struct {
 	dict *Dictionary
 }

-// Dictionary returns segmenter's dictionary
-func (seg *Segmenter) Dictionary() *Dictionary {
-	return seg.dict
+// Frequency returns a word's frequency and existence
+func (seg *Segmenter) Frequency(word string) (float64, bool) {
+	return seg.dict.Frequency(word)
+}
+
+// AddWord adds a new word with frequency to dictionary
+func (seg *Segmenter) AddWord(word string, frequency float64) {
+	seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
+}
+
+// DeleteWord removes a word from dictionary
+func (seg *Segmenter) DeleteWord(word string) {
+	seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
+}
+
+/*
+SuggestFrequency returns a suggested frequncy of a word or a long word
+cutted into several short words.
+
+This method is useful when a word in the sentence is not cutted out correctly.
+
+If a word should not be further cutted, for example word "石墨烯" should not be
+cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu
+frequency for this word.
+
+If a word should be further cutted, for example word "今天天气" should be
+further cutted into two words "今天" and "天气",  SuggestFrequency("今天", "天气")
+should return the minimum frequency for word "今天天气".
+*/
+func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
+	frequency := 1.0
+	if len(words) > 1 {
+		for _, word := range words {
+			if freq, ok := seg.dict.Frequency(word); ok {
+				frequency *= freq
+			}
+			frequency /= seg.dict.total
+		}
+		frequency, _ = math.Modf(frequency * seg.dict.total)
+		wordFreq := 0.0
+		if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
+			wordFreq = freq
+		}
+		if wordFreq < frequency {
+			frequency = wordFreq
+		}
+	} else {
+		word := words[0]
+		for segment := range seg.Cut(word, false) {
+			if freq, ok := seg.dict.Frequency(segment); ok {
+				frequency *= freq
+			}
+			frequency /= seg.dict.total
+		}
+		frequency, _ = math.Modf(frequency * seg.dict.total)
+		frequency += 1.0
+		wordFreq := 1.0
+		if freq, ok := seg.dict.Frequency(word); ok {
+			wordFreq = freq
+		}
+		if wordFreq > frequency {
+			frequency = wordFreq
+		}
+	}
+	return frequency
 }

 // LoadDictionary loads dictionary from given file name. Everytime
@@ -175,14 +239,14 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
 			if reEng.MatchString(string(frag)) && len(frag) == 1 {
 				buf = append(buf, frag...)
 				x = y
-			} else {
-				if len(buf) > 0 {
-					result <- string(buf)
-					buf = make([]rune, 0)
-				}
-				result <- string(frag)
-				x = y
+				continue
 			}
+			if len(buf) > 0 {
+				result <- string(buf)
+				buf = make([]rune, 0)
+			}
+			result <- string(frag)
+			x = y
 		}
 		if len(buf) > 0 {
 			result <- string(buf)
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	dict := jt.seg.Dictionary()
 	for word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range [2]int{2, 3} {
-				if width > step {
-					for i := 0; i < width-step+1; i++ {
-						gram = string(runes[i : i+step])
-						gramLen := len(gram)
-						if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
-							gramStart := start + len(string(runes[:i]))
-							token := analysis.Token{
-								Term:     []byte(gram),
-								Start:    gramStart,
-								End:      gramStart + gramLen,
-								Position: pos,
-								Type:     detectTokenType(gram),
-							}
-							rv = append(rv, &token)
-							pos++
+				if width <= step {
+					continue
+				}
+				for i := 0; i < width-step+1; i++ {
+					gram = string(runes[i : i+step])
+					gramLen := len(gram)
+					if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
+						gramStart := start + len(string(runes[:i]))
+						token := analysis.Token{
+							Term:     []byte(gram),
+							Start:    gramStart,
+							End:      gramStart + gramLen,
+							Position: pos,
+							Type:     detectTokenType(gram),
 						}
+						rv = append(rv, &token)
+						pos++
 					}
 				}
 			}