Merge branch 'release/v0.3.2'

2026-06-21 11:20:31 +08:00 · 2015-05-08 16:35:21 +08:00
parent 3d91f615cf 6b75cef871
commit ab8b95ef87
3 changed files with 144 additions and 26 deletions
--- a/example_test.go
+++ b/example_test.go
@@ -35,6 +35,60 @@ func Example() {
 	// 【搜索引擎模式】： 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / ， / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
 }
 func Example_suggestFrequency() {
 	var seg jiebago.Segmenter
 	seg.LoadDictionary("dict.txt")
 	print := func(ch <-chan string) {
 		for word := range ch {
 			fmt.Printf(" %s /", word)
 		}
 		fmt.Println()
 	}
 	sentence := "超敏C反应蛋白是什么？"
 	fmt.Print("Before:")
 	print(seg.Cut(sentence, false))
 	word := "超敏C反应蛋白"
 	oldFrequency, _ := seg.Frequency(word)
 	frequency := seg.SuggestFrequency(word)
 	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
 	seg.AddWord(word, frequency)
 	fmt.Print("After:")
 	print(seg.Cut(sentence, false))
 	sentence = "如果放到post中将出错"
 	fmt.Print("Before:")
 	print(seg.Cut(sentence, false))
 	word = "中将"
 	oldFrequency, _ = seg.Frequency(word)
 	frequency = seg.SuggestFrequency("中", "将")
 	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
 	seg.AddWord(word, frequency)
 	fmt.Print("After:")
 	print(seg.Cut(sentence, false))
 	sentence = "今天天气不错"
 	fmt.Print("Before:")
 	print(seg.Cut(sentence, false))
 	word = "今天天气"
 	oldFrequency, _ = seg.Frequency(word)
 	frequency = seg.SuggestFrequency("今天", "天气")
 	fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
 	seg.AddWord(word, frequency)
 	fmt.Print("After:")
 	print(seg.Cut(sentence, false))
 	// Output:
 	// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ？ /
 	// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
 	// After: 超敏C反应蛋白 / 是 / 什么 / ？ /
 	// Before: 如果 / 放到 / post / 中将 / 出错 /
 	// 中将 current frequency: 763.000000, suggest: 494.000000.
 	// After: 如果 / 放到 / post / 中 / 将 / 出错 /
 	// Before: 今天天气 / 不错 /
 	// 今天天气 current frequency: 3.000000, suggest: 0.000000.
 	// After: 今天 / 天气 / 不错 /
 }
 func Example_loadUserDictionary() {
 	var seg jiebago.Segmenter
 	seg.LoadDictionary("dict.txt")
--- a/jieba.go
+++ b/jieba.go
@@ -4,7 +4,9 @@ package jiebago
 import (
 	"math"
 	"regexp"
 	"strings"
 	"github.com/wangbin/jiebago/dictionary"
 	"github.com/wangbin/jiebago/finalseg"
 	"github.com/wangbin/jiebago/util"
 )
@@ -22,9 +24,71 @@ type Segmenter struct {
 	dict *Dictionary
 }
-// Dictionary returns segmenter's dictionary
+// Frequency returns a word's frequency and existence
-func (seg *Segmenter) Dictionary() *Dictionary {
+func (seg *Segmenter) Frequency(word string) (float64, bool) {
-	return seg.dict
+	return seg.dict.Frequency(word)
 }
 // AddWord adds a new word with frequency to dictionary
 func (seg *Segmenter) AddWord(word string, frequency float64) {
 	seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
 }
 // DeleteWord removes a word from dictionary
 func (seg *Segmenter) DeleteWord(word string) {
 	seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
 }
 /*
 SuggestFrequency returns a suggested frequncy of a word or a long word
 cutted into several short words.
 This method is useful when a word in the sentence is not cutted out correctly.
 If a word should not be further cutted, for example word "石墨烯" should not be
 cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu
 frequency for this word.
 If a word should be further cutted, for example word "今天天气" should be
 further cutted into two words "今天" and "天气",  SuggestFrequency("今天", "天气")
 should return the minimum frequency for word "今天天气".
 */
 func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
 	frequency := 1.0
 	if len(words) > 1 {
 		for _, word := range words {
 			if freq, ok := seg.dict.Frequency(word); ok {
 				frequency *= freq
 			}
 			frequency /= seg.dict.total
 		}
 		frequency, _ = math.Modf(frequency * seg.dict.total)
 		wordFreq := 0.0
 		if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
 			wordFreq = freq
 		}
 		if wordFreq < frequency {
 			frequency = wordFreq
 		}
 	} else {
 		word := words[0]
 		for segment := range seg.Cut(word, false) {
 			if freq, ok := seg.dict.Frequency(segment); ok {
 				frequency *= freq
 			}
 			frequency /= seg.dict.total
 		}
 		frequency, _ = math.Modf(frequency * seg.dict.total)
 		frequency += 1.0
 		wordFreq := 1.0
 		if freq, ok := seg.dict.Frequency(word); ok {
 			wordFreq = freq
 		}
 		if wordFreq > frequency {
 			frequency = wordFreq
 		}
 	}
 	return frequency
 }
 // LoadDictionary loads dictionary from given file name. Everytime
@@ -175,14 +239,14 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
 			if reEng.MatchString(string(frag)) && len(frag) == 1 {
 				buf = append(buf, frag...)
 				x = y
-			} else {
+				continue
 				if len(buf) > 0 {
 					result <- string(buf)
 					buf = make([]rune, 0)
 				}
 				result <- string(frag)
 				x = y
 			}
 			if len(buf) > 0 {
 				result <- string(buf)
 				buf = make([]rune, 0)
 			}
 			result <- string(frag)
 			x = y
 		}
 		if len(buf) > 0 {
 			result <- string(buf)
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
 	dict := jt.seg.Dictionary()
 	for word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range [2]int{2, 3} {
-				if width > step {
+				if width <= step {
-					for i := 0; i < width-step+1; i++ {
+					continue
-						gram = string(runes[i : i+step])
+				}
-						gramLen := len(gram)
+				for i := 0; i < width-step+1; i++ {
-						if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
+					gram = string(runes[i : i+step])
-							gramStart := start + len(string(runes[:i]))
+					gramLen := len(gram)
-							token := analysis.Token{
+					if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
-								Term:     []byte(gram),
+						gramStart := start + len(string(runes[:i]))
-								Start:    gramStart,
+						token := analysis.Token{
-								End:      gramStart + gramLen,
+							Term:     []byte(gram),
-								Position: pos,
+							Start:    gramStart,
-								Type:     detectTokenType(gram),
+							End:      gramStart + gramLen,
-							}
+							Position: pos,
-							rv = append(rv, &token)
+							Type:     detectTokenType(gram),
 							pos++
 						}
 						rv = append(rv, &token)
 						pos++
 					}
 				}
 			}