added AddWord/DeleteWord/SuggestFrequency functions, this is correpsonding to jieba commit #59aa8b69b1399569ea6b417280c993da703baba8

2026-06-18 00:50:26 +08:00 · 2015-05-08 11:57:46 +08:00
parent 3d91f615cf
commit c48eb5b4a7
2 changed files with 67 additions and 19 deletions
--- a/jieba.go
+++ b/jieba.go
@@ -4,7 +4,9 @@ package jiebago
 import (
 	"math"
 	"regexp"
 	"strings"
 	"github.com/wangbin/jiebago/dictionary"
 	"github.com/wangbin/jiebago/finalseg"
 	"github.com/wangbin/jiebago/util"
 )
@@ -22,9 +24,55 @@ type Segmenter struct {
 	dict *Dictionary
 }
-// Dictionary returns segmenter's dictionary
+// Frequency returns a word's frequency and existence
-func (seg *Segmenter) Dictionary() *Dictionary {
+func (seg *Segmenter) Frequency(word string) (float64, bool) {
-	return seg.dict
+	return seg.dict.Frequency(word)
 }
 // AddWord adds a new word with frequency to dictionary
 func (seg *Segmenter) AddWord(word string, frequency float64) {
 	seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
 }
 // Delete removes a word from dictionary
 func (seg *Segmenter) DeleteWord(word string) {
 	seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
 }
 func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
 	frequency := 1.0
 	if len(words) > 1 {
 		for _, word := range words {
 			if freq, ok := seg.dict.Frequency(word); ok {
 				frequency *= freq
 			}
 			frequency /= seg.dict.total
 		}
 		wordFreq := 0.0
 		if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
 			wordFreq = freq
 		}
 		if wordFreq < frequency {
 			frequency = wordFreq
 		}
 	} else {
 		word := words[0]
 		for segment := range seg.Cut(word, false) {
 			if freq, ok := seg.dict.Frequency(segment); ok {
 				frequency *= freq
 			}
 			frequency /= seg.dict.total
 		}
 		frequency = frequency*seg.dict.total + 1
 		wordFreq := 1.0
 		if freq, ok := seg.dict.Frequency(word); ok {
 			wordFreq = freq
 		}
 		if wordFreq > frequency {
 			frequency = wordFreq
 		}
 	}
 	return frequency
 }
 // LoadDictionary loads dictionary from given file name. Everytime
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
 	dict := jt.seg.Dictionary()
 	for word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range [2]int{2, 3} {
-				if width > step {
+				if width <= step {
-					for i := 0; i < width-step+1; i++ {
+					continue
-						gram = string(runes[i : i+step])
+				}
-						gramLen := len(gram)
+				for i := 0; i < width-step+1; i++ {
-						if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
+					gram = string(runes[i : i+step])
-							gramStart := start + len(string(runes[:i]))
+					gramLen := len(gram)
-							token := analysis.Token{
+					if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
-								Term:     []byte(gram),
+						gramStart := start + len(string(runes[:i]))
-								Start:    gramStart,
+						token := analysis.Token{
-								End:      gramStart + gramLen,
+							Term:     []byte(gram),
-								Position: pos,
+							Start:    gramStart,
-								Type:     detectTokenType(gram),
+							End:      gramStart + gramLen,
-							}
+							Position: pos,
-							rv = append(rv, &token)
+							Type:     detectTokenType(gram),
 							pos++
 						}
 						rv = append(rv, &token)
 						pos++
 					}
 				}
 			}