added AddWord/DeleteWord/SuggestFrequency functions, this is correpsonding to jieba commit #59aa8b69b1399569ea6b417280c993da703baba8

2026-07-17 02:40:23 +08:00 · 2015-05-08 11:57:46 +08:00
parent 3d91f615cf
commit c48eb5b4a7
2 changed files with 67 additions and 19 deletions
--- a/jieba.go
+++ b/jieba.go
@@ -4,7 +4,9 @@ package jiebago
 import (
 	"math"
 	"regexp"
+	"strings"

+	"github.com/wangbin/jiebago/dictionary"
 	"github.com/wangbin/jiebago/finalseg"
 	"github.com/wangbin/jiebago/util"
 )
@@ -22,9 +24,55 @@ type Segmenter struct {
 	dict *Dictionary
 }

-// Dictionary returns segmenter's dictionary
-func (seg *Segmenter) Dictionary() *Dictionary {
-	return seg.dict
+// Frequency returns a word's frequency and existence
+func (seg *Segmenter) Frequency(word string) (float64, bool) {
+	return seg.dict.Frequency(word)
+}
+
+// AddWord adds a new word with frequency to dictionary
+func (seg *Segmenter) AddWord(word string, frequency float64) {
+	seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
+}
+
+// Delete removes a word from dictionary
+func (seg *Segmenter) DeleteWord(word string) {
+	seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
+}
+
+func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
+	frequency := 1.0
+	if len(words) > 1 {
+		for _, word := range words {
+			if freq, ok := seg.dict.Frequency(word); ok {
+				frequency *= freq
+			}
+			frequency /= seg.dict.total
+		}
+		wordFreq := 0.0
+		if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
+			wordFreq = freq
+		}
+		if wordFreq < frequency {
+			frequency = wordFreq
+		}
+	} else {
+		word := words[0]
+		for segment := range seg.Cut(word, false) {
+			if freq, ok := seg.dict.Frequency(segment); ok {
+				frequency *= freq
+			}
+			frequency /= seg.dict.total
+		}
+		frequency = frequency*seg.dict.total + 1
+		wordFreq := 1.0
+		if freq, ok := seg.dict.Frequency(word); ok {
+			wordFreq = freq
+		}
+		if wordFreq > frequency {
+			frequency = wordFreq
+		}
+	}
+	return frequency
 }

 // LoadDictionary loads dictionary from given file name. Everytime
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	dict := jt.seg.Dictionary()
 	for word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range [2]int{2, 3} {
-				if width > step {
-					for i := 0; i < width-step+1; i++ {
-						gram = string(runes[i : i+step])
-						gramLen := len(gram)
-						if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
-							gramStart := start + len(string(runes[:i]))
-							token := analysis.Token{
-								Term:     []byte(gram),
-								Start:    gramStart,
-								End:      gramStart + gramLen,
-								Position: pos,
-								Type:     detectTokenType(gram),
-							}
-							rv = append(rv, &token)
-							pos++
+				if width <= step {
+					continue
+				}
+				for i := 0; i < width-step+1; i++ {
+					gram = string(runes[i : i+step])
+					gramLen := len(gram)
+					if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
+						gramStart := start + len(string(runes[:i]))
+						token := analysis.Token{
+							Term:     []byte(gram),
+							Start:    gramStart,
+							End:      gramStart + gramLen,
+							Position: pos,
+							Type:     detectTokenType(gram),
 						}
+						rv = append(rv, &token)
+						pos++
 					}
 				}
 			}