From c48eb5b4a7f4f68bec9b24b4f8a7c166b8ea844e Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Fri, 8 May 2015 11:57:46 +0800 Subject: [PATCH] added AddWord/DeleteWord/SuggestFrequency functions, this is correpsonding to jieba commit #59aa8b69b1399569ea6b417280c993da703baba8 --- jieba.go | 54 ++++++++++++++++++++++++++++++++++++++--- tokenizers/tokenizer.go | 32 ++++++++++++------------ 2 files changed, 67 insertions(+), 19 deletions(-) diff --git a/jieba.go b/jieba.go index f81f2ab..3011a00 100644 --- a/jieba.go +++ b/jieba.go @@ -4,7 +4,9 @@ package jiebago import ( "math" "regexp" + "strings" + "github.com/wangbin/jiebago/dictionary" "github.com/wangbin/jiebago/finalseg" "github.com/wangbin/jiebago/util" ) @@ -22,9 +24,55 @@ type Segmenter struct { dict *Dictionary } -// Dictionary returns segmenter's dictionary -func (seg *Segmenter) Dictionary() *Dictionary { - return seg.dict +// Frequency returns a word's frequency and existence +func (seg *Segmenter) Frequency(word string) (float64, bool) { + return seg.dict.Frequency(word) +} + +// AddWord adds a new word with frequency to dictionary +func (seg *Segmenter) AddWord(word string, frequency float64) { + seg.dict.AddToken(dictionary.NewToken(word, frequency, "")) +} + +// Delete removes a word from dictionary +func (seg *Segmenter) DeleteWord(word string) { + seg.dict.AddToken(dictionary.NewToken(word, 0.0, "")) +} + +func (seg *Segmenter) SuggestFrequency(words ...string) float64 { + frequency := 1.0 + if len(words) > 1 { + for _, word := range words { + if freq, ok := seg.dict.Frequency(word); ok { + frequency *= freq + } + frequency /= seg.dict.total + } + wordFreq := 0.0 + if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok { + wordFreq = freq + } + if wordFreq < frequency { + frequency = wordFreq + } + } else { + word := words[0] + for segment := range seg.Cut(word, false) { + if freq, ok := seg.dict.Frequency(segment); ok { + frequency *= freq + } + frequency /= seg.dict.total + } + frequency = frequency*seg.dict.total + 1 + wordFreq := 1.0 + if freq, ok := seg.dict.Frequency(word); ok { + wordFreq = freq + } + if wordFreq > frequency { + frequency = wordFreq + } + } + return frequency } // LoadDictionary loads dictionary from given file name. Everytime diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index f1bdcb6..6311b5d 100644 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - dict := jt.seg.Dictionary() for word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes) for _, step := range [2]int{2, 3} { - if width > step { - for i := 0; i < width-step+1; i++ { - gram = string(runes[i : i+step]) - gramLen := len(gram) - if frequency, ok := dict.Frequency(gram); ok && frequency > 0 { - gramStart := start + len(string(runes[:i])) - token := analysis.Token{ - Term: []byte(gram), - Start: gramStart, - End: gramStart + gramLen, - Position: pos, - Type: detectTokenType(gram), - } - rv = append(rv, &token) - pos++ + if width <= step { + continue + } + for i := 0; i < width-step+1; i++ { + gram = string(runes[i : i+step]) + gramLen := len(gram) + if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 { + gramStart := start + len(string(runes[:i])) + token := analysis.Token{ + Term: []byte(gram), + Start: gramStart, + End: gramStart + gramLen, + Position: pos, + Type: detectTokenType(gram), } + rv = append(rv, &token) + pos++ } } }