1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-18 00:50:26 +08:00

added AddWord/DeleteWord/SuggestFrequency functions, this is correpsonding to jieba commit #59aa8b69b1399569ea6b417280c993da703baba8

This commit is contained in:
Wang Bin
2015-05-08 11:57:46 +08:00
parent 3d91f615cf
commit c48eb5b4a7
2 changed files with 67 additions and 19 deletions

View File

@@ -4,7 +4,9 @@ package jiebago
import ( import (
"math" "math"
"regexp" "regexp"
"strings"
"github.com/wangbin/jiebago/dictionary"
"github.com/wangbin/jiebago/finalseg" "github.com/wangbin/jiebago/finalseg"
"github.com/wangbin/jiebago/util" "github.com/wangbin/jiebago/util"
) )
@@ -22,9 +24,55 @@ type Segmenter struct {
dict *Dictionary dict *Dictionary
} }
// Dictionary returns segmenter's dictionary // Frequency returns a word's frequency and existence
func (seg *Segmenter) Dictionary() *Dictionary { func (seg *Segmenter) Frequency(word string) (float64, bool) {
return seg.dict return seg.dict.Frequency(word)
}
// AddWord adds a new word with frequency to dictionary
func (seg *Segmenter) AddWord(word string, frequency float64) {
seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
}
// Delete removes a word from dictionary
func (seg *Segmenter) DeleteWord(word string) {
seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
}
func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
frequency := 1.0
if len(words) > 1 {
for _, word := range words {
if freq, ok := seg.dict.Frequency(word); ok {
frequency *= freq
}
frequency /= seg.dict.total
}
wordFreq := 0.0
if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
wordFreq = freq
}
if wordFreq < frequency {
frequency = wordFreq
}
} else {
word := words[0]
for segment := range seg.Cut(word, false) {
if freq, ok := seg.dict.Frequency(segment); ok {
frequency *= freq
}
frequency /= seg.dict.total
}
frequency = frequency*seg.dict.total + 1
wordFreq := 1.0
if freq, ok := seg.dict.Frequency(word); ok {
wordFreq = freq
}
if wordFreq > frequency {
frequency = wordFreq
}
}
return frequency
} }
// LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary loads dictionary from given file name. Everytime

View File

@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1 pos := 1
var width int var width int
var gram string var gram string
dict := jt.seg.Dictionary()
for word := range jt.seg.Cut(string(input), jt.hmm) { for word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
width = len(runes) width = len(runes)
for _, step := range [2]int{2, 3} { for _, step := range [2]int{2, 3} {
if width > step { if width <= step {
for i := 0; i < width-step+1; i++ { continue
gram = string(runes[i : i+step]) }
gramLen := len(gram) for i := 0; i < width-step+1; i++ {
if frequency, ok := dict.Frequency(gram); ok && frequency > 0 { gram = string(runes[i : i+step])
gramStart := start + len(string(runes[:i])) gramLen := len(gram)
token := analysis.Token{ if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
Term: []byte(gram), gramStart := start + len(string(runes[:i]))
Start: gramStart, token := analysis.Token{
End: gramStart + gramLen, Term: []byte(gram),
Position: pos, Start: gramStart,
Type: detectTokenType(gram), End: gramStart + gramLen,
} Position: pos,
rv = append(rv, &token) Type: detectTokenType(gram),
pos++
} }
rv = append(rv, &token)
pos++
} }
} }
} }