1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

tweak SuggestFrequency, added example

This commit is contained in:
Wang Bin
2015-05-08 16:34:28 +08:00
parent c48eb5b4a7
commit 6b75cef871
2 changed files with 79 additions and 9 deletions

View File

@@ -35,6 +35,60 @@ func Example() {
// 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
}
func Example_suggestFrequency() {
var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")
print := func(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Println()
}
sentence := "超敏C反应蛋白是什么"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word := "超敏C反应蛋白"
oldFrequency, _ := seg.Frequency(word)
frequency := seg.SuggestFrequency(word)
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))
sentence = "如果放到post中将出错"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word = "中将"
oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("中", "将")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))
sentence = "今天天气不错"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word = "今天天气"
oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("今天", "天气")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))
// Output:
// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / /
// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
// After: 超敏C反应蛋白 / 是 / 什么 / /
// Before: 如果 / 放到 / post / 中将 / 出错 /
// 中将 current frequency: 763.000000, suggest: 494.000000.
// After: 如果 / 放到 / post / 中 / 将 / 出错 /
// Before: 今天天气 / 不错 /
// 今天天气 current frequency: 3.000000, suggest: 0.000000.
// After: 今天 / 天气 / 不错 /
}
func Example_loadUserDictionary() {
var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")

View File

@@ -34,11 +34,25 @@ func (seg *Segmenter) AddWord(word string, frequency float64) {
seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
}
// Delete removes a word from dictionary
// DeleteWord removes a word from dictionary
func (seg *Segmenter) DeleteWord(word string) {
seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
}
/*
SuggestFrequency returns a suggested frequncy of a word or a long word
cutted into several short words.
This method is useful when a word in the sentence is not cutted out correctly.
If a word should not be further cutted, for example word "石墨烯" should not be
cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu
frequency for this word.
If a word should be further cutted, for example word "今天天气" should be
further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气")
should return the minimum frequency for word "今天天气".
*/
func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
frequency := 1.0
if len(words) > 1 {
@@ -48,6 +62,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
}
frequency /= seg.dict.total
}
frequency, _ = math.Modf(frequency * seg.dict.total)
wordFreq := 0.0
if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
wordFreq = freq
@@ -63,7 +78,8 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
}
frequency /= seg.dict.total
}
frequency = frequency*seg.dict.total + 1
frequency, _ = math.Modf(frequency * seg.dict.total)
frequency += 1.0
wordFreq := 1.0
if freq, ok := seg.dict.Frequency(word); ok {
wordFreq = freq
@@ -223,14 +239,14 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
if reEng.MatchString(string(frag)) && len(frag) == 1 {
buf = append(buf, frag...)
x = y
} else {
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
result <- string(frag)
x = y
continue
}
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
result <- string(frag)
x = y
}
if len(buf) > 0 {
result <- string(buf)