mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-21 11:20:31 +08:00
Merge branch 'release/v0.3.2'
This commit is contained in:
@@ -35,6 +35,60 @@ func Example() {
|
|||||||
// 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
// 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Example_suggestFrequency() {
|
||||||
|
var seg jiebago.Segmenter
|
||||||
|
seg.LoadDictionary("dict.txt")
|
||||||
|
|
||||||
|
print := func(ch <-chan string) {
|
||||||
|
for word := range ch {
|
||||||
|
fmt.Printf(" %s /", word)
|
||||||
|
}
|
||||||
|
fmt.Println()
|
||||||
|
}
|
||||||
|
sentence := "超敏C反应蛋白是什么?"
|
||||||
|
fmt.Print("Before:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
word := "超敏C反应蛋白"
|
||||||
|
oldFrequency, _ := seg.Frequency(word)
|
||||||
|
frequency := seg.SuggestFrequency(word)
|
||||||
|
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||||
|
seg.AddWord(word, frequency)
|
||||||
|
fmt.Print("After:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
|
||||||
|
sentence = "如果放到post中将出错"
|
||||||
|
fmt.Print("Before:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
word = "中将"
|
||||||
|
oldFrequency, _ = seg.Frequency(word)
|
||||||
|
frequency = seg.SuggestFrequency("中", "将")
|
||||||
|
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||||
|
seg.AddWord(word, frequency)
|
||||||
|
fmt.Print("After:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
|
||||||
|
sentence = "今天天气不错"
|
||||||
|
fmt.Print("Before:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
word = "今天天气"
|
||||||
|
oldFrequency, _ = seg.Frequency(word)
|
||||||
|
frequency = seg.SuggestFrequency("今天", "天气")
|
||||||
|
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||||
|
seg.AddWord(word, frequency)
|
||||||
|
fmt.Print("After:")
|
||||||
|
print(seg.Cut(sentence, false))
|
||||||
|
// Output:
|
||||||
|
// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? /
|
||||||
|
// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
|
||||||
|
// After: 超敏C反应蛋白 / 是 / 什么 / ? /
|
||||||
|
// Before: 如果 / 放到 / post / 中将 / 出错 /
|
||||||
|
// 中将 current frequency: 763.000000, suggest: 494.000000.
|
||||||
|
// After: 如果 / 放到 / post / 中 / 将 / 出错 /
|
||||||
|
// Before: 今天天气 / 不错 /
|
||||||
|
// 今天天气 current frequency: 3.000000, suggest: 0.000000.
|
||||||
|
// After: 今天 / 天气 / 不错 /
|
||||||
|
}
|
||||||
|
|
||||||
func Example_loadUserDictionary() {
|
func Example_loadUserDictionary() {
|
||||||
var seg jiebago.Segmenter
|
var seg jiebago.Segmenter
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionary("dict.txt")
|
||||||
|
|||||||
84
jieba.go
84
jieba.go
@@ -4,7 +4,9 @@ package jiebago
|
|||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
"github.com/wangbin/jiebago/finalseg"
|
"github.com/wangbin/jiebago/finalseg"
|
||||||
"github.com/wangbin/jiebago/util"
|
"github.com/wangbin/jiebago/util"
|
||||||
)
|
)
|
||||||
@@ -22,9 +24,71 @@ type Segmenter struct {
|
|||||||
dict *Dictionary
|
dict *Dictionary
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dictionary returns segmenter's dictionary
|
// Frequency returns a word's frequency and existence
|
||||||
func (seg *Segmenter) Dictionary() *Dictionary {
|
func (seg *Segmenter) Frequency(word string) (float64, bool) {
|
||||||
return seg.dict
|
return seg.dict.Frequency(word)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddWord adds a new word with frequency to dictionary
|
||||||
|
func (seg *Segmenter) AddWord(word string, frequency float64) {
|
||||||
|
seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteWord removes a word from dictionary
|
||||||
|
func (seg *Segmenter) DeleteWord(word string) {
|
||||||
|
seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
SuggestFrequency returns a suggested frequncy of a word or a long word
|
||||||
|
cutted into several short words.
|
||||||
|
|
||||||
|
This method is useful when a word in the sentence is not cutted out correctly.
|
||||||
|
|
||||||
|
If a word should not be further cutted, for example word "石墨烯" should not be
|
||||||
|
cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu
|
||||||
|
frequency for this word.
|
||||||
|
|
||||||
|
If a word should be further cutted, for example word "今天天气" should be
|
||||||
|
further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气")
|
||||||
|
should return the minimum frequency for word "今天天气".
|
||||||
|
*/
|
||||||
|
func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
||||||
|
frequency := 1.0
|
||||||
|
if len(words) > 1 {
|
||||||
|
for _, word := range words {
|
||||||
|
if freq, ok := seg.dict.Frequency(word); ok {
|
||||||
|
frequency *= freq
|
||||||
|
}
|
||||||
|
frequency /= seg.dict.total
|
||||||
|
}
|
||||||
|
frequency, _ = math.Modf(frequency * seg.dict.total)
|
||||||
|
wordFreq := 0.0
|
||||||
|
if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
|
||||||
|
wordFreq = freq
|
||||||
|
}
|
||||||
|
if wordFreq < frequency {
|
||||||
|
frequency = wordFreq
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
word := words[0]
|
||||||
|
for segment := range seg.Cut(word, false) {
|
||||||
|
if freq, ok := seg.dict.Frequency(segment); ok {
|
||||||
|
frequency *= freq
|
||||||
|
}
|
||||||
|
frequency /= seg.dict.total
|
||||||
|
}
|
||||||
|
frequency, _ = math.Modf(frequency * seg.dict.total)
|
||||||
|
frequency += 1.0
|
||||||
|
wordFreq := 1.0
|
||||||
|
if freq, ok := seg.dict.Frequency(word); ok {
|
||||||
|
wordFreq = freq
|
||||||
|
}
|
||||||
|
if wordFreq > frequency {
|
||||||
|
frequency = wordFreq
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return frequency
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name. Everytime
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
@@ -175,14 +239,14 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
|||||||
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
||||||
buf = append(buf, frag...)
|
buf = append(buf, frag...)
|
||||||
x = y
|
x = y
|
||||||
} else {
|
continue
|
||||||
if len(buf) > 0 {
|
|
||||||
result <- string(buf)
|
|
||||||
buf = make([]rune, 0)
|
|
||||||
}
|
|
||||||
result <- string(frag)
|
|
||||||
x = y
|
|
||||||
}
|
}
|
||||||
|
if len(buf) > 0 {
|
||||||
|
result <- string(buf)
|
||||||
|
buf = make([]rune, 0)
|
||||||
|
}
|
||||||
|
result <- string(frag)
|
||||||
|
x = y
|
||||||
}
|
}
|
||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
result <- string(buf)
|
result <- string(buf)
|
||||||
|
|||||||
@@ -60,28 +60,28 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
pos := 1
|
pos := 1
|
||||||
var width int
|
var width int
|
||||||
var gram string
|
var gram string
|
||||||
dict := jt.seg.Dictionary()
|
|
||||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||||
if jt.searchMode {
|
if jt.searchMode {
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
width = len(runes)
|
width = len(runes)
|
||||||
for _, step := range [2]int{2, 3} {
|
for _, step := range [2]int{2, 3} {
|
||||||
if width > step {
|
if width <= step {
|
||||||
for i := 0; i < width-step+1; i++ {
|
continue
|
||||||
gram = string(runes[i : i+step])
|
}
|
||||||
gramLen := len(gram)
|
for i := 0; i < width-step+1; i++ {
|
||||||
if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
|
gram = string(runes[i : i+step])
|
||||||
gramStart := start + len(string(runes[:i]))
|
gramLen := len(gram)
|
||||||
token := analysis.Token{
|
if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
|
||||||
Term: []byte(gram),
|
gramStart := start + len(string(runes[:i]))
|
||||||
Start: gramStart,
|
token := analysis.Token{
|
||||||
End: gramStart + gramLen,
|
Term: []byte(gram),
|
||||||
Position: pos,
|
Start: gramStart,
|
||||||
Type: detectTokenType(gram),
|
End: gramStart + gramLen,
|
||||||
}
|
Position: pos,
|
||||||
rv = append(rv, &token)
|
Type: detectTokenType(gram),
|
||||||
pos++
|
|
||||||
}
|
}
|
||||||
|
rv = append(rv, &token)
|
||||||
|
pos++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user