diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index 3a1e7bb..be71f1b 100755 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -53,17 +53,17 @@ type TagExtracter struct { } // LoadDictionary reads the given filename and create a new dictionary. -func (t *TagExtracter) LoadDictionary(file fs.File) error { +func (t *TagExtracter) LoadDictionary(file fs.File) (err error) { t.stopWord = NewStopWord() - t.seg = new(jieba.Segmenter) - return t.seg.LoadDictionary(file) + t.seg, err = jieba.LoadDictionary(file) + return } // LoadDictionaryAt reads the given filename and create a new dictionary. -func (t *TagExtracter) LoadDictionaryAt(fileName string) error { +func (t *TagExtracter) LoadDictionaryAt(file string) (err error) { t.stopWord = NewStopWord() - t.seg = new(jieba.Segmenter) - return t.seg.LoadDictionaryAt(fileName) + t.seg, err = jieba.LoadDictionaryAt(file) + return } // LoadIdf reads the given file and create a new Idf dictionary. diff --git a/dictionary.go b/dictionary.go index ba0e3e0..d38eaaa 100755 --- a/dictionary.go +++ b/dictionary.go @@ -10,9 +10,9 @@ import ( // A Dictionary represents a thread-safe dictionary used for word segmentation. type Dictionary struct { + sync.RWMutex total, logTotal float64 freqMap map[string]float64 - sync.RWMutex } // Load loads all tokens diff --git a/example_parallel_cut_test.go b/example_parallel_cut_test.go index 56f4822..508ad35 100755 --- a/example_parallel_cut_test.go +++ b/example_parallel_cut_test.go @@ -16,7 +16,7 @@ type line struct { } var ( - segmenter = Segmenter{} + segmenter *Segmenter numThreads = runtime.NumCPU() task = make(chan line, numThreads) result = make(chan line, numThreads) @@ -35,9 +35,6 @@ func Example_parallelCut() { // Set the number of goroutines runtime.GOMAXPROCS(numThreads) - // Load dictionary - segmenter.LoadDictionaryAt("dict.txt") - // open file for segmentation file, err := os.Open("README.md") if err != nil { @@ -45,6 +42,12 @@ func Example_parallelCut() { } defer file.Close() + // Load dictionary + segmenter, err = LoadDictionaryAt("dict.txt") + if err != nil { + log.Fatal(err) + } + // start worker routines for i := 0; i < numThreads; i++ { go worker() diff --git a/example_test.go b/example_test.go index 569e58e..640a2a0 100755 --- a/example_test.go +++ b/example_test.go @@ -5,8 +5,10 @@ import ( ) func Example() { - var seg Segmenter - seg.LoadDictionaryAt("dict.txt") + seg, err := LoadDictionaryAt("dict.txt") + if err != nil { + panic(err) + } fmt.Print("【全模式】:") fmt.Println(seg.CutAll("我来到北京清华大学")) @@ -27,8 +29,10 @@ func Example() { } func Example_suggestFrequency() { - var seg Segmenter - seg.LoadDictionaryAt("dict.txt") + seg, err := LoadDictionaryAt("dict.txt") + if err != nil { + panic(err) + } sentence := "超敏C反应蛋白是什么?" fmt.Print("Before:") @@ -75,8 +79,10 @@ func Example_suggestFrequency() { } func Example_loadUserDictionary() { - var seg Segmenter - seg.LoadDictionaryAt("dict.txt") + seg, err := LoadDictionaryAt("dict.txt") + if err != nil { + panic(err) + } sentence := "李小福是创新办主任也是云计算方面的专家" fmt.Print("Before:") diff --git a/jieba.go b/jieba.go index 408ddf8..669f3c6 100755 --- a/jieba.go +++ b/jieba.go @@ -21,23 +21,21 @@ var ( ) // Segmenter is a Chinese words segmentation struct. -type Segmenter struct { - dict *Dictionary -} +type Segmenter Dictionary // Frequency returns a word's frequency and existence func (seg *Segmenter) Frequency(word string) (float64, bool) { - return seg.dict.Frequency(word) + return (*Dictionary)(seg).Frequency(word) } // AddWord adds a new word with frequency to dictionary func (seg *Segmenter) AddWord(word string, frequency float64) { - seg.dict.AddToken(dictionary.NewToken(word, frequency, "")) + (*Dictionary)(seg).AddToken(dictionary.NewToken(word, frequency, "")) } // DeleteWord removes a word from dictionary func (seg *Segmenter) DeleteWord(word string) { - seg.dict.AddToken(dictionary.NewToken(word, 0.0, "")) + (*Dictionary)(seg).AddToken(dictionary.NewToken(word, 0.0, "")) } /* @@ -58,79 +56,79 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 { frequency := 1.0 if len(words) > 1 { for _, word := range words { - if freq, ok := seg.dict.Frequency(word); ok { + if freq, ok := (*Dictionary)(seg).Frequency(word); ok { frequency *= freq } - frequency /= seg.dict.total + frequency /= (*Dictionary)(seg).total } - frequency, _ = math.Modf(frequency * seg.dict.total) + frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total) wordFreq := 0.0 - if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok { + if freq, ok := (*Dictionary)(seg).Frequency(strings.Join(words, "")); ok { wordFreq = freq } if wordFreq < frequency { frequency = wordFreq } - } else { - word := words[0] - for _, segment := range seg.Cut(word, false) { - if freq, ok := seg.dict.Frequency(segment); ok { - frequency *= freq - } - frequency /= seg.dict.total - } - frequency, _ = math.Modf(frequency * seg.dict.total) - frequency += 1.0 - wordFreq := 1.0 - if freq, ok := seg.dict.Frequency(word); ok { - wordFreq = freq - } - if wordFreq > frequency { - frequency = wordFreq + return frequency + } + word := words[0] + for _, segment := range seg.Cut(word, false) { + if freq, ok := (*Dictionary)(seg).Frequency(segment); ok { + frequency *= freq } + frequency /= (*Dictionary)(seg).total + } + frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total) + frequency += 1.0 + wordFreq := 1.0 + if freq, ok := (*Dictionary)(seg).Frequency(word); ok { + wordFreq = freq + } + if wordFreq > frequency { + frequency = wordFreq } return frequency } // LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionary(file fs.File) error { - seg.dict = &Dictionary{freqMap: make(map[string]float64)} - return seg.dict.loadDictionary(file) +func LoadDictionary(file fs.File) (*Segmenter, error) { + d := &Dictionary{freqMap: make(map[string]float64)} + err := d.loadDictionary(file) + return (*Segmenter)(d), err } // LoadDictionaryAt loads dictionary from given file name. Everytime // LoadDictionaryAt is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionaryAt(file string) error { - seg.dict = &Dictionary{freqMap: make(map[string]float64)} - return seg.dict.loadDictionaryAt(file) +func LoadDictionaryAt(file string) (*Segmenter, error) { + d := &Dictionary{freqMap: make(map[string]float64)} + err := d.loadDictionaryAt(file) + return (*Segmenter)(d), err } // LoadUserDictionary loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionary(file fs.File) error { - return seg.dict.loadDictionary(file) + return (*Dictionary)(seg).loadDictionary(file) } // LoadUserDictionaryAt loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionaryAt(file string) error { - return seg.dict.loadDictionaryAt(file) + return (*Dictionary)(seg).loadDictionaryAt(file) } -func (seg *Segmenter) dag(runes []rune) map[int][]int { - dag := make(map[int][]int) +func (seg *Segmenter) dag(runes []rune) [][]int { n := len(runes) - var frag []rune - var i int + dag := make([][]int, n) for k := 0; k < n; k++ { - dag[k] = make([]int, 0) - i = k - frag = runes[k : k+1] + dag[k] = make([]int, 0, 64) + i := k + frag := runes[k : k+1] for { - freq, ok := seg.dict.Frequency(string(frag)) + freq, ok := (*Dictionary)(seg).Frequency(string(frag)) if !ok { break } @@ -155,20 +153,20 @@ type route struct { index int } -func (seg *Segmenter) calc(runes []rune) map[int]route { +func (seg *Segmenter) calc(runes []rune) []*route { dag := seg.dag(runes) n := len(runes) - rs := make(map[int]route) - rs[n] = route{frequency: 0.0, index: 0} - var r route + rs := make([]*route, n+1) + rs[n] = &route{frequency: 0.0, index: 0} for idx := n - 1; idx >= 0; idx-- { for _, i := range dag[idx] { - if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok { - r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i} + var r *route + if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok { + r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } else { - r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i} + r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } - if v, ok := rs[idx]; !ok { + if v := rs[idx]; v == nil { rs[idx] = r } else { if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { @@ -190,14 +188,11 @@ type cutFunc func(sentence string) []string func (seg *Segmenter) cutDAG(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) - runes := []rune(sentence) routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 + buf := make([]rune, 0, 256) + for x := 0; x < len(runes); { + y := routes[x].index + 1 frag := runes[x:y] if y-x == 1 { buf = append(buf, frag...) @@ -207,7 +202,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string { if len(buf) == 1 { result = append(result, bufString) } else { - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { + if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { result = append(result, finalseg.Cut(bufString)...) } else { for _, elem := range buf { @@ -215,7 +210,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string { } } } - buf = make([]rune, 0) + buf = buf[:0] } result = append(result, string(frag)) } @@ -227,7 +222,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string { if len(buf) == 1 { result = append(result, bufString) } else { - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { + if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { result = append(result, finalseg.Cut(bufString)...) } else { for _, elem := range buf { @@ -242,14 +237,11 @@ func (seg *Segmenter) cutDAG(sentence string) []string { func (seg *Segmenter) cutDAGNoHMM(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) - runes := []rune(sentence) routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 + buf := make([]rune, 0, 256) + for x := 0; x < len(runes); { + y := routes[x].index + 1 frag := runes[x:y] if reEng.MatchString(string(frag)) && len(frag) == 1 { buf = append(buf, frag...) @@ -258,7 +250,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) []string { } if len(buf) > 0 { result = append(result, string(buf)) - buf = make([]rune, 0) + buf = buf[:0] } result = append(result, string(frag)) x = y @@ -307,17 +299,11 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) []string { func (seg *Segmenter) cutAll(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) - runes := []rune(sentence) dag := seg.dag(runes) start := -1 - ks := make([]int, len(dag)) - for k := range dag { - ks[k] = k - } - var l []int - for k := range ks { - l = dag[k] + for k := 0; k < len(dag); k++ { + l := dag[k] if len(l) == 1 && k > start { result = append(result, string(runes[k:l[0]+1])) start = l[0] @@ -367,10 +353,9 @@ func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string { if len(runes) <= increment { continue } - var gram string for i := 0; i < len(runes)-increment+1; i++ { - gram = string(runes[i : i+increment]) - if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { + gram := string(runes[i : i+increment]) + if v, ok := (*Dictionary)(seg).Frequency(gram); ok && v > 0.0 { result = append(result, gram) } } diff --git a/jieba_test.go b/jieba_test.go index de1ebe0..ee8f025 100755 --- a/jieba_test.go +++ b/jieba_test.go @@ -3,7 +3,7 @@ package jieba import "testing" var ( - seg Segmenter + seg *Segmenter testContents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", @@ -616,7 +616,11 @@ var ( ) func init() { - seg.LoadDictionaryAt("dict.txt") + var err error + seg, err = LoadDictionaryAt("dict.txt") + if err != nil { + panic(err) + } } func TestCutDAG(t *testing.T) { @@ -715,7 +719,11 @@ func TestCutForSearch(t *testing.T) { func TestLoadDictionary(t *testing.T) { var result []string - seg.LoadDictionaryAt("foobar.txt") + var err error + seg, err = LoadDictionaryAt("foobar.txt") + if err != nil { + t.Fatal(err) + } for index, content := range testContents { result = seg.Cut(content, true) if len(result) != len(userDictCutResult[index]) { @@ -728,7 +736,10 @@ func TestLoadDictionary(t *testing.T) { } } } - seg.LoadDictionaryAt("dict.txt") + seg, err = LoadDictionaryAt("dict.txt") + if err != nil { + t.Fatal(err) + } } func TestLoadUserDictionary(t *testing.T) { @@ -771,7 +782,11 @@ func TestLoadUserDictionary(t *testing.T) { t.Fatal(word) } } - seg.LoadDictionaryAt("dict.txt") + var err error + seg, err = LoadDictionaryAt("dict.txt") + if err != nil { + t.Fatal(err) + } } func BenchmarkCutNoHMM(b *testing.B) { diff --git a/posseg/posseg.go b/posseg/posseg.go index 21efd1a..67c3423 100755 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -123,12 +123,10 @@ func (seg *Segmenter) cutDetail(sentence string) (results []Segment) { func (seg *Segmenter) dag(runes []rune) [][]int { n := len(runes) dag := make([][]int, n) - var frag []rune - var i int for k := 0; k < n; k++ { dag[k] = make([]int, 0, 64) - i = k - frag = runes[k : k+1] + i := k + frag := runes[k : k+1] for { freq, ok := (*Dictionary)(seg).Frequency(string(frag)) if !ok { @@ -160,9 +158,9 @@ func (seg *Segmenter) calc(runes []rune) []*route { n := len(runes) rs := make([]*route, n+1) rs[n] = &route{frequency: 0.0, index: 0} - var r *route for idx := n - 1; idx >= 0; idx-- { for _, i := range dag[idx] { + var r *route if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok { r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } else { diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index 412fca0..9ae5056 100755 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -18,7 +18,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`) // JiebaTokenizer is the beleve tokenizer for jieba. type JiebaTokenizer struct { - seg jieba.Segmenter + seg *jieba.Segmenter hmm, searchMode bool } @@ -43,8 +43,7 @@ Parameters: this word into "交换", "换机", which are valid Chinese words. */ func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) { - var seg jieba.Segmenter - err := seg.LoadDictionary(dictFile) + seg, err := jieba.LoadDictionary(dictFile) return &JiebaTokenizer{ seg: seg, hmm: hmm, @@ -73,8 +72,7 @@ Parameters: this word into "交换", "换机", which are valid Chinese words. */ func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { - var seg jieba.Segmenter - err := seg.LoadDictionaryAt(dictFilePath) + seg, err := jieba.LoadDictionaryAt(dictFilePath) return &JiebaTokenizer{ seg: seg, hmm: hmm,