1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-23 04:30:44 +08:00

use only one dict to store words and prefixes, this corresponding to jieba commit #f808ea0ebba7056fa1b55081b474329e556933a8

This commit is contained in:
Wang Bin
2015-02-25 18:27:24 +08:00
parent 08ac49d10b
commit 67216a8a7d
2 changed files with 19 additions and 26 deletions

View File

@@ -82,11 +82,12 @@ func GetDAG(sentence string) map[int][]int {
i = k i = k
frag = string(runes[k]) frag = string(runes[k])
for { for {
if !T.Nodes.Contains(frag) { if freq, ok := T.Freq[frag]; !ok {
break break
} } else {
if _, ok := T.Freq[frag]; ok { if freq > 0.0 {
tmpList = append(tmpList, i) tmpList = append(tmpList, i)
}
} }
i += 1 i += 1
if i >= n { if i >= n {
@@ -152,7 +153,7 @@ func cut_DAG(sentence string) []string {
buf = make([]rune, 0) buf = make([]rune, 0)
} else { } else {
bufString := string(buf) bufString := string(buf)
if _, ok := T.Freq[bufString]; !ok { if v, ok := T.Freq[bufString]; !ok || v == 0.0 {
recognized := finalseg.Cut(bufString) recognized := finalseg.Cut(bufString)
for _, t := range recognized { for _, t := range recognized {
result = append(result, t) result = append(result, t)
@@ -175,7 +176,7 @@ func cut_DAG(sentence string) []string {
result = append(result, string(buf)) result = append(result, string(buf))
} else { } else {
bufString := string(buf) bufString := string(buf)
if _, ok := T.Freq[bufString]; !ok { if v, ok := T.Freq[bufString]; !ok || v == 0.0 {
recognized := finalseg.Cut(bufString) recognized := finalseg.Cut(bufString)
for _, t := range recognized { for _, t := range recognized {
result = append(result, t) result = append(result, t)
@@ -319,7 +320,7 @@ func CutForSearch(sentence string, hmm bool) []string {
var gram2 string var gram2 string
for i := 0; i < len(runes)-increment+1; i++ { for i := 0; i < len(runes)-increment+1; i++ {
gram2 = string(runes[i : i+increment]) gram2 = string(runes[i : i+increment])
if _, ok := T.Freq[gram2]; ok { if v, ok := T.Freq[gram2]; ok && v > 0.0 {
result = append(result, gram2) result = append(result, gram2)
} }
} }

View File

@@ -6,7 +6,6 @@ import (
"crypto/md5" "crypto/md5"
"encoding/gob" "encoding/gob"
"fmt" "fmt"
mapset "github.com/deckarep/golang-set"
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
@@ -17,7 +16,6 @@ import (
var T *Trie var T *Trie
type Trie struct { type Trie struct {
Nodes mapset.Set
Total float64 Total float64
Freq map[string]float64 Freq map[string]float64
} }
@@ -25,11 +23,7 @@ type Trie struct {
func (t Trie) MarshalBinary() ([]byte, error) { func (t Trie) MarshalBinary() ([]byte, error) {
var b bytes.Buffer var b bytes.Buffer
enc := gob.NewEncoder(&b) enc := gob.NewEncoder(&b)
err := enc.Encode(t.Nodes.ToSlice()) err := enc.Encode(t.Total)
if err != nil {
return nil, err
}
err = enc.Encode(t.Total)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -43,13 +37,7 @@ func (t Trie) MarshalBinary() ([]byte, error) {
func (t *Trie) UnmarshalBinary(data []byte) error { func (t *Trie) UnmarshalBinary(data []byte) error {
b := bytes.NewBuffer(data) b := bytes.NewBuffer(data)
dec := gob.NewDecoder(b) dec := gob.NewDecoder(b)
var nodes []interface{} err := dec.Decode(&t.Total)
err := dec.Decode(&nodes)
if err != nil {
return err
}
t.Nodes = mapset.NewSetFromSlice(nodes)
err = dec.Decode(&t.Total)
if err != nil { if err != nil {
return err return err
} }
@@ -111,8 +99,7 @@ func newTrie(fileName string) (*Trie, error) {
} }
if !isDictCached { if !isDictCached {
trie = &Trie{Nodes: mapset.NewSet(), Total: 0.0, trie = &Trie{Total: 0.0, Freq: make(map[string]float64)}
Freq: make(map[string]float64)}
file, openError := os.Open(filePath) file, openError := os.Open(filePath)
if openError != nil { if openError != nil {
@@ -125,7 +112,10 @@ func newTrie(fileName string) (*Trie, error) {
line := scanner.Text() line := scanner.Text()
words := strings.Split(line, " ") words := strings.Split(line, " ")
word, freqStr := words[0], words[1] word, freqStr := words[0], words[1]
freq, _ := strconv.ParseFloat(freqStr, 64) freq, err := strconv.ParseFloat(freqStr, 64)
if err != nil {
return nil, err
}
trie.addWord(word, freq) trie.addWord(word, freq)
} }
if scanErr := scanner.Err(); scanErr != nil { if scanErr := scanner.Err(); scanErr != nil {
@@ -155,10 +145,12 @@ func (t *Trie) addWord(word string, freq float64) {
runes := []rune(word) runes := []rune(word)
count := len(runes) count := len(runes)
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
t.Nodes.Add(string(runes[:i+1])) wfrag := string(runes[0 : i+1])
if _, ok := t.Freq[wfrag]; !ok {
t.Freq[wfrag] = 0.0
}
} }
} }
func addWord(word string, freq float64, tag string) { func addWord(word string, freq float64, tag string) {
if len(tag) > 0 { if len(tag) > 0 {
UserWordTagTab[word] = strings.TrimSpace(tag) UserWordTagTab[word] = strings.TrimSpace(tag)