From 68fed7e250bfb48c278d62287b92319bd0f44c13 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 30 Mar 2015 16:12:02 +0800 Subject: [PATCH] make struct Jieba's fields private --- analyse/analyse.go | 4 +-- jieba.go | 48 +++++++++++++++++++-------------- jieba_test.go | 16 +++++------ loader.go | 66 +++++++++++++++++++++++++++++++++++++++++++++ posseg/posseg.go | 6 ++--- tokenizers/jieba.go | 4 +-- 6 files changed, 109 insertions(+), 35 deletions(-) create mode 100644 loader.go diff --git a/analyse/analyse.go b/analyse/analyse.go index 16be60d..43bfa00 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -42,7 +42,7 @@ type TagExtracter struct { } func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { - j, err := jiebago.NewJieba(dictFileName) + j, err := jiebago.Open(dictFileName) if err != nil { return nil, err } @@ -57,7 +57,7 @@ func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) { freq := make(map[string]float64) - for w := range t.Cut(sentence, false, true) { + for w := range t.Cut(sentence, true) { w = strings.TrimSpace(w) if utf8.RuneCountInString(w) < 2 { continue diff --git a/jieba.go b/jieba.go index 251e359..6d4d0c3 100644 --- a/jieba.go +++ b/jieba.go @@ -2,7 +2,6 @@ package jiebago import ( - "fmt" "github.com/wangbin/jiebago/finalseg" "math" "regexp" @@ -22,10 +21,6 @@ type route struct { Index int } -func (r route) String() string { - return fmt.Sprintf("(%f, %d)", r.Freq, r.Index) -} - type routes []*route func (rs routes) Len() int { @@ -47,8 +42,17 @@ func (rs routes) Swap(i, j int) { } type Jieba struct { - Total float64 - Freq map[string]float64 + total float64 + freqMap map[string]float64 +} + +func (j Jieba) Freq(key string) (float64, bool) { + freq, ok := j.freqMap[key] + return freq, ok +} + +func (j Jieba) Total() float64 { + return j.total } func (j *Jieba) AddEntry(entry Entry) { @@ -56,13 +60,13 @@ func (j *Jieba) AddEntry(entry Entry) { } func (j *Jieba) Add(word string, freq float64) { - j.Freq[word] = freq - j.Total += freq + j.freqMap[word] = freq + j.total += freq runes := []rune(word) for i := 0; i < len(runes); i++ { frag := string(runes[0 : i+1]) - if _, ok := j.Freq[frag]; !ok { - j.Freq[frag] = 0.0 + if _, ok := j.Freq(frag); !ok { + j.freqMap[frag] = 0.0 } } } @@ -72,11 +76,15 @@ func (j *Jieba) LoadUserDict(dictFilePath string) error { return LoadDict(j, dictFilePath, false) } +func New() *Jieba { + return &Jieba{total: 0.0, freqMap: make(map[string]float64)} +} + // Set the dictionary, could be absolute path of dictionary file, or dictionary // name in current directory. This function must be called before cut any // sentence. -func NewJieba(dictFileName string) (*Jieba, error) { - j := &Jieba{Total: 0.0, Freq: make(map[string]float64)} +func Open(dictFileName string) (*Jieba, error) { + j := &Jieba{total: 0.0, freqMap: make(map[string]float64)} err := LoadDict(j, dictFileName, false) return j, err } @@ -92,7 +100,7 @@ func (j *Jieba) DAG(sentence string) map[int][]int { i := k frag = string(runes[k]) for { - if freq, ok := j.Freq[frag]; !ok { + if freq, ok := j.Freq(frag); !ok { break } else { if freq > 0.0 { @@ -118,14 +126,14 @@ func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route { number := len(runes) rs := make(map[int]*route) rs[number] = &route{Freq: 0.0, Index: 0} - logTotal := math.Log(j.Total) + logTotal := math.Log(j.Total()) for idx := number - 1; idx >= 0; idx-- { candidates := make(routes, 0) for _, i := range dag[idx] { word := string(runes[idx : i+1]) var r *route - if _, ok := j.Freq[word]; ok { - r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i} + if freq, ok := j.Freq(word); ok { + r = &route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} } else { r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} } @@ -164,7 +172,7 @@ func (j *Jieba) cutDAG(sentence string) chan string { buf = make([]rune, 0) } else { bufString := string(buf) - if v, ok := j.Freq[bufString]; !ok || v == 0.0 { + if v, ok := j.Freq(bufString); !ok || v == 0.0 { for x := range finalseg.Cut(bufString) { result <- x } @@ -186,7 +194,7 @@ func (j *Jieba) cutDAG(sentence string) chan string { result <- string(buf) } else { bufString := string(buf) - if v, ok := j.Freq[bufString]; !ok || v == 0.0 { + if v, ok := j.Freq(bufString); !ok || v == 0.0 { for t := range finalseg.Cut(bufString) { result <- t } @@ -352,7 +360,7 @@ func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string { var gram2 string for i := 0; i < len(runes)-increment+1; i++ { gram2 = string(runes[i : i+increment]) - if v, ok := j.Freq[gram2]; ok && v > 0.0 { + if v, ok := j.Freq(gram2); ok && v > 0.0 { result <- gram2 } } diff --git a/jieba_test.go b/jieba_test.go index 2158e13..8846dd4 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -626,7 +626,7 @@ func chanToArray(ch chan string) []string { } func TestCutDAG(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) if len(result) != 11 { @@ -635,7 +635,7 @@ func TestCutDAG(t *testing.T) { } func TestCutDAGNoHmm(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) if len(result) != 11 { @@ -657,7 +657,7 @@ func TestRegexpSplit(t *testing.T) { } func TestDefaultCut(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") var result []string for index, content := range test_contents { @@ -675,7 +675,7 @@ func TestDefaultCut(t *testing.T) { } func TestCutAll(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") var result []string for index, content := range test_contents { @@ -693,7 +693,7 @@ func TestCutAll(t *testing.T) { } func TestDefaultCutNoHMM(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") var result []string for index, content := range test_contents { @@ -711,7 +711,7 @@ func TestDefaultCutNoHMM(t *testing.T) { } func TestCutForSearch(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") var result []string for index, content := range test_contents { @@ -742,7 +742,7 @@ func TestCutForSearch(t *testing.T) { func TestSetdictionary(t *testing.T) { var result []string - j, _ := NewJieba("foobar.txt") + j, _ := Open("foobar.txt") for index, content := range test_contents { result = chanToArray(j.Cut(content, true)) if len(result) != len(userDictCutResult[index]) { @@ -758,7 +758,7 @@ func TestSetdictionary(t *testing.T) { } func TestLoadUserDict(t *testing.T) { - j, _ := NewJieba("dict.txt") + j, _ := Open("dict.txt") j.LoadUserDict("userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" diff --git a/loader.go b/loader.go new file mode 100644 index 0000000..fc56916 --- /dev/null +++ b/loader.go @@ -0,0 +1,66 @@ +package jiebago + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" +) + +type Entry struct { + Word string + Flag string + Freq float64 +} + +type Loader interface { + AddEntry(Entry) +} + +func dictPath(dictFileName string) (string, error) { + if filepath.IsAbs(dictFileName) { + return dictFileName, nil + } + var dictFilePath string + cwd, err := os.Getwd() + if err != nil { + return dictFilePath, err + } + dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) + return dictFilePath, nil +} + +func LoadDict(l Loader, dictFileName string, usingFlag bool) error { + dictFilePath, err := dictPath(dictFileName) + if err != nil { + return err + } + + dictFile, err := os.Open(dictFilePath) + if err != nil { + return err + } + defer dictFile.Close() + + scanner := bufio.NewScanner(dictFile) + var entry Entry + var line string + var fields []string + for scanner.Scan() { + line = scanner.Text() + fields = strings.Split(line, " ") + entry.Word = strings.Replace(fields[0], "\ufeff", "", 1) + if length := len(fields); length > 1 { + entry.Freq, err = strconv.ParseFloat(fields[1], 64) + if err != nil { + return err + } + if usingFlag && length > 2 { + entry.Flag = fields[2] + } + } + l.AddEntry(entry) + } + return scanner.Err() +} diff --git a/posseg/posseg.go b/posseg/posseg.go index d05568d..8beac1a 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -35,7 +35,7 @@ func (p *Posseg) AddEntry(entry jiebago.Entry) { // Set dictionary, it could be absolute path of dictionary file, or dictionary // name in current diectory. func NewPosseg(dictFileName string) (*Posseg, error) { - j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)} + j := jiebago.New() p := &Posseg{j, make(map[string]string)} err := jiebago.LoadDict(p, dictFileName, true) if err != nil { @@ -137,7 +137,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { buf = make([]rune, 0) } else { bufString := string(buf) - if v, ok := p.Freq[bufString]; !ok || v == 0.0 { + if v, ok := p.Freq(bufString); !ok || v == 0.0 { for t := range p.cutDetail(bufString) { result <- t } @@ -175,7 +175,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { } } else { bufString := string(buf) - if v, ok := p.Freq[bufString]; !ok || v == 0.0 { + if v, ok := p.Freq(bufString); !ok || v == 0.0 { for t := range p.cutDetail(bufString) { result <- t } diff --git a/tokenizers/jieba.go b/tokenizers/jieba.go index 017c87d..f8e9b4f 100644 --- a/tokenizers/jieba.go +++ b/tokenizers/jieba.go @@ -19,7 +19,7 @@ type JiebaTokenizer struct { } func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { - j, err := jiebago.NewJieba(dictFileName) + j, err := jiebago.Open(dictFileName) return &JiebaTokenizer{ j: j, hmm: hmm, @@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) gramLen := len(gram) - if value, ok := jt.j.Freq[gram]; ok && value > 0 { + if value, ok := jt.j.Freq(gram); ok && value > 0 { gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram),