From 83efde1e617b343bcdd4ba4b0d8e20d14f331b07 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Sat, 4 Apr 2015 17:10:40 +0800 Subject: [PATCH] small refactors, removed sort in dag, save logTotal in segmenter --- dag.go | 29 +++++++++++++++-------------- jieba.go | 11 +++++++++-- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/dag.go b/dag.go index 1f32f43..6f2a559 100644 --- a/dag.go +++ b/dag.go @@ -2,7 +2,6 @@ package jiebago import ( "math" - "sort" ) type route struct { @@ -36,16 +35,17 @@ func DAG(s Segmenter, runes []rune) dag { d := make(dag) n := len(runes) var frag string + var i int for k := 0; k < n; k++ { - tmpList := make([]int, 0) - i := k + d[k] = make([]int, 0) + i = k frag = string(runes[k]) for { if freq, ok := s.Freq(frag); !ok { break } else { if freq > 0.0 { - tmpList = append(tmpList, i) + d[k] = append(d[k], i) } } i += 1 @@ -54,10 +54,9 @@ func DAG(s Segmenter, runes []rune) dag { } frag = string(runes[k : i+1]) } - if len(tmpList) == 0 { - tmpList = append(tmpList, k) + if len(d[k]) == 0 { + d[k] = append(d[k], k) } - d[k] = tmpList } return d } @@ -66,19 +65,21 @@ func Routes(s Segmenter, runes []rune, d dag) map[int]route { n := len(runes) rs := make(map[int]route) rs[n] = route{Freq: 0.0, Index: 0} - logTotal := math.Log(s.Total()) + logTotal := s.LogTotal() + var r route for idx := n - 1; idx >= 0; idx-- { - candidates := make(routes, len(d[idx])) - for index, i := range d[idx] { + for _, i := range d[idx] { word := string(runes[idx : i+1]) if freq, ok := s.Freq(word); ok { - candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} + r = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} } else { - candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} + r = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} + } + + if v, ok := rs[idx]; !ok || v.Freq < r.Freq || (v.Freq == r.Freq && v.Index < r.Index) { + rs[idx] = r } } - sort.Sort(sort.Reverse(candidates)) - rs[idx] = candidates[0] } return rs } diff --git a/jieba.go b/jieba.go index c16a9e8..465ff3a 100644 --- a/jieba.go +++ b/jieba.go @@ -4,6 +4,7 @@ package jiebago import ( "errors" "github.com/wangbin/jiebago/finalseg" + "math" "regexp" "sort" ) @@ -67,11 +68,12 @@ func RegexpSplit(re *regexp.Regexp, s string, n int) []string { type Segmenter interface { Freq(string) (float64, bool) Total() float64 + LogTotal() float64 } type Jieba struct { - total float64 - freqMap map[string]float64 + total, logTotal float64 + freqMap map[string]float64 } func (j Jieba) Freq(key string) (float64, bool) { @@ -83,6 +85,10 @@ func (j Jieba) Total() float64 { return j.total } +func (j Jieba) LogTotal() float64 { + return j.logTotal +} + func (j *Jieba) AddEntry(entry Entry) { j.Add(entry.Word, entry.Freq) } @@ -97,6 +103,7 @@ func (j *Jieba) Add(word string, freq float64) { j.freqMap[frag] = 0.0 } } + j.logTotal = math.Log(j.total) } // Load user specified dictionary file.