1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 21:20:26 +08:00

small refactors, removed sort in dag, save logTotal in segmenter

This commit is contained in:
Wang Bin
2015-04-04 17:10:40 +08:00
parent 5c6a2eff74
commit 83efde1e61
2 changed files with 24 additions and 16 deletions

29
dag.go
View File

@@ -2,7 +2,6 @@ package jiebago
import ( import (
"math" "math"
"sort"
) )
type route struct { type route struct {
@@ -36,16 +35,17 @@ func DAG(s Segmenter, runes []rune) dag {
d := make(dag) d := make(dag)
n := len(runes) n := len(runes)
var frag string var frag string
var i int
for k := 0; k < n; k++ { for k := 0; k < n; k++ {
tmpList := make([]int, 0) d[k] = make([]int, 0)
i := k i = k
frag = string(runes[k]) frag = string(runes[k])
for { for {
if freq, ok := s.Freq(frag); !ok { if freq, ok := s.Freq(frag); !ok {
break break
} else { } else {
if freq > 0.0 { if freq > 0.0 {
tmpList = append(tmpList, i) d[k] = append(d[k], i)
} }
} }
i += 1 i += 1
@@ -54,10 +54,9 @@ func DAG(s Segmenter, runes []rune) dag {
} }
frag = string(runes[k : i+1]) frag = string(runes[k : i+1])
} }
if len(tmpList) == 0 { if len(d[k]) == 0 {
tmpList = append(tmpList, k) d[k] = append(d[k], k)
} }
d[k] = tmpList
} }
return d return d
} }
@@ -66,19 +65,21 @@ func Routes(s Segmenter, runes []rune, d dag) map[int]route {
n := len(runes) n := len(runes)
rs := make(map[int]route) rs := make(map[int]route)
rs[n] = route{Freq: 0.0, Index: 0} rs[n] = route{Freq: 0.0, Index: 0}
logTotal := math.Log(s.Total()) logTotal := s.LogTotal()
var r route
for idx := n - 1; idx >= 0; idx-- { for idx := n - 1; idx >= 0; idx-- {
candidates := make(routes, len(d[idx])) for _, i := range d[idx] {
for index, i := range d[idx] {
word := string(runes[idx : i+1]) word := string(runes[idx : i+1])
if freq, ok := s.Freq(word); ok { if freq, ok := s.Freq(word); ok {
candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} r = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
} else { } else {
candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} r = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
}
if v, ok := rs[idx]; !ok || v.Freq < r.Freq || (v.Freq == r.Freq && v.Index < r.Index) {
rs[idx] = r
} }
} }
sort.Sort(sort.Reverse(candidates))
rs[idx] = candidates[0]
} }
return rs return rs
} }

View File

@@ -4,6 +4,7 @@ package jiebago
import ( import (
"errors" "errors"
"github.com/wangbin/jiebago/finalseg" "github.com/wangbin/jiebago/finalseg"
"math"
"regexp" "regexp"
"sort" "sort"
) )
@@ -67,11 +68,12 @@ func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
type Segmenter interface { type Segmenter interface {
Freq(string) (float64, bool) Freq(string) (float64, bool)
Total() float64 Total() float64
LogTotal() float64
} }
type Jieba struct { type Jieba struct {
total float64 total, logTotal float64
freqMap map[string]float64 freqMap map[string]float64
} }
func (j Jieba) Freq(key string) (float64, bool) { func (j Jieba) Freq(key string) (float64, bool) {
@@ -83,6 +85,10 @@ func (j Jieba) Total() float64 {
return j.total return j.total
} }
func (j Jieba) LogTotal() float64 {
return j.logTotal
}
func (j *Jieba) AddEntry(entry Entry) { func (j *Jieba) AddEntry(entry Entry) {
j.Add(entry.Word, entry.Freq) j.Add(entry.Word, entry.Freq)
} }
@@ -97,6 +103,7 @@ func (j *Jieba) Add(word string, freq float64) {
j.freqMap[frag] = 0.0 j.freqMap[frag] = 0.0
} }
} }
j.logTotal = math.Log(j.total)
} }
// Load user specified dictionary file. // Load user specified dictionary file.