mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-12 13:10:25 +08:00
small refactors, removed sort in dag, save logTotal in segmenter
This commit is contained in:
29
dag.go
29
dag.go
@@ -2,7 +2,6 @@ package jiebago
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type route struct {
|
type route struct {
|
||||||
@@ -36,16 +35,17 @@ func DAG(s Segmenter, runes []rune) dag {
|
|||||||
d := make(dag)
|
d := make(dag)
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
var frag string
|
var frag string
|
||||||
|
var i int
|
||||||
for k := 0; k < n; k++ {
|
for k := 0; k < n; k++ {
|
||||||
tmpList := make([]int, 0)
|
d[k] = make([]int, 0)
|
||||||
i := k
|
i = k
|
||||||
frag = string(runes[k])
|
frag = string(runes[k])
|
||||||
for {
|
for {
|
||||||
if freq, ok := s.Freq(frag); !ok {
|
if freq, ok := s.Freq(frag); !ok {
|
||||||
break
|
break
|
||||||
} else {
|
} else {
|
||||||
if freq > 0.0 {
|
if freq > 0.0 {
|
||||||
tmpList = append(tmpList, i)
|
d[k] = append(d[k], i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i += 1
|
i += 1
|
||||||
@@ -54,10 +54,9 @@ func DAG(s Segmenter, runes []rune) dag {
|
|||||||
}
|
}
|
||||||
frag = string(runes[k : i+1])
|
frag = string(runes[k : i+1])
|
||||||
}
|
}
|
||||||
if len(tmpList) == 0 {
|
if len(d[k]) == 0 {
|
||||||
tmpList = append(tmpList, k)
|
d[k] = append(d[k], k)
|
||||||
}
|
}
|
||||||
d[k] = tmpList
|
|
||||||
}
|
}
|
||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
@@ -66,19 +65,21 @@ func Routes(s Segmenter, runes []rune, d dag) map[int]route {
|
|||||||
n := len(runes)
|
n := len(runes)
|
||||||
rs := make(map[int]route)
|
rs := make(map[int]route)
|
||||||
rs[n] = route{Freq: 0.0, Index: 0}
|
rs[n] = route{Freq: 0.0, Index: 0}
|
||||||
logTotal := math.Log(s.Total())
|
logTotal := s.LogTotal()
|
||||||
|
var r route
|
||||||
for idx := n - 1; idx >= 0; idx-- {
|
for idx := n - 1; idx >= 0; idx-- {
|
||||||
candidates := make(routes, len(d[idx]))
|
for _, i := range d[idx] {
|
||||||
for index, i := range d[idx] {
|
|
||||||
word := string(runes[idx : i+1])
|
word := string(runes[idx : i+1])
|
||||||
if freq, ok := s.Freq(word); ok {
|
if freq, ok := s.Freq(word); ok {
|
||||||
candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
|
r = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
} else {
|
} else {
|
||||||
candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
r = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
|
}
|
||||||
|
|
||||||
|
if v, ok := rs[idx]; !ok || v.Freq < r.Freq || (v.Freq == r.Freq && v.Index < r.Index) {
|
||||||
|
rs[idx] = r
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(candidates))
|
|
||||||
rs[idx] = candidates[0]
|
|
||||||
}
|
}
|
||||||
return rs
|
return rs
|
||||||
}
|
}
|
||||||
|
|||||||
11
jieba.go
11
jieba.go
@@ -4,6 +4,7 @@ package jiebago
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"github.com/wangbin/jiebago/finalseg"
|
"github.com/wangbin/jiebago/finalseg"
|
||||||
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
@@ -67,11 +68,12 @@ func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
|||||||
type Segmenter interface {
|
type Segmenter interface {
|
||||||
Freq(string) (float64, bool)
|
Freq(string) (float64, bool)
|
||||||
Total() float64
|
Total() float64
|
||||||
|
LogTotal() float64
|
||||||
}
|
}
|
||||||
|
|
||||||
type Jieba struct {
|
type Jieba struct {
|
||||||
total float64
|
total, logTotal float64
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j Jieba) Freq(key string) (float64, bool) {
|
func (j Jieba) Freq(key string) (float64, bool) {
|
||||||
@@ -83,6 +85,10 @@ func (j Jieba) Total() float64 {
|
|||||||
return j.total
|
return j.total
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (j Jieba) LogTotal() float64 {
|
||||||
|
return j.logTotal
|
||||||
|
}
|
||||||
|
|
||||||
func (j *Jieba) AddEntry(entry Entry) {
|
func (j *Jieba) AddEntry(entry Entry) {
|
||||||
j.Add(entry.Word, entry.Freq)
|
j.Add(entry.Word, entry.Freq)
|
||||||
}
|
}
|
||||||
@@ -97,6 +103,7 @@ func (j *Jieba) Add(word string, freq float64) {
|
|||||||
j.freqMap[frag] = 0.0
|
j.freqMap[frag] = 0.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
j.logTotal = math.Log(j.total)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load user specified dictionary file.
|
// Load user specified dictionary file.
|
||||||
|
|||||||
Reference in New Issue
Block a user