1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

move DAG related function to a seperated file, rename Calc to Routes

This commit is contained in:
Wang Bin
2015-03-30 17:10:48 +08:00
parent 68fed7e250
commit 7a7f8af517
3 changed files with 102 additions and 96 deletions

84
dag.go Normal file
View File

@@ -0,0 +1,84 @@
package jiebago
import (
"math"
"sort"
)
type route struct {
Freq float64
Index int
}
type routes []route
func (rs routes) Len() int {
return len(rs)
}
func (rs routes) Less(i, j int) bool {
if rs[i].Freq < rs[j].Freq {
return true
}
if rs[i].Freq == rs[j].Freq {
return rs[i].Index < rs[j].Index
}
return false
}
func (rs routes) Swap(i, j int) {
rs[i], rs[j] = rs[j], rs[i]
}
type dag map[int][]int
func DAG(s Segmenter, runes []rune) dag {
d := make(dag)
n := len(runes)
var frag string
for k := 0; k < n; k++ {
tmpList := make([]int, 0)
i := k
frag = string(runes[k])
for {
if freq, ok := s.Freq(frag); !ok {
break
} else {
if freq > 0.0 {
tmpList = append(tmpList, i)
}
}
i += 1
if i >= n {
break
}
frag = string(runes[k : i+1])
}
if len(tmpList) == 0 {
tmpList = append(tmpList, k)
}
d[k] = tmpList
}
return d
}
func Routes(s Segmenter, runes []rune, d dag) map[int]route {
n := len(runes)
rs := make(map[int]route)
rs[n] = route{Freq: 0.0, Index: 0}
logTotal := math.Log(s.Total())
for idx := n - 1; idx >= 0; idx-- {
candidates := make(routes, len(d[idx]))
for index, i := range d[idx] {
word := string(runes[idx : i+1])
if freq, ok := s.Freq(word); ok {
candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
} else {
candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
}
}
sort.Sort(sort.Reverse(candidates))
rs[idx] = candidates[0]
}
return rs
}

View File

@@ -3,7 +3,6 @@ package jiebago
import (
"github.com/wangbin/jiebago/finalseg"
"math"
"regexp"
"sort"
)
@@ -16,29 +15,9 @@ var (
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
)
type route struct {
Freq float64
Index int
}
type routes []*route
func (rs routes) Len() int {
return len(rs)
}
func (rs routes) Less(i, j int) bool {
if rs[i].Freq < rs[j].Freq {
return true
}
if rs[i].Freq == rs[j].Freq {
return rs[i].Index < rs[j].Index
}
return false
}
func (rs routes) Swap(i, j int) {
rs[i], rs[j] = rs[j], rs[i]
type Segmenter interface {
Freq(string) (float64, bool)
Total() float64
}
type Jieba struct {
@@ -84,77 +63,21 @@ func New() *Jieba {
// name in current directory. This function must be called before cut any
// sentence.
func Open(dictFileName string) (*Jieba, error) {
j := &Jieba{total: 0.0, freqMap: make(map[string]float64)}
j := New()
err := LoadDict(j, dictFileName, false)
return j, err
}
// Build a directed acyclic graph (DAG) for sentence.
func (j *Jieba) DAG(sentence string) map[int][]int {
dag := make(map[int][]int)
runes := []rune(sentence)
n := len(runes)
var frag string
for k := 0; k < n; k++ {
tmpList := make([]int, 0)
i := k
frag = string(runes[k])
for {
if freq, ok := j.Freq(frag); !ok {
break
} else {
if freq > 0.0 {
tmpList = append(tmpList, i)
}
}
i += 1
if i >= n {
break
}
frag = string(runes[k : i+1])
}
if len(tmpList) == 0 {
tmpList = append(tmpList, k)
}
dag[k] = tmpList
}
return dag
}
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
runes := []rune(sentence)
number := len(runes)
rs := make(map[int]*route)
rs[number] = &route{Freq: 0.0, Index: 0}
logTotal := math.Log(j.Total())
for idx := number - 1; idx >= 0; idx-- {
candidates := make(routes, 0)
for _, i := range dag[idx] {
word := string(runes[idx : i+1])
var r *route
if freq, ok := j.Freq(word); ok {
r = &route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
} else {
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
}
candidates = append(candidates, r)
}
sort.Sort(sort.Reverse(candidates))
rs[idx] = candidates[0]
}
return rs
}
type cutFunc func(sentence string) chan string
func (j *Jieba) cutDAG(sentence string) chan string {
result := make(chan string)
go func() {
dag := j.DAG(sentence)
routes := j.Calc(sentence, dag)
runes := []rune(sentence)
dag := DAG(j, runes)
routes := Routes(j, runes, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
@@ -214,11 +137,11 @@ func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
result := make(chan string)
go func() {
dag := j.DAG(sentence)
routes := j.Calc(sentence, dag)
runes := []rune(sentence)
dag := DAG(j, runes)
routes := Routes(j, runes, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
@@ -253,7 +176,7 @@ func (j *Jieba) cutAll(sentence string) chan string {
go func() {
runes := []rune(sentence)
dag := j.DAG(sentence)
dag := DAG(j, runes)
old_j := -1
ks := make([]int, 0)
for k := range dag {

View File

@@ -35,8 +35,7 @@ func (p *Posseg) AddEntry(entry jiebago.Entry) {
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func NewPosseg(dictFileName string) (*Posseg, error) {
j := jiebago.New()
p := &Posseg{j, make(map[string]string)}
p := &Posseg{jiebago.New(), make(map[string]string)}
err := jiebago.LoadDict(p, dictFileName, true)
if err != nil {
return nil, err
@@ -114,10 +113,10 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
result := make(chan Pair)
go func() {
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
var y int
runes := []rune(sentence)
dag := jiebago.DAG(p, runes)
routes := jiebago.Routes(p, runes, dag)
var y int
length := len(runes)
buf := make([]rune, 0)
for x := 0; x < length; {
@@ -200,11 +199,11 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
result := make(chan Pair)
go func() {
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
runes := []rune(sentence)
dag := jiebago.DAG(p, runes)
routes := jiebago.Routes(p, runes, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {