mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
move DAG related function to a seperated file, rename Calc to Routes
This commit is contained in:
84
dag.go
Normal file
84
dag.go
Normal file
@@ -0,0 +1,84 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
type route struct {
|
||||
Freq float64
|
||||
Index int
|
||||
}
|
||||
|
||||
type routes []route
|
||||
|
||||
func (rs routes) Len() int {
|
||||
return len(rs)
|
||||
}
|
||||
|
||||
func (rs routes) Less(i, j int) bool {
|
||||
if rs[i].Freq < rs[j].Freq {
|
||||
return true
|
||||
}
|
||||
if rs[i].Freq == rs[j].Freq {
|
||||
return rs[i].Index < rs[j].Index
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (rs routes) Swap(i, j int) {
|
||||
rs[i], rs[j] = rs[j], rs[i]
|
||||
}
|
||||
|
||||
type dag map[int][]int
|
||||
|
||||
func DAG(s Segmenter, runes []rune) dag {
|
||||
d := make(dag)
|
||||
n := len(runes)
|
||||
var frag string
|
||||
for k := 0; k < n; k++ {
|
||||
tmpList := make([]int, 0)
|
||||
i := k
|
||||
frag = string(runes[k])
|
||||
for {
|
||||
if freq, ok := s.Freq(frag); !ok {
|
||||
break
|
||||
} else {
|
||||
if freq > 0.0 {
|
||||
tmpList = append(tmpList, i)
|
||||
}
|
||||
}
|
||||
i += 1
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
frag = string(runes[k : i+1])
|
||||
}
|
||||
if len(tmpList) == 0 {
|
||||
tmpList = append(tmpList, k)
|
||||
}
|
||||
d[k] = tmpList
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func Routes(s Segmenter, runes []rune, d dag) map[int]route {
|
||||
n := len(runes)
|
||||
rs := make(map[int]route)
|
||||
rs[n] = route{Freq: 0.0, Index: 0}
|
||||
logTotal := math.Log(s.Total())
|
||||
for idx := n - 1; idx >= 0; idx-- {
|
||||
candidates := make(routes, len(d[idx]))
|
||||
for index, i := range d[idx] {
|
||||
word := string(runes[idx : i+1])
|
||||
if freq, ok := s.Freq(word); ok {
|
||||
candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
|
||||
} else {
|
||||
candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||
}
|
||||
}
|
||||
sort.Sort(sort.Reverse(candidates))
|
||||
rs[idx] = candidates[0]
|
||||
}
|
||||
return rs
|
||||
}
|
||||
99
jieba.go
99
jieba.go
@@ -3,7 +3,6 @@ package jiebago
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago/finalseg"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
)
|
||||
@@ -16,29 +15,9 @@ var (
|
||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
type route struct {
|
||||
Freq float64
|
||||
Index int
|
||||
}
|
||||
|
||||
type routes []*route
|
||||
|
||||
func (rs routes) Len() int {
|
||||
return len(rs)
|
||||
}
|
||||
|
||||
func (rs routes) Less(i, j int) bool {
|
||||
if rs[i].Freq < rs[j].Freq {
|
||||
return true
|
||||
}
|
||||
if rs[i].Freq == rs[j].Freq {
|
||||
return rs[i].Index < rs[j].Index
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (rs routes) Swap(i, j int) {
|
||||
rs[i], rs[j] = rs[j], rs[i]
|
||||
type Segmenter interface {
|
||||
Freq(string) (float64, bool)
|
||||
Total() float64
|
||||
}
|
||||
|
||||
type Jieba struct {
|
||||
@@ -84,77 +63,21 @@ func New() *Jieba {
|
||||
// name in current directory. This function must be called before cut any
|
||||
// sentence.
|
||||
func Open(dictFileName string) (*Jieba, error) {
|
||||
j := &Jieba{total: 0.0, freqMap: make(map[string]float64)}
|
||||
j := New()
|
||||
err := LoadDict(j, dictFileName, false)
|
||||
return j, err
|
||||
}
|
||||
|
||||
// Build a directed acyclic graph (DAG) for sentence.
|
||||
func (j *Jieba) DAG(sentence string) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
runes := []rune(sentence)
|
||||
n := len(runes)
|
||||
var frag string
|
||||
for k := 0; k < n; k++ {
|
||||
tmpList := make([]int, 0)
|
||||
i := k
|
||||
frag = string(runes[k])
|
||||
for {
|
||||
if freq, ok := j.Freq(frag); !ok {
|
||||
break
|
||||
} else {
|
||||
if freq > 0.0 {
|
||||
tmpList = append(tmpList, i)
|
||||
}
|
||||
}
|
||||
i += 1
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
frag = string(runes[k : i+1])
|
||||
}
|
||||
if len(tmpList) == 0 {
|
||||
tmpList = append(tmpList, k)
|
||||
}
|
||||
dag[k] = tmpList
|
||||
}
|
||||
return dag
|
||||
}
|
||||
|
||||
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||
runes := []rune(sentence)
|
||||
number := len(runes)
|
||||
rs := make(map[int]*route)
|
||||
rs[number] = &route{Freq: 0.0, Index: 0}
|
||||
logTotal := math.Log(j.Total())
|
||||
for idx := number - 1; idx >= 0; idx-- {
|
||||
candidates := make(routes, 0)
|
||||
for _, i := range dag[idx] {
|
||||
word := string(runes[idx : i+1])
|
||||
var r *route
|
||||
if freq, ok := j.Freq(word); ok {
|
||||
r = &route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
|
||||
} else {
|
||||
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||
}
|
||||
candidates = append(candidates, r)
|
||||
}
|
||||
sort.Sort(sort.Reverse(candidates))
|
||||
rs[idx] = candidates[0]
|
||||
}
|
||||
return rs
|
||||
}
|
||||
|
||||
type cutFunc func(sentence string) chan string
|
||||
|
||||
func (j *Jieba) cutDAG(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
dag := j.DAG(sentence)
|
||||
routes := j.Calc(sentence, dag)
|
||||
runes := []rune(sentence)
|
||||
dag := DAG(j, runes)
|
||||
routes := Routes(j, runes, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
@@ -214,11 +137,11 @@ func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
|
||||
go func() {
|
||||
dag := j.DAG(sentence)
|
||||
routes := j.Calc(sentence, dag)
|
||||
runes := []rune(sentence)
|
||||
dag := DAG(j, runes)
|
||||
routes := Routes(j, runes, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
@@ -253,7 +176,7 @@ func (j *Jieba) cutAll(sentence string) chan string {
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
dag := j.DAG(sentence)
|
||||
dag := DAG(j, runes)
|
||||
old_j := -1
|
||||
ks := make([]int, 0)
|
||||
for k := range dag {
|
||||
|
||||
@@ -35,8 +35,7 @@ func (p *Posseg) AddEntry(entry jiebago.Entry) {
|
||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||
// name in current diectory.
|
||||
func NewPosseg(dictFileName string) (*Posseg, error) {
|
||||
j := jiebago.New()
|
||||
p := &Posseg{j, make(map[string]string)}
|
||||
p := &Posseg{jiebago.New(), make(map[string]string)}
|
||||
err := jiebago.LoadDict(p, dictFileName, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -114,10 +113,10 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
|
||||
result := make(chan Pair)
|
||||
|
||||
go func() {
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
dag := jiebago.DAG(p, runes)
|
||||
routes := jiebago.Routes(p, runes, dag)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for x := 0; x < length; {
|
||||
@@ -200,11 +199,11 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
|
||||
result := make(chan Pair)
|
||||
|
||||
go func() {
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
runes := []rune(sentence)
|
||||
dag := jiebago.DAG(p, runes)
|
||||
routes := jiebago.Routes(p, runes, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
|
||||
Reference in New Issue
Block a user