From 7a7f8af517b27c9c204ef38e688bc628d64114e5 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 30 Mar 2015 17:10:48 +0800 Subject: [PATCH] move DAG related function to a seperated file, rename Calc to Routes --- dag.go | 84 ++++++++++++++++++++++++++++++++++++++++ jieba.go | 99 ++++++------------------------------------------ posseg/posseg.go | 15 ++++---- 3 files changed, 102 insertions(+), 96 deletions(-) create mode 100644 dag.go diff --git a/dag.go b/dag.go new file mode 100644 index 0000000..1f32f43 --- /dev/null +++ b/dag.go @@ -0,0 +1,84 @@ +package jiebago + +import ( + "math" + "sort" +) + +type route struct { + Freq float64 + Index int +} + +type routes []route + +func (rs routes) Len() int { + return len(rs) +} + +func (rs routes) Less(i, j int) bool { + if rs[i].Freq < rs[j].Freq { + return true + } + if rs[i].Freq == rs[j].Freq { + return rs[i].Index < rs[j].Index + } + return false +} + +func (rs routes) Swap(i, j int) { + rs[i], rs[j] = rs[j], rs[i] +} + +type dag map[int][]int + +func DAG(s Segmenter, runes []rune) dag { + d := make(dag) + n := len(runes) + var frag string + for k := 0; k < n; k++ { + tmpList := make([]int, 0) + i := k + frag = string(runes[k]) + for { + if freq, ok := s.Freq(frag); !ok { + break + } else { + if freq > 0.0 { + tmpList = append(tmpList, i) + } + } + i += 1 + if i >= n { + break + } + frag = string(runes[k : i+1]) + } + if len(tmpList) == 0 { + tmpList = append(tmpList, k) + } + d[k] = tmpList + } + return d +} + +func Routes(s Segmenter, runes []rune, d dag) map[int]route { + n := len(runes) + rs := make(map[int]route) + rs[n] = route{Freq: 0.0, Index: 0} + logTotal := math.Log(s.Total()) + for idx := n - 1; idx >= 0; idx-- { + candidates := make(routes, len(d[idx])) + for index, i := range d[idx] { + word := string(runes[idx : i+1]) + if freq, ok := s.Freq(word); ok { + candidates[index] = route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} + } else { + candidates[index] = route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} + } + } + sort.Sort(sort.Reverse(candidates)) + rs[idx] = candidates[0] + } + return rs +} diff --git a/jieba.go b/jieba.go index 6d4d0c3..c62e22e 100644 --- a/jieba.go +++ b/jieba.go @@ -3,7 +3,6 @@ package jiebago import ( "github.com/wangbin/jiebago/finalseg" - "math" "regexp" "sort" ) @@ -16,29 +15,9 @@ var ( reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) -type route struct { - Freq float64 - Index int -} - -type routes []*route - -func (rs routes) Len() int { - return len(rs) -} - -func (rs routes) Less(i, j int) bool { - if rs[i].Freq < rs[j].Freq { - return true - } - if rs[i].Freq == rs[j].Freq { - return rs[i].Index < rs[j].Index - } - return false -} - -func (rs routes) Swap(i, j int) { - rs[i], rs[j] = rs[j], rs[i] +type Segmenter interface { + Freq(string) (float64, bool) + Total() float64 } type Jieba struct { @@ -84,77 +63,21 @@ func New() *Jieba { // name in current directory. This function must be called before cut any // sentence. func Open(dictFileName string) (*Jieba, error) { - j := &Jieba{total: 0.0, freqMap: make(map[string]float64)} + j := New() err := LoadDict(j, dictFileName, false) return j, err } -// Build a directed acyclic graph (DAG) for sentence. -func (j *Jieba) DAG(sentence string) map[int][]int { - dag := make(map[int][]int) - runes := []rune(sentence) - n := len(runes) - var frag string - for k := 0; k < n; k++ { - tmpList := make([]int, 0) - i := k - frag = string(runes[k]) - for { - if freq, ok := j.Freq(frag); !ok { - break - } else { - if freq > 0.0 { - tmpList = append(tmpList, i) - } - } - i += 1 - if i >= n { - break - } - frag = string(runes[k : i+1]) - } - if len(tmpList) == 0 { - tmpList = append(tmpList, k) - } - dag[k] = tmpList - } - return dag -} - -func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route { - runes := []rune(sentence) - number := len(runes) - rs := make(map[int]*route) - rs[number] = &route{Freq: 0.0, Index: 0} - logTotal := math.Log(j.Total()) - for idx := number - 1; idx >= 0; idx-- { - candidates := make(routes, 0) - for _, i := range dag[idx] { - word := string(runes[idx : i+1]) - var r *route - if freq, ok := j.Freq(word); ok { - r = &route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i} - } else { - r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i} - } - candidates = append(candidates, r) - } - sort.Sort(sort.Reverse(candidates)) - rs[idx] = candidates[0] - } - return rs -} - type cutFunc func(sentence string) chan string func (j *Jieba) cutDAG(sentence string) chan string { result := make(chan string) go func() { - dag := j.DAG(sentence) - routes := j.Calc(sentence, dag) + runes := []rune(sentence) + dag := DAG(j, runes) + routes := Routes(j, runes, dag) x := 0 var y int - runes := []rune(sentence) length := len(runes) buf := make([]rune, 0) for { @@ -214,11 +137,11 @@ func (j *Jieba) cutDAGNoHMM(sentence string) chan string { result := make(chan string) go func() { - dag := j.DAG(sentence) - routes := j.Calc(sentence, dag) + runes := []rune(sentence) + dag := DAG(j, runes) + routes := Routes(j, runes, dag) x := 0 var y int - runes := []rune(sentence) length := len(runes) buf := make([]rune, 0) for { @@ -253,7 +176,7 @@ func (j *Jieba) cutAll(sentence string) chan string { go func() { runes := []rune(sentence) - dag := j.DAG(sentence) + dag := DAG(j, runes) old_j := -1 ks := make([]int, 0) for k := range dag { diff --git a/posseg/posseg.go b/posseg/posseg.go index 8beac1a..55c7033 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -35,8 +35,7 @@ func (p *Posseg) AddEntry(entry jiebago.Entry) { // Set dictionary, it could be absolute path of dictionary file, or dictionary // name in current diectory. func NewPosseg(dictFileName string) (*Posseg, error) { - j := jiebago.New() - p := &Posseg{j, make(map[string]string)} + p := &Posseg{jiebago.New(), make(map[string]string)} err := jiebago.LoadDict(p, dictFileName, true) if err != nil { return nil, err @@ -114,10 +113,10 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { result := make(chan Pair) go func() { - dag := p.DAG(sentence) - routes := p.Calc(sentence, dag) - var y int runes := []rune(sentence) + dag := jiebago.DAG(p, runes) + routes := jiebago.Routes(p, runes, dag) + var y int length := len(runes) buf := make([]rune, 0) for x := 0; x < length; { @@ -200,11 +199,11 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair { result := make(chan Pair) go func() { - dag := p.DAG(sentence) - routes := p.Calc(sentence, dag) + runes := []rune(sentence) + dag := jiebago.DAG(p, runes) + routes := jiebago.Routes(p, runes, dag) x := 0 var y int - runes := []rune(sentence) length := len(runes) buf := make([]rune, 0) for {