1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-07-02 10:00:27 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:30:45 +08:00
parent c03b3eac1c
commit 43480db509
2 changed files with 125 additions and 115 deletions

View File

@@ -127,16 +127,17 @@ func Calc(sentence string, dag map[int][]int) map[int]*Route {
return routes return routes
} }
type cutFunc func(sentence string) []string type cutFunc func(sentence string) chan string
func cutDAG(sentence string) []string { func cutDAG(sentence string) chan string {
result := make(chan string)
go func() {
dag := DAG(sentence) dag := DAG(sentence)
routes := Calc(sentence, dag) routes := Calc(sentence, dag)
x := 0 x := 0
var y int var y int
runes := []rune(sentence) runes := []rune(sentence)
length := len(runes) length := len(runes)
result := make([]string, 0)
buf := make([]rune, 0) buf := make([]rune, 0)
for { for {
if x >= length { if x >= length {
@@ -149,49 +150,52 @@ func cutDAG(sentence string) []string {
} else { } else {
if len(buf) > 0 { if len(buf) > 0 {
if len(buf) == 1 { if len(buf) == 1 {
result = append(result, string(buf)) result <- string(buf)
buf = make([]rune, 0) buf = make([]rune, 0)
} else { } else {
bufString := string(buf) bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) { for x := range finalseg.Cut(bufString) {
result = append(result, t) result <- x
} }
} else { } else {
for _, elem := range buf { for _, elem := range buf {
result = append(result, string(elem)) // TODO: I don't get this? result <- string(elem) // TODO: I don't get this?
} }
} }
buf = make([]rune, 0) buf = make([]rune, 0)
} }
} }
result = append(result, string(l_word)) result <- string(l_word)
} }
x = y x = y
} }
if len(buf) > 0 { if len(buf) > 0 {
if len(buf) == 1 { if len(buf) == 1 {
result = append(result, string(buf)) result <- string(buf)
} else { } else {
bufString := string(buf) bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) { for t := range finalseg.Cut(bufString) {
result = append(result, t) result <- t
} }
} else { } else {
for _, elem := range buf { for _, elem := range buf {
result = append(result, string(elem)) // TODO: I don't get this? result <- string(elem) // TODO: I don't get this?
} }
} }
} }
} }
close(result)
}()
return result return result
} }
func cutDAGNoHMM(sentence string) []string { func cutDAGNoHMM(sentence string) chan string {
result := make([]string, 0) result := make(chan string)
go func() {
dag := DAG(sentence) dag := DAG(sentence)
routes := Calc(sentence, dag) routes := Calc(sentence, dag)
x := 0 x := 0
@@ -210,22 +214,26 @@ func cutDAGNoHMM(sentence string) []string {
x = y x = y
} else { } else {
if len(buf) > 0 { if len(buf) > 0 {
result = append(result, string(buf)) result <- string(buf)
buf = make([]rune, 0) buf = make([]rune, 0)
} }
result = append(result, string(l_word)) result <- string(l_word)
x = y x = y
} }
} }
if len(buf) > 0 { if len(buf) > 0 {
result = append(result, string(buf)) result <- string(buf)
buf = make([]rune, 0) buf = make([]rune, 0)
} }
close(result)
}()
return result return result
} }
func cutAll(sentence string) []string { func cutAll(sentence string) chan string {
result := make([]string, 0) result := make(chan string)
go func() {
runes := []rune(sentence) runes := []rune(sentence)
dag := DAG(sentence) dag := DAG(sentence)
old_j := -1 old_j := -1
@@ -237,17 +245,19 @@ func cutAll(sentence string) []string {
for k := range ks { for k := range ks {
l := dag[k] l := dag[k]
if len(l) == 1 && k > old_j { if len(l) == 1 && k > old_j {
result = append(result, string(runes[k:l[0]+1])) result <- string(runes[k : l[0]+1])
old_j = l[0] old_j = l[0]
} else { } else {
for _, j := range l { for _, j := range l {
if j > k { if j > k {
result = append(result, string(runes[k:j+1])) result <- string(runes[k : j+1])
old_j = j old_j = j
} }
} }
} }
} }
close(result)
}()
return result return result
} }
@@ -277,8 +287,8 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
continue continue
} }
if reHan.MatchString(blk) { if reHan.MatchString(blk) {
for _, word := range cut(blk) { for x := range cut(blk) {
result <- word result <- x
} }
} else { } else {
type skipSplitFunc func(sentence string) []string type skipSplitFunc func(sentence string) []string

View File

@@ -621,15 +621,23 @@ func init() {
SetDictionary("dict.txt") SetDictionary("dict.txt")
} }
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestCutDAG(t *testing.T) { func TestCutDAG(t *testing.T) {
result := cutDAG("BP神经网络如何训练才能在分类时增加区分度") result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 { if len(result) != 11 {
t.Error(result) t.Error(result)
} }
} }
func TestCutDAGNoHmm(t *testing.T) { func TestCutDAGNoHmm(t *testing.T) {
result := cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度") result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 { if len(result) != 11 {
t.Error(result) t.Error(result)
} }
@@ -648,14 +656,6 @@ func TestRegexpSplit(t *testing.T) {
} }
} }
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestDefaultCut(t *testing.T) { func TestDefaultCut(t *testing.T) {
var result []string var result []string
for index, content := range test_contents { for index, content := range test_contents {