diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index 8725610..0b19ecd 100644 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -9,80 +9,86 @@ var ( reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) ) -func cutHan(sentence string) []string { - runes := []rune(sentence) - result := make([]string, 0) - _, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) - begin, next := 0, 0 - for i, char := range runes { - pos := pos_list[i] - switch pos { - case 'B': - begin = i - case 'E': - result = append(result, string(runes[begin:i+1])) - next = i + 1 - case 'S': - result = append(result, string(char)) - next = i + 1 +func cutHan(sentence string) chan string { + result := make(chan string) + go func() { + runes := []rune(sentence) + _, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) + begin, next := 0, 0 + for i, char := range runes { + pos := pos_list[i] + switch pos { + case 'B': + begin = i + case 'E': + result <- string(runes[begin : i+1]) + next = i + 1 + case 'S': + result <- string(char) + next = i + 1 + } } - } - if next < len(runes) { - result = append(result, string(runes[next:])) - } + if next < len(runes) { + result <- string(runes[next:]) + } + close(result) + }() return result } -func Cut(sentence string) []string { - result := make([]string, 0) +func Cut(sentence string) chan string { + result := make(chan string) s := sentence var hans string var hanLoc []int var nonhanLoc []int - for { - hanLoc = reHan.FindStringIndex(s) - if hanLoc == nil { - if len(s) == 0 { - break - } - } else if hanLoc[0] == 0 { - hans = s[hanLoc[0]:hanLoc[1]] - s = s[hanLoc[1]:] - for _, han := range cutHan(hans) { - result = append(result, han) - } - continue - } - nonhanLoc = reSkip.FindStringIndex(s) - if nonhanLoc == nil { - if len(s) == 0 { - break - } - } else if nonhanLoc[0] == 0 { - nonhans := s[nonhanLoc[0]:nonhanLoc[1]] - s = s[nonhanLoc[1]:] - if nonhans != "" { - result = append(result, nonhans) + go func() { + for { + hanLoc = reHan.FindStringIndex(s) + if hanLoc == nil { + if len(s) == 0 { + break + } + } else if hanLoc[0] == 0 { + hans = s[hanLoc[0]:hanLoc[1]] + s = s[hanLoc[1]:] + for han := range cutHan(hans) { + result <- han + } continue } - } - var loc []int - if hanLoc == nil && nonhanLoc == nil { - if len(s) > 0 { - result = append(result, s) - break + nonhanLoc = reSkip.FindStringIndex(s) + if nonhanLoc == nil { + if len(s) == 0 { + break + } + } else if nonhanLoc[0] == 0 { + nonhans := s[nonhanLoc[0]:nonhanLoc[1]] + s = s[nonhanLoc[1]:] + if nonhans != "" { + result <- nonhans + continue + } } - } else if hanLoc == nil { - loc = nonhanLoc - } else if nonhanLoc == nil { - loc = hanLoc - } else if hanLoc[0] < nonhanLoc[0] { - loc = hanLoc - } else { - loc = nonhanLoc + var loc []int + if hanLoc == nil && nonhanLoc == nil { + if len(s) > 0 { + result <- s + break + } + } else if hanLoc == nil { + loc = nonhanLoc + } else if nonhanLoc == nil { + loc = hanLoc + } else if hanLoc[0] < nonhanLoc[0] { + loc = hanLoc + } else { + loc = nonhanLoc + } + result <- s[:loc[0]] + s = s[loc[0]:] } - result = append(result, s[:loc[0]]) - s = s[loc[0]:] - } + close(result) + }() return result } diff --git a/finalseg/finalseg_test.go b/finalseg/finalseg_test.go index 7ae9bd9..1157ebd 100644 --- a/finalseg/finalseg_test.go +++ b/finalseg/finalseg_test.go @@ -5,6 +5,14 @@ import ( "testing" ) +func chanToArray(ch chan string) []string { + result := make([]string, 0) + for word := range ch { + result = append(result, word) + } + return result +} + func TestViterbi(t *testing.T) { obs := "我们是程序员" states := []byte{'B', 'M', 'E', 'S'} @@ -21,7 +29,7 @@ func TestViterbi(t *testing.T) { func TestCutHan(t *testing.T) { obs := "我们是程序员" - result := cutHan(obs) + result := chanToArray(cutHan(obs)) if len(result) != 3 { t.Error(result) } @@ -38,7 +46,7 @@ func TestCutHan(t *testing.T) { func TestCut(t *testing.T) { sentence := "我们是程序员" - result := Cut(sentence) + result := chanToArray(Cut(sentence)) if len(result) != 3 { t.Error(len(result)) } @@ -51,11 +59,11 @@ func TestCut(t *testing.T) { if result[2] != "程序员" { t.Error(result[2]) } - result2 := Cut("I'm a programmer!") + result2 := chanToArray(Cut("I'm a programmer!")) if len(result2) != 8 { t.Error(result2) } - result3 := Cut("程序员average年龄28.6岁。") + result3 := chanToArray(Cut("程序员average年龄28.6岁。")) if len(result3) != 6 { t.Error(result3) } diff --git a/jieba.go b/jieba.go index aea00ec..cf5d6c8 100644 --- a/jieba.go +++ b/jieba.go @@ -154,8 +154,7 @@ func cutDAG(sentence string) []string { } else { bufString := string(buf) if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { - recognized := finalseg.Cut(bufString) - for _, t := range recognized { + for t := range finalseg.Cut(bufString) { result = append(result, t) } } else { @@ -177,8 +176,7 @@ func cutDAG(sentence string) []string { } else { bufString := string(buf) if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { - recognized := finalseg.Cut(bufString) - for _, t := range recognized { + for t := range finalseg.Cut(bufString) { result = append(result, t) } } else {