1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-27 15:40:32 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:15:23 +08:00
parent d76fbfb017
commit c03b3eac1c
3 changed files with 83 additions and 71 deletions

View File

@@ -9,80 +9,86 @@ var (
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
) )
func cutHan(sentence string) []string { func cutHan(sentence string) chan string {
runes := []rune(sentence) result := make(chan string)
result := make([]string, 0) go func() {
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) runes := []rune(sentence)
begin, next := 0, 0 _, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
for i, char := range runes { begin, next := 0, 0
pos := pos_list[i] for i, char := range runes {
switch pos { pos := pos_list[i]
case 'B': switch pos {
begin = i case 'B':
case 'E': begin = i
result = append(result, string(runes[begin:i+1])) case 'E':
next = i + 1 result <- string(runes[begin : i+1])
case 'S': next = i + 1
result = append(result, string(char)) case 'S':
next = i + 1 result <- string(char)
next = i + 1
}
} }
} if next < len(runes) {
if next < len(runes) { result <- string(runes[next:])
result = append(result, string(runes[next:])) }
} close(result)
}()
return result return result
} }
func Cut(sentence string) []string { func Cut(sentence string) chan string {
result := make([]string, 0) result := make(chan string)
s := sentence s := sentence
var hans string var hans string
var hanLoc []int var hanLoc []int
var nonhanLoc []int var nonhanLoc []int
for { go func() {
hanLoc = reHan.FindStringIndex(s) for {
if hanLoc == nil { hanLoc = reHan.FindStringIndex(s)
if len(s) == 0 { if hanLoc == nil {
break if len(s) == 0 {
} break
} else if hanLoc[0] == 0 { }
hans = s[hanLoc[0]:hanLoc[1]] } else if hanLoc[0] == 0 {
s = s[hanLoc[1]:] hans = s[hanLoc[0]:hanLoc[1]]
for _, han := range cutHan(hans) { s = s[hanLoc[1]:]
result = append(result, han) for han := range cutHan(hans) {
} result <- han
continue }
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue continue
} }
} nonhanLoc = reSkip.FindStringIndex(s)
var loc []int if nonhanLoc == nil {
if hanLoc == nil && nonhanLoc == nil { if len(s) == 0 {
if len(s) > 0 { break
result = append(result, s) }
break } else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result <- nonhans
continue
}
} }
} else if hanLoc == nil { var loc []int
loc = nonhanLoc if hanLoc == nil && nonhanLoc == nil {
} else if nonhanLoc == nil { if len(s) > 0 {
loc = hanLoc result <- s
} else if hanLoc[0] < nonhanLoc[0] { break
loc = hanLoc }
} else { } else if hanLoc == nil {
loc = nonhanLoc loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result <- s[:loc[0]]
s = s[loc[0]:]
} }
result = append(result, s[:loc[0]]) close(result)
s = s[loc[0]:] }()
}
return result return result
} }

View File

@@ -5,6 +5,14 @@ import (
"testing" "testing"
) )
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestViterbi(t *testing.T) { func TestViterbi(t *testing.T) {
obs := "我们是程序员" obs := "我们是程序员"
states := []byte{'B', 'M', 'E', 'S'} states := []byte{'B', 'M', 'E', 'S'}
@@ -21,7 +29,7 @@ func TestViterbi(t *testing.T) {
func TestCutHan(t *testing.T) { func TestCutHan(t *testing.T) {
obs := "我们是程序员" obs := "我们是程序员"
result := cutHan(obs) result := chanToArray(cutHan(obs))
if len(result) != 3 { if len(result) != 3 {
t.Error(result) t.Error(result)
} }
@@ -38,7 +46,7 @@ func TestCutHan(t *testing.T) {
func TestCut(t *testing.T) { func TestCut(t *testing.T) {
sentence := "我们是程序员" sentence := "我们是程序员"
result := Cut(sentence) result := chanToArray(Cut(sentence))
if len(result) != 3 { if len(result) != 3 {
t.Error(len(result)) t.Error(len(result))
} }
@@ -51,11 +59,11 @@ func TestCut(t *testing.T) {
if result[2] != "程序员" { if result[2] != "程序员" {
t.Error(result[2]) t.Error(result[2])
} }
result2 := Cut("I'm a programmer!") result2 := chanToArray(Cut("I'm a programmer!"))
if len(result2) != 8 { if len(result2) != 8 {
t.Error(result2) t.Error(result2)
} }
result3 := Cut("程序员average年龄28.6岁。") result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
if len(result3) != 6 { if len(result3) != 6 {
t.Error(result3) t.Error(result3)
} }

View File

@@ -154,8 +154,7 @@ func cutDAG(sentence string) []string {
} else { } else {
bufString := string(buf) bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
recognized := finalseg.Cut(bufString) for t := range finalseg.Cut(bufString) {
for _, t := range recognized {
result = append(result, t) result = append(result, t)
} }
} else { } else {
@@ -177,8 +176,7 @@ func cutDAG(sentence string) []string {
} else { } else {
bufString := string(buf) bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 { if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
recognized := finalseg.Cut(bufString) for t := range finalseg.Cut(bufString) {
for _, t := range recognized {
result = append(result, t) result = append(result, t)
} }
} else { } else {