1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-11 04:10:25 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:30:45 +08:00
parent c03b3eac1c
commit 43480db509
2 changed files with 125 additions and 115 deletions

220
jieba.go
View File

@@ -127,127 +127,137 @@ func Calc(sentence string, dag map[int][]int) map[int]*Route {
return routes
}
type cutFunc func(sentence string) []string
type cutFunc func(sentence string) chan string
func cutDAG(sentence string) []string {
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
result := make([]string, 0)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
if len(buf) > 0 {
if len(buf) == 1 {
result = append(result, string(buf))
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) {
result = append(result, t)
}
} else {
for _, elem := range buf {
result = append(result, string(elem)) // TODO: I don't get this?
}
}
buf = make([]rune, 0)
}
func cutDAG(sentence string) chan string {
result := make(chan string)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
result = append(result, string(l_word))
}
x = y
}
if len(buf) > 0 {
if len(buf) == 1 {
result = append(result, string(buf))
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) {
result = append(result, t)
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
for _, elem := range buf {
result = append(result, string(elem)) // TODO: I don't get this?
if len(buf) > 0 {
if len(buf) == 1 {
result <- string(buf)
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for x := range finalseg.Cut(bufString) {
result <- x
}
} else {
for _, elem := range buf {
result <- string(elem) // TODO: I don't get this?
}
}
buf = make([]rune, 0)
}
}
result <- string(l_word)
}
x = y
}
if len(buf) > 0 {
if len(buf) == 1 {
result <- string(buf)
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) {
result <- t
}
} else {
for _, elem := range buf {
result <- string(elem) // TODO: I don't get this?
}
}
}
}
}
close(result)
}()
return result
}
func cutDAGNoHMM(sentence string) []string {
result := make([]string, 0)
func cutDAGNoHMM(sentence string) chan string {
result := make(chan string)
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
x = y
} else {
if len(buf) > 0 {
result = append(result, string(buf))
buf = make([]rune, 0)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
x = y
} else {
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
result <- string(l_word)
x = y
}
result = append(result, string(l_word))
x = y
}
}
if len(buf) > 0 {
result = append(result, string(buf))
buf = make([]rune, 0)
}
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
close(result)
}()
return result
}
func cutAll(sentence string) []string {
result := make([]string, 0)
runes := []rune(sentence)
dag := DAG(sentence)
old_j := -1
ks := make([]int, 0)
for k := range dag {
ks = append(ks, k)
}
sort.Ints(ks)
for k := range ks {
l := dag[k]
if len(l) == 1 && k > old_j {
result = append(result, string(runes[k:l[0]+1]))
old_j = l[0]
} else {
for _, j := range l {
if j > k {
result = append(result, string(runes[k:j+1]))
old_j = j
func cutAll(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
dag := DAG(sentence)
old_j := -1
ks := make([]int, 0)
for k := range dag {
ks = append(ks, k)
}
sort.Ints(ks)
for k := range ks {
l := dag[k]
if len(l) == 1 && k > old_j {
result <- string(runes[k : l[0]+1])
old_j = l[0]
} else {
for _, j := range l {
if j > k {
result <- string(runes[k : j+1])
old_j = j
}
}
}
}
}
close(result)
}()
return result
}
@@ -277,8 +287,8 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
continue
}
if reHan.MatchString(blk) {
for _, word := range cut(blk) {
result <- word
for x := range cut(blk) {
result <- x
}
} else {
type skipSplitFunc func(sentence string) []string

View File

@@ -621,15 +621,23 @@ func init() {
SetDictionary("dict.txt")
}
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestCutDAG(t *testing.T) {
result := cutDAG("BP神经网络如何训练才能在分类时增加区分度")
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
}
}
func TestCutDAGNoHmm(t *testing.T) {
result := cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度")
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
}
@@ -648,14 +656,6 @@ func TestRegexpSplit(t *testing.T) {
}
}
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestDefaultCut(t *testing.T) {
var result []string
for index, content := range test_contents {