change cut method to return a channel string, not []string

2026-07-02 10:00:27 +08:00 · 2015-02-27 11:37:55 +08:00
parent 87fe3a19f8
commit 76b9df8511
4 changed files with 91 additions and 80 deletions
--- a/analyse/analyse.go
+++ b/analyse/analyse.go
@@ -36,10 +36,9 @@ func (tis TfIdfs) Swap(i, j int) {
 }
 func ExtractTags(sentence string, topK int) (tags TfIdfs) {
 	words := jiebago.Cut(sentence, false, true)
 	freq := make(map[string]float64)
-	for _, w := range words {
+	for w := range jiebago.Cut(sentence, false, true) {
 		w = strings.TrimSpace(w)
 		if utf8.RuneCountInString(w) < 2 {
 			continue
--- a/jieba.go
+++ b/jieba.go
@@ -253,80 +253,85 @@ func cutAll(sentence string) []string {
 	return result
 }
-func Cut(sentence string, isCutAll bool, HMM bool) []string {
+func Cut(sentence string, isCutAll bool, HMM bool) chan string {
-	result := make([]string, 0)
+	result := make(chan string)
-	var reHan, reSkip *regexp.Regexp
+	go func() {
-	if isCutAll {
+		var reHan, reSkip *regexp.Regexp
-		reHan = reHanCutAll
+		if isCutAll {
-		reSkip = reSkipCutAll
+			reHan = reHanCutAll
-	} else {
+			reSkip = reSkipCutAll
 		reHan = reHanDefault
 		reSkip = reSkipDefault
 	}
 	blocks := RegexpSplit(reHan, sentence)
 	var cut cutFunc
 	if HMM {
 		cut = cutDAG
 	} else {
 		cut = cutDAGNoHMM
 	}
 	if isCutAll {
 		cut = cutAll
 	}
 	for _, blk := range blocks {
 		if len(blk) == 0 {
 			continue
 		}
 		if reHan.MatchString(blk) {
 			for _, word := range cut(blk) {
 				result = append(result, word)
 			}
 		} else {
-			type skipSplitFunc func(sentence string) []string
+			reHan = reHanDefault
-			var ssf skipSplitFunc
+			reSkip = reSkipDefault
-			if isCutAll {
+		}
-				ssf = func(sentence string) []string {
+		blocks := RegexpSplit(reHan, sentence)
-					return reSkip.Split(sentence, -1)
+		var cut cutFunc
 		if HMM {
 			cut = cutDAG
 		} else {
 			cut = cutDAGNoHMM
 		}
 		if isCutAll {
 			cut = cutAll
 		}
 		for _, blk := range blocks {
 			if len(blk) == 0 {
 				continue
 			}
 			if reHan.MatchString(blk) {
 				for _, word := range cut(blk) {
 					result <- word
 				}
 			} else {
-				ssf = func(sentence string) []string {
+				type skipSplitFunc func(sentence string) []string
-					return RegexpSplit(reSkip, sentence)
+				var ssf skipSplitFunc
-				}
+				if isCutAll {
-			}
+					ssf = func(sentence string) []string {
-
+						return reSkip.Split(sentence, -1)
 			for _, x := range ssf(blk) {
 				if reSkip.MatchString(x) {
 					result = append(result, x)
 				} else if !isCutAll {
 					for _, xx := range x {
 						result = append(result, string(xx))
 					}
 				} else {
-					result = append(result, x)
+					ssf = func(sentence string) []string {
 						return RegexpSplit(reSkip, sentence)
 					}
 				}
 			}
 		}
 	}
 	return result
 }
-func CutForSearch(sentence string, hmm bool) []string {
+				for _, x := range ssf(blk) {
-	result := make([]string, 0)
+					if reSkip.MatchString(x) {
-	words := Cut(sentence, false, hmm)
+						result <- x
-	for _, word := range words {
+					} else if !isCutAll {
-		runes := []rune(word)
+						for _, xx := range x {
-		for _, increment := range []int{2, 3} {
+							result <- string(xx)
-			if len(runes) > increment {
+						}
-				var gram2 string
+					} else {
-				for i := 0; i < len(runes)-increment+1; i++ {
+						result <- x
 					gram2 = string(runes[i : i+increment])
 					if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
 						result = append(result, gram2)
 					}
 				}
 			}
 		}
-		result = append(result, word)
+		close(result)
-	}
+	}()
 	return result
 }
 func CutForSearch(sentence string, hmm bool) chan string {
 	result := make(chan string)
 	go func() {
 		for word := range Cut(sentence, false, hmm) {
 			runes := []rune(word)
 			for _, increment := range []int{2, 3} {
 				if len(runes) > increment {
 					var gram2 string
 					for i := 0; i < len(runes)-increment+1; i++ {
 						gram2 = string(runes[i : i+increment])
 						if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
 							result <- gram2
 						}
 					}
 				}
 			}
 			result <- word
 		}
 		close(result)
 	}()
 	return result
 }
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -648,10 +648,18 @@ func TestRegexpSplit(t *testing.T) {
 	}
 }
 func chanToArray(ch chan string) []string {
 	result := make([]string, 0)
 	for word := range ch {
 		result = append(result, word)
 	}
 	return result
 }
 func TestDefaultCut(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, false, true)
+		result = chanToArray(Cut(content, false, true))
 		if len(result) != len(defaultCutResult[index]) {
 			t.Errorf("default cut for %s length should be %d not %d\n",
 				content, len(defaultCutResult[index]), len(result))
@@ -667,7 +675,7 @@ func TestDefaultCut(t *testing.T) {
 func TestCutAll(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, true, true)
+		result = chanToArray(Cut(content, true, true))
 		if len(result) != len(cutAllResult[index]) {
 			t.Errorf("cut all for %s length should be %d not %d\n",
 				content, len(cutAllResult[index]), len(result))
@@ -683,7 +691,7 @@ func TestCutAll(t *testing.T) {
 func TestDefaultCutNoHMM(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, false, false)
+		result = chanToArray(Cut(content, false, false))
 		if len(result) != len(defaultCutNoHMMResult[index]) {
 			t.Errorf("default cut no hmm for %s length should be %d not %d\n",
 				content, len(defaultCutNoHMMResult[index]), len(result))
@@ -699,7 +707,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
 func TestCutForSearch(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = CutForSearch(content, true)
+		result = chanToArray(CutForSearch(content, true))
 		if len(result) != len(cutForSearchResult[index]) {
 			t.Errorf("cut for search for %s length should be %d not %d\n",
 				content, len(cutForSearchResult[index]), len(result))
@@ -711,7 +719,7 @@ func TestCutForSearch(t *testing.T) {
 		}
 	}
 	for index, content := range test_contents {
-		result = CutForSearch(content, false)
+		result = chanToArray(CutForSearch(content, false))
 		if len(result) != len(cutForSearchNoHMMResult[index]) {
 			t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
 				content, len(cutForSearchNoHMMResult[index]), len(result))
@@ -728,7 +736,7 @@ func TestSetdictionary(t *testing.T) {
 	var result []string
 	SetDictionary("foobar.txt")
 	for index, content := range test_contents {
-		result = Cut(content, false, true)
+		result = chanToArray(Cut(content, false, true))
 		if len(result) != len(userDictCutResult[index]) {
 			t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
 				content, len(userDictCutResult[index]), len(result))
@@ -748,7 +756,7 @@ func TestLoadUserDict(t *testing.T) {
 	sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 	result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "，", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
-	words := Cut(sentence, false, true)
+	words := chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -760,7 +768,7 @@ func TestLoadUserDict(t *testing.T) {
 	sentence = "easy_install is great"
 	result = []string{"easy_install", " ", "is", " ", "great"}
-	words = Cut(sentence, false, true)
+	words = chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -772,7 +780,7 @@ func TestLoadUserDict(t *testing.T) {
 	sentence = "python 的正则表达式是好用的"
 	result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
-	words = Cut(sentence, false, true)
+	words = chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(words)
 		t.Error(result)
--- a/tokenize.go
+++ b/tokenize.go
@@ -10,14 +10,13 @@ func Tokenize(sentence string, mode string, HMM bool) []Token {
 	tokens := make([]Token, 0)
 	start := 0
 	var width int
-	if mode == "default" {
+	for word := range Cut(sentence, false, HMM) {
-		for _, word := range Cut(sentence, false, HMM) {
+		if mode == "default" {
 			width = len([]rune(word))
 			tokens = append(tokens, Token{word, start, start + width})
 			start += width
-		}
+
-	} else {
+		} else {
 		for _, word := range Cut(sentence, false, HMM) {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range []int{2, 3} {