change cut method to return a channel string, not []string

2026-06-08 18:40:24 +08:00 · 2015-02-27 11:37:55 +08:00
parent 87fe3a19f8
commit 76b9df8511
4 changed files with 91 additions and 80 deletions
--- a/analyse/analyse.go
+++ b/analyse/analyse.go
@@ -36,10 +36,9 @@ func (tis TfIdfs) Swap(i, j int) {
 }

 func ExtractTags(sentence string, topK int) (tags TfIdfs) {
-	words := jiebago.Cut(sentence, false, true)
 	freq := make(map[string]float64)

-	for _, w := range words {
+	for w := range jiebago.Cut(sentence, false, true) {
 		w = strings.TrimSpace(w)
 		if utf8.RuneCountInString(w) < 2 {
 			continue
--- a/jieba.go
+++ b/jieba.go
@@ -253,80 +253,85 @@ func cutAll(sentence string) []string {
 	return result
 }

-func Cut(sentence string, isCutAll bool, HMM bool) []string {
-	result := make([]string, 0)
-	var reHan, reSkip *regexp.Regexp
-	if isCutAll {
-		reHan = reHanCutAll
-		reSkip = reSkipCutAll
-	} else {
-		reHan = reHanDefault
-		reSkip = reSkipDefault
-	}
-	blocks := RegexpSplit(reHan, sentence)
-	var cut cutFunc
-	if HMM {
-		cut = cutDAG
-	} else {
-		cut = cutDAGNoHMM
-	}
-	if isCutAll {
-		cut = cutAll
-	}
-	for _, blk := range blocks {
-		if len(blk) == 0 {
-			continue
-		}
-		if reHan.MatchString(blk) {
-			for _, word := range cut(blk) {
-				result = append(result, word)
-			}
+func Cut(sentence string, isCutAll bool, HMM bool) chan string {
+	result := make(chan string)
+	go func() {
+		var reHan, reSkip *regexp.Regexp
+		if isCutAll {
+			reHan = reHanCutAll
+			reSkip = reSkipCutAll
 		} else {
-			type skipSplitFunc func(sentence string) []string
-			var ssf skipSplitFunc
-			if isCutAll {
-				ssf = func(sentence string) []string {
-					return reSkip.Split(sentence, -1)
+			reHan = reHanDefault
+			reSkip = reSkipDefault
+		}
+		blocks := RegexpSplit(reHan, sentence)
+		var cut cutFunc
+		if HMM {
+			cut = cutDAG
+		} else {
+			cut = cutDAGNoHMM
+		}
+		if isCutAll {
+			cut = cutAll
+		}
+		for _, blk := range blocks {
+			if len(blk) == 0 {
+				continue
+			}
+			if reHan.MatchString(blk) {
+				for _, word := range cut(blk) {
+					result <- word
 				}
 			} else {
-				ssf = func(sentence string) []string {
-					return RegexpSplit(reSkip, sentence)
-				}
-			}
-
-			for _, x := range ssf(blk) {
-				if reSkip.MatchString(x) {
-					result = append(result, x)
-				} else if !isCutAll {
-					for _, xx := range x {
-						result = append(result, string(xx))
+				type skipSplitFunc func(sentence string) []string
+				var ssf skipSplitFunc
+				if isCutAll {
+					ssf = func(sentence string) []string {
+						return reSkip.Split(sentence, -1)
 					}
 				} else {
-					result = append(result, x)
+					ssf = func(sentence string) []string {
+						return RegexpSplit(reSkip, sentence)
+					}
 				}
-			}
-		}
-	}
-	return result
-}

-func CutForSearch(sentence string, hmm bool) []string {
-	result := make([]string, 0)
-	words := Cut(sentence, false, hmm)
-	for _, word := range words {
-		runes := []rune(word)
-		for _, increment := range []int{2, 3} {
-			if len(runes) > increment {
-				var gram2 string
-				for i := 0; i < len(runes)-increment+1; i++ {
-					gram2 = string(runes[i : i+increment])
-					if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
-						result = append(result, gram2)
+				for _, x := range ssf(blk) {
+					if reSkip.MatchString(x) {
+						result <- x
+					} else if !isCutAll {
+						for _, xx := range x {
+							result <- string(xx)
+						}
+					} else {
+						result <- x
 					}
 				}
 			}
 		}
-		result = append(result, word)
-	}
+		close(result)
+	}()
+	return result
+}
+
+func CutForSearch(sentence string, hmm bool) chan string {
+	result := make(chan string)
+	go func() {
+		for word := range Cut(sentence, false, hmm) {
+			runes := []rune(word)
+			for _, increment := range []int{2, 3} {
+				if len(runes) > increment {
+					var gram2 string
+					for i := 0; i < len(runes)-increment+1; i++ {
+						gram2 = string(runes[i : i+increment])
+						if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
+							result <- gram2
+						}
+					}
+				}
+			}
+			result <- word
+		}
+		close(result)
+	}()
 	return result
 }
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -648,10 +648,18 @@ func TestRegexpSplit(t *testing.T) {
 	}
 }

+func chanToArray(ch chan string) []string {
+	result := make([]string, 0)
+	for word := range ch {
+		result = append(result, word)
+	}
+	return result
+}
+
 func TestDefaultCut(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, false, true)
+		result = chanToArray(Cut(content, false, true))
 		if len(result) != len(defaultCutResult[index]) {
 			t.Errorf("default cut for %s length should be %d not %d\n",
 				content, len(defaultCutResult[index]), len(result))
@@ -667,7 +675,7 @@ func TestDefaultCut(t *testing.T) {
 func TestCutAll(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, true, true)
+		result = chanToArray(Cut(content, true, true))
 		if len(result) != len(cutAllResult[index]) {
 			t.Errorf("cut all for %s length should be %d not %d\n",
 				content, len(cutAllResult[index]), len(result))
@@ -683,7 +691,7 @@ func TestCutAll(t *testing.T) {
 func TestDefaultCutNoHMM(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = Cut(content, false, false)
+		result = chanToArray(Cut(content, false, false))
 		if len(result) != len(defaultCutNoHMMResult[index]) {
 			t.Errorf("default cut no hmm for %s length should be %d not %d\n",
 				content, len(defaultCutNoHMMResult[index]), len(result))
@@ -699,7 +707,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
 func TestCutForSearch(t *testing.T) {
 	var result []string
 	for index, content := range test_contents {
-		result = CutForSearch(content, true)
+		result = chanToArray(CutForSearch(content, true))
 		if len(result) != len(cutForSearchResult[index]) {
 			t.Errorf("cut for search for %s length should be %d not %d\n",
 				content, len(cutForSearchResult[index]), len(result))
@@ -711,7 +719,7 @@ func TestCutForSearch(t *testing.T) {
 		}
 	}
 	for index, content := range test_contents {
-		result = CutForSearch(content, false)
+		result = chanToArray(CutForSearch(content, false))
 		if len(result) != len(cutForSearchNoHMMResult[index]) {
 			t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
 				content, len(cutForSearchNoHMMResult[index]), len(result))
@@ -728,7 +736,7 @@ func TestSetdictionary(t *testing.T) {
 	var result []string
 	SetDictionary("foobar.txt")
 	for index, content := range test_contents {
-		result = Cut(content, false, true)
+		result = chanToArray(Cut(content, false, true))
 		if len(result) != len(userDictCutResult[index]) {
 			t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
 				content, len(userDictCutResult[index]), len(result))
@@ -748,7 +756,7 @@ func TestLoadUserDict(t *testing.T) {
 	sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 	result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "，", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}

-	words := Cut(sentence, false, true)
+	words := chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -760,7 +768,7 @@ func TestLoadUserDict(t *testing.T) {

 	sentence = "easy_install is great"
 	result = []string{"easy_install", " ", "is", " ", "great"}
-	words = Cut(sentence, false, true)
+	words = chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -772,7 +780,7 @@ func TestLoadUserDict(t *testing.T) {

 	sentence = "python 的正则表达式是好用的"
 	result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
-	words = Cut(sentence, false, true)
+	words = chanToArray(Cut(sentence, false, true))
 	if len(words) != len(result) {
 		t.Error(words)
 		t.Error(result)
--- a/tokenize.go
+++ b/tokenize.go
@@ -10,14 +10,13 @@ func Tokenize(sentence string, mode string, HMM bool) []Token {
 	tokens := make([]Token, 0)
 	start := 0
 	var width int
-	if mode == "default" {
-		for _, word := range Cut(sentence, false, HMM) {
+	for word := range Cut(sentence, false, HMM) {
+		if mode == "default" {
 			width = len([]rune(word))
 			tokens = append(tokens, Token{word, start, start + width})
 			start += width
-		}
-	} else {
-		for _, word := range Cut(sentence, false, HMM) {
+
+		} else {
 			runes := []rune(word)
 			width = len(runes)
 			for _, step := range []int{2, 3} {