From 76b9df85115fb3b1ab319c1559bb342dd6704a87 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Fri, 27 Feb 2015 11:37:55 +0800 Subject: [PATCH] change cut method to return a channel string, not []string --- analyse/analyse.go | 3 +- jieba.go | 133 +++++++++++++++++++++++---------------------- jieba_test.go | 26 ++++++--- tokenize.go | 9 ++- 4 files changed, 91 insertions(+), 80 deletions(-) diff --git a/analyse/analyse.go b/analyse/analyse.go index e4502e9..ae53e17 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -36,10 +36,9 @@ func (tis TfIdfs) Swap(i, j int) { } func ExtractTags(sentence string, topK int) (tags TfIdfs) { - words := jiebago.Cut(sentence, false, true) freq := make(map[string]float64) - for _, w := range words { + for w := range jiebago.Cut(sentence, false, true) { w = strings.TrimSpace(w) if utf8.RuneCountInString(w) < 2 { continue diff --git a/jieba.go b/jieba.go index ada0b6e..aea00ec 100644 --- a/jieba.go +++ b/jieba.go @@ -253,80 +253,85 @@ func cutAll(sentence string) []string { return result } -func Cut(sentence string, isCutAll bool, HMM bool) []string { - result := make([]string, 0) - var reHan, reSkip *regexp.Regexp - if isCutAll { - reHan = reHanCutAll - reSkip = reSkipCutAll - } else { - reHan = reHanDefault - reSkip = reSkipDefault - } - blocks := RegexpSplit(reHan, sentence) - var cut cutFunc - if HMM { - cut = cutDAG - } else { - cut = cutDAGNoHMM - } - if isCutAll { - cut = cutAll - } - for _, blk := range blocks { - if len(blk) == 0 { - continue - } - if reHan.MatchString(blk) { - for _, word := range cut(blk) { - result = append(result, word) - } +func Cut(sentence string, isCutAll bool, HMM bool) chan string { + result := make(chan string) + go func() { + var reHan, reSkip *regexp.Regexp + if isCutAll { + reHan = reHanCutAll + reSkip = reSkipCutAll } else { - type skipSplitFunc func(sentence string) []string - var ssf skipSplitFunc - if isCutAll { - ssf = func(sentence string) []string { - return reSkip.Split(sentence, -1) + reHan = reHanDefault + reSkip = reSkipDefault + } + blocks := RegexpSplit(reHan, sentence) + var cut cutFunc + if HMM { + cut = cutDAG + } else { + cut = cutDAGNoHMM + } + if isCutAll { + cut = cutAll + } + for _, blk := range blocks { + if len(blk) == 0 { + continue + } + if reHan.MatchString(blk) { + for _, word := range cut(blk) { + result <- word } } else { - ssf = func(sentence string) []string { - return RegexpSplit(reSkip, sentence) - } - } - - for _, x := range ssf(blk) { - if reSkip.MatchString(x) { - result = append(result, x) - } else if !isCutAll { - for _, xx := range x { - result = append(result, string(xx)) + type skipSplitFunc func(sentence string) []string + var ssf skipSplitFunc + if isCutAll { + ssf = func(sentence string) []string { + return reSkip.Split(sentence, -1) } } else { - result = append(result, x) + ssf = func(sentence string) []string { + return RegexpSplit(reSkip, sentence) + } } - } - } - } - return result -} -func CutForSearch(sentence string, hmm bool) []string { - result := make([]string, 0) - words := Cut(sentence, false, hmm) - for _, word := range words { - runes := []rune(word) - for _, increment := range []int{2, 3} { - if len(runes) > increment { - var gram2 string - for i := 0; i < len(runes)-increment+1; i++ { - gram2 = string(runes[i : i+increment]) - if v, ok := Trie.Freq[gram2]; ok && v > 0.0 { - result = append(result, gram2) + for _, x := range ssf(blk) { + if reSkip.MatchString(x) { + result <- x + } else if !isCutAll { + for _, xx := range x { + result <- string(xx) + } + } else { + result <- x } } } } - result = append(result, word) - } + close(result) + }() + return result +} + +func CutForSearch(sentence string, hmm bool) chan string { + result := make(chan string) + go func() { + for word := range Cut(sentence, false, hmm) { + runes := []rune(word) + for _, increment := range []int{2, 3} { + if len(runes) > increment { + var gram2 string + for i := 0; i < len(runes)-increment+1; i++ { + gram2 = string(runes[i : i+increment]) + if v, ok := Trie.Freq[gram2]; ok && v > 0.0 { + result <- gram2 + } + } + } + } + result <- word + } + close(result) + }() return result } diff --git a/jieba_test.go b/jieba_test.go index d59a431..f119132 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -648,10 +648,18 @@ func TestRegexpSplit(t *testing.T) { } } +func chanToArray(ch chan string) []string { + result := make([]string, 0) + for word := range ch { + result = append(result, word) + } + return result +} + func TestDefaultCut(t *testing.T) { var result []string for index, content := range test_contents { - result = Cut(content, false, true) + result = chanToArray(Cut(content, false, true)) if len(result) != len(defaultCutResult[index]) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -667,7 +675,7 @@ func TestDefaultCut(t *testing.T) { func TestCutAll(t *testing.T) { var result []string for index, content := range test_contents { - result = Cut(content, true, true) + result = chanToArray(Cut(content, true, true)) if len(result) != len(cutAllResult[index]) { t.Errorf("cut all for %s length should be %d not %d\n", content, len(cutAllResult[index]), len(result)) @@ -683,7 +691,7 @@ func TestCutAll(t *testing.T) { func TestDefaultCutNoHMM(t *testing.T) { var result []string for index, content := range test_contents { - result = Cut(content, false, false) + result = chanToArray(Cut(content, false, false)) if len(result) != len(defaultCutNoHMMResult[index]) { t.Errorf("default cut no hmm for %s length should be %d not %d\n", content, len(defaultCutNoHMMResult[index]), len(result)) @@ -699,7 +707,7 @@ func TestDefaultCutNoHMM(t *testing.T) { func TestCutForSearch(t *testing.T) { var result []string for index, content := range test_contents { - result = CutForSearch(content, true) + result = chanToArray(CutForSearch(content, true)) if len(result) != len(cutForSearchResult[index]) { t.Errorf("cut for search for %s length should be %d not %d\n", content, len(cutForSearchResult[index]), len(result)) @@ -711,7 +719,7 @@ func TestCutForSearch(t *testing.T) { } } for index, content := range test_contents { - result = CutForSearch(content, false) + result = chanToArray(CutForSearch(content, false)) if len(result) != len(cutForSearchNoHMMResult[index]) { t.Errorf("cut for search no hmm for %s length should be %d not %d\n", content, len(cutForSearchNoHMMResult[index]), len(result)) @@ -728,7 +736,7 @@ func TestSetdictionary(t *testing.T) { var result []string SetDictionary("foobar.txt") for index, content := range test_contents { - result = Cut(content, false, true) + result = chanToArray(Cut(content, false, true)) if len(result) != len(userDictCutResult[index]) { t.Errorf("default cut with user dictionary for %s length should be %d not %d\n", content, len(userDictCutResult[index]), len(result)) @@ -748,7 +756,7 @@ func TestLoadUserDict(t *testing.T) { sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} - words := Cut(sentence, false, true) + words := chanToArray(Cut(sentence, false, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -760,7 +768,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "easy_install is great" result = []string{"easy_install", " ", "is", " ", "great"} - words = Cut(sentence, false, true) + words = chanToArray(Cut(sentence, false, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -772,7 +780,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "python 的正则表达式是好用的" result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} - words = Cut(sentence, false, true) + words = chanToArray(Cut(sentence, false, true)) if len(words) != len(result) { t.Error(words) t.Error(result) diff --git a/tokenize.go b/tokenize.go index 950c056..2ba4016 100644 --- a/tokenize.go +++ b/tokenize.go @@ -10,14 +10,13 @@ func Tokenize(sentence string, mode string, HMM bool) []Token { tokens := make([]Token, 0) start := 0 var width int - if mode == "default" { - for _, word := range Cut(sentence, false, HMM) { + for word := range Cut(sentence, false, HMM) { + if mode == "default" { width = len([]rune(word)) tokens = append(tokens, Token{word, start, start + width}) start += width - } - } else { - for _, word := range Cut(sentence, false, HMM) { + + } else { runes := []rune(word) width = len(runes) for _, step := range []int{2, 3} {