From c4c3a5f9ad9d5d2d6de50fac1befbcaabdb8d22e Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 30 Mar 2015 15:18:36 +0800 Subject: [PATCH] refactor Cut function, make CutAll a seperate function, to simplify the logic of Cut function --- dictionary.go | 11 ------ jieba.go | 83 ++++++++++++++++++++------------------------- jieba_test.go | 14 ++++---- tokenizers/jieba.go | 2 +- util.go | 52 ---------------------------- 5 files changed, 44 insertions(+), 118 deletions(-) delete mode 100644 dictionary.go diff --git a/dictionary.go b/dictionary.go deleted file mode 100644 index be71c64..0000000 --- a/dictionary.go +++ /dev/null @@ -1,11 +0,0 @@ -package jiebago - -type Entry struct { - Word string - Flag string - Freq float64 -} - -type DictLoader interface { - AddEntry(Entry) -} diff --git a/jieba.go b/jieba.go index d97f935..251e359 100644 --- a/jieba.go +++ b/jieba.go @@ -86,11 +86,10 @@ func (j *Jieba) DAG(sentence string) map[int][]int { dag := make(map[int][]int) runes := []rune(sentence) n := len(runes) - i := 0 var frag string for k := 0; k < n; k++ { tmpList := make([]int, 0) - i = k + i := k frag = string(runes[k]) for { if freq, ok := j.Freq[frag]; !ok { @@ -284,63 +283,31 @@ which is suitable for text analysis. HMM contols whether to use the Hidden Markov Mode. */ -func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string { +func (j *Jieba) Cut(sentence string, hmm bool) chan string { result := make(chan string) + var cut cutFunc + if hmm { + cut = j.cutDAG + } else { + cut = j.cutDAGNoHMM + } go func() { - var reHan, reSkip *regexp.Regexp - if isCutAll { - reHan = reHanCutAll - reSkip = reSkipCutAll - } else { - reHan = reHanDefault - reSkip = reSkipDefault - } - var cut cutFunc - if HMM { - cut = j.cutDAG - } else { - cut = j.cutDAGNoHMM - } - if isCutAll { - cut = j.cutAll - } - for blk := range RegexpSplit(reHan, sentence) { + for blk := range RegexpSplit(reHanDefault, sentence) { if len(blk) == 0 { continue } - if reHan.MatchString(blk) { + if reHanDefault.MatchString(blk) { for x := range cut(blk) { result <- x } } else { - type skipSplitFunc func(sentence string) chan string - var ssf skipSplitFunc - if isCutAll { - ssf = func(sentence string) chan string { - ch := make(chan string) - go func() { - for _, s := range reSkip.Split(sentence, -1) { - ch <- s - } - close(ch) - }() - return ch - } - } else { - ssf = func(sentence string) chan string { - return RegexpSplit(reSkip, sentence) - } - } - - for x := range ssf(blk) { - if reSkip.MatchString(x) { + for x := range RegexpSplit(reSkipDefault, blk) { + if reSkipDefault.MatchString(x) { result <- x - } else if !isCutAll { + } else { for _, xx := range x { result <- string(xx) } - } else { - result <- x } } } @@ -350,13 +317,35 @@ func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string { return result } +func (j *Jieba) CutAll(sentence string) chan string { + result := make(chan string) + go func() { + for blk := range RegexpSplit(reHanCutAll, sentence) { + if len(blk) == 0 { + continue + } + if reHanCutAll.MatchString(blk) { + for x := range j.cutAll(blk) { + result <- x + } + } else { + for _, x := range reSkipCutAll.Split(blk, -1) { + result <- x + } + } + } + close(result) + }() + return result +} + // Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts // to cut long words into several short words, which can raise the recall rate. // Suitable for search engines. func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string { result := make(chan string) go func() { - for word := range j.Cut(sentence, false, hmm) { + for word := range j.Cut(sentence, hmm) { runes := []rune(word) for _, increment := range []int{2, 3} { if len(runes) > increment { diff --git a/jieba_test.go b/jieba_test.go index f1786a8..2158e13 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) { var result []string for index, content := range test_contents { - result = chanToArray(j.Cut(content, false, true)) + result = chanToArray(j.Cut(content, true)) if len(result) != len(defaultCutResult[index]) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) { var result []string for index, content := range test_contents { - result = chanToArray(j.Cut(content, true, true)) + result = chanToArray(j.CutAll(content)) if len(result) != len(cutAllResult[index]) { t.Errorf("cut all for %s length should be %d not %d\n", content, len(cutAllResult[index]), len(result)) @@ -697,7 +697,7 @@ func TestDefaultCutNoHMM(t *testing.T) { var result []string for index, content := range test_contents { - result = chanToArray(j.Cut(content, false, false)) + result = chanToArray(j.Cut(content, false)) if len(result) != len(defaultCutNoHMMResult[index]) { t.Errorf("default cut no hmm for %s length should be %d not %d\n", content, len(defaultCutNoHMMResult[index]), len(result)) @@ -744,7 +744,7 @@ func TestSetdictionary(t *testing.T) { var result []string j, _ := NewJieba("foobar.txt") for index, content := range test_contents { - result = chanToArray(j.Cut(content, false, true)) + result = chanToArray(j.Cut(content, true)) if len(result) != len(userDictCutResult[index]) { t.Errorf("default cut with user dictionary for %s length should be %d not %d\n", content, len(userDictCutResult[index]), len(result)) @@ -764,7 +764,7 @@ func TestLoadUserDict(t *testing.T) { sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} - words := chanToArray(j.Cut(sentence, false, true)) + words := chanToArray(j.Cut(sentence, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -776,7 +776,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "easy_install is great" result = []string{"easy_install", " ", "is", " ", "great"} - words = chanToArray(j.Cut(sentence, false, true)) + words = chanToArray(j.Cut(sentence, true)) if len(words) != len(result) { t.Error(len(words)) } @@ -788,7 +788,7 @@ func TestLoadUserDict(t *testing.T) { sentence = "python 的正则表达式是好用的" result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} - words = chanToArray(j.Cut(sentence, false, true)) + words = chanToArray(j.Cut(sentence, true)) if len(words) != len(result) { t.Error(words) t.Error(result) diff --git a/tokenizers/jieba.go b/tokenizers/jieba.go index 55e4292..017c87d 100644 --- a/tokenizers/jieba.go +++ b/tokenizers/jieba.go @@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - for word := range jt.j.Cut(string(input), false, jt.hmm) { + for word := range jt.j.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes) diff --git a/util.go b/util.go index 7ff9e3f..32f7f7c 100644 --- a/util.go +++ b/util.go @@ -1,61 +1,9 @@ package jiebago import ( - "bufio" - "os" - "path/filepath" "regexp" - "strconv" - "strings" ) -func dictPath(dictFileName string) (string, error) { - if filepath.IsAbs(dictFileName) { - return dictFileName, nil - } - var dictFilePath string - cwd, err := os.Getwd() - if err != nil { - return dictFilePath, err - } - dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) - return dictFilePath, nil -} - -func LoadDict(l DictLoader, dictFileName string, usingFlag bool) error { - dictFilePath, err := dictPath(dictFileName) - if err != nil { - return err - } - - dictFile, err := os.Open(dictFilePath) - if err != nil { - return err - } - defer dictFile.Close() - - scanner := bufio.NewScanner(dictFile) - var entry Entry - var line string - var fields []string - for scanner.Scan() { - line = scanner.Text() - fields = strings.Split(line, " ") - entry.Word = strings.Replace(fields[0], "\ufeff", "", 1) - if length := len(fields); length > 1 { - entry.Freq, err = strconv.ParseFloat(fields[1], 64) - if err != nil { - return err - } - if usingFlag && length > 2 { - entry.Flag = fields[2] - } - } - l.AddEntry(entry) - } - return scanner.Err() -} - // Split sentence using regular expression. func RegexpSplit(r *regexp.Regexp, sentence string) chan string { result := make(chan string)