refactor Cut function, make CutAll a seperate function, to simplify the logic of Cut function

2026-07-17 02:40:23 +08:00 · 2015-03-30 15:18:36 +08:00
parent 556b96b137
commit c4c3a5f9ad
5 changed files with 44 additions and 118 deletions
--- a/dictionary.go
+++ b/dictionary.go
@@ -1,11 +0,0 @@
-package jiebago
-
-type Entry struct {
-	Word string
-	Flag string
-	Freq float64
-}
-
-type DictLoader interface {
-	AddEntry(Entry)
-}
--- a/jieba.go
+++ b/jieba.go
@@ -86,11 +86,10 @@ func (j *Jieba) DAG(sentence string) map[int][]int {
 	dag := make(map[int][]int)
 	runes := []rune(sentence)
 	n := len(runes)
-	i := 0
 	var frag string
 	for k := 0; k < n; k++ {
 		tmpList := make([]int, 0)
-		i = k
+		i := k
 		frag = string(runes[k])
 		for {
 			if freq, ok := j.Freq[frag]; !ok {
@@ -284,63 +283,31 @@ which is suitable for text analysis.

 HMM contols whether to use the Hidden Markov Mode.
 */
-func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
+func (j *Jieba) Cut(sentence string, hmm bool) chan string {
 	result := make(chan string)
+	var cut cutFunc
+	if hmm {
+		cut = j.cutDAG
+	} else {
+		cut = j.cutDAGNoHMM
+	}
 	go func() {
-		var reHan, reSkip *regexp.Regexp
-		if isCutAll {
-			reHan = reHanCutAll
-			reSkip = reSkipCutAll
-		} else {
-			reHan = reHanDefault
-			reSkip = reSkipDefault
-		}
-		var cut cutFunc
-		if HMM {
-			cut = j.cutDAG
-		} else {
-			cut = j.cutDAGNoHMM
-		}
-		if isCutAll {
-			cut = j.cutAll
-		}
-		for blk := range RegexpSplit(reHan, sentence) {
+		for blk := range RegexpSplit(reHanDefault, sentence) {
 			if len(blk) == 0 {
 				continue
 			}
-			if reHan.MatchString(blk) {
+			if reHanDefault.MatchString(blk) {
 				for x := range cut(blk) {
 					result <- x
 				}
 			} else {
-				type skipSplitFunc func(sentence string) chan string
-				var ssf skipSplitFunc
-				if isCutAll {
-					ssf = func(sentence string) chan string {
-						ch := make(chan string)
-						go func() {
-							for _, s := range reSkip.Split(sentence, -1) {
-								ch <- s
-							}
-							close(ch)
-						}()
-						return ch
-					}
-				} else {
-					ssf = func(sentence string) chan string {
-						return RegexpSplit(reSkip, sentence)
-					}
-				}
-
-				for x := range ssf(blk) {
-					if reSkip.MatchString(x) {
+				for x := range RegexpSplit(reSkipDefault, blk) {
+					if reSkipDefault.MatchString(x) {
 						result <- x
-					} else if !isCutAll {
+					} else {
 						for _, xx := range x {
 							result <- string(xx)
 						}
-					} else {
-						result <- x
 					}
 				}
 			}
@@ -350,13 +317,35 @@ func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
 	return result
 }

+func (j *Jieba) CutAll(sentence string) chan string {
+	result := make(chan string)
+	go func() {
+		for blk := range RegexpSplit(reHanCutAll, sentence) {
+			if len(blk) == 0 {
+				continue
+			}
+			if reHanCutAll.MatchString(blk) {
+				for x := range j.cutAll(blk) {
+					result <- x
+				}
+			} else {
+				for _, x := range reSkipCutAll.Split(blk, -1) {
+					result <- x
+				}
+			}
+		}
+		close(result)
+	}()
+	return result
+}
+
 // Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
 // to cut long words into several short words, which can raise the recall rate.
 // Suitable for search engines.
 func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
 	result := make(chan string)
 	go func() {
-		for word := range j.Cut(sentence, false, hmm) {
+		for word := range j.Cut(sentence, hmm) {
 			runes := []rune(word)
 			for _, increment := range []int{2, 3} {
 				if len(runes) > increment {
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {

 	var result []string
 	for index, content := range test_contents {
-		result = chanToArray(j.Cut(content, false, true))
+		result = chanToArray(j.Cut(content, true))
 		if len(result) != len(defaultCutResult[index]) {
 			t.Errorf("default cut for %s length should be %d not %d\n",
 				content, len(defaultCutResult[index]), len(result))
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {

 	var result []string
 	for index, content := range test_contents {
-		result = chanToArray(j.Cut(content, true, true))
+		result = chanToArray(j.CutAll(content))
 		if len(result) != len(cutAllResult[index]) {
 			t.Errorf("cut all for %s length should be %d not %d\n",
 				content, len(cutAllResult[index]), len(result))
@@ -697,7 +697,7 @@ func TestDefaultCutNoHMM(t *testing.T) {

 	var result []string
 	for index, content := range test_contents {
-		result = chanToArray(j.Cut(content, false, false))
+		result = chanToArray(j.Cut(content, false))
 		if len(result) != len(defaultCutNoHMMResult[index]) {
 			t.Errorf("default cut no hmm for %s length should be %d not %d\n",
 				content, len(defaultCutNoHMMResult[index]), len(result))
@@ -744,7 +744,7 @@ func TestSetdictionary(t *testing.T) {
 	var result []string
 	j, _ := NewJieba("foobar.txt")
 	for index, content := range test_contents {
-		result = chanToArray(j.Cut(content, false, true))
+		result = chanToArray(j.Cut(content, true))
 		if len(result) != len(userDictCutResult[index]) {
 			t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
 				content, len(userDictCutResult[index]), len(result))
@@ -764,7 +764,7 @@ func TestLoadUserDict(t *testing.T) {
 	sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 	result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "，", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}

-	words := chanToArray(j.Cut(sentence, false, true))
+	words := chanToArray(j.Cut(sentence, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -776,7 +776,7 @@ func TestLoadUserDict(t *testing.T) {

 	sentence = "easy_install is great"
 	result = []string{"easy_install", " ", "is", " ", "great"}
-	words = chanToArray(j.Cut(sentence, false, true))
+	words = chanToArray(j.Cut(sentence, true))
 	if len(words) != len(result) {
 		t.Error(len(words))
 	}
@@ -788,7 +788,7 @@ func TestLoadUserDict(t *testing.T) {

 	sentence = "python 的正则表达式是好用的"
 	result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
-	words = chanToArray(j.Cut(sentence, false, true))
+	words = chanToArray(j.Cut(sentence, true))
 	if len(words) != len(result) {
 		t.Error(words)
 		t.Error(result)
--- a/tokenizers/jieba.go
+++ b/tokenizers/jieba.go
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	for word := range jt.j.Cut(string(input), false, jt.hmm) {
+	for word := range jt.j.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
--- a/util.go
+++ b/util.go
@@ -1,61 +1,9 @@
 package jiebago

 import (
-	"bufio"
-	"os"
-	"path/filepath"
 	"regexp"
-	"strconv"
-	"strings"
 )

-func dictPath(dictFileName string) (string, error) {
-	if filepath.IsAbs(dictFileName) {
-		return dictFileName, nil
-	}
-	var dictFilePath string
-	cwd, err := os.Getwd()
-	if err != nil {
-		return dictFilePath, err
-	}
-	dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
-	return dictFilePath, nil
-}
-
-func LoadDict(l DictLoader, dictFileName string, usingFlag bool) error {
-	dictFilePath, err := dictPath(dictFileName)
-	if err != nil {
-		return err
-	}
-
-	dictFile, err := os.Open(dictFilePath)
-	if err != nil {
-		return err
-	}
-	defer dictFile.Close()
-
-	scanner := bufio.NewScanner(dictFile)
-	var entry Entry
-	var line string
-	var fields []string
-	for scanner.Scan() {
-		line = scanner.Text()
-		fields = strings.Split(line, " ")
-		entry.Word = strings.Replace(fields[0], "\ufeff", "", 1)
-		if length := len(fields); length > 1 {
-			entry.Freq, err = strconv.ParseFloat(fields[1], 64)
-			if err != nil {
-				return err
-			}
-			if usingFlag && length > 2 {
-				entry.Flag = fields[2]
-			}
-		}
-		l.AddEntry(entry)
-	}
-	return scanner.Err()
-}
-
 // Split sentence using regular expression.
 func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
 	result := make(chan string)