small refactor, don't compile regular expression every time, corresponding to jieba commit #32a0e92a09614cf5c72f87b1a59a5c4369200516

2026-06-23 20:50:27 +08:00 · 2015-02-25 16:32:28 +08:00
parent 5702495bf6
commit 08ac49d10b
3 changed files with 51 additions and 46 deletions
--- a/finalseg/finalseg.go
+++ b/finalseg/finalseg.go
@@ -4,6 +4,11 @@ import (
 	"regexp"
 )
 var (
 	reHan  = regexp.MustCompile(`\p{Han}+`)
 	reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
 )
 func cutHan(sentence string) []string {
 	runes := []rune(sentence)
 	result := make([]string, 0)
@@ -30,14 +35,12 @@ func cutHan(sentence string) []string {
 func Cut(sentence string) []string {
 	result := make([]string, 0)
 	re_han := regexp.MustCompile(`\p{Han}+`)
 	re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
 	s := sentence
 	var hans string
 	var hanLoc []int
 	var nonhanLoc []int
 	for {
-		hanLoc = re_han.FindStringIndex(s)
+		hanLoc = reHan.FindStringIndex(s)
 		if hanLoc == nil {
 			if len(s) == 0 {
 				break
@@ -50,7 +53,7 @@ func Cut(sentence string) []string {
 			}
 			continue
 		}
-		nonhanLoc = re_skip.FindStringIndex(s)
+		nonhanLoc = reSkip.FindStringIndex(s)
 		if nonhanLoc == nil {
 			if len(s) == 0 {
 				break
--- a/jieba.go
+++ b/jieba.go
@@ -11,6 +11,11 @@ import (
 var (
 	Dictionary     = "dict.txt"
 	UserWordTagTab = make(map[string]string)
 	reEng          = regexp.MustCompile(`[[:alnum:]]`)
 	reHanCutAll    = regexp.MustCompile(`\p{Han}+`)
 	reSkipCutAll   = regexp.MustCompile(`[^[:alnum:]+#\n]`)
 	reHanDefault   = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
 	reSkipDefault  = regexp.MustCompile(`(\r\n|\s)`)
 )
 type Route struct {
@@ -187,7 +192,7 @@ func cut_DAG(sentence string) []string {
 func cut_DAG_NO_HMM(sentence string) []string {
 	result := make([]string, 0)
-	re_eng := regexp.MustCompile(`[[:alnum:]]`)
+
 	dag := GetDAG(sentence)
 	routes := Calc(sentence, dag)
 	x := 0
@@ -201,7 +206,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
 		}
 		y = routes[x].Index + 1
 		l_word := runes[x:y]
-		if re_eng.MatchString(string(l_word)) && len(l_word) == 1 {
+		if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
 			buf = append(buf, l_word...)
 			x = y
 		} else {
@@ -220,7 +225,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
 	return result
 }
-func cut_All(sentence string) []string {
+func cutAll(sentence string) []string {
 	result := make([]string, 0)
 	runes := []rune(sentence)
 	dag := GetDAG(sentence)
@@ -247,51 +252,51 @@ func cut_All(sentence string) []string {
 	return result
 }
-func Cut(sentence string, cut_all bool, HMM bool) []string {
+func Cut(sentence string, isCutAll bool, HMM bool) []string {
 	result := make([]string, 0)
-	var re_han, re_skip *regexp.Regexp
+	var reHan, reSkip *regexp.Regexp
-	if cut_all {
+	if isCutAll {
-		re_han = regexp.MustCompile(`\p{Han}+`)
+		reHan = reHanCutAll
-		re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`)
+		reSkip = reSkipCutAll
 	} else {
-		re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
+		reHan = reHanDefault
-		re_skip = regexp.MustCompile(`(\r\n|\s)`)
+		reSkip = reSkipDefault
 	}
-	blocks := RegexpSplit(re_han, sentence)
+	blocks := RegexpSplit(reHan, sentence)
 	var cut_block cutAction
 	if HMM {
 		cut_block = cut_DAG
 	} else {
 		cut_block = cut_DAG_NO_HMM
 	}
-	if cut_all {
+	if isCutAll {
-		cut_block = cut_All
+		cut_block = cutAll
 	}
 	for _, blk := range blocks {
 		if len(blk) == 0 {
 			continue
 		}
-		if re_han.MatchString(blk) {
+		if reHan.MatchString(blk) {
 			for _, word := range cut_block(blk) {
 				result = append(result, word)
 			}
 		} else {
 			type skipSplitFunc func(sentence string) []string
 			var ssf skipSplitFunc
-			if cut_all {
+			if isCutAll {
 				ssf = func(sentence string) []string {
-					return re_skip.Split(sentence, -1)
+					return reSkip.Split(sentence, -1)
 				}
 			} else {
 				ssf = func(sentence string) []string {
-					return RegexpSplit(re_skip, sentence)
+					return RegexpSplit(reSkip, sentence)
 				}
 			}
 			for _, x := range ssf(blk) {
-				if re_skip.MatchString(x) {
+				if reSkip.MatchString(x) {
 					result = append(result, x)
-				} else if !cut_all {
+				} else if !isCutAll {
 					for _, xx := range x {
 						result = append(result, string(xx))
 					}
--- a/posseg/posseg.go
+++ b/posseg/posseg.go
@@ -12,7 +12,14 @@ import (
 )
 var (
-	WordTagTab = make(map[string]string)
+	WordTagTab     = make(map[string]string)
 	reHanDetail    = regexp.MustCompile(`\p{Han}+`)
 	reSkipDetail   = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
 	reEng          = regexp.MustCompile(`[[:alnum:]]`)
 	reNum          = regexp.MustCompile(`[\.[:digit:]]+`)
 	reEng1         = regexp.MustCompile(`[[:alnum:]]$`)
 	reHanInternal  = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
 	reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
 )
 type WordTag struct {
@@ -80,26 +87,21 @@ func __cut(sentence string) []WordTag {
 func cutDetail(sentence string) []WordTag {
 	result := make([]WordTag, 0)
-	re_han := regexp.MustCompile(`\p{Han}+`)
+	blocks := jiebago.RegexpSplit(reHanDetail, sentence)
 	re_skip := regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
 	re_eng := regexp.MustCompile(`[[:alnum:]]`)
 	re_num := regexp.MustCompile(`[\.[:digit:]]+`)
 	blocks := jiebago.RegexpSplit(re_han, sentence)
 	for _, blk := range blocks {
-		if re_han.MatchString(blk) {
+		if reHanDetail.MatchString(blk) {
 			for _, wordTag := range __cut(blk) {
 				result = append(result, wordTag)
 			}
 		} else {
-			for _, x := range jiebago.RegexpSplit(re_skip, blk) {
+			for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
 				if len(x) == 0 {
 					continue
 				}
 				switch {
-				case re_num.MatchString(x):
+				case reNum.MatchString(x):
 					result = append(result, WordTag{x, "m"})
-				case re_eng.MatchString(x):
+				case reEng.MatchString(x):
 					result = append(result, WordTag{x, "eng"})
 				default:
 					result = append(result, WordTag{x, "x"})
@@ -203,7 +205,6 @@ func cut_DAG(sentence string) []WordTag {
 func cut_DAG_NO_HMM(sentence string) []WordTag {
 	result := make([]WordTag, 0)
 	re_eng := regexp.MustCompile(`[[:alnum:]]`)
 	dag := jiebago.GetDAG(sentence)
 	routes := jiebago.Calc(sentence, dag)
 	x := 0
@@ -217,7 +218,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
 		}
 		y = routes[x].Index + 1
 		l_word := runes[x:y]
-		if re_eng.MatchString(string(l_word)) && len(l_word) == 1 {
+		if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
 			buf = append(buf, l_word...)
 			x = y
 		} else {
@@ -243,11 +244,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
 func cut(sentence string, HMM bool) []WordTag {
 	result := make([]WordTag, 0)
-	re_han := regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
+	blocks := jiebago.RegexpSplit(reHanInternal, sentence)
 	re_skip := regexp.MustCompile(`(\r\n|\s)`)
 	re_eng := regexp.MustCompile(`[[:alnum:]]`)
 	re_num := regexp.MustCompile(`[\.[:digit:]]+`)
 	blocks := jiebago.RegexpSplit(re_han, sentence)
 	var cut_block cutAction
 	if HMM {
 		cut_block = cut_DAG
@@ -255,21 +252,21 @@ func cut(sentence string, HMM bool) []WordTag {
 		cut_block = cut_DAG_NO_HMM
 	}
 	for _, blk := range blocks {
-		if re_han.MatchString(blk) {
+		if reHanInternal.MatchString(blk) {
 			for _, wordTag := range cut_block(blk) {
 				result = append(result, wordTag)
 			}
 		} else {
-			for _, x := range jiebago.RegexpSplit(re_skip, blk) {
+			for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
-				if re_skip.MatchString(x) {
+				if reSkipInternal.MatchString(x) {
 					result = append(result, WordTag{x, "x"})
 				} else {
 					for _, xx := range x {
 						s := string(xx)
 						switch {
-						case re_num.MatchString(s):
+						case reNum.MatchString(s):
 							result = append(result, WordTag{s, "m"})
-						case re_eng.MatchString(x):
+						case reEng.MatchString(x):
 							result = append(result, WordTag{x, "eng"})
 							break
 						default: