code refactor, updated RegexpSplit function to match Python's re.split function

2026-06-30 00:50:30 +08:00 · 2015-04-02 18:25:00 +08:00
parent 0ab9063f43
commit 84ad6fe25e
5 changed files with 94 additions and 58 deletions
--- a/jieba.go
+++ b/jieba.go
@@ -11,12 +11,59 @@ import (
 var (
 	ErrInitialized = errors.New("already initialized")
 	reEng          = regexp.MustCompile(`[[:alnum:]]`)
-	reHanCutAll    = regexp.MustCompile(`\p{Han}+`)
+	reHanCutAll    = regexp.MustCompile(`(\p{Han}+)`)
 	reSkipCutAll   = regexp.MustCompile(`[^[:alnum:]+#\n]`)
 	reHanDefault   = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
 	reSkipDefault  = regexp.MustCompile(`(\r\n|\s)`)
 )

+// RegexpSplit split slices s into substrings separated by the expression and
+// returns a slice of the substrings between those expression matches.
+// If capturing parentheses are used in expression, then the text of all groups
+// in the expression are also returned as part of the resulting slice.
+//
+// This function acts consistent with Python's re.split function.
+func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
+	if n == 0 {
+		return nil
+	}
+
+	if len(re.String()) > 0 && len(s) == 0 {
+		return []string{""}
+	}
+
+	var matches [][]int
+	if len(re.SubexpNames()) > 1 {
+		matches = re.FindAllStringSubmatchIndex(s, n)
+	} else {
+		matches = re.FindAllStringIndex(s, n)
+	}
+	strings := make([]string, 0, len(matches))
+
+	beg := 0
+	end := 0
+	for _, match := range matches {
+		if n > 0 && len(strings) >= n-1 {
+			break
+		}
+
+		end = match[0]
+		if match[1] != 0 {
+			strings = append(strings, s[beg:end])
+		}
+		beg = match[1]
+		if len(re.SubexpNames()) > 1 {
+			strings = append(strings, s[match[0]:match[1]])
+		}
+	}
+
+	if end != len(s) {
+		strings = append(strings, s[beg:])
+	}
+
+	return strings
+}
+
 type Segmenter interface {
 	Freq(string) (float64, bool)
 	Total() float64
@@ -231,22 +278,23 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
 	} else {
 		cut = j.cutDAGNoHMM
 	}
+
 	go func() {
-		for blk := range RegexpSplit(reHanDefault, sentence) {
-			if len(blk) == 0 {
+		for _, block := range RegexpSplit(reHanDefault, sentence, -1) {
+			if len(block) == 0 {
 				continue
 			}
-			if reHanDefault.MatchString(blk) {
-				for x := range cut(blk) {
+			if reHanDefault.MatchString(block) {
+				for x := range cut(block) {
 					result <- x
 				}
 			} else {
-				for x := range RegexpSplit(reSkipDefault, blk) {
-					if reSkipDefault.MatchString(x) {
-						result <- x
+				for _, subBlock := range RegexpSplit(reSkipDefault, block, -1) {
+					if reSkipDefault.MatchString(subBlock) {
+						result <- subBlock
 					} else {
-						for _, xx := range x {
-							result <- string(xx)
+						for _, r := range subBlock {
+							result <- string(r)
 						}
 					}
 				}
@@ -260,17 +308,17 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
 func (j *Jieba) CutAll(sentence string) chan string {
 	result := make(chan string)
 	go func() {
-		for blk := range RegexpSplit(reHanCutAll, sentence) {
-			if len(blk) == 0 {
+		for _, block := range RegexpSplit(reHanCutAll, sentence, -1) {
+			if len(block) == 0 {
 				continue
 			}
-			if reHanCutAll.MatchString(blk) {
-				for x := range j.cutAll(blk) {
+			if reHanCutAll.MatchString(block) {
+				for x := range j.cutAll(block) {
 					result <- x
 				}
 			} else {
-				for _, x := range reSkipCutAll.Split(blk, -1) {
-					result <- x
+				for _, subBlock := range reSkipCutAll.Split(block, -1) {
+					result <- subBlock
 				}
 			}
 		}
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -644,13 +644,18 @@ func TestCutDAGNoHmm(t *testing.T) {
 }

 func TestRegexpSplit(t *testing.T) {
-	result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`),
-		"BP神经网络如何训练才能在分类时增加区分度？"))
+	result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
+		"BP神经网络如何训练才能在分类时增加区分度？", -1)
+	if len(result) != 2 {
+		t.Fatal(result)
+	}
+	result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
+		"BP神经网络如何训练才能在分类时增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
-	result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
-		",BP神经网络如何训练才能在分类时#增加区分度？"))
+	result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
+		",BP神经网络如何训练才能在分类时#增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
@@ -663,8 +668,10 @@ func TestDefaultCut(t *testing.T) {
 	for index, content := range test_contents {
 		result = chanToArray(j.Cut(content, true))
 		if len(result) != len(defaultCutResult[index]) {
-			t.Fatalf("default cut for %s length should be %d not %d\n",
+			t.Errorf("default cut for %s length should be %d not %d\n",
 				content, len(defaultCutResult[index]), len(result))
+			t.Errorf("expect: %v\n", defaultCutResult[index])
+			t.Fatalf("got: %v\n", result)
 		}
 		for i, r := range result {
 			if r != defaultCutResult[index][i] {
@@ -681,8 +688,10 @@ func TestCutAll(t *testing.T) {
 	for index, content := range test_contents {
 		result = chanToArray(j.CutAll(content))
 		if len(result) != len(cutAllResult[index]) {
-			t.Fatalf("cut all for %s length should be %d not %d\n",
+			t.Errorf("cut all for %s length should be %d not %d\n",
 				content, len(cutAllResult[index]), len(result))
+			t.Errorf("expect: %v\n", strings.Join(defaultCutResult[index], "/"))
+			t.Fatalf("got: %v\n", strings.Join(result, "/"))
 		}
 		for i, c := range result {
 			if c != cutAllResult[index][i] {
--- a/posseg/posseg.go
+++ b/posseg/posseg.go
@@ -1,14 +1,15 @@
 package posseg

 import (
+	"fmt"
 	"github.com/wangbin/jiebago"
 	"regexp"
 	"strings"
 )

 var (
-	reHanDetail    = regexp.MustCompile(`\p{Han}+`)
-	reSkipDetail   = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
+	reHanDetail    = regexp.MustCompile(`(\p{Han}+)`)
+	reSkipDetail   = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
 	reEng          = regexp.MustCompile(`[[:alnum:]]`)
 	reNum          = regexp.MustCompile(`[\.[:digit:]]+`)
 	reEng1         = regexp.MustCompile(`[[:alnum:]]$`)
@@ -20,6 +21,10 @@ type Pair struct {
 	Word, Flag string
 }

+func (p Pair) String() string {
+	return fmt.Sprintf("%s / %s", p.Word, p.Flag)
+}
+
 type Posseg struct {
 	*jiebago.Jieba
 	flagMap map[string]string
@@ -92,15 +97,14 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair {

 func (p *Posseg) cutDetail(sentence string) chan Pair {
 	result := make(chan Pair)
-
 	go func() {
-		for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
+		for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) {
 			if reHanDetail.MatchString(blk) {
 				for wordTag := range p.cutDetailInternal(blk) {
 					result <- wordTag
 				}
 			} else {
-				for x := range jiebago.RegexpSplit(reSkipDetail, blk) {
+				for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) {
 					if len(x) == 0 {
 						continue
 					}
@@ -262,13 +266,13 @@ func (p *Posseg) Cut(sentence string, HMM bool) chan Pair {
 		cut = p.cutDAGNoHMM
 	}
 	go func() {
-		for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
+		for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) {
 			if reHanInternal.MatchString(blk) {
 				for wordTag := range cut(blk) {
 					result <- wordTag
 				}
 			} else {
-				for x := range jiebago.RegexpSplit(reSkipInternal, blk) {
+				for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) {
 					if reSkipInternal.MatchString(x) {
 						result <- Pair{x, "x"}
 					} else {
--- a/posseg/posseg_test.go
+++ b/posseg/posseg_test.go
@@ -283,7 +283,10 @@ func TestCut(t *testing.T) {
 	for index, content := range test_contents {
 		result := chanToArray(p.Cut(content, true))
 		if len(defaultCutResult[index]) != len(result) {
-			t.Fatal(content)
+			t.Errorf("default cut for %s length should be %d not %d\n",
+				content, len(defaultCutResult[index]), len(result))
+			t.Errorf("expect: %v\n", defaultCutResult[index])
+			t.Fatalf("got: %v\n", result)
 		}
 		for i, _ := range result {
 			if result[i] != defaultCutResult[index][i] {
--- a/util.go
+++ b/util.go
@@ -1,28 +0,0 @@
-package jiebago
-
-import (
-	"regexp"
-)
-
-// Split sentence using regular expression.
-func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
-	result := make(chan string)
-	go func() {
-		locs := r.FindAllStringIndex(sentence, -1)
-		lastLoc := 0
-		for _, loc := range locs {
-			if loc[0] == lastLoc {
-				result <- sentence[loc[0]:loc[1]]
-			} else {
-				result <- sentence[lastLoc:loc[0]]
-				result <- sentence[loc[0]:loc[1]]
-			}
-			lastLoc = loc[1]
-		}
-		if lastLoc < len(sentence) {
-			result <- sentence[lastLoc:]
-		}
-		close(result)
-	}()
-	return result
-}