added util module

2026-07-17 02:40:23 +08:00 · 2015-04-30 15:26:34 +08:00
parent 732196127b
commit d9f77563bf
4 changed files with 74 additions and 65 deletions
--- a/jieba.go
+++ b/jieba.go
@@ -18,53 +18,6 @@ var (
 	reSkipDefault  = regexp.MustCompile(`(\r\n|\s)`)
 )

-// RegexpSplit split slices s into substrings separated by the expression and
-// returns a slice of the substrings between those expression matches.
-// If capturing parentheses are used in expression, then the text of all groups
-// in the expression are also returned as part of the resulting slice.
-//
-// This function acts consistent with Python's re.split function.
-func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
-	if n == 0 {
-		return nil
-	}
-
-	if len(re.String()) > 0 && len(s) == 0 {
-		return []string{""}
-	}
-
-	var matches [][]int
-	if len(re.SubexpNames()) > 1 {
-		matches = re.FindAllStringSubmatchIndex(s, n)
-	} else {
-		matches = re.FindAllStringIndex(s, n)
-	}
-	strings := make([]string, 0, len(matches))
-
-	beg := 0
-	end := 0
-	for _, match := range matches {
-		if n > 0 && len(strings) >= n-1 {
-			break
-		}
-
-		end = match[0]
-		if match[1] != 0 {
-			strings = append(strings, s[beg:end])
-		}
-		beg = match[1]
-		if len(re.SubexpNames()) > 1 {
-			strings = append(strings, s[match[0]:match[1]])
-		}
-	}
-
-	if end != len(s) {
-		strings = append(strings, s[beg:])
-	}
-
-	return strings
-}
-
 type Segmenter interface {
 	Freq(string) (float64, bool)
 	Total() float64
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -643,24 +643,6 @@ func TestCutDAGNoHmm(t *testing.T) {
 	}
 }

-func TestRegexpSplit(t *testing.T) {
-	result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
-		"BP神经网络如何训练才能在分类时增加区分度？", -1)
-	if len(result) != 2 {
-		t.Fatal(result)
-	}
-	result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
-		"BP神经网络如何训练才能在分类时增加区分度？", -1)
-	if len(result) != 3 {
-		t.Fatal(result)
-	}
-	result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
-		",BP神经网络如何训练才能在分类时#增加区分度？", -1)
-	if len(result) != 3 {
-		t.Fatal(result)
-	}
-}
-
 func TestDefaultCut(t *testing.T) {
 	j, _ := Open("dict.txt")

--- a/util/util.go
+++ b/util/util.go
@@ -0,0 +1,50 @@
+package util
+
+import "regexp"
+
+// RegexpSplit split slices s into substrings separated by the expression and
+// returns a slice of the substrings between those expression matches.
+// If capturing parentheses are used in expression, then the text of all groups
+// in the expression are also returned as part of the resulting slice.
+//
+// This function acts consistent with Python's re.split function.
+func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
+	if n == 0 {
+		return nil
+	}
+
+	if len(re.String()) > 0 && len(s) == 0 {
+		return []string{""}
+	}
+
+	var matches [][]int
+	if len(re.SubexpNames()) > 1 {
+		matches = re.FindAllStringSubmatchIndex(s, n)
+	} else {
+		matches = re.FindAllStringIndex(s, n)
+	}
+	strings := make([]string, 0, len(matches))
+
+	beg := 0
+	end := 0
+	for _, match := range matches {
+		if n > 0 && len(strings) >= n-1 {
+			break
+		}
+
+		end = match[0]
+		if match[1] != 0 {
+			strings = append(strings, s[beg:end])
+		}
+		beg = match[1]
+		if len(re.SubexpNames()) > 1 {
+			strings = append(strings, s[match[0]:match[1]])
+		}
+	}
+
+	if end != len(s) {
+		strings = append(strings, s[beg:])
+	}
+
+	return strings
+}
--- a/util/util_test.go
+++ b/util/util_test.go
@@ -0,0 +1,24 @@
+package util
+
+import (
+	"regexp"
+	"testing"
+)
+
+func TestRegexpSplit(t *testing.T) {
+	result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
+		"BP神经网络如何训练才能在分类时增加区分度？", -1)
+	if len(result) != 2 {
+		t.Fatal(result)
+	}
+	result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
+		"BP神经网络如何训练才能在分类时增加区分度？", -1)
+	if len(result) != 3 {
+		t.Fatal(result)
+	}
+	result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
+		",BP神经网络如何训练才能在分类时#增加区分度？", -1)
+	if len(result) != 3 {
+		t.Fatal(result)
+	}
+}