added util module

2026-06-12 05:00:24 +08:00 · 2015-04-30 15:26:34 +08:00
parent 732196127b
commit d9f77563bf
4 changed files with 74 additions and 65 deletions
--- a/jieba.go
+++ b/jieba.go
@@ -18,53 +18,6 @@ var (
 	reSkipDefault  = regexp.MustCompile(`(\r\n|\s)`)
 )
 // RegexpSplit split slices s into substrings separated by the expression and
 // returns a slice of the substrings between those expression matches.
 // If capturing parentheses are used in expression, then the text of all groups
 // in the expression are also returned as part of the resulting slice.
 //
 // This function acts consistent with Python's re.split function.
 func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
 	if n == 0 {
 		return nil
 	}
 	if len(re.String()) > 0 && len(s) == 0 {
 		return []string{""}
 	}
 	var matches [][]int
 	if len(re.SubexpNames()) > 1 {
 		matches = re.FindAllStringSubmatchIndex(s, n)
 	} else {
 		matches = re.FindAllStringIndex(s, n)
 	}
 	strings := make([]string, 0, len(matches))
 	beg := 0
 	end := 0
 	for _, match := range matches {
 		if n > 0 && len(strings) >= n-1 {
 			break
 		}
 		end = match[0]
 		if match[1] != 0 {
 			strings = append(strings, s[beg:end])
 		}
 		beg = match[1]
 		if len(re.SubexpNames()) > 1 {
 			strings = append(strings, s[match[0]:match[1]])
 		}
 	}
 	if end != len(s) {
 		strings = append(strings, s[beg:])
 	}
 	return strings
 }
 type Segmenter interface {
 	Freq(string) (float64, bool)
 	Total() float64
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -643,24 +643,6 @@ func TestCutDAGNoHmm(t *testing.T) {
 	}
 }
 func TestRegexpSplit(t *testing.T) {
 	result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
 		"BP神经网络如何训练才能在分类时增加区分度？", -1)
 	if len(result) != 2 {
 		t.Fatal(result)
 	}
 	result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
 		"BP神经网络如何训练才能在分类时增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
 	result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
 		",BP神经网络如何训练才能在分类时#增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
 }
 func TestDefaultCut(t *testing.T) {
 	j, _ := Open("dict.txt")
--- a/util/util.go
+++ b/util/util.go
@@ -0,0 +1,50 @@
 package util
 import "regexp"
 // RegexpSplit split slices s into substrings separated by the expression and
 // returns a slice of the substrings between those expression matches.
 // If capturing parentheses are used in expression, then the text of all groups
 // in the expression are also returned as part of the resulting slice.
 //
 // This function acts consistent with Python's re.split function.
 func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
 	if n == 0 {
 		return nil
 	}
 	if len(re.String()) > 0 && len(s) == 0 {
 		return []string{""}
 	}
 	var matches [][]int
 	if len(re.SubexpNames()) > 1 {
 		matches = re.FindAllStringSubmatchIndex(s, n)
 	} else {
 		matches = re.FindAllStringIndex(s, n)
 	}
 	strings := make([]string, 0, len(matches))
 	beg := 0
 	end := 0
 	for _, match := range matches {
 		if n > 0 && len(strings) >= n-1 {
 			break
 		}
 		end = match[0]
 		if match[1] != 0 {
 			strings = append(strings, s[beg:end])
 		}
 		beg = match[1]
 		if len(re.SubexpNames()) > 1 {
 			strings = append(strings, s[match[0]:match[1]])
 		}
 	}
 	if end != len(s) {
 		strings = append(strings, s[beg:])
 	}
 	return strings
 }
--- a/util/util_test.go
+++ b/util/util_test.go
@@ -0,0 +1,24 @@
 package util
 import (
 	"regexp"
 	"testing"
 )
 func TestRegexpSplit(t *testing.T) {
 	result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
 		"BP神经网络如何训练才能在分类时增加区分度？", -1)
 	if len(result) != 2 {
 		t.Fatal(result)
 	}
 	result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
 		"BP神经网络如何训练才能在分类时增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
 	result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
 		",BP神经网络如何训练才能在分类时#增加区分度？", -1)
 	if len(result) != 3 {
 		t.Fatal(result)
 	}
 }