diff --git a/jieba.go b/jieba.go index 465ff3a..2f077de 100644 --- a/jieba.go +++ b/jieba.go @@ -18,53 +18,6 @@ var ( reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) -// RegexpSplit split slices s into substrings separated by the expression and -// returns a slice of the substrings between those expression matches. -// If capturing parentheses are used in expression, then the text of all groups -// in the expression are also returned as part of the resulting slice. -// -// This function acts consistent with Python's re.split function. -func RegexpSplit(re *regexp.Regexp, s string, n int) []string { - if n == 0 { - return nil - } - - if len(re.String()) > 0 && len(s) == 0 { - return []string{""} - } - - var matches [][]int - if len(re.SubexpNames()) > 1 { - matches = re.FindAllStringSubmatchIndex(s, n) - } else { - matches = re.FindAllStringIndex(s, n) - } - strings := make([]string, 0, len(matches)) - - beg := 0 - end := 0 - for _, match := range matches { - if n > 0 && len(strings) >= n-1 { - break - } - - end = match[0] - if match[1] != 0 { - strings = append(strings, s[beg:end]) - } - beg = match[1] - if len(re.SubexpNames()) > 1 { - strings = append(strings, s[match[0]:match[1]]) - } - } - - if end != len(s) { - strings = append(strings, s[beg:]) - } - - return strings -} - type Segmenter interface { Freq(string) (float64, bool) Total() float64 diff --git a/jieba_test.go b/jieba_test.go index 855b029..1033a84 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -643,24 +643,6 @@ func TestCutDAGNoHmm(t *testing.T) { } } -func TestRegexpSplit(t *testing.T) { - result := RegexpSplit(regexp.MustCompile(`\p{Han}+`), - "BP神经网络如何训练才能在分类时增加区分度?", -1) - if len(result) != 2 { - t.Fatal(result) - } - result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`), - "BP神经网络如何训练才能在分类时增加区分度?", -1) - if len(result) != 3 { - t.Fatal(result) - } - result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), - ",BP神经网络如何训练才能在分类时#增加区分度?", -1) - if len(result) != 3 { - t.Fatal(result) - } -} - func TestDefaultCut(t *testing.T) { j, _ := Open("dict.txt") diff --git a/util/util.go b/util/util.go new file mode 100644 index 0000000..9c2fac7 --- /dev/null +++ b/util/util.go @@ -0,0 +1,50 @@ +package util + +import "regexp" + +// RegexpSplit split slices s into substrings separated by the expression and +// returns a slice of the substrings between those expression matches. +// If capturing parentheses are used in expression, then the text of all groups +// in the expression are also returned as part of the resulting slice. +// +// This function acts consistent with Python's re.split function. +func RegexpSplit(re *regexp.Regexp, s string, n int) []string { + if n == 0 { + return nil + } + + if len(re.String()) > 0 && len(s) == 0 { + return []string{""} + } + + var matches [][]int + if len(re.SubexpNames()) > 1 { + matches = re.FindAllStringSubmatchIndex(s, n) + } else { + matches = re.FindAllStringIndex(s, n) + } + strings := make([]string, 0, len(matches)) + + beg := 0 + end := 0 + for _, match := range matches { + if n > 0 && len(strings) >= n-1 { + break + } + + end = match[0] + if match[1] != 0 { + strings = append(strings, s[beg:end]) + } + beg = match[1] + if len(re.SubexpNames()) > 1 { + strings = append(strings, s[match[0]:match[1]]) + } + } + + if end != len(s) { + strings = append(strings, s[beg:]) + } + + return strings +} diff --git a/util/util_test.go b/util/util_test.go new file mode 100644 index 0000000..5f46705 --- /dev/null +++ b/util/util_test.go @@ -0,0 +1,24 @@ +package util + +import ( + "regexp" + "testing" +) + +func TestRegexpSplit(t *testing.T) { + result := RegexpSplit(regexp.MustCompile(`\p{Han}+`), + "BP神经网络如何训练才能在分类时增加区分度?", -1) + if len(result) != 2 { + t.Fatal(result) + } + result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`), + "BP神经网络如何训练才能在分类时增加区分度?", -1) + if len(result) != 3 { + t.Fatal(result) + } + result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), + ",BP神经网络如何训练才能在分类时#增加区分度?", -1) + if len(result) != 3 { + t.Fatal(result) + } +}