mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
added util module
This commit is contained in:
50
util/util.go
Normal file
50
util/util.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package util
|
||||
|
||||
import "regexp"
|
||||
|
||||
// RegexpSplit split slices s into substrings separated by the expression and
|
||||
// returns a slice of the substrings between those expression matches.
|
||||
// If capturing parentheses are used in expression, then the text of all groups
|
||||
// in the expression are also returned as part of the resulting slice.
|
||||
//
|
||||
// This function acts consistent with Python's re.split function.
|
||||
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(re.String()) > 0 && len(s) == 0 {
|
||||
return []string{""}
|
||||
}
|
||||
|
||||
var matches [][]int
|
||||
if len(re.SubexpNames()) > 1 {
|
||||
matches = re.FindAllStringSubmatchIndex(s, n)
|
||||
} else {
|
||||
matches = re.FindAllStringIndex(s, n)
|
||||
}
|
||||
strings := make([]string, 0, len(matches))
|
||||
|
||||
beg := 0
|
||||
end := 0
|
||||
for _, match := range matches {
|
||||
if n > 0 && len(strings) >= n-1 {
|
||||
break
|
||||
}
|
||||
|
||||
end = match[0]
|
||||
if match[1] != 0 {
|
||||
strings = append(strings, s[beg:end])
|
||||
}
|
||||
beg = match[1]
|
||||
if len(re.SubexpNames()) > 1 {
|
||||
strings = append(strings, s[match[0]:match[1]])
|
||||
}
|
||||
}
|
||||
|
||||
if end != len(s) {
|
||||
strings = append(strings, s[beg:])
|
||||
}
|
||||
|
||||
return strings
|
||||
}
|
||||
24
util/util_test.go
Normal file
24
util/util_test.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRegexpSplit(t *testing.T) {
|
||||
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||
if len(result) != 2 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user