1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

added util module

This commit is contained in:
Wang Bin
2015-04-30 15:26:34 +08:00
parent 732196127b
commit d9f77563bf
4 changed files with 74 additions and 65 deletions

View File

@@ -18,53 +18,6 @@ var (
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
)
// RegexpSplit split slices s into substrings separated by the expression and
// returns a slice of the substrings between those expression matches.
// If capturing parentheses are used in expression, then the text of all groups
// in the expression are also returned as part of the resulting slice.
//
// This function acts consistent with Python's re.split function.
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 {
return nil
}
if len(re.String()) > 0 && len(s) == 0 {
return []string{""}
}
var matches [][]int
if len(re.SubexpNames()) > 1 {
matches = re.FindAllStringSubmatchIndex(s, n)
} else {
matches = re.FindAllStringIndex(s, n)
}
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
if len(re.SubexpNames()) > 1 {
strings = append(strings, s[match[0]:match[1]])
}
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}
type Segmenter interface {
Freq(string) (float64, bool)
Total() float64

View File

@@ -643,24 +643,6 @@ func TestCutDAGNoHmm(t *testing.T) {
}
}
func TestRegexpSplit(t *testing.T) {
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 2 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 3 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
if len(result) != 3 {
t.Fatal(result)
}
}
func TestDefaultCut(t *testing.T) {
j, _ := Open("dict.txt")

50
util/util.go Normal file
View File

@@ -0,0 +1,50 @@
package util
import "regexp"
// RegexpSplit split slices s into substrings separated by the expression and
// returns a slice of the substrings between those expression matches.
// If capturing parentheses are used in expression, then the text of all groups
// in the expression are also returned as part of the resulting slice.
//
// This function acts consistent with Python's re.split function.
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 {
return nil
}
if len(re.String()) > 0 && len(s) == 0 {
return []string{""}
}
var matches [][]int
if len(re.SubexpNames()) > 1 {
matches = re.FindAllStringSubmatchIndex(s, n)
} else {
matches = re.FindAllStringIndex(s, n)
}
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
if len(re.SubexpNames()) > 1 {
strings = append(strings, s[match[0]:match[1]])
}
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}

24
util/util_test.go Normal file
View File

@@ -0,0 +1,24 @@
package util
import (
"regexp"
"testing"
)
func TestRegexpSplit(t *testing.T) {
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 2 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 3 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
if len(result) != 3 {
t.Fatal(result)
}
}