mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-12 05:00:24 +08:00
added util module
This commit is contained in:
47
jieba.go
47
jieba.go
@@ -18,53 +18,6 @@ var (
|
|||||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
// RegexpSplit split slices s into substrings separated by the expression and
|
|
||||||
// returns a slice of the substrings between those expression matches.
|
|
||||||
// If capturing parentheses are used in expression, then the text of all groups
|
|
||||||
// in the expression are also returned as part of the resulting slice.
|
|
||||||
//
|
|
||||||
// This function acts consistent with Python's re.split function.
|
|
||||||
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
|
||||||
if n == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(re.String()) > 0 && len(s) == 0 {
|
|
||||||
return []string{""}
|
|
||||||
}
|
|
||||||
|
|
||||||
var matches [][]int
|
|
||||||
if len(re.SubexpNames()) > 1 {
|
|
||||||
matches = re.FindAllStringSubmatchIndex(s, n)
|
|
||||||
} else {
|
|
||||||
matches = re.FindAllStringIndex(s, n)
|
|
||||||
}
|
|
||||||
strings := make([]string, 0, len(matches))
|
|
||||||
|
|
||||||
beg := 0
|
|
||||||
end := 0
|
|
||||||
for _, match := range matches {
|
|
||||||
if n > 0 && len(strings) >= n-1 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
end = match[0]
|
|
||||||
if match[1] != 0 {
|
|
||||||
strings = append(strings, s[beg:end])
|
|
||||||
}
|
|
||||||
beg = match[1]
|
|
||||||
if len(re.SubexpNames()) > 1 {
|
|
||||||
strings = append(strings, s[match[0]:match[1]])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if end != len(s) {
|
|
||||||
strings = append(strings, s[beg:])
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings
|
|
||||||
}
|
|
||||||
|
|
||||||
type Segmenter interface {
|
type Segmenter interface {
|
||||||
Freq(string) (float64, bool)
|
Freq(string) (float64, bool)
|
||||||
Total() float64
|
Total() float64
|
||||||
|
|||||||
@@ -643,24 +643,6 @@ func TestCutDAGNoHmm(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRegexpSplit(t *testing.T) {
|
|
||||||
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
|
||||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
|
||||||
if len(result) != 2 {
|
|
||||||
t.Fatal(result)
|
|
||||||
}
|
|
||||||
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
|
|
||||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
|
||||||
if len(result) != 3 {
|
|
||||||
t.Fatal(result)
|
|
||||||
}
|
|
||||||
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
|
||||||
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
|
|
||||||
if len(result) != 3 {
|
|
||||||
t.Fatal(result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDefaultCut(t *testing.T) {
|
func TestDefaultCut(t *testing.T) {
|
||||||
j, _ := Open("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
|
|||||||
50
util/util.go
Normal file
50
util/util.go
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
package util
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// RegexpSplit split slices s into substrings separated by the expression and
|
||||||
|
// returns a slice of the substrings between those expression matches.
|
||||||
|
// If capturing parentheses are used in expression, then the text of all groups
|
||||||
|
// in the expression are also returned as part of the resulting slice.
|
||||||
|
//
|
||||||
|
// This function acts consistent with Python's re.split function.
|
||||||
|
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||||
|
if n == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(re.String()) > 0 && len(s) == 0 {
|
||||||
|
return []string{""}
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches [][]int
|
||||||
|
if len(re.SubexpNames()) > 1 {
|
||||||
|
matches = re.FindAllStringSubmatchIndex(s, n)
|
||||||
|
} else {
|
||||||
|
matches = re.FindAllStringIndex(s, n)
|
||||||
|
}
|
||||||
|
strings := make([]string, 0, len(matches))
|
||||||
|
|
||||||
|
beg := 0
|
||||||
|
end := 0
|
||||||
|
for _, match := range matches {
|
||||||
|
if n > 0 && len(strings) >= n-1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
end = match[0]
|
||||||
|
if match[1] != 0 {
|
||||||
|
strings = append(strings, s[beg:end])
|
||||||
|
}
|
||||||
|
beg = match[1]
|
||||||
|
if len(re.SubexpNames()) > 1 {
|
||||||
|
strings = append(strings, s[match[0]:match[1]])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if end != len(s) {
|
||||||
|
strings = append(strings, s[beg:])
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings
|
||||||
|
}
|
||||||
24
util/util_test.go
Normal file
24
util/util_test.go
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
package util
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRegexpSplit(t *testing.T) {
|
||||||
|
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||||
|
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||||
|
if len(result) != 2 {
|
||||||
|
t.Fatal(result)
|
||||||
|
}
|
||||||
|
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
|
||||||
|
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||||
|
if len(result) != 3 {
|
||||||
|
t.Fatal(result)
|
||||||
|
}
|
||||||
|
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||||
|
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
|
||||||
|
if len(result) != 3 {
|
||||||
|
t.Fatal(result)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user