1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-28 16:10:32 +08:00

code refactor, updated RegexpSplit function to match Python's re.split function

This commit is contained in:
Wang Bin
2015-04-02 18:25:00 +08:00
parent 0ab9063f43
commit 84ad6fe25e
5 changed files with 94 additions and 58 deletions

View File

@@ -11,12 +11,59 @@ import (
var ( var (
ErrInitialized = errors.New("already initialized") ErrInitialized = errors.New("already initialized")
reEng = regexp.MustCompile(`[[:alnum:]]`) reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`\p{Han}+`) reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
) )
// RegexpSplit split slices s into substrings separated by the expression and
// returns a slice of the substrings between those expression matches.
// If capturing parentheses are used in expression, then the text of all groups
// in the expression are also returned as part of the resulting slice.
//
// This function acts consistent with Python's re.split function.
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 {
return nil
}
if len(re.String()) > 0 && len(s) == 0 {
return []string{""}
}
var matches [][]int
if len(re.SubexpNames()) > 1 {
matches = re.FindAllStringSubmatchIndex(s, n)
} else {
matches = re.FindAllStringIndex(s, n)
}
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
if len(re.SubexpNames()) > 1 {
strings = append(strings, s[match[0]:match[1]])
}
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}
type Segmenter interface { type Segmenter interface {
Freq(string) (float64, bool) Freq(string) (float64, bool)
Total() float64 Total() float64
@@ -231,22 +278,23 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
} else { } else {
cut = j.cutDAGNoHMM cut = j.cutDAGNoHMM
} }
go func() { go func() {
for blk := range RegexpSplit(reHanDefault, sentence) { for _, block := range RegexpSplit(reHanDefault, sentence, -1) {
if len(blk) == 0 { if len(block) == 0 {
continue continue
} }
if reHanDefault.MatchString(blk) { if reHanDefault.MatchString(block) {
for x := range cut(blk) { for x := range cut(block) {
result <- x result <- x
} }
} else { } else {
for x := range RegexpSplit(reSkipDefault, blk) { for _, subBlock := range RegexpSplit(reSkipDefault, block, -1) {
if reSkipDefault.MatchString(x) { if reSkipDefault.MatchString(subBlock) {
result <- x result <- subBlock
} else { } else {
for _, xx := range x { for _, r := range subBlock {
result <- string(xx) result <- string(r)
} }
} }
} }
@@ -260,17 +308,17 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
func (j *Jieba) CutAll(sentence string) chan string { func (j *Jieba) CutAll(sentence string) chan string {
result := make(chan string) result := make(chan string)
go func() { go func() {
for blk := range RegexpSplit(reHanCutAll, sentence) { for _, block := range RegexpSplit(reHanCutAll, sentence, -1) {
if len(blk) == 0 { if len(block) == 0 {
continue continue
} }
if reHanCutAll.MatchString(blk) { if reHanCutAll.MatchString(block) {
for x := range j.cutAll(blk) { for x := range j.cutAll(block) {
result <- x result <- x
} }
} else { } else {
for _, x := range reSkipCutAll.Split(blk, -1) { for _, subBlock := range reSkipCutAll.Split(block, -1) {
result <- x result <- subBlock
} }
} }
} }

View File

@@ -644,13 +644,18 @@ func TestCutDAGNoHmm(t *testing.T) {
} }
func TestRegexpSplit(t *testing.T) { func TestRegexpSplit(t *testing.T) {
result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`), result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度")) "BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 2 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 3 { if len(result) != 3 {
t.Fatal(result) t.Fatal(result)
} }
result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?")) ",BP神经网络如何训练才能在分类时#增加区分度?", -1)
if len(result) != 3 { if len(result) != 3 {
t.Fatal(result) t.Fatal(result)
} }
@@ -663,8 +668,10 @@ func TestDefaultCut(t *testing.T) {
for index, content := range test_contents { for index, content := range test_contents {
result = chanToArray(j.Cut(content, true)) result = chanToArray(j.Cut(content, true))
if len(result) != len(defaultCutResult[index]) { if len(result) != len(defaultCutResult[index]) {
t.Fatalf("default cut for %s length should be %d not %d\n", t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result)) content, len(defaultCutResult[index]), len(result))
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
} }
for i, r := range result { for i, r := range result {
if r != defaultCutResult[index][i] { if r != defaultCutResult[index][i] {
@@ -681,8 +688,10 @@ func TestCutAll(t *testing.T) {
for index, content := range test_contents { for index, content := range test_contents {
result = chanToArray(j.CutAll(content)) result = chanToArray(j.CutAll(content))
if len(result) != len(cutAllResult[index]) { if len(result) != len(cutAllResult[index]) {
t.Fatalf("cut all for %s length should be %d not %d\n", t.Errorf("cut all for %s length should be %d not %d\n",
content, len(cutAllResult[index]), len(result)) content, len(cutAllResult[index]), len(result))
t.Errorf("expect: %v\n", strings.Join(defaultCutResult[index], "/"))
t.Fatalf("got: %v\n", strings.Join(result, "/"))
} }
for i, c := range result { for i, c := range result {
if c != cutAllResult[index][i] { if c != cutAllResult[index][i] {

View File

@@ -1,14 +1,15 @@
package posseg package posseg
import ( import (
"fmt"
"github.com/wangbin/jiebago" "github.com/wangbin/jiebago"
"regexp" "regexp"
"strings" "strings"
) )
var ( var (
reHanDetail = regexp.MustCompile(`\p{Han}+`) reHanDetail = regexp.MustCompile(`(\p{Han}+)`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
reEng = regexp.MustCompile(`[[:alnum:]]`) reEng = regexp.MustCompile(`[[:alnum:]]`)
reNum = regexp.MustCompile(`[\.[:digit:]]+`) reNum = regexp.MustCompile(`[\.[:digit:]]+`)
reEng1 = regexp.MustCompile(`[[:alnum:]]$`) reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
@@ -20,6 +21,10 @@ type Pair struct {
Word, Flag string Word, Flag string
} }
func (p Pair) String() string {
return fmt.Sprintf("%s / %s", p.Word, p.Flag)
}
type Posseg struct { type Posseg struct {
*jiebago.Jieba *jiebago.Jieba
flagMap map[string]string flagMap map[string]string
@@ -92,15 +97,14 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
func (p *Posseg) cutDetail(sentence string) chan Pair { func (p *Posseg) cutDetail(sentence string) chan Pair {
result := make(chan Pair) result := make(chan Pair)
go func() { go func() {
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) { for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) {
if reHanDetail.MatchString(blk) { if reHanDetail.MatchString(blk) {
for wordTag := range p.cutDetailInternal(blk) { for wordTag := range p.cutDetailInternal(blk) {
result <- wordTag result <- wordTag
} }
} else { } else {
for x := range jiebago.RegexpSplit(reSkipDetail, blk) { for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) {
if len(x) == 0 { if len(x) == 0 {
continue continue
} }
@@ -262,13 +266,13 @@ func (p *Posseg) Cut(sentence string, HMM bool) chan Pair {
cut = p.cutDAGNoHMM cut = p.cutDAGNoHMM
} }
go func() { go func() {
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) { for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) {
if reHanInternal.MatchString(blk) { if reHanInternal.MatchString(blk) {
for wordTag := range cut(blk) { for wordTag := range cut(blk) {
result <- wordTag result <- wordTag
} }
} else { } else {
for x := range jiebago.RegexpSplit(reSkipInternal, blk) { for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) {
if reSkipInternal.MatchString(x) { if reSkipInternal.MatchString(x) {
result <- Pair{x, "x"} result <- Pair{x, "x"}
} else { } else {

View File

@@ -283,7 +283,10 @@ func TestCut(t *testing.T) {
for index, content := range test_contents { for index, content := range test_contents {
result := chanToArray(p.Cut(content, true)) result := chanToArray(p.Cut(content, true))
if len(defaultCutResult[index]) != len(result) { if len(defaultCutResult[index]) != len(result) {
t.Fatal(content) t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result))
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
} }
for i, _ := range result { for i, _ := range result {
if result[i] != defaultCutResult[index][i] { if result[i] != defaultCutResult[index][i] {

28
util.go
View File

@@ -1,28 +0,0 @@
package jiebago
import (
"regexp"
)
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
result := make(chan string)
go func() {
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
for _, loc := range locs {
if loc[0] == lastLoc {
result <- sentence[loc[0]:loc[1]]
} else {
result <- sentence[lastLoc:loc[0]]
result <- sentence[loc[0]:loc[1]]
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result <- sentence[lastLoc:]
}
close(result)
}()
return result
}