From 84ad6fe25e57ebab6cf386066a9bcba1b5072775 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Thu, 2 Apr 2015 18:25:00 +0800 Subject: [PATCH] code refactor, updated RegexpSplit function to match Python's re.split function --- jieba.go | 80 ++++++++++++++++++++++++++++++++++--------- jieba_test.go | 21 ++++++++---- posseg/posseg.go | 18 ++++++---- posseg/posseg_test.go | 5 ++- util.go | 28 --------------- 5 files changed, 94 insertions(+), 58 deletions(-) delete mode 100644 util.go diff --git a/jieba.go b/jieba.go index 5a1822c..c16a9e8 100644 --- a/jieba.go +++ b/jieba.go @@ -11,12 +11,59 @@ import ( var ( ErrInitialized = errors.New("already initialized") reEng = regexp.MustCompile(`[[:alnum:]]`) - reHanCutAll = regexp.MustCompile(`\p{Han}+`) + reHanCutAll = regexp.MustCompile(`(\p{Han}+)`) reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) +// RegexpSplit split slices s into substrings separated by the expression and +// returns a slice of the substrings between those expression matches. +// If capturing parentheses are used in expression, then the text of all groups +// in the expression are also returned as part of the resulting slice. +// +// This function acts consistent with Python's re.split function. +func RegexpSplit(re *regexp.Regexp, s string, n int) []string { + if n == 0 { + return nil + } + + if len(re.String()) > 0 && len(s) == 0 { + return []string{""} + } + + var matches [][]int + if len(re.SubexpNames()) > 1 { + matches = re.FindAllStringSubmatchIndex(s, n) + } else { + matches = re.FindAllStringIndex(s, n) + } + strings := make([]string, 0, len(matches)) + + beg := 0 + end := 0 + for _, match := range matches { + if n > 0 && len(strings) >= n-1 { + break + } + + end = match[0] + if match[1] != 0 { + strings = append(strings, s[beg:end]) + } + beg = match[1] + if len(re.SubexpNames()) > 1 { + strings = append(strings, s[match[0]:match[1]]) + } + } + + if end != len(s) { + strings = append(strings, s[beg:]) + } + + return strings +} + type Segmenter interface { Freq(string) (float64, bool) Total() float64 @@ -231,22 +278,23 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string { } else { cut = j.cutDAGNoHMM } + go func() { - for blk := range RegexpSplit(reHanDefault, sentence) { - if len(blk) == 0 { + for _, block := range RegexpSplit(reHanDefault, sentence, -1) { + if len(block) == 0 { continue } - if reHanDefault.MatchString(blk) { - for x := range cut(blk) { + if reHanDefault.MatchString(block) { + for x := range cut(block) { result <- x } } else { - for x := range RegexpSplit(reSkipDefault, blk) { - if reSkipDefault.MatchString(x) { - result <- x + for _, subBlock := range RegexpSplit(reSkipDefault, block, -1) { + if reSkipDefault.MatchString(subBlock) { + result <- subBlock } else { - for _, xx := range x { - result <- string(xx) + for _, r := range subBlock { + result <- string(r) } } } @@ -260,17 +308,17 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string { func (j *Jieba) CutAll(sentence string) chan string { result := make(chan string) go func() { - for blk := range RegexpSplit(reHanCutAll, sentence) { - if len(blk) == 0 { + for _, block := range RegexpSplit(reHanCutAll, sentence, -1) { + if len(block) == 0 { continue } - if reHanCutAll.MatchString(blk) { - for x := range j.cutAll(blk) { + if reHanCutAll.MatchString(block) { + for x := range j.cutAll(block) { result <- x } } else { - for _, x := range reSkipCutAll.Split(blk, -1) { - result <- x + for _, subBlock := range reSkipCutAll.Split(block, -1) { + result <- subBlock } } } diff --git a/jieba_test.go b/jieba_test.go index 7a9da9c..8e69c1c 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -644,13 +644,18 @@ func TestCutDAGNoHmm(t *testing.T) { } func TestRegexpSplit(t *testing.T) { - result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`), - "BP神经网络如何训练才能在分类时增加区分度?")) + result := RegexpSplit(regexp.MustCompile(`\p{Han}+`), + "BP神经网络如何训练才能在分类时增加区分度?", -1) + if len(result) != 2 { + t.Fatal(result) + } + result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`), + "BP神经网络如何训练才能在分类时增加区分度?", -1) if len(result) != 3 { t.Fatal(result) } - result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), - ",BP神经网络如何训练才能在分类时#增加区分度?")) + result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), + ",BP神经网络如何训练才能在分类时#增加区分度?", -1) if len(result) != 3 { t.Fatal(result) } @@ -663,8 +668,10 @@ func TestDefaultCut(t *testing.T) { for index, content := range test_contents { result = chanToArray(j.Cut(content, true)) if len(result) != len(defaultCutResult[index]) { - t.Fatalf("default cut for %s length should be %d not %d\n", + t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) + t.Errorf("expect: %v\n", defaultCutResult[index]) + t.Fatalf("got: %v\n", result) } for i, r := range result { if r != defaultCutResult[index][i] { @@ -681,8 +688,10 @@ func TestCutAll(t *testing.T) { for index, content := range test_contents { result = chanToArray(j.CutAll(content)) if len(result) != len(cutAllResult[index]) { - t.Fatalf("cut all for %s length should be %d not %d\n", + t.Errorf("cut all for %s length should be %d not %d\n", content, len(cutAllResult[index]), len(result)) + t.Errorf("expect: %v\n", strings.Join(defaultCutResult[index], "/")) + t.Fatalf("got: %v\n", strings.Join(result, "/")) } for i, c := range result { if c != cutAllResult[index][i] { diff --git a/posseg/posseg.go b/posseg/posseg.go index a9b9978..3103b72 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -1,14 +1,15 @@ package posseg import ( + "fmt" "github.com/wangbin/jiebago" "regexp" "strings" ) var ( - reHanDetail = regexp.MustCompile(`\p{Han}+`) - reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) + reHanDetail = regexp.MustCompile(`(\p{Han}+)`) + reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`) reEng = regexp.MustCompile(`[[:alnum:]]`) reNum = regexp.MustCompile(`[\.[:digit:]]+`) reEng1 = regexp.MustCompile(`[[:alnum:]]$`) @@ -20,6 +21,10 @@ type Pair struct { Word, Flag string } +func (p Pair) String() string { + return fmt.Sprintf("%s / %s", p.Word, p.Flag) +} + type Posseg struct { *jiebago.Jieba flagMap map[string]string @@ -92,15 +97,14 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair { func (p *Posseg) cutDetail(sentence string) chan Pair { result := make(chan Pair) - go func() { - for blk := range jiebago.RegexpSplit(reHanDetail, sentence) { + for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) { if reHanDetail.MatchString(blk) { for wordTag := range p.cutDetailInternal(blk) { result <- wordTag } } else { - for x := range jiebago.RegexpSplit(reSkipDetail, blk) { + for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) { if len(x) == 0 { continue } @@ -262,13 +266,13 @@ func (p *Posseg) Cut(sentence string, HMM bool) chan Pair { cut = p.cutDAGNoHMM } go func() { - for blk := range jiebago.RegexpSplit(reHanInternal, sentence) { + for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) { if reHanInternal.MatchString(blk) { for wordTag := range cut(blk) { result <- wordTag } } else { - for x := range jiebago.RegexpSplit(reSkipInternal, blk) { + for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) { if reSkipInternal.MatchString(x) { result <- Pair{x, "x"} } else { diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 42e45a1..2d0206d 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -283,7 +283,10 @@ func TestCut(t *testing.T) { for index, content := range test_contents { result := chanToArray(p.Cut(content, true)) if len(defaultCutResult[index]) != len(result) { - t.Fatal(content) + t.Errorf("default cut for %s length should be %d not %d\n", + content, len(defaultCutResult[index]), len(result)) + t.Errorf("expect: %v\n", defaultCutResult[index]) + t.Fatalf("got: %v\n", result) } for i, _ := range result { if result[i] != defaultCutResult[index][i] { diff --git a/util.go b/util.go deleted file mode 100644 index 32f7f7c..0000000 --- a/util.go +++ /dev/null @@ -1,28 +0,0 @@ -package jiebago - -import ( - "regexp" -) - -// Split sentence using regular expression. -func RegexpSplit(r *regexp.Regexp, sentence string) chan string { - result := make(chan string) - go func() { - locs := r.FindAllStringIndex(sentence, -1) - lastLoc := 0 - for _, loc := range locs { - if loc[0] == lastLoc { - result <- sentence[loc[0]:loc[1]] - } else { - result <- sentence[lastLoc:loc[0]] - result <- sentence[loc[0]:loc[1]] - } - lastLoc = loc[1] - } - if lastLoc < len(sentence) { - result <- sentence[lastLoc:] - } - close(result) - }() - return result -}