mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-28 08:02:45 +08:00
code refactor, updated RegexpSplit function to match Python's re.split function
This commit is contained in:
80
jieba.go
80
jieba.go
@@ -11,12 +11,59 @@ import (
|
|||||||
var (
|
var (
|
||||||
ErrInitialized = errors.New("already initialized")
|
ErrInitialized = errors.New("already initialized")
|
||||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||||
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
|
reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
|
||||||
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
||||||
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// RegexpSplit split slices s into substrings separated by the expression and
|
||||||
|
// returns a slice of the substrings between those expression matches.
|
||||||
|
// If capturing parentheses are used in expression, then the text of all groups
|
||||||
|
// in the expression are also returned as part of the resulting slice.
|
||||||
|
//
|
||||||
|
// This function acts consistent with Python's re.split function.
|
||||||
|
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||||
|
if n == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(re.String()) > 0 && len(s) == 0 {
|
||||||
|
return []string{""}
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches [][]int
|
||||||
|
if len(re.SubexpNames()) > 1 {
|
||||||
|
matches = re.FindAllStringSubmatchIndex(s, n)
|
||||||
|
} else {
|
||||||
|
matches = re.FindAllStringIndex(s, n)
|
||||||
|
}
|
||||||
|
strings := make([]string, 0, len(matches))
|
||||||
|
|
||||||
|
beg := 0
|
||||||
|
end := 0
|
||||||
|
for _, match := range matches {
|
||||||
|
if n > 0 && len(strings) >= n-1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
end = match[0]
|
||||||
|
if match[1] != 0 {
|
||||||
|
strings = append(strings, s[beg:end])
|
||||||
|
}
|
||||||
|
beg = match[1]
|
||||||
|
if len(re.SubexpNames()) > 1 {
|
||||||
|
strings = append(strings, s[match[0]:match[1]])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if end != len(s) {
|
||||||
|
strings = append(strings, s[beg:])
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings
|
||||||
|
}
|
||||||
|
|
||||||
type Segmenter interface {
|
type Segmenter interface {
|
||||||
Freq(string) (float64, bool)
|
Freq(string) (float64, bool)
|
||||||
Total() float64
|
Total() float64
|
||||||
@@ -231,22 +278,23 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
|
|||||||
} else {
|
} else {
|
||||||
cut = j.cutDAGNoHMM
|
cut = j.cutDAGNoHMM
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range RegexpSplit(reHanDefault, sentence) {
|
for _, block := range RegexpSplit(reHanDefault, sentence, -1) {
|
||||||
if len(blk) == 0 {
|
if len(block) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if reHanDefault.MatchString(blk) {
|
if reHanDefault.MatchString(block) {
|
||||||
for x := range cut(blk) {
|
for x := range cut(block) {
|
||||||
result <- x
|
result <- x
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for x := range RegexpSplit(reSkipDefault, blk) {
|
for _, subBlock := range RegexpSplit(reSkipDefault, block, -1) {
|
||||||
if reSkipDefault.MatchString(x) {
|
if reSkipDefault.MatchString(subBlock) {
|
||||||
result <- x
|
result <- subBlock
|
||||||
} else {
|
} else {
|
||||||
for _, xx := range x {
|
for _, r := range subBlock {
|
||||||
result <- string(xx)
|
result <- string(r)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -260,17 +308,17 @@ func (j *Jieba) Cut(sentence string, hmm bool) chan string {
|
|||||||
func (j *Jieba) CutAll(sentence string) chan string {
|
func (j *Jieba) CutAll(sentence string) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range RegexpSplit(reHanCutAll, sentence) {
|
for _, block := range RegexpSplit(reHanCutAll, sentence, -1) {
|
||||||
if len(blk) == 0 {
|
if len(block) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if reHanCutAll.MatchString(blk) {
|
if reHanCutAll.MatchString(block) {
|
||||||
for x := range j.cutAll(blk) {
|
for x := range j.cutAll(block) {
|
||||||
result <- x
|
result <- x
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for _, x := range reSkipCutAll.Split(blk, -1) {
|
for _, subBlock := range reSkipCutAll.Split(block, -1) {
|
||||||
result <- x
|
result <- subBlock
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -644,13 +644,18 @@ func TestCutDAGNoHmm(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRegexpSplit(t *testing.T) {
|
func TestRegexpSplit(t *testing.T) {
|
||||||
result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||||
"BP神经网络如何训练才能在分类时增加区分度?"))
|
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||||
|
if len(result) != 2 {
|
||||||
|
t.Fatal(result)
|
||||||
|
}
|
||||||
|
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
|
||||||
|
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||||
if len(result) != 3 {
|
if len(result) != 3 {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||||
",BP神经网络如何训练才能在分类时#增加区分度?"))
|
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
|
||||||
if len(result) != 3 {
|
if len(result) != 3 {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
@@ -663,8 +668,10 @@ func TestDefaultCut(t *testing.T) {
|
|||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(j.Cut(content, true))
|
result = chanToArray(j.Cut(content, true))
|
||||||
if len(result) != len(defaultCutResult[index]) {
|
if len(result) != len(defaultCutResult[index]) {
|
||||||
t.Fatalf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutResult[index]), len(result))
|
content, len(defaultCutResult[index]), len(result))
|
||||||
|
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||||
|
t.Fatalf("got: %v\n", result)
|
||||||
}
|
}
|
||||||
for i, r := range result {
|
for i, r := range result {
|
||||||
if r != defaultCutResult[index][i] {
|
if r != defaultCutResult[index][i] {
|
||||||
@@ -681,8 +688,10 @@ func TestCutAll(t *testing.T) {
|
|||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(j.CutAll(content))
|
result = chanToArray(j.CutAll(content))
|
||||||
if len(result) != len(cutAllResult[index]) {
|
if len(result) != len(cutAllResult[index]) {
|
||||||
t.Fatalf("cut all for %s length should be %d not %d\n",
|
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||||
content, len(cutAllResult[index]), len(result))
|
content, len(cutAllResult[index]), len(result))
|
||||||
|
t.Errorf("expect: %v\n", strings.Join(defaultCutResult[index], "/"))
|
||||||
|
t.Fatalf("got: %v\n", strings.Join(result, "/"))
|
||||||
}
|
}
|
||||||
for i, c := range result {
|
for i, c := range result {
|
||||||
if c != cutAllResult[index][i] {
|
if c != cutAllResult[index][i] {
|
||||||
|
|||||||
@@ -1,14 +1,15 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"github.com/wangbin/jiebago"
|
"github.com/wangbin/jiebago"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
reHanDetail = regexp.MustCompile(`(\p{Han}+)`)
|
||||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
|
||||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||||
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
|
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
|
||||||
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
|
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
|
||||||
@@ -20,6 +21,10 @@ type Pair struct {
|
|||||||
Word, Flag string
|
Word, Flag string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p Pair) String() string {
|
||||||
|
return fmt.Sprintf("%s / %s", p.Word, p.Flag)
|
||||||
|
}
|
||||||
|
|
||||||
type Posseg struct {
|
type Posseg struct {
|
||||||
*jiebago.Jieba
|
*jiebago.Jieba
|
||||||
flagMap map[string]string
|
flagMap map[string]string
|
||||||
@@ -92,15 +97,14 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
|
|||||||
|
|
||||||
func (p *Posseg) cutDetail(sentence string) chan Pair {
|
func (p *Posseg) cutDetail(sentence string) chan Pair {
|
||||||
result := make(chan Pair)
|
result := make(chan Pair)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) {
|
||||||
if reHanDetail.MatchString(blk) {
|
if reHanDetail.MatchString(blk) {
|
||||||
for wordTag := range p.cutDetailInternal(blk) {
|
for wordTag := range p.cutDetailInternal(blk) {
|
||||||
result <- wordTag
|
result <- wordTag
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for x := range jiebago.RegexpSplit(reSkipDetail, blk) {
|
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) {
|
||||||
if len(x) == 0 {
|
if len(x) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -262,13 +266,13 @@ func (p *Posseg) Cut(sentence string, HMM bool) chan Pair {
|
|||||||
cut = p.cutDAGNoHMM
|
cut = p.cutDAGNoHMM
|
||||||
}
|
}
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) {
|
||||||
if reHanInternal.MatchString(blk) {
|
if reHanInternal.MatchString(blk) {
|
||||||
for wordTag := range cut(blk) {
|
for wordTag := range cut(blk) {
|
||||||
result <- wordTag
|
result <- wordTag
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) {
|
||||||
if reSkipInternal.MatchString(x) {
|
if reSkipInternal.MatchString(x) {
|
||||||
result <- Pair{x, "x"}
|
result <- Pair{x, "x"}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -283,7 +283,10 @@ func TestCut(t *testing.T) {
|
|||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result := chanToArray(p.Cut(content, true))
|
result := chanToArray(p.Cut(content, true))
|
||||||
if len(defaultCutResult[index]) != len(result) {
|
if len(defaultCutResult[index]) != len(result) {
|
||||||
t.Fatal(content)
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
|
content, len(defaultCutResult[index]), len(result))
|
||||||
|
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||||
|
t.Fatalf("got: %v\n", result)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i, _ := range result {
|
||||||
if result[i] != defaultCutResult[index][i] {
|
if result[i] != defaultCutResult[index][i] {
|
||||||
|
|||||||
28
util.go
28
util.go
@@ -1,28 +0,0 @@
|
|||||||
package jiebago
|
|
||||||
|
|
||||||
import (
|
|
||||||
"regexp"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Split sentence using regular expression.
|
|
||||||
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
|
|
||||||
result := make(chan string)
|
|
||||||
go func() {
|
|
||||||
locs := r.FindAllStringIndex(sentence, -1)
|
|
||||||
lastLoc := 0
|
|
||||||
for _, loc := range locs {
|
|
||||||
if loc[0] == lastLoc {
|
|
||||||
result <- sentence[loc[0]:loc[1]]
|
|
||||||
} else {
|
|
||||||
result <- sentence[lastLoc:loc[0]]
|
|
||||||
result <- sentence[loc[0]:loc[1]]
|
|
||||||
}
|
|
||||||
lastLoc = loc[1]
|
|
||||||
}
|
|
||||||
if lastLoc < len(sentence) {
|
|
||||||
result <- sentence[lastLoc:]
|
|
||||||
}
|
|
||||||
close(result)
|
|
||||||
}()
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user