1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 05:00:24 +08:00

refactor Cut function, make CutAll a seperate function, to simplify the logic of Cut function

This commit is contained in:
Wang Bin
2015-03-30 15:18:36 +08:00
parent 556b96b137
commit c4c3a5f9ad
5 changed files with 44 additions and 118 deletions

View File

@@ -86,11 +86,10 @@ func (j *Jieba) DAG(sentence string) map[int][]int {
dag := make(map[int][]int)
runes := []rune(sentence)
n := len(runes)
i := 0
var frag string
for k := 0; k < n; k++ {
tmpList := make([]int, 0)
i = k
i := k
frag = string(runes[k])
for {
if freq, ok := j.Freq[frag]; !ok {
@@ -284,63 +283,31 @@ which is suitable for text analysis.
HMM contols whether to use the Hidden Markov Mode.
*/
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
func (j *Jieba) Cut(sentence string, hmm bool) chan string {
result := make(chan string)
var cut cutFunc
if hmm {
cut = j.cutDAG
} else {
cut = j.cutDAGNoHMM
}
go func() {
var reHan, reSkip *regexp.Regexp
if isCutAll {
reHan = reHanCutAll
reSkip = reSkipCutAll
} else {
reHan = reHanDefault
reSkip = reSkipDefault
}
var cut cutFunc
if HMM {
cut = j.cutDAG
} else {
cut = j.cutDAGNoHMM
}
if isCutAll {
cut = j.cutAll
}
for blk := range RegexpSplit(reHan, sentence) {
for blk := range RegexpSplit(reHanDefault, sentence) {
if len(blk) == 0 {
continue
}
if reHan.MatchString(blk) {
if reHanDefault.MatchString(blk) {
for x := range cut(blk) {
result <- x
}
} else {
type skipSplitFunc func(sentence string) chan string
var ssf skipSplitFunc
if isCutAll {
ssf = func(sentence string) chan string {
ch := make(chan string)
go func() {
for _, s := range reSkip.Split(sentence, -1) {
ch <- s
}
close(ch)
}()
return ch
}
} else {
ssf = func(sentence string) chan string {
return RegexpSplit(reSkip, sentence)
}
}
for x := range ssf(blk) {
if reSkip.MatchString(x) {
for x := range RegexpSplit(reSkipDefault, blk) {
if reSkipDefault.MatchString(x) {
result <- x
} else if !isCutAll {
} else {
for _, xx := range x {
result <- string(xx)
}
} else {
result <- x
}
}
}
@@ -350,13 +317,35 @@ func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
return result
}
func (j *Jieba) CutAll(sentence string) chan string {
result := make(chan string)
go func() {
for blk := range RegexpSplit(reHanCutAll, sentence) {
if len(blk) == 0 {
continue
}
if reHanCutAll.MatchString(blk) {
for x := range j.cutAll(blk) {
result <- x
}
} else {
for _, x := range reSkipCutAll.Split(blk, -1) {
result <- x
}
}
}
close(result)
}()
return result
}
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
// to cut long words into several short words, which can raise the recall rate.
// Suitable for search engines.
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
result := make(chan string)
go func() {
for word := range j.Cut(sentence, false, hmm) {
for word := range j.Cut(sentence, hmm) {
runes := []rune(word)
for _, increment := range []int{2, 3} {
if len(runes) > increment {