1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:04:50 +08:00
parent 76b9df8511
commit d76fbfb017
3 changed files with 43 additions and 29 deletions

View File

@@ -123,7 +123,10 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64)
span := 5
wordTags := posseg.Cut(sentence, true)
wordTags := make([]posseg.WordTag, 0)
for wordTag := range posseg.Cut(sentence, true) {
wordTags = append(wordTags, wordTag)
}
for i, _ := range wordTags {
if _, ok := posFilt[wordTags[i].Tag]; ok {
for j := i + 1; j < i+span; j++ {

View File

@@ -219,12 +219,12 @@ func cutDAGNoHMM(sentence string) []WordTag {
return result
}
func Cut(sentence string, HMM bool) []WordTag {
func Cut(sentence string, HMM bool) chan WordTag {
for key := range jiebago.UserWordTagTab {
wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
result := make([]WordTag, 0)
result := make(chan WordTag)
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
var cut cutFunc
if HMM {
@@ -232,31 +232,34 @@ func Cut(sentence string, HMM bool) []WordTag {
} else {
cut = cutDAGNoHMM
}
for _, blk := range blocks {
if reHanInternal.MatchString(blk) {
for _, wordTag := range cut(blk) {
result = append(result, wordTag)
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
if reSkipInternal.MatchString(x) {
result = append(result, WordTag{x, "x"})
} else {
for _, xx := range x {
s := string(xx)
switch {
case reNum.MatchString(s):
result = append(result, WordTag{s, "m"})
case reEng.MatchString(x):
result = append(result, WordTag{x, "eng"})
break
default:
result = append(result, WordTag{s, "x"})
go func() {
for _, blk := range blocks {
if reHanInternal.MatchString(blk) {
for _, wordTag := range cut(blk) {
result <- wordTag
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
if reSkipInternal.MatchString(x) {
result <- WordTag{x, "x"}
} else {
for _, xx := range x {
s := string(xx)
switch {
case reNum.MatchString(s):
result <- WordTag{s, "m"}
case reEng.MatchString(x):
result <- WordTag{x, "eng"}
break
default:
result <- WordTag{s, "x"}
}
}
}
}
}
}
}
close(result)
}()
return result
}

View File

@@ -268,10 +268,18 @@ var (
}
)
func chanToArray(ch chan WordTag) []WordTag {
result := make([]WordTag, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestCut(t *testing.T) {
SetDictionary("../dict.txt")
for index, content := range test_contents {
result := Cut(content, true)
result := chanToArray(Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Error(content)
}
@@ -280,7 +288,7 @@ func TestCut(t *testing.T) {
t.Error(content)
}
}
result = Cut(content, false)
result = chanToArray(Cut(content, false))
if len(noHMMCutResult[index]) != len(result) {
t.Error(content)
}
@@ -305,7 +313,7 @@ func TestBug132(t *testing.T) {
WordTag{"又", "d"},
WordTag{"啞", "v"},
}
result := Cut(sentence, true)
result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -337,7 +345,7 @@ func TestBug137(t *testing.T) {
WordTag{"研究", "vn"},
WordTag{"組", "x"},
}
result := Cut(sentence, true)
result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -392,7 +400,7 @@ func TestUserDict(t *testing.T) {
WordTag{"N", "eng"},
WordTag{"类型", "n"}}
result := Cut(sentence, true)
result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}