1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-22 12:10:30 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:04:50 +08:00
parent 76b9df8511
commit d76fbfb017
3 changed files with 43 additions and 29 deletions

View File

@@ -123,7 +123,10 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
g := newUndirectWeightedGraph() g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64) cm := make(map[[2]string]float64)
span := 5 span := 5
wordTags := posseg.Cut(sentence, true) wordTags := make([]posseg.WordTag, 0)
for wordTag := range posseg.Cut(sentence, true) {
wordTags = append(wordTags, wordTag)
}
for i, _ := range wordTags { for i, _ := range wordTags {
if _, ok := posFilt[wordTags[i].Tag]; ok { if _, ok := posFilt[wordTags[i].Tag]; ok {
for j := i + 1; j < i+span; j++ { for j := i + 1; j < i+span; j++ {

View File

@@ -219,12 +219,12 @@ func cutDAGNoHMM(sentence string) []WordTag {
return result return result
} }
func Cut(sentence string, HMM bool) []WordTag { func Cut(sentence string, HMM bool) chan WordTag {
for key := range jiebago.UserWordTagTab { for key := range jiebago.UserWordTagTab {
wordTagMap[key] = jiebago.UserWordTagTab[key] wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key) delete(jiebago.UserWordTagTab, key)
} }
result := make([]WordTag, 0) result := make(chan WordTag)
blocks := jiebago.RegexpSplit(reHanInternal, sentence) blocks := jiebago.RegexpSplit(reHanInternal, sentence)
var cut cutFunc var cut cutFunc
if HMM { if HMM {
@@ -232,31 +232,34 @@ func Cut(sentence string, HMM bool) []WordTag {
} else { } else {
cut = cutDAGNoHMM cut = cutDAGNoHMM
} }
for _, blk := range blocks { go func() {
if reHanInternal.MatchString(blk) { for _, blk := range blocks {
for _, wordTag := range cut(blk) { if reHanInternal.MatchString(blk) {
result = append(result, wordTag) for _, wordTag := range cut(blk) {
} result <- wordTag
} else { }
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) { } else {
if reSkipInternal.MatchString(x) { for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
result = append(result, WordTag{x, "x"}) if reSkipInternal.MatchString(x) {
} else { result <- WordTag{x, "x"}
for _, xx := range x { } else {
s := string(xx) for _, xx := range x {
switch { s := string(xx)
case reNum.MatchString(s): switch {
result = append(result, WordTag{s, "m"}) case reNum.MatchString(s):
case reEng.MatchString(x): result <- WordTag{s, "m"}
result = append(result, WordTag{x, "eng"}) case reEng.MatchString(x):
break result <- WordTag{x, "eng"}
default: break
result = append(result, WordTag{s, "x"}) default:
result <- WordTag{s, "x"}
}
} }
} }
} }
} }
} }
} close(result)
}()
return result return result
} }

View File

@@ -268,10 +268,18 @@ var (
} }
) )
func chanToArray(ch chan WordTag) []WordTag {
result := make([]WordTag, 0)
for word := range ch {
result = append(result, word)
}
return result
}
func TestCut(t *testing.T) { func TestCut(t *testing.T) {
SetDictionary("../dict.txt") SetDictionary("../dict.txt")
for index, content := range test_contents { for index, content := range test_contents {
result := Cut(content, true) result := chanToArray(Cut(content, true))
if len(defaultCutResult[index]) != len(result) { if len(defaultCutResult[index]) != len(result) {
t.Error(content) t.Error(content)
} }
@@ -280,7 +288,7 @@ func TestCut(t *testing.T) {
t.Error(content) t.Error(content)
} }
} }
result = Cut(content, false) result = chanToArray(Cut(content, false))
if len(noHMMCutResult[index]) != len(result) { if len(noHMMCutResult[index]) != len(result) {
t.Error(content) t.Error(content)
} }
@@ -305,7 +313,7 @@ func TestBug132(t *testing.T) {
WordTag{"又", "d"}, WordTag{"又", "d"},
WordTag{"啞", "v"}, WordTag{"啞", "v"},
} }
result := Cut(sentence, true) result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Error(result) t.Error(result)
} }
@@ -337,7 +345,7 @@ func TestBug137(t *testing.T) {
WordTag{"研究", "vn"}, WordTag{"研究", "vn"},
WordTag{"組", "x"}, WordTag{"組", "x"},
} }
result := Cut(sentence, true) result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Error(result) t.Error(result)
} }
@@ -392,7 +400,7 @@ func TestUserDict(t *testing.T) {
WordTag{"N", "eng"}, WordTag{"N", "eng"},
WordTag{"类型", "n"}} WordTag{"类型", "n"}}
result := Cut(sentence, true) result := chanToArray(Cut(sentence, true))
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Error(result) t.Error(result)
} }