mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
unify Cut method, return channel instead of array
This commit is contained in:
@@ -123,7 +123,10 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
|
||||
g := newUndirectWeightedGraph()
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
wordTags := posseg.Cut(sentence, true)
|
||||
wordTags := make([]posseg.WordTag, 0)
|
||||
for wordTag := range posseg.Cut(sentence, true) {
|
||||
wordTags = append(wordTags, wordTag)
|
||||
}
|
||||
for i, _ := range wordTags {
|
||||
if _, ok := posFilt[wordTags[i].Tag]; ok {
|
||||
for j := i + 1; j < i+span; j++ {
|
||||
|
||||
@@ -219,12 +219,12 @@ func cutDAGNoHMM(sentence string) []WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func Cut(sentence string, HMM bool) []WordTag {
|
||||
func Cut(sentence string, HMM bool) chan WordTag {
|
||||
for key := range jiebago.UserWordTagTab {
|
||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
result := make([]WordTag, 0)
|
||||
result := make(chan WordTag)
|
||||
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
@@ -232,31 +232,34 @@ func Cut(sentence string, HMM bool) []WordTag {
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
}
|
||||
for _, blk := range blocks {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
for _, wordTag := range cut(blk) {
|
||||
result = append(result, wordTag)
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result = append(result, WordTag{x, "x"})
|
||||
} else {
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
result = append(result, WordTag{s, "m"})
|
||||
case reEng.MatchString(x):
|
||||
result = append(result, WordTag{x, "eng"})
|
||||
break
|
||||
default:
|
||||
result = append(result, WordTag{s, "x"})
|
||||
go func() {
|
||||
for _, blk := range blocks {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
for _, wordTag := range cut(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result <- WordTag{x, "x"}
|
||||
} else {
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
result <- WordTag{s, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- WordTag{x, "eng"}
|
||||
break
|
||||
default:
|
||||
result <- WordTag{s, "x"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -268,10 +268,18 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
func chanToArray(ch chan WordTag) []WordTag {
|
||||
result := make([]WordTag, 0)
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
for index, content := range test_contents {
|
||||
result := Cut(content, true)
|
||||
result := chanToArray(Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
@@ -280,7 +288,7 @@ func TestCut(t *testing.T) {
|
||||
t.Error(content)
|
||||
}
|
||||
}
|
||||
result = Cut(content, false)
|
||||
result = chanToArray(Cut(content, false))
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
@@ -305,7 +313,7 @@ func TestBug132(t *testing.T) {
|
||||
WordTag{"又", "d"},
|
||||
WordTag{"啞", "v"},
|
||||
}
|
||||
result := Cut(sentence, true)
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -337,7 +345,7 @@ func TestBug137(t *testing.T) {
|
||||
WordTag{"研究", "vn"},
|
||||
WordTag{"組", "x"},
|
||||
}
|
||||
result := Cut(sentence, true)
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -392,7 +400,7 @@ func TestUserDict(t *testing.T) {
|
||||
WordTag{"N", "eng"},
|
||||
WordTag{"类型", "n"}}
|
||||
|
||||
result := Cut(sentence, true)
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user