mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-22 12:10:30 +08:00
unify Cut method, return channel instead of array
This commit is contained in:
@@ -123,7 +123,10 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
|
|||||||
g := newUndirectWeightedGraph()
|
g := newUndirectWeightedGraph()
|
||||||
cm := make(map[[2]string]float64)
|
cm := make(map[[2]string]float64)
|
||||||
span := 5
|
span := 5
|
||||||
wordTags := posseg.Cut(sentence, true)
|
wordTags := make([]posseg.WordTag, 0)
|
||||||
|
for wordTag := range posseg.Cut(sentence, true) {
|
||||||
|
wordTags = append(wordTags, wordTag)
|
||||||
|
}
|
||||||
for i, _ := range wordTags {
|
for i, _ := range wordTags {
|
||||||
if _, ok := posFilt[wordTags[i].Tag]; ok {
|
if _, ok := posFilt[wordTags[i].Tag]; ok {
|
||||||
for j := i + 1; j < i+span; j++ {
|
for j := i + 1; j < i+span; j++ {
|
||||||
|
|||||||
@@ -219,12 +219,12 @@ func cutDAGNoHMM(sentence string) []WordTag {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func Cut(sentence string, HMM bool) []WordTag {
|
func Cut(sentence string, HMM bool) chan WordTag {
|
||||||
for key := range jiebago.UserWordTagTab {
|
for key := range jiebago.UserWordTagTab {
|
||||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||||
delete(jiebago.UserWordTagTab, key)
|
delete(jiebago.UserWordTagTab, key)
|
||||||
}
|
}
|
||||||
result := make([]WordTag, 0)
|
result := make(chan WordTag)
|
||||||
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
||||||
var cut cutFunc
|
var cut cutFunc
|
||||||
if HMM {
|
if HMM {
|
||||||
@@ -232,31 +232,34 @@ func Cut(sentence string, HMM bool) []WordTag {
|
|||||||
} else {
|
} else {
|
||||||
cut = cutDAGNoHMM
|
cut = cutDAGNoHMM
|
||||||
}
|
}
|
||||||
for _, blk := range blocks {
|
go func() {
|
||||||
if reHanInternal.MatchString(blk) {
|
for _, blk := range blocks {
|
||||||
for _, wordTag := range cut(blk) {
|
if reHanInternal.MatchString(blk) {
|
||||||
result = append(result, wordTag)
|
for _, wordTag := range cut(blk) {
|
||||||
}
|
result <- wordTag
|
||||||
} else {
|
}
|
||||||
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
} else {
|
||||||
if reSkipInternal.MatchString(x) {
|
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||||
result = append(result, WordTag{x, "x"})
|
if reSkipInternal.MatchString(x) {
|
||||||
} else {
|
result <- WordTag{x, "x"}
|
||||||
for _, xx := range x {
|
} else {
|
||||||
s := string(xx)
|
for _, xx := range x {
|
||||||
switch {
|
s := string(xx)
|
||||||
case reNum.MatchString(s):
|
switch {
|
||||||
result = append(result, WordTag{s, "m"})
|
case reNum.MatchString(s):
|
||||||
case reEng.MatchString(x):
|
result <- WordTag{s, "m"}
|
||||||
result = append(result, WordTag{x, "eng"})
|
case reEng.MatchString(x):
|
||||||
break
|
result <- WordTag{x, "eng"}
|
||||||
default:
|
break
|
||||||
result = append(result, WordTag{s, "x"})
|
default:
|
||||||
|
result <- WordTag{s, "x"}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
close(result)
|
||||||
|
}()
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -268,10 +268,18 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func chanToArray(ch chan WordTag) []WordTag {
|
||||||
|
result := make([]WordTag, 0)
|
||||||
|
for word := range ch {
|
||||||
|
result = append(result, word)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
func TestCut(t *testing.T) {
|
func TestCut(t *testing.T) {
|
||||||
SetDictionary("../dict.txt")
|
SetDictionary("../dict.txt")
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result := Cut(content, true)
|
result := chanToArray(Cut(content, true))
|
||||||
if len(defaultCutResult[index]) != len(result) {
|
if len(defaultCutResult[index]) != len(result) {
|
||||||
t.Error(content)
|
t.Error(content)
|
||||||
}
|
}
|
||||||
@@ -280,7 +288,7 @@ func TestCut(t *testing.T) {
|
|||||||
t.Error(content)
|
t.Error(content)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = Cut(content, false)
|
result = chanToArray(Cut(content, false))
|
||||||
if len(noHMMCutResult[index]) != len(result) {
|
if len(noHMMCutResult[index]) != len(result) {
|
||||||
t.Error(content)
|
t.Error(content)
|
||||||
}
|
}
|
||||||
@@ -305,7 +313,7 @@ func TestBug132(t *testing.T) {
|
|||||||
WordTag{"又", "d"},
|
WordTag{"又", "d"},
|
||||||
WordTag{"啞", "v"},
|
WordTag{"啞", "v"},
|
||||||
}
|
}
|
||||||
result := Cut(sentence, true)
|
result := chanToArray(Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
@@ -337,7 +345,7 @@ func TestBug137(t *testing.T) {
|
|||||||
WordTag{"研究", "vn"},
|
WordTag{"研究", "vn"},
|
||||||
WordTag{"組", "x"},
|
WordTag{"組", "x"},
|
||||||
}
|
}
|
||||||
result := Cut(sentence, true)
|
result := chanToArray(Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
@@ -392,7 +400,7 @@ func TestUserDict(t *testing.T) {
|
|||||||
WordTag{"N", "eng"},
|
WordTag{"N", "eng"},
|
||||||
WordTag{"类型", "n"}}
|
WordTag{"类型", "n"}}
|
||||||
|
|
||||||
result := Cut(sentence, true)
|
result := chanToArray(Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user