mirror of
https://github.com/fumiama/jieba.git
synced 2026-07-02 10:00:27 +08:00
change cut method to return a channel string, not []string
This commit is contained in:
@@ -36,10 +36,9 @@ func (tis TfIdfs) Swap(i, j int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ExtractTags(sentence string, topK int) (tags TfIdfs) {
|
func ExtractTags(sentence string, topK int) (tags TfIdfs) {
|
||||||
words := jiebago.Cut(sentence, false, true)
|
|
||||||
freq := make(map[string]float64)
|
freq := make(map[string]float64)
|
||||||
|
|
||||||
for _, w := range words {
|
for w := range jiebago.Cut(sentence, false, true) {
|
||||||
w = strings.TrimSpace(w)
|
w = strings.TrimSpace(w)
|
||||||
if utf8.RuneCountInString(w) < 2 {
|
if utf8.RuneCountInString(w) < 2 {
|
||||||
continue
|
continue
|
||||||
|
|||||||
133
jieba.go
133
jieba.go
@@ -253,80 +253,85 @@ func cutAll(sentence string) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func Cut(sentence string, isCutAll bool, HMM bool) []string {
|
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||||
result := make([]string, 0)
|
result := make(chan string)
|
||||||
var reHan, reSkip *regexp.Regexp
|
go func() {
|
||||||
if isCutAll {
|
var reHan, reSkip *regexp.Regexp
|
||||||
reHan = reHanCutAll
|
if isCutAll {
|
||||||
reSkip = reSkipCutAll
|
reHan = reHanCutAll
|
||||||
} else {
|
reSkip = reSkipCutAll
|
||||||
reHan = reHanDefault
|
|
||||||
reSkip = reSkipDefault
|
|
||||||
}
|
|
||||||
blocks := RegexpSplit(reHan, sentence)
|
|
||||||
var cut cutFunc
|
|
||||||
if HMM {
|
|
||||||
cut = cutDAG
|
|
||||||
} else {
|
|
||||||
cut = cutDAGNoHMM
|
|
||||||
}
|
|
||||||
if isCutAll {
|
|
||||||
cut = cutAll
|
|
||||||
}
|
|
||||||
for _, blk := range blocks {
|
|
||||||
if len(blk) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if reHan.MatchString(blk) {
|
|
||||||
for _, word := range cut(blk) {
|
|
||||||
result = append(result, word)
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
type skipSplitFunc func(sentence string) []string
|
reHan = reHanDefault
|
||||||
var ssf skipSplitFunc
|
reSkip = reSkipDefault
|
||||||
if isCutAll {
|
}
|
||||||
ssf = func(sentence string) []string {
|
blocks := RegexpSplit(reHan, sentence)
|
||||||
return reSkip.Split(sentence, -1)
|
var cut cutFunc
|
||||||
|
if HMM {
|
||||||
|
cut = cutDAG
|
||||||
|
} else {
|
||||||
|
cut = cutDAGNoHMM
|
||||||
|
}
|
||||||
|
if isCutAll {
|
||||||
|
cut = cutAll
|
||||||
|
}
|
||||||
|
for _, blk := range blocks {
|
||||||
|
if len(blk) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if reHan.MatchString(blk) {
|
||||||
|
for _, word := range cut(blk) {
|
||||||
|
result <- word
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ssf = func(sentence string) []string {
|
type skipSplitFunc func(sentence string) []string
|
||||||
return RegexpSplit(reSkip, sentence)
|
var ssf skipSplitFunc
|
||||||
}
|
if isCutAll {
|
||||||
}
|
ssf = func(sentence string) []string {
|
||||||
|
return reSkip.Split(sentence, -1)
|
||||||
for _, x := range ssf(blk) {
|
|
||||||
if reSkip.MatchString(x) {
|
|
||||||
result = append(result, x)
|
|
||||||
} else if !isCutAll {
|
|
||||||
for _, xx := range x {
|
|
||||||
result = append(result, string(xx))
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result = append(result, x)
|
ssf = func(sentence string) []string {
|
||||||
|
return RegexpSplit(reSkip, sentence)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func CutForSearch(sentence string, hmm bool) []string {
|
for _, x := range ssf(blk) {
|
||||||
result := make([]string, 0)
|
if reSkip.MatchString(x) {
|
||||||
words := Cut(sentence, false, hmm)
|
result <- x
|
||||||
for _, word := range words {
|
} else if !isCutAll {
|
||||||
runes := []rune(word)
|
for _, xx := range x {
|
||||||
for _, increment := range []int{2, 3} {
|
result <- string(xx)
|
||||||
if len(runes) > increment {
|
}
|
||||||
var gram2 string
|
} else {
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
result <- x
|
||||||
gram2 = string(runes[i : i+increment])
|
|
||||||
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
|
|
||||||
result = append(result, gram2)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = append(result, word)
|
close(result)
|
||||||
}
|
}()
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func CutForSearch(sentence string, hmm bool) chan string {
|
||||||
|
result := make(chan string)
|
||||||
|
go func() {
|
||||||
|
for word := range Cut(sentence, false, hmm) {
|
||||||
|
runes := []rune(word)
|
||||||
|
for _, increment := range []int{2, 3} {
|
||||||
|
if len(runes) > increment {
|
||||||
|
var gram2 string
|
||||||
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
|
gram2 = string(runes[i : i+increment])
|
||||||
|
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
|
||||||
|
result <- gram2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result <- word
|
||||||
|
}
|
||||||
|
close(result)
|
||||||
|
}()
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -648,10 +648,18 @@ func TestRegexpSplit(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chanToArray(ch chan string) []string {
|
||||||
|
result := make([]string, 0)
|
||||||
|
for word := range ch {
|
||||||
|
result = append(result, word)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
func TestDefaultCut(t *testing.T) {
|
func TestDefaultCut(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = Cut(content, false, true)
|
result = chanToArray(Cut(content, false, true))
|
||||||
if len(result) != len(defaultCutResult[index]) {
|
if len(result) != len(defaultCutResult[index]) {
|
||||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutResult[index]), len(result))
|
content, len(defaultCutResult[index]), len(result))
|
||||||
@@ -667,7 +675,7 @@ func TestDefaultCut(t *testing.T) {
|
|||||||
func TestCutAll(t *testing.T) {
|
func TestCutAll(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = Cut(content, true, true)
|
result = chanToArray(Cut(content, true, true))
|
||||||
if len(result) != len(cutAllResult[index]) {
|
if len(result) != len(cutAllResult[index]) {
|
||||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||||
content, len(cutAllResult[index]), len(result))
|
content, len(cutAllResult[index]), len(result))
|
||||||
@@ -683,7 +691,7 @@ func TestCutAll(t *testing.T) {
|
|||||||
func TestDefaultCutNoHMM(t *testing.T) {
|
func TestDefaultCutNoHMM(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = Cut(content, false, false)
|
result = chanToArray(Cut(content, false, false))
|
||||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||||
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutNoHMMResult[index]), len(result))
|
content, len(defaultCutNoHMMResult[index]), len(result))
|
||||||
@@ -699,7 +707,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
|||||||
func TestCutForSearch(t *testing.T) {
|
func TestCutForSearch(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = CutForSearch(content, true)
|
result = chanToArray(CutForSearch(content, true))
|
||||||
if len(result) != len(cutForSearchResult[index]) {
|
if len(result) != len(cutForSearchResult[index]) {
|
||||||
t.Errorf("cut for search for %s length should be %d not %d\n",
|
t.Errorf("cut for search for %s length should be %d not %d\n",
|
||||||
content, len(cutForSearchResult[index]), len(result))
|
content, len(cutForSearchResult[index]), len(result))
|
||||||
@@ -711,7 +719,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = CutForSearch(content, false)
|
result = chanToArray(CutForSearch(content, false))
|
||||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||||
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
||||||
content, len(cutForSearchNoHMMResult[index]), len(result))
|
content, len(cutForSearchNoHMMResult[index]), len(result))
|
||||||
@@ -728,7 +736,7 @@ func TestSetdictionary(t *testing.T) {
|
|||||||
var result []string
|
var result []string
|
||||||
SetDictionary("foobar.txt")
|
SetDictionary("foobar.txt")
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = Cut(content, false, true)
|
result = chanToArray(Cut(content, false, true))
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||||
content, len(userDictCutResult[index]), len(result))
|
content, len(userDictCutResult[index]), len(result))
|
||||||
@@ -748,7 +756,7 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||||
|
|
||||||
words := Cut(sentence, false, true)
|
words := chanToArray(Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(len(words))
|
t.Error(len(words))
|
||||||
}
|
}
|
||||||
@@ -760,7 +768,7 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
|
|
||||||
sentence = "easy_install is great"
|
sentence = "easy_install is great"
|
||||||
result = []string{"easy_install", " ", "is", " ", "great"}
|
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||||
words = Cut(sentence, false, true)
|
words = chanToArray(Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(len(words))
|
t.Error(len(words))
|
||||||
}
|
}
|
||||||
@@ -772,7 +780,7 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
|
|
||||||
sentence = "python 的正则表达式是好用的"
|
sentence = "python 的正则表达式是好用的"
|
||||||
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
||||||
words = Cut(sentence, false, true)
|
words = chanToArray(Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(words)
|
t.Error(words)
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
|
|||||||
@@ -10,14 +10,13 @@ func Tokenize(sentence string, mode string, HMM bool) []Token {
|
|||||||
tokens := make([]Token, 0)
|
tokens := make([]Token, 0)
|
||||||
start := 0
|
start := 0
|
||||||
var width int
|
var width int
|
||||||
if mode == "default" {
|
for word := range Cut(sentence, false, HMM) {
|
||||||
for _, word := range Cut(sentence, false, HMM) {
|
if mode == "default" {
|
||||||
width = len([]rune(word))
|
width = len([]rune(word))
|
||||||
tokens = append(tokens, Token{word, start, start + width})
|
tokens = append(tokens, Token{word, start, start + width})
|
||||||
start += width
|
start += width
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
for _, word := range Cut(sentence, false, HMM) {
|
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
width = len(runes)
|
width = len(runes)
|
||||||
for _, step := range []int{2, 3} {
|
for _, step := range []int{2, 3} {
|
||||||
|
|||||||
Reference in New Issue
Block a user