1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-25 22:47:08 +08:00

unify Cut method, return channel instead of array

This commit is contained in:
Wang Bin
2015-02-27 17:56:26 +08:00
parent 43480db509
commit 00fae2358d

View File

@@ -37,185 +37,198 @@ func SetDictionary(dictFileName string) error {
return nil return nil
} }
func cutDetailInternal(sentence string) []WordTag { func cutDetailInternal(sentence string) chan WordTag {
result := make([]WordTag, 0) result := make(chan WordTag)
runes := []rune(sentence)
_, posList := Viterbi(runes) go func() {
begin := 0 runes := []rune(sentence)
next := 0 _, posList := Viterbi(runes)
for i, char := range runes { begin := 0
pos := posList[i].State next := 0
switch pos { for i, char := range runes {
case 'B': pos := posList[i].State
begin = i switch pos {
case 'E': case 'B':
result = append(result, WordTag{string(runes[begin : i+1]), posList[i].Tag}) begin = i
next = i + 1 case 'E':
case 'S': result <- WordTag{string(runes[begin : i+1]), posList[i].Tag}
result = append(result, WordTag{string(char), posList[i].Tag}) next = i + 1
next = i + 1 case 'S':
result <- WordTag{string(char), posList[i].Tag}
next = i + 1
}
} }
} if next < len(runes) {
if next < len(runes) { result <- WordTag{string(runes[next:]), posList[next].Tag}
result = append(result, WordTag{string(runes[next:]), posList[next].Tag}) }
} close(result)
}()
return result return result
} }
func cutDetail(sentence string) []WordTag { func cutDetail(sentence string) chan WordTag {
result := make([]WordTag, 0) result := make(chan WordTag)
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
for _, blk := range blocks { go func() {
if reHanDetail.MatchString(blk) { blocks := jiebago.RegexpSplit(reHanDetail, sentence)
for _, wordTag := range cutDetailInternal(blk) { for _, blk := range blocks {
result = append(result, wordTag) if reHanDetail.MatchString(blk) {
} for wordTag := range cutDetailInternal(blk) {
} else { result <- wordTag
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
if len(x) == 0 {
continue
} }
switch { } else {
case reNum.MatchString(x): for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
result = append(result, WordTag{x, "m"}) if len(x) == 0 {
case reEng.MatchString(x): continue
result = append(result, WordTag{x, "eng"})
default:
result = append(result, WordTag{x, "x"})
}
}
}
}
return result
}
type cutFunc func(sentence string) []WordTag
func cutDAG(sentence string) []WordTag {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
result := make([]WordTag, 0)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
} }
buf = make([]rune, 0) switch {
} else { case reNum.MatchString(x):
bufString := string(buf) result <- WordTag{x, "m"}
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 { case reEng.MatchString(x):
recognized := cutDetail(bufString) result <- WordTag{x, "eng"}
for _, t := range recognized { default:
result = append(result, t) result <- WordTag{x, "x"}
}
}
}
}
close(result)
}()
return result
}
type cutFunc func(sentence string) chan WordTag
func cutDAG(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
} }
buf = make([]rune, 0)
} else { } else {
for _, elem := range buf { bufString := string(buf)
selem := string(elem) if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
if tag, ok := wordTagMap[selem]; ok { for t := range cutDetail(bufString) {
result = append(result, WordTag{string(elem), tag}) result <- t
} else {
result = append(result, WordTag{string(elem), "x"})
} }
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
result <- WordTag{string(elem), tag}
} else {
result <- WordTag{string(elem), "x"}
}
}
}
buf = make([]rune, 0)
}
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
}
}
x = y
}
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
}
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
result <- WordTag{selem, tag}
} else {
result <- WordTag{selem, "x"}
} }
} }
buf = make([]rune, 0)
}
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
result = append(result, WordTag{sl_word, tag})
} else {
result = append(result, WordTag{sl_word, "x"})
}
}
x = y
}
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
}
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
recognized := cutDetail(bufString)
for _, t := range recognized {
result = append(result, t)
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
result = append(result, WordTag{selem, tag})
} else {
result = append(result, WordTag{selem, "x"})
}
} }
} }
} }
} close(result)
}()
return result return result
} }
func cutDAGNoHMM(sentence string) []WordTag { func cutDAGNoHMM(sentence string) chan WordTag {
result := make([]WordTag, 0) result := make(chan WordTag)
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag) go func() {
x := 0 dag := jiebago.DAG(sentence)
var y int routes := jiebago.Calc(sentence, dag)
runes := []rune(sentence) x := 0
length := len(runes) var y int
buf := make([]rune, 0) runes := []rune(sentence)
for { length := len(runes)
if x >= length { buf := make([]rune, 0)
break for {
} if x >= length {
y = routes[x].Index + 1 break
l_word := runes[x:y]
if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
x = y
} else {
if len(buf) > 0 {
result = append(result, WordTag{string(buf), "eng"})
buf = make([]rune, 0)
} }
sl_word := string(l_word) y = routes[x].Index + 1
if tag, ok := wordTagMap[sl_word]; ok { l_word := runes[x:y]
result = append(result, WordTag{sl_word, tag}) if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
x = y
} else { } else {
result = append(result, WordTag{sl_word, "x"}) if len(buf) > 0 {
result <- WordTag{string(buf), "eng"}
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
}
x = y
} }
x = y
} }
} if len(buf) > 0 {
if len(buf) > 0 { result <- WordTag{string(buf), "eng"}
result = append(result, WordTag{string(buf), "eng"}) buf = make([]rune, 0)
buf = make([]rune, 0) }
} close(result)
}()
return result return result
} }
@@ -235,7 +248,7 @@ func Cut(sentence string, HMM bool) chan WordTag {
go func() { go func() {
for _, blk := range blocks { for _, blk := range blocks {
if reHanInternal.MatchString(blk) { if reHanInternal.MatchString(blk) {
for _, wordTag := range cut(blk) { for wordTag := range cut(blk) {
result <- wordTag result <- wordTag
} }
} else { } else {