mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 08:40:36 +08:00
refactor Cut function, make CutAll a seperate function, to simplify the logic of Cut function
This commit is contained in:
@@ -1,11 +0,0 @@
|
||||
package jiebago
|
||||
|
||||
type Entry struct {
|
||||
Word string
|
||||
Flag string
|
||||
Freq float64
|
||||
}
|
||||
|
||||
type DictLoader interface {
|
||||
AddEntry(Entry)
|
||||
}
|
||||
83
jieba.go
83
jieba.go
@@ -86,11 +86,10 @@ func (j *Jieba) DAG(sentence string) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
runes := []rune(sentence)
|
||||
n := len(runes)
|
||||
i := 0
|
||||
var frag string
|
||||
for k := 0; k < n; k++ {
|
||||
tmpList := make([]int, 0)
|
||||
i = k
|
||||
i := k
|
||||
frag = string(runes[k])
|
||||
for {
|
||||
if freq, ok := j.Freq[frag]; !ok {
|
||||
@@ -284,63 +283,31 @@ which is suitable for text analysis.
|
||||
|
||||
HMM contols whether to use the Hidden Markov Mode.
|
||||
*/
|
||||
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
func (j *Jieba) Cut(sentence string, hmm bool) chan string {
|
||||
result := make(chan string)
|
||||
var cut cutFunc
|
||||
if hmm {
|
||||
cut = j.cutDAG
|
||||
} else {
|
||||
cut = j.cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
var reHan, reSkip *regexp.Regexp
|
||||
if isCutAll {
|
||||
reHan = reHanCutAll
|
||||
reSkip = reSkipCutAll
|
||||
} else {
|
||||
reHan = reHanDefault
|
||||
reSkip = reSkipDefault
|
||||
}
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = j.cutDAG
|
||||
} else {
|
||||
cut = j.cutDAGNoHMM
|
||||
}
|
||||
if isCutAll {
|
||||
cut = j.cutAll
|
||||
}
|
||||
for blk := range RegexpSplit(reHan, sentence) {
|
||||
for blk := range RegexpSplit(reHanDefault, sentence) {
|
||||
if len(blk) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHan.MatchString(blk) {
|
||||
if reHanDefault.MatchString(blk) {
|
||||
for x := range cut(blk) {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
type skipSplitFunc func(sentence string) chan string
|
||||
var ssf skipSplitFunc
|
||||
if isCutAll {
|
||||
ssf = func(sentence string) chan string {
|
||||
ch := make(chan string)
|
||||
go func() {
|
||||
for _, s := range reSkip.Split(sentence, -1) {
|
||||
ch <- s
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
} else {
|
||||
ssf = func(sentence string) chan string {
|
||||
return RegexpSplit(reSkip, sentence)
|
||||
}
|
||||
}
|
||||
|
||||
for x := range ssf(blk) {
|
||||
if reSkip.MatchString(x) {
|
||||
for x := range RegexpSplit(reSkipDefault, blk) {
|
||||
if reSkipDefault.MatchString(x) {
|
||||
result <- x
|
||||
} else if !isCutAll {
|
||||
} else {
|
||||
for _, xx := range x {
|
||||
result <- string(xx)
|
||||
}
|
||||
} else {
|
||||
result <- x
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -350,13 +317,35 @@ func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
func (j *Jieba) CutAll(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for blk := range RegexpSplit(reHanCutAll, sentence) {
|
||||
if len(blk) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHanCutAll.MatchString(blk) {
|
||||
for x := range j.cutAll(blk) {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
for _, x := range reSkipCutAll.Split(blk, -1) {
|
||||
result <- x
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
||||
// to cut long words into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for word := range j.Cut(sentence, false, hmm) {
|
||||
for word := range j.Cut(sentence, hmm) {
|
||||
runes := []rune(word)
|
||||
for _, increment := range []int{2, 3} {
|
||||
if len(runes) > increment {
|
||||
|
||||
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(j.Cut(content, false, true))
|
||||
result = chanToArray(j.Cut(content, true))
|
||||
if len(result) != len(defaultCutResult[index]) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
content, len(defaultCutResult[index]), len(result))
|
||||
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(j.Cut(content, true, true))
|
||||
result = chanToArray(j.CutAll(content))
|
||||
if len(result) != len(cutAllResult[index]) {
|
||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||
content, len(cutAllResult[index]), len(result))
|
||||
@@ -697,7 +697,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(j.Cut(content, false, false))
|
||||
result = chanToArray(j.Cut(content, false))
|
||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
||||
content, len(defaultCutNoHMMResult[index]), len(result))
|
||||
@@ -744,7 +744,7 @@ func TestSetdictionary(t *testing.T) {
|
||||
var result []string
|
||||
j, _ := NewJieba("foobar.txt")
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(j.Cut(content, false, true))
|
||||
result = chanToArray(j.Cut(content, true))
|
||||
if len(result) != len(userDictCutResult[index]) {
|
||||
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||
content, len(userDictCutResult[index]), len(result))
|
||||
@@ -764,7 +764,7 @@ func TestLoadUserDict(t *testing.T) {
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||
|
||||
words := chanToArray(j.Cut(sentence, false, true))
|
||||
words := chanToArray(j.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
}
|
||||
@@ -776,7 +776,7 @@ func TestLoadUserDict(t *testing.T) {
|
||||
|
||||
sentence = "easy_install is great"
|
||||
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||
words = chanToArray(j.Cut(sentence, false, true))
|
||||
words = chanToArray(j.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
}
|
||||
@@ -788,7 +788,7 @@ func TestLoadUserDict(t *testing.T) {
|
||||
|
||||
sentence = "python 的正则表达式是好用的"
|
||||
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
||||
words = chanToArray(j.Cut(sentence, false, true))
|
||||
words = chanToArray(j.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(words)
|
||||
t.Error(result)
|
||||
|
||||
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jt.j.Cut(string(input), false, jt.hmm) {
|
||||
for word := range jt.j.Cut(string(input), jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
|
||||
52
util.go
52
util.go
@@ -1,61 +1,9 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func dictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
|
||||
func LoadDict(l DictLoader, dictFileName string, usingFlag bool) error {
|
||||
dictFilePath, err := dictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dictFile, err := os.Open(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer dictFile.Close()
|
||||
|
||||
scanner := bufio.NewScanner(dictFile)
|
||||
var entry Entry
|
||||
var line string
|
||||
var fields []string
|
||||
for scanner.Scan() {
|
||||
line = scanner.Text()
|
||||
fields = strings.Split(line, " ")
|
||||
entry.Word = strings.Replace(fields[0], "\ufeff", "", 1)
|
||||
if length := len(fields); length > 1 {
|
||||
entry.Freq, err = strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if usingFlag && length > 2 {
|
||||
entry.Flag = fields[2]
|
||||
}
|
||||
}
|
||||
l.AddEntry(entry)
|
||||
}
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
// Split sentence using regular expression.
|
||||
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
|
||||
result := make(chan string)
|
||||
|
||||
Reference in New Issue
Block a user