mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化
This commit is contained in:
1
.gitignore
vendored
Executable file
1
.gitignore
vendored
Executable file
@@ -0,0 +1 @@
|
||||
tokenizers/jieba.beleve/
|
||||
@@ -1,3 +0,0 @@
|
||||
language: go
|
||||
go:
|
||||
- 1.4.2
|
||||
8
README.md
Normal file → Executable file
8
README.md
Normal file → Executable file
@@ -1,7 +1,7 @@
|
||||
#结巴分词 Go 语言版:Jiebago
|
||||
|
||||
|
||||
[](https://travis-ci.org/wangbin/jiebago) [](https://godoc.org/github.com/wangbin/jiebago)
|
||||
[](https://travis-ci.org/wangbin/jiebago) [](https://godoc.org/github.com/fumiama/jieba)
|
||||
|
||||
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
## 安装
|
||||
|
||||
```
|
||||
go get github.com/wangbin/jiebago/...
|
||||
go get github.com/fumiama/jieba/...
|
||||
```
|
||||
|
||||
## 使用
|
||||
@@ -20,7 +20,7 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
"github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
var seg jiebago.Segmenter
|
||||
@@ -62,7 +62,7 @@ func Example() {
|
||||
【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
||||
```
|
||||
|
||||
更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。
|
||||
更多信息请参考[文档](https://godoc.org/github.com/fumiama/jieba)。
|
||||
|
||||
## 分词速度
|
||||
|
||||
|
||||
2
analyse/example_test.go
Normal file → Executable file
2
analyse/example_test.go
Normal file → Executable file
@@ -3,7 +3,7 @@ package analyse_test
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/analyse"
|
||||
"github.com/fumiama/jieba/analyse"
|
||||
)
|
||||
|
||||
func Example_extractTags() {
|
||||
|
||||
2
analyse/idf.go
Normal file → Executable file
2
analyse/idf.go
Normal file → Executable file
@@ -4,7 +4,7 @@ import (
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
)
|
||||
|
||||
// Idf represents a thread-safe dictionary for all words with their
|
||||
|
||||
0
analyse/idf.txt
Normal file → Executable file
0
analyse/idf.txt
Normal file → Executable file
0
analyse/stop_words.txt
Normal file → Executable file
0
analyse/stop_words.txt
Normal file → Executable file
2
analyse/stopwords.go
Normal file → Executable file
2
analyse/stopwords.go
Normal file → Executable file
@@ -3,7 +3,7 @@ package analyse
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
)
|
||||
|
||||
// DefaultStopWordMap contains some stop words.
|
||||
|
||||
4
analyse/tag_extracker.go
Normal file → Executable file
4
analyse/tag_extracker.go
Normal file → Executable file
@@ -6,7 +6,7 @@ import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
jiebago "github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
// Segment represents a word with weight.
|
||||
@@ -74,7 +74,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
freqMap := make(map[string]float64)
|
||||
|
||||
for w := range t.seg.Cut(sentence, true) {
|
||||
for _, w := range t.seg.Cut(sentence, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
if utf8.RuneCountInString(w) < 2 {
|
||||
continue
|
||||
|
||||
0
analyse/tag_extracker_test.go
Normal file → Executable file
0
analyse/tag_extracker_test.go
Normal file → Executable file
2
analyse/textrank.go
Normal file → Executable file
2
analyse/textrank.go
Normal file → Executable file
@@ -4,7 +4,7 @@ import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
"github.com/fumiama/jieba/posseg"
|
||||
)
|
||||
|
||||
const dampingFactor = 0.85
|
||||
|
||||
0
analyse/textrank_test.go
Normal file → Executable file
0
analyse/textrank_test.go
Normal file → Executable file
2
dictionary.go
Normal file → Executable file
2
dictionary.go
Normal file → Executable file
@@ -4,7 +4,7 @@ import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
|
||||
0
dictionary/dictionary.go
Normal file → Executable file
0
dictionary/dictionary.go
Normal file → Executable file
0
dictionary/dictionary_test.go
Normal file → Executable file
0
dictionary/dictionary_test.go
Normal file → Executable file
0
dictionary/token.go
Normal file → Executable file
0
dictionary/token.go
Normal file → Executable file
11
example_parallel_cut_test.go
Normal file → Executable file
11
example_parallel_cut_test.go
Normal file → Executable file
@@ -1,4 +1,4 @@
|
||||
package jiebago_test
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
@@ -8,8 +8,6 @@ import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
type line struct {
|
||||
@@ -18,7 +16,7 @@ type line struct {
|
||||
}
|
||||
|
||||
var (
|
||||
segmenter = jiebago.Segmenter{}
|
||||
segmenter = Segmenter{}
|
||||
numThreads = runtime.NumCPU()
|
||||
task = make(chan line, numThreads)
|
||||
result = make(chan line, numThreads)
|
||||
@@ -26,10 +24,7 @@ var (
|
||||
|
||||
func worker() {
|
||||
for l := range task {
|
||||
var segments []string
|
||||
for segment := range segmenter.Cut(l.text, true) {
|
||||
segments = append(segments, segment)
|
||||
}
|
||||
segments := segmenter.Cut(l.text, true)
|
||||
|
||||
l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
|
||||
result <- l
|
||||
|
||||
53
example_test.go
Normal file → Executable file
53
example_test.go
Normal file → Executable file
@@ -1,33 +1,24 @@
|
||||
package jiebago_test
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
var seg jiebago.Segmenter
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
|
||||
print := func(ch <-chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf(" %s /", word)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
fmt.Print("【全模式】:")
|
||||
print(seg.CutAll("我来到北京清华大学"))
|
||||
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
||||
|
||||
fmt.Print("【精确模式】:")
|
||||
print(seg.Cut("我来到北京清华大学", false))
|
||||
fmt.Println(seg.Cut("我来到北京清华大学", false))
|
||||
|
||||
fmt.Print("【新词识别】:")
|
||||
print(seg.Cut("他来到了网易杭研大厦", true))
|
||||
fmt.Println(seg.Cut("他来到了网易杭研大厦", true))
|
||||
|
||||
fmt.Print("【搜索引擎模式】:")
|
||||
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
|
||||
fmt.Println(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
|
||||
// Output:
|
||||
// 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
|
||||
// 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
|
||||
@@ -36,47 +27,41 @@ func Example() {
|
||||
}
|
||||
|
||||
func Example_suggestFrequency() {
|
||||
var seg jiebago.Segmenter
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
|
||||
print := func(ch <-chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf(" %s /", word)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
sentence := "超敏C反应蛋白是什么?"
|
||||
fmt.Print("Before:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
word := "超敏C反应蛋白"
|
||||
oldFrequency, _ := seg.Frequency(word)
|
||||
frequency := seg.SuggestFrequency(word)
|
||||
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||
seg.AddWord(word, frequency)
|
||||
fmt.Print("After:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
|
||||
sentence = "如果放到post中将出错"
|
||||
fmt.Print("Before:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
word = "中将"
|
||||
oldFrequency, _ = seg.Frequency(word)
|
||||
frequency = seg.SuggestFrequency("中", "将")
|
||||
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||
seg.AddWord(word, frequency)
|
||||
fmt.Print("After:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
|
||||
sentence = "今天天气不错"
|
||||
fmt.Print("Before:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
word = "今天天气"
|
||||
oldFrequency, _ = seg.Frequency(word)
|
||||
frequency = seg.SuggestFrequency("今天", "天气")
|
||||
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
|
||||
seg.AddWord(word, frequency)
|
||||
fmt.Print("After:")
|
||||
print(seg.Cut(sentence, false))
|
||||
fmt.Println(seg.Cut(sentence, false))
|
||||
// Output:
|
||||
// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? /
|
||||
// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
|
||||
@@ -90,23 +75,17 @@ func Example_suggestFrequency() {
|
||||
}
|
||||
|
||||
func Example_loadUserDictionary() {
|
||||
var seg jiebago.Segmenter
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
|
||||
print := func(ch <-chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf(" %s /", word)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家"
|
||||
fmt.Print("Before:")
|
||||
print(seg.Cut(sentence, true))
|
||||
fmt.Println(seg.Cut(sentence, true))
|
||||
|
||||
seg.LoadUserDictionary("userdict.txt")
|
||||
|
||||
fmt.Print("After:")
|
||||
print(seg.Cut(sentence, true))
|
||||
fmt.Println(seg.Cut(sentence, true))
|
||||
// Output:
|
||||
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||
|
||||
138
finalseg/finalseg.go
Normal file → Executable file
138
finalseg/finalseg.go
Normal file → Executable file
@@ -10,88 +10,86 @@ var (
|
||||
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
|
||||
)
|
||||
|
||||
func cutHan(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
case 'E':
|
||||
result <- string(runes[begin : i+1])
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result <- string(char)
|
||||
next = i + 1
|
||||
}
|
||||
func cutHan(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
case 'E':
|
||||
result = append(result, string(runes[begin:i+1]))
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result = append(result, string(char))
|
||||
next = i + 1
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- string(runes[next:])
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
}
|
||||
if next < len(runes) {
|
||||
result = append(result, string(runes[next:]))
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by Jiebago for unknonw words.
|
||||
func Cut(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
func Cut(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
s := sentence
|
||||
var hans string
|
||||
var hanLoc []int
|
||||
var nonhanLoc []int
|
||||
go func() {
|
||||
for {
|
||||
hanLoc = reHan.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for han := range cutHan(hans) {
|
||||
result <- han
|
||||
}
|
||||
|
||||
for {
|
||||
hanLoc = reHan.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for _, han := range cutHan(hans) {
|
||||
result = append(result, han)
|
||||
}
|
||||
continue
|
||||
}
|
||||
nonhanLoc = reSkip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if nonhanLoc[0] == 0 {
|
||||
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
|
||||
s = s[nonhanLoc[1]:]
|
||||
if nonhans != "" {
|
||||
result = append(result, nonhans)
|
||||
continue
|
||||
}
|
||||
nonhanLoc = reSkip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if nonhanLoc[0] == 0 {
|
||||
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
|
||||
s = s[nonhanLoc[1]:]
|
||||
if nonhans != "" {
|
||||
result <- nonhans
|
||||
continue
|
||||
}
|
||||
}
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
if len(s) > 0 {
|
||||
result <- s
|
||||
break
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
loc = hanLoc
|
||||
} else {
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result <- s[:loc[0]]
|
||||
s = s[loc[0]:]
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
if len(s) > 0 {
|
||||
result = append(result, s)
|
||||
break
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
loc = hanLoc
|
||||
} else {
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result = append(result, s[:loc[0]])
|
||||
s = s[loc[0]:]
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
16
finalseg/finalseg_test.go
Normal file → Executable file
16
finalseg/finalseg_test.go
Normal file → Executable file
@@ -5,14 +5,6 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func TestViterbi(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
states := []byte{'B', 'M', 'E', 'S'}
|
||||
@@ -29,7 +21,7 @@ func TestViterbi(t *testing.T) {
|
||||
|
||||
func TestCutHan(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
result := chanToArray(cutHan(obs))
|
||||
result := cutHan(obs)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
@@ -46,7 +38,7 @@ func TestCutHan(t *testing.T) {
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
sentence := "我们是程序员"
|
||||
result := chanToArray(Cut(sentence))
|
||||
result := Cut(sentence)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(len(result))
|
||||
}
|
||||
@@ -59,11 +51,11 @@ func TestCut(t *testing.T) {
|
||||
if result[2] != "程序员" {
|
||||
t.Fatal(result[2])
|
||||
}
|
||||
result2 := chanToArray(Cut("I'm a programmer!"))
|
||||
result2 := Cut("I'm a programmer!")
|
||||
if len(result2) != 8 {
|
||||
t.Fatal(result2)
|
||||
}
|
||||
result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
|
||||
result3 := Cut("程序员average年龄28.6岁。")
|
||||
if len(result3) != 6 {
|
||||
t.Fatal(result3)
|
||||
}
|
||||
|
||||
0
finalseg/prob_emit.go
Normal file → Executable file
0
finalseg/prob_emit.go
Normal file → Executable file
0
finalseg/prob_trans.go
Normal file → Executable file
0
finalseg/prob_trans.go
Normal file → Executable file
0
finalseg/viterbi.go
Normal file → Executable file
0
finalseg/viterbi.go
Normal file → Executable file
0
foobar.txt
Normal file → Executable file
0
foobar.txt
Normal file → Executable file
29
go.mod
Normal file
29
go.mod
Normal file
@@ -0,0 +1,29 @@
|
||||
module github.com/fumiama/jieba
|
||||
|
||||
go 1.19
|
||||
|
||||
require github.com/blevesearch/bleve v1.0.14
|
||||
|
||||
require (
|
||||
github.com/RoaringBitmap/roaring v0.4.23 // indirect
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
|
||||
github.com/blevesearch/mmap-go v1.0.2 // indirect
|
||||
github.com/blevesearch/segment v0.9.0 // indirect
|
||||
github.com/blevesearch/snowballstem v0.9.0 // indirect
|
||||
github.com/blevesearch/zap/v11 v11.0.14 // indirect
|
||||
github.com/blevesearch/zap/v12 v12.0.14 // indirect
|
||||
github.com/blevesearch/zap/v13 v13.0.6 // indirect
|
||||
github.com/blevesearch/zap/v14 v14.0.5 // indirect
|
||||
github.com/blevesearch/zap/v15 v15.0.3 // indirect
|
||||
github.com/couchbase/vellum v1.0.2 // indirect
|
||||
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2 // indirect
|
||||
github.com/golang/protobuf v1.3.2 // indirect
|
||||
github.com/golang/snappy v0.0.1 // indirect
|
||||
github.com/mschoch/smat v0.2.0 // indirect
|
||||
github.com/philhofer/fwd v1.0.0 // indirect
|
||||
github.com/steveyen/gtreap v0.1.0 // indirect
|
||||
github.com/tinylib/msgp v1.1.0 // indirect
|
||||
github.com/willf/bitset v1.1.10 // indirect
|
||||
go.etcd.io/bbolt v1.3.5 // indirect
|
||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 // indirect
|
||||
)
|
||||
125
go.sum
Normal file
125
go.sum
Normal file
@@ -0,0 +1,125 @@
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/RoaringBitmap/roaring v0.4.23 h1:gpyfd12QohbqhFO4NVDUdoPOCXsyahYRQhINmlHxKeo=
|
||||
github.com/RoaringBitmap/roaring v0.4.23/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo=
|
||||
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
|
||||
github.com/blevesearch/bleve v1.0.14 h1:Q8r+fHTt35jtGXJUM0ULwM3Tzg+MRfyai4ZkWDy2xO4=
|
||||
github.com/blevesearch/bleve v1.0.14/go.mod h1:e/LJTr+E7EaoVdkQZTfoz7dt4KoDNvDbLb8MSKuNTLQ=
|
||||
github.com/blevesearch/blevex v1.0.0 h1:pnilj2Qi3YSEGdWgLj1Pn9Io7ukfXPoQcpAI1Bv8n/o=
|
||||
github.com/blevesearch/blevex v1.0.0/go.mod h1:2rNVqoG2BZI8t1/P1awgTKnGlx5MP9ZbtEciQaNhswc=
|
||||
github.com/blevesearch/cld2 v0.0.0-20200327141045-8b5f551d37f5/go.mod h1:PN0QNTLs9+j1bKy3d/GB/59wsNBFC4sWLWG3k69lWbc=
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
|
||||
github.com/blevesearch/mmap-go v1.0.2 h1:JtMHb+FgQCTTYIhtMvimw15dJwu1Y5lrZDMOFXVWPk0=
|
||||
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
|
||||
github.com/blevesearch/segment v0.9.0 h1:5lG7yBCx98or7gK2cHMKPukPZ/31Kag7nONpoBt22Ac=
|
||||
github.com/blevesearch/segment v0.9.0/go.mod h1:9PfHYUdQCgHktBgvtUOF4x+pc4/l8rdH0u5spnW85UQ=
|
||||
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
|
||||
github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs=
|
||||
github.com/blevesearch/zap/v11 v11.0.14 h1:IrDAvtlzDylh6H2QCmS0OGcN9Hpf6mISJlfKjcwJs7k=
|
||||
github.com/blevesearch/zap/v11 v11.0.14/go.mod h1:MUEZh6VHGXv1PKx3WnCbdP404LGG2IZVa/L66pyFwnY=
|
||||
github.com/blevesearch/zap/v12 v12.0.14 h1:2o9iRtl1xaRjsJ1xcqTyLX414qPAwykHNV7wNVmbp3w=
|
||||
github.com/blevesearch/zap/v12 v12.0.14/go.mod h1:rOnuZOiMKPQj18AEKEHJxuI14236tTQ1ZJz4PAnWlUg=
|
||||
github.com/blevesearch/zap/v13 v13.0.6 h1:r+VNSVImi9cBhTNNR+Kfl5uiGy8kIbb0JMz/h8r6+O4=
|
||||
github.com/blevesearch/zap/v13 v13.0.6/go.mod h1:L89gsjdRKGyGrRN6nCpIScCvvkyxvmeDCwZRcjjPCrw=
|
||||
github.com/blevesearch/zap/v14 v14.0.5 h1:NdcT+81Nvmp2zL+NhwSvGSLh7xNgGL8QRVZ67njR0NU=
|
||||
github.com/blevesearch/zap/v14 v14.0.5/go.mod h1:bWe8S7tRrSBTIaZ6cLRbgNH4TUDaC9LZSpRGs85AsGY=
|
||||
github.com/blevesearch/zap/v15 v15.0.3 h1:Ylj8Oe+mo0P25tr9iLPp33lN6d4qcztGjaIsP51UxaY=
|
||||
github.com/blevesearch/zap/v15 v15.0.3/go.mod h1:iuwQrImsh1WjWJ0Ue2kBqY83a0rFtJTqfa9fp1rbVVU=
|
||||
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
|
||||
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
|
||||
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
|
||||
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
|
||||
github.com/couchbase/moss v0.1.0/go.mod h1:9MaHIaRuy9pvLPUJxB8sh8OrLfyDczECVL37grCIubs=
|
||||
github.com/couchbase/vellum v1.0.2 h1:BrbP0NKiyDdndMPec8Jjhy0U47CZ0Lgx3xUC2r9rZqw=
|
||||
github.com/couchbase/vellum v1.0.2/go.mod h1:FcwrEivFpNi24R3jLOs3n+fs5RnuQnQqCLBJ1uAg1W4=
|
||||
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
|
||||
github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d h1:SwD98825d6bdB+pEuTxWOXiSjBrHdOl/UVp75eI7JT8=
|
||||
github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d/go.mod h1:URriBxXwVq5ijiJ12C7iIZqlA69nTlI+LgI6/pwftG8=
|
||||
github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM=
|
||||
github.com/cznic/strutil v0.0.0-20181122101858-275e90344537/go.mod h1:AHHPPPXTw0h6pVabbcbyGRK1DckRn7r/STdZEeIDzZc=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c/go.mod h1:Yg+htXGokKKdzcwhuNDwVvN+uBxDGXJ7G/VN1d8fa64=
|
||||
github.com/facebookgo/stack v0.0.0-20160209184415-751773369052/go.mod h1:UbMTZqLaRiH3MsBH8va0n7s1pQYcu3uTb8G4tygF4Zg=
|
||||
github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0=
|
||||
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
|
||||
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2 h1:Ujru1hufTHVb++eG6OuNDKMxZnGIvF6o/u8q/8h2+I4=
|
||||
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE=
|
||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8=
|
||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24=
|
||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
|
||||
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
|
||||
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99 h1:twflg0XRTjwKpxb/jFExr4HGq6on2dEOmnL6FV+fgPw=
|
||||
github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
|
||||
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
|
||||
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/ikawaha/kagome.ipadic v1.1.2/go.mod h1:DPSBbU0czaJhAb/5uKQZHMc9MTVRpDugJfX+HddPHHg=
|
||||
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
|
||||
github.com/jmhodges/levigo v1.0.0 h1:q5EC36kV79HWeTBWsod3mG11EgStG3qArTKcvlksN1U=
|
||||
github.com/jmhodges/levigo v1.0.0/go.mod h1:Q6Qx+uH3RAqyK4rFQroq9RL7mdkABMcfhEI+nNuzMJQ=
|
||||
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
|
||||
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
|
||||
github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw=
|
||||
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
|
||||
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
|
||||
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||
github.com/mschoch/smat v0.0.0-20160514031455-90eadee771ae/go.mod h1:qAyveg+e4CE+eKJXWVjKXM4ck2QobLqTDytGJbLLhJg=
|
||||
github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
|
||||
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
|
||||
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
||||
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
||||
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
|
||||
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
|
||||
github.com/philhofer/fwd v1.0.0 h1:UbZqGr5Y38ApvM/V/jEljVxwocdweyH+vmYvRPBnbqQ=
|
||||
github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
|
||||
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
|
||||
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
|
||||
github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
|
||||
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
|
||||
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
|
||||
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
||||
github.com/steveyen/gtreap v0.1.0 h1:CjhzTa274PyJLJuMZwIzCO1PfC00oRa8d1Kc78bFXJM=
|
||||
github.com/steveyen/gtreap v0.1.0/go.mod h1:kl/5J7XbrOmlIbYIXdRHDDE5QxHqpk0cmkT7Z4dM9/Y=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
|
||||
github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
|
||||
github.com/tebeka/snowball v0.4.2/go.mod h1:4IfL14h1lvwZcp1sfXuuc7/7yCsvVffTWxWxCLfFpYg=
|
||||
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok=
|
||||
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8=
|
||||
github.com/tinylib/msgp v1.1.0 h1:9fQd+ICuRIu/ue4vxJZu6/LzxN0HwMds2nq/0cFvxHU=
|
||||
github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
|
||||
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
||||
github.com/willf/bitset v1.1.10 h1:NotGKqX0KwQ72NUzqrjZq5ipPNDQex9lo3WpaS8L2sc=
|
||||
github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
|
||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||
go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0=
|
||||
go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ=
|
||||
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
|
||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
|
||||
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
293
jieba.go
Normal file → Executable file
293
jieba.go
Normal file → Executable file
@@ -6,9 +6,9 @@ import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
"github.com/wangbin/jiebago/finalseg"
|
||||
"github.com/wangbin/jiebago/util"
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
"github.com/fumiama/jieba/finalseg"
|
||||
"github.com/fumiama/jieba/util"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -72,7 +72,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
||||
}
|
||||
} else {
|
||||
word := words[0]
|
||||
for segment := range seg.Cut(word, false) {
|
||||
for _, segment := range seg.Cut(word, false) {
|
||||
if freq, ok := seg.dict.Frequency(segment); ok {
|
||||
frequency *= freq
|
||||
}
|
||||
@@ -165,95 +165,93 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
|
||||
return rs
|
||||
}
|
||||
|
||||
type cutFunc func(sentence string) <-chan string
|
||||
// ratio words and letters in an article commonly
|
||||
const (
|
||||
RatioLetterWord float32 = 1.5
|
||||
RatioLetterWordFull float32 = 1
|
||||
)
|
||||
|
||||
func (seg *Segmenter) cutDAG(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, frag...)
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result <- bufString
|
||||
type cutFunc func(sentence string) []string
|
||||
|
||||
func (seg *Segmenter) cutDAG(sentence string) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, frag...)
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result = append(result, bufString)
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
result = append(result, finalseg.Cut(bufString)...)
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for x := range finalseg.Cut(bufString) {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
result <- string(elem)
|
||||
}
|
||||
for _, elem := range buf {
|
||||
result = append(result, string(elem))
|
||||
}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
result <- string(frag)
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
x = y
|
||||
result = append(result, string(frag))
|
||||
}
|
||||
x = y
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result <- bufString
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result = append(result, bufString)
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
result = append(result, finalseg.Cut(bufString)...)
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range finalseg.Cut(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
result <- string(elem)
|
||||
}
|
||||
for _, elem := range buf {
|
||||
result = append(result, string(elem))
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- string(buf)
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
result <- string(frag)
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- string(buf)
|
||||
result = append(result, string(buf))
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
result = append(result, string(frag))
|
||||
x = y
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result = append(result, string(buf))
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -261,8 +259,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
// Accurate mode attempts to cut the sentence into the most accurate
|
||||
// segmentations, which is suitable for text analysis.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||
var cut cutFunc
|
||||
if hmm {
|
||||
cut = seg.cutDAG
|
||||
@@ -270,84 +268,74 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
||||
cut = seg.cutDAGNoHMM
|
||||
}
|
||||
|
||||
go func() {
|
||||
for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHanDefault.MatchString(block) {
|
||||
result = append(result, cut(block)...)
|
||||
continue
|
||||
}
|
||||
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
|
||||
if reSkipDefault.MatchString(subBlock) {
|
||||
result = append(result, subBlock)
|
||||
continue
|
||||
}
|
||||
if reHanDefault.MatchString(block) {
|
||||
for x := range cut(block) {
|
||||
result <- x
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
|
||||
if reSkipDefault.MatchString(subBlock) {
|
||||
result <- subBlock
|
||||
continue
|
||||
}
|
||||
for _, r := range subBlock {
|
||||
result <- string(r)
|
||||
}
|
||||
for _, r := range subBlock {
|
||||
result = append(result, string(r))
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutAll(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
dag := seg.dag(runes)
|
||||
start := -1
|
||||
ks := make([]int, len(dag))
|
||||
for k := range dag {
|
||||
ks[k] = k
|
||||
func (seg *Segmenter) cutAll(sentence string) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||
|
||||
runes := []rune(sentence)
|
||||
dag := seg.dag(runes)
|
||||
start := -1
|
||||
ks := make([]int, len(dag))
|
||||
for k := range dag {
|
||||
ks[k] = k
|
||||
}
|
||||
var l []int
|
||||
for k := range ks {
|
||||
l = dag[k]
|
||||
if len(l) == 1 && k > start {
|
||||
result = append(result, string(runes[k:l[0]+1]))
|
||||
start = l[0]
|
||||
continue
|
||||
}
|
||||
var l []int
|
||||
for k := range ks {
|
||||
l = dag[k]
|
||||
if len(l) == 1 && k > start {
|
||||
result <- string(runes[k : l[0]+1])
|
||||
start = l[0]
|
||||
continue
|
||||
}
|
||||
for _, j := range l {
|
||||
if j > k {
|
||||
result <- string(runes[k : j+1])
|
||||
start = j
|
||||
}
|
||||
for _, j := range l {
|
||||
if j > k {
|
||||
result = append(result, string(runes[k:j+1]))
|
||||
start = j
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// CutAll cuts a sentence into words using full mode.
|
||||
// Full mode gets all the possible words from the sentence.
|
||||
// Fast but not accurate.
|
||||
func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHanCutAll.MatchString(block) {
|
||||
for x := range seg.cutAll(block) {
|
||||
result <- x
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, subBlock := range reSkipCutAll.Split(block, -1) {
|
||||
result <- subBlock
|
||||
}
|
||||
func (seg *Segmenter) CutAll(sentence string) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
|
||||
|
||||
for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
continue
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
if reHanCutAll.MatchString(block) {
|
||||
result = append(result, seg.cutAll(block)...)
|
||||
continue
|
||||
}
|
||||
result = append(result, reSkipCutAll.Split(block, -1)...)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -355,26 +343,25 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||
// Search engine mode, based on the accurate mode, attempts to cut long words
|
||||
// into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for word := range seg.Cut(sentence, hmm) {
|
||||
runes := []rune(word)
|
||||
for _, increment := range []int{2, 3} {
|
||||
if len(runes) <= increment {
|
||||
continue
|
||||
}
|
||||
var gram string
|
||||
for i := 0; i < len(runes)-increment+1; i++ {
|
||||
gram = string(runes[i : i+increment])
|
||||
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
|
||||
result <- gram
|
||||
}
|
||||
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string {
|
||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
|
||||
|
||||
for _, word := range seg.Cut(sentence, hmm) {
|
||||
runes := []rune(word)
|
||||
for _, increment := range []int{2, 3} {
|
||||
if len(runes) <= increment {
|
||||
continue
|
||||
}
|
||||
var gram string
|
||||
for i := 0; i < len(runes)-increment+1; i++ {
|
||||
gram = string(runes[i : i+increment])
|
||||
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
|
||||
result = append(result, gram)
|
||||
}
|
||||
}
|
||||
result <- word
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
result = append(result, word)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
1060
jieba_test.go
Normal file → Executable file
1060
jieba_test.go
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
0
posseg/char_state_tab.go
Normal file → Executable file
0
posseg/char_state_tab.go
Normal file → Executable file
0
posseg/char_state_tab_test.go
Normal file → Executable file
0
posseg/char_state_tab_test.go
Normal file → Executable file
2
posseg/dictionary.go
Normal file → Executable file
2
posseg/dictionary.go
Normal file → Executable file
@@ -4,7 +4,7 @@ import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
|
||||
2
posseg/example_test.go
Normal file → Executable file
2
posseg/example_test.go
Normal file → Executable file
@@ -3,7 +3,7 @@ package posseg_test
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
"github.com/fumiama/jieba/posseg"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
|
||||
2
posseg/posseg.go
Normal file → Executable file
2
posseg/posseg.go
Normal file → Executable file
@@ -5,7 +5,7 @@ import (
|
||||
"math"
|
||||
"regexp"
|
||||
|
||||
"github.com/wangbin/jiebago/util"
|
||||
"github.com/fumiama/jieba/util"
|
||||
)
|
||||
|
||||
var (
|
||||
|
||||
0
posseg/posseg_test.go
Normal file → Executable file
0
posseg/posseg_test.go
Normal file → Executable file
0
posseg/prob_emit.go
Normal file → Executable file
0
posseg/prob_emit.go
Normal file → Executable file
0
posseg/prob_start.go
Normal file → Executable file
0
posseg/prob_start.go
Normal file → Executable file
0
posseg/prob_trans.go
Normal file → Executable file
0
posseg/prob_trans.go
Normal file → Executable file
0
posseg/viterbi.go
Normal file → Executable file
0
posseg/viterbi.go
Normal file → Executable file
0
posseg/viterbi_test.go
Normal file → Executable file
0
posseg/viterbi_test.go
Normal file → Executable file
30
tokenizers/analyzer.go
Executable file
30
tokenizers/analyzer.go
Executable file
@@ -0,0 +1,30 @@
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
type JiebaAnalyzer struct {
|
||||
}
|
||||
|
||||
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizerName, ok := config["tokenizer"].(string)
|
||||
if !ok {
|
||||
return nil, errors.New("must specify tokenizer")
|
||||
}
|
||||
tokenizer, err := cache.TokenizerNamed(tokenizerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
alz := &analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
}
|
||||
return alz, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer("jieba", analyzerConstructor)
|
||||
}
|
||||
20
tokenizers/example_bleve_test.go
Normal file → Executable file
20
tokenizers/example_bleve_test.go
Normal file → Executable file
@@ -6,7 +6,7 @@ import (
|
||||
"os"
|
||||
|
||||
"github.com/blevesearch/bleve"
|
||||
_ "github.com/wangbin/jiebago/tokenizers"
|
||||
_ "github.com/fumiama/jieba/tokenizers"
|
||||
)
|
||||
|
||||
func Example_beleveSearch() {
|
||||
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
|
||||
// Output:
|
||||
// Result of "水果世博园": 2 matches:
|
||||
// 1. Doc 3, (1.099550)
|
||||
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
|
||||
// Name: 买<mark>水果</mark>然后来<mark>世博</mark>园。
|
||||
// 2. Doc 2, (0.031941)
|
||||
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
|
||||
// Name: The second one 你 中文测试中文 is even more interesting! 吃<mark>水果</mark>
|
||||
// Result of "你": 1 matches:
|
||||
// 1. Doc 2, (0.391161)
|
||||
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
|
||||
// Name: The second one <mark>你</mark> 中文测试中文 is even more interesting! 吃水果
|
||||
// Result of "first": 1 matches:
|
||||
// 1. Doc 1, (0.512150)
|
||||
// Name: This is the <span class="highlight">first</span> document we’ve added
|
||||
// Name: This is the <mark>first</mark> document we’ve added
|
||||
// Result of "中文": 1 matches:
|
||||
// 1. Doc 2, (0.553186)
|
||||
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
|
||||
// Name: The second one 你 <mark>中文</mark>测试<mark>中文</mark> is even more interesting! 吃水果
|
||||
// Result of "交换机": 2 matches:
|
||||
// 1. Doc 4, (0.608495)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换机</mark>等技术性器件的安装工作
|
||||
// 2. Doc 5, (0.086700)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// Name: 咱俩<mark>交换</mark>一下吧。
|
||||
// Result of "交换": 2 matches:
|
||||
// 1. Doc 5, (0.534158)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// Name: 咱俩<mark>交换</mark>一下吧。
|
||||
// 2. Doc 4, (0.296297)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换</mark>机等技术性器件的安装工作
|
||||
}
|
||||
|
||||
2
tokenizers/example_test.go
Normal file → Executable file
2
tokenizers/example_test.go
Normal file → Executable file
@@ -3,7 +3,7 @@ package tokenizers_test
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/tokenizers"
|
||||
"github.com/fumiama/jieba/tokenizers"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
|
||||
34
tokenizers/tokenizer.go
Normal file → Executable file
34
tokenizers/tokenizer.go
Normal file → Executable file
@@ -7,7 +7,7 @@ import (
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/wangbin/jiebago"
|
||||
jiebago "github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFilePath: path of the dictioanry file.
|
||||
dictFilePath: path of the dictioanry file.
|
||||
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg jiebago.Segmenter
|
||||
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
for _, word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||
|
||||
Parameter config should contains at least one parameter:
|
||||
|
||||
file: the path of the dictionary file.
|
||||
file: the path of the dictionary file.
|
||||
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
*/
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
|
||||
0
tokenizers/tokenizer_test.go
Normal file → Executable file
0
tokenizers/tokenizer_test.go
Normal file → Executable file
0
userdict.txt
Normal file → Executable file
0
userdict.txt
Normal file → Executable file
0
util/util.go
Normal file → Executable file
0
util/util.go
Normal file → Executable file
0
util/util_test.go
Normal file → Executable file
0
util/util_test.go
Normal file → Executable file
Reference in New Issue
Block a user