1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
This commit is contained in:
源文雨
2022-11-30 12:18:15 +08:00
parent ab8b95ef87
commit 8bbc755ed4
48 changed files with 984 additions and 859 deletions

1
.gitignore vendored Executable file
View File

@@ -0,0 +1 @@
tokenizers/jieba.beleve/

View File

@@ -1,3 +0,0 @@
language: go
go:
- 1.4.2

8
README.md Normal file → Executable file
View File

@@ -1,7 +1,7 @@
#结巴分词 Go 语言版Jiebago #结巴分词 Go 语言版Jiebago
[![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/wangbin/jiebago?status.svg)](https://godoc.org/github.com/wangbin/jiebago) [![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba)
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件Iiebago 是结巴分词的 Golang 语言实现。 [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件Iiebago 是结巴分词的 Golang 语言实现。
@@ -9,7 +9,7 @@
## 安装 ## 安装
``` ```
go get github.com/wangbin/jiebago/... go get github.com/fumiama/jieba/...
``` ```
## 使用 ## 使用
@@ -20,7 +20,7 @@ package main
import ( import (
"fmt" "fmt"
"github.com/wangbin/jiebago" "github.com/fumiama/jieba"
) )
var seg jiebago.Segmenter var seg jiebago.Segmenter
@@ -62,7 +62,7 @@ func Example() {
【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 / 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
``` ```
更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。 更多信息请参考[文档](https://godoc.org/github.com/fumiama/jieba)。
## 分词速度 ## 分词速度

2
analyse/example_test.go Normal file → Executable file
View File

@@ -3,7 +3,7 @@ package analyse_test
import ( import (
"fmt" "fmt"
"github.com/wangbin/jiebago/analyse" "github.com/fumiama/jieba/analyse"
) )
func Example_extractTags() { func Example_extractTags() {

2
analyse/idf.go Normal file → Executable file
View File

@@ -4,7 +4,7 @@ import (
"sort" "sort"
"sync" "sync"
"github.com/wangbin/jiebago/dictionary" "github.com/fumiama/jieba/dictionary"
) )
// Idf represents a thread-safe dictionary for all words with their // Idf represents a thread-safe dictionary for all words with their

0
analyse/idf.txt Normal file → Executable file
View File

0
analyse/stop_words.txt Normal file → Executable file
View File

2
analyse/stopwords.go Normal file → Executable file
View File

@@ -3,7 +3,7 @@ package analyse
import ( import (
"sync" "sync"
"github.com/wangbin/jiebago/dictionary" "github.com/fumiama/jieba/dictionary"
) )
// DefaultStopWordMap contains some stop words. // DefaultStopWordMap contains some stop words.

4
analyse/tag_extracker.go Normal file → Executable file
View File

@@ -6,7 +6,7 @@ import (
"strings" "strings"
"unicode/utf8" "unicode/utf8"
"github.com/wangbin/jiebago" jiebago "github.com/fumiama/jieba"
) )
// Segment represents a word with weight. // Segment represents a word with weight.
@@ -74,7 +74,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64) freqMap := make(map[string]float64)
for w := range t.seg.Cut(sentence, true) { for _, w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w) w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 { if utf8.RuneCountInString(w) < 2 {
continue continue

0
analyse/tag_extracker_test.go Normal file → Executable file
View File

2
analyse/textrank.go Normal file → Executable file
View File

@@ -4,7 +4,7 @@ import (
"math" "math"
"sort" "sort"
"github.com/wangbin/jiebago/posseg" "github.com/fumiama/jieba/posseg"
) )
const dampingFactor = 0.85 const dampingFactor = 0.85

0
analyse/textrank_test.go Normal file → Executable file
View File

0
dict.txt Normal file → Executable file
View File

2
dictionary.go Normal file → Executable file
View File

@@ -4,7 +4,7 @@ import (
"math" "math"
"sync" "sync"
"github.com/wangbin/jiebago/dictionary" "github.com/fumiama/jieba/dictionary"
) )
// A Dictionary represents a thread-safe dictionary used for word segmentation. // A Dictionary represents a thread-safe dictionary used for word segmentation.

0
dictionary/dictionary.go Normal file → Executable file
View File

0
dictionary/dictionary_test.go Normal file → Executable file
View File

0
dictionary/token.go Normal file → Executable file
View File

11
example_parallel_cut_test.go Normal file → Executable file
View File

@@ -1,4 +1,4 @@
package jiebago_test package jiebago
import ( import (
"bufio" "bufio"
@@ -8,8 +8,6 @@ import (
"runtime" "runtime"
"strings" "strings"
"time" "time"
"github.com/wangbin/jiebago"
) )
type line struct { type line struct {
@@ -18,7 +16,7 @@ type line struct {
} }
var ( var (
segmenter = jiebago.Segmenter{} segmenter = Segmenter{}
numThreads = runtime.NumCPU() numThreads = runtime.NumCPU()
task = make(chan line, numThreads) task = make(chan line, numThreads)
result = make(chan line, numThreads) result = make(chan line, numThreads)
@@ -26,10 +24,7 @@ var (
func worker() { func worker() {
for l := range task { for l := range task {
var segments []string segments := segmenter.Cut(l.text, true)
for segment := range segmenter.Cut(l.text, true) {
segments = append(segments, segment)
}
l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / ")) l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
result <- l result <- l

53
example_test.go Normal file → Executable file
View File

@@ -1,33 +1,24 @@
package jiebago_test package jiebago
import ( import (
"fmt" "fmt"
"github.com/wangbin/jiebago"
) )
func Example() { func Example() {
var seg jiebago.Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionary("dict.txt")
print := func(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Println()
}
fmt.Print("【全模式】:") fmt.Print("【全模式】:")
print(seg.CutAll("我来到北京清华大学")) fmt.Println(seg.CutAll("我来到北京清华大学"))
fmt.Print("【精确模式】:") fmt.Print("【精确模式】:")
print(seg.Cut("我来到北京清华大学", false)) fmt.Println(seg.Cut("我来到北京清华大学", false))
fmt.Print("【新词识别】:") fmt.Print("【新词识别】:")
print(seg.Cut("他来到了网易杭研大厦", true)) fmt.Println(seg.Cut("他来到了网易杭研大厦", true))
fmt.Print("【搜索引擎模式】:") fmt.Print("【搜索引擎模式】:")
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true)) fmt.Println(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
// Output: // Output:
// 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 / // 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
// 【精确模式】: 我 / 来到 / 北京 / 清华大学 / // 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
@@ -36,47 +27,41 @@ func Example() {
} }
func Example_suggestFrequency() { func Example_suggestFrequency() {
var seg jiebago.Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionary("dict.txt")
print := func(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Println()
}
sentence := "超敏C反应蛋白是什么" sentence := "超敏C反应蛋白是什么"
fmt.Print("Before:") fmt.Print("Before:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
word := "超敏C反应蛋白" word := "超敏C反应蛋白"
oldFrequency, _ := seg.Frequency(word) oldFrequency, _ := seg.Frequency(word)
frequency := seg.SuggestFrequency(word) frequency := seg.SuggestFrequency(word)
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency) seg.AddWord(word, frequency)
fmt.Print("After:") fmt.Print("After:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
sentence = "如果放到post中将出错" sentence = "如果放到post中将出错"
fmt.Print("Before:") fmt.Print("Before:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
word = "中将" word = "中将"
oldFrequency, _ = seg.Frequency(word) oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("中", "将") frequency = seg.SuggestFrequency("中", "将")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency) seg.AddWord(word, frequency)
fmt.Print("After:") fmt.Print("After:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
sentence = "今天天气不错" sentence = "今天天气不错"
fmt.Print("Before:") fmt.Print("Before:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
word = "今天天气" word = "今天天气"
oldFrequency, _ = seg.Frequency(word) oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("今天", "天气") frequency = seg.SuggestFrequency("今天", "天气")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency) seg.AddWord(word, frequency)
fmt.Print("After:") fmt.Print("After:")
print(seg.Cut(sentence, false)) fmt.Println(seg.Cut(sentence, false))
// Output: // Output:
// Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / / // Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / /
// 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000. // 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
@@ -90,23 +75,17 @@ func Example_suggestFrequency() {
} }
func Example_loadUserDictionary() { func Example_loadUserDictionary() {
var seg jiebago.Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionary("dict.txt")
print := func(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Println()
}
sentence := "李小福是创新办主任也是云计算方面的专家" sentence := "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:") fmt.Print("Before:")
print(seg.Cut(sentence, true)) fmt.Println(seg.Cut(sentence, true))
seg.LoadUserDictionary("userdict.txt") seg.LoadUserDictionary("userdict.txt")
fmt.Print("After:") fmt.Print("After:")
print(seg.Cut(sentence, true)) fmt.Println(seg.Cut(sentence, true))
// Output: // Output:
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / // Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / // After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /

138
finalseg/finalseg.go Normal file → Executable file
View File

@@ -10,88 +10,86 @@ var (
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
) )
func cutHan(sentence string) chan string { func cutHan(sentence string) []string {
result := make(chan string) result := make([]string, 0, 10)
go func() {
runes := []rune(sentence) runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0 begin, next := 0, 0
for i, char := range runes { for i, char := range runes {
pos := posList[i] pos := posList[i]
switch pos { switch pos {
case 'B': case 'B':
begin = i begin = i
case 'E': case 'E':
result <- string(runes[begin : i+1]) result = append(result, string(runes[begin:i+1]))
next = i + 1 next = i + 1
case 'S': case 'S':
result <- string(char) result = append(result, string(char))
next = i + 1 next = i + 1
}
} }
if next < len(runes) { }
result <- string(runes[next:]) if next < len(runes) {
} result = append(result, string(runes[next:]))
close(result) }
}()
return result return result
} }
// Cut cuts sentence into words using Hidden Markov Model with Viterbi // Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words. // algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string { func Cut(sentence string) []string {
result := make(chan string) result := make([]string, 0, 10)
s := sentence s := sentence
var hans string var hans string
var hanLoc []int var hanLoc []int
var nonhanLoc []int var nonhanLoc []int
go func() {
for { for {
hanLoc = reHan.FindStringIndex(s) hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil { if hanLoc == nil {
if len(s) == 0 { if len(s) == 0 {
break break
} }
} else if hanLoc[0] == 0 { } else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]] hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:] s = s[hanLoc[1]:]
for han := range cutHan(hans) { for _, han := range cutHan(hans) {
result <- han result = append(result, han)
} }
continue
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue continue
} }
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result <- nonhans
continue
}
}
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result <- s
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result <- s[:loc[0]]
s = s[loc[0]:]
} }
close(result) var loc []int
}() if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result = append(result, s)
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result = append(result, s[:loc[0]])
s = s[loc[0]:]
}
return result return result
} }

16
finalseg/finalseg_test.go Normal file → Executable file
View File

@@ -5,14 +5,6 @@ import (
"testing" "testing"
) )
func chanToArray(ch chan string) []string {
var result []string
for word := range ch {
result = append(result, word)
}
return result
}
func TestViterbi(t *testing.T) { func TestViterbi(t *testing.T) {
obs := "我们是程序员" obs := "我们是程序员"
states := []byte{'B', 'M', 'E', 'S'} states := []byte{'B', 'M', 'E', 'S'}
@@ -29,7 +21,7 @@ func TestViterbi(t *testing.T) {
func TestCutHan(t *testing.T) { func TestCutHan(t *testing.T) {
obs := "我们是程序员" obs := "我们是程序员"
result := chanToArray(cutHan(obs)) result := cutHan(obs)
if len(result) != 3 { if len(result) != 3 {
t.Fatal(result) t.Fatal(result)
} }
@@ -46,7 +38,7 @@ func TestCutHan(t *testing.T) {
func TestCut(t *testing.T) { func TestCut(t *testing.T) {
sentence := "我们是程序员" sentence := "我们是程序员"
result := chanToArray(Cut(sentence)) result := Cut(sentence)
if len(result) != 3 { if len(result) != 3 {
t.Fatal(len(result)) t.Fatal(len(result))
} }
@@ -59,11 +51,11 @@ func TestCut(t *testing.T) {
if result[2] != "程序员" { if result[2] != "程序员" {
t.Fatal(result[2]) t.Fatal(result[2])
} }
result2 := chanToArray(Cut("I'm a programmer!")) result2 := Cut("I'm a programmer!")
if len(result2) != 8 { if len(result2) != 8 {
t.Fatal(result2) t.Fatal(result2)
} }
result3 := chanToArray(Cut("程序员average年龄28.6岁。")) result3 := Cut("程序员average年龄28.6岁。")
if len(result3) != 6 { if len(result3) != 6 {
t.Fatal(result3) t.Fatal(result3)
} }

0
finalseg/prob_emit.go Normal file → Executable file
View File

0
finalseg/prob_trans.go Normal file → Executable file
View File

0
finalseg/viterbi.go Normal file → Executable file
View File

0
foobar.txt Normal file → Executable file
View File

29
go.mod Normal file
View File

@@ -0,0 +1,29 @@
module github.com/fumiama/jieba
go 1.19
require github.com/blevesearch/bleve v1.0.14
require (
github.com/RoaringBitmap/roaring v0.4.23 // indirect
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
github.com/blevesearch/mmap-go v1.0.2 // indirect
github.com/blevesearch/segment v0.9.0 // indirect
github.com/blevesearch/snowballstem v0.9.0 // indirect
github.com/blevesearch/zap/v11 v11.0.14 // indirect
github.com/blevesearch/zap/v12 v12.0.14 // indirect
github.com/blevesearch/zap/v13 v13.0.6 // indirect
github.com/blevesearch/zap/v14 v14.0.5 // indirect
github.com/blevesearch/zap/v15 v15.0.3 // indirect
github.com/couchbase/vellum v1.0.2 // indirect
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2 // indirect
github.com/golang/protobuf v1.3.2 // indirect
github.com/golang/snappy v0.0.1 // indirect
github.com/mschoch/smat v0.2.0 // indirect
github.com/philhofer/fwd v1.0.0 // indirect
github.com/steveyen/gtreap v0.1.0 // indirect
github.com/tinylib/msgp v1.1.0 // indirect
github.com/willf/bitset v1.1.10 // indirect
go.etcd.io/bbolt v1.3.5 // indirect
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 // indirect
)

125
go.sum Normal file
View File

@@ -0,0 +1,125 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/RoaringBitmap/roaring v0.4.23 h1:gpyfd12QohbqhFO4NVDUdoPOCXsyahYRQhINmlHxKeo=
github.com/RoaringBitmap/roaring v0.4.23/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/blevesearch/bleve v1.0.14 h1:Q8r+fHTt35jtGXJUM0ULwM3Tzg+MRfyai4ZkWDy2xO4=
github.com/blevesearch/bleve v1.0.14/go.mod h1:e/LJTr+E7EaoVdkQZTfoz7dt4KoDNvDbLb8MSKuNTLQ=
github.com/blevesearch/blevex v1.0.0 h1:pnilj2Qi3YSEGdWgLj1Pn9Io7ukfXPoQcpAI1Bv8n/o=
github.com/blevesearch/blevex v1.0.0/go.mod h1:2rNVqoG2BZI8t1/P1awgTKnGlx5MP9ZbtEciQaNhswc=
github.com/blevesearch/cld2 v0.0.0-20200327141045-8b5f551d37f5/go.mod h1:PN0QNTLs9+j1bKy3d/GB/59wsNBFC4sWLWG3k69lWbc=
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
github.com/blevesearch/mmap-go v1.0.2 h1:JtMHb+FgQCTTYIhtMvimw15dJwu1Y5lrZDMOFXVWPk0=
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/segment v0.9.0 h1:5lG7yBCx98or7gK2cHMKPukPZ/31Kag7nONpoBt22Ac=
github.com/blevesearch/segment v0.9.0/go.mod h1:9PfHYUdQCgHktBgvtUOF4x+pc4/l8rdH0u5spnW85UQ=
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs=
github.com/blevesearch/zap/v11 v11.0.14 h1:IrDAvtlzDylh6H2QCmS0OGcN9Hpf6mISJlfKjcwJs7k=
github.com/blevesearch/zap/v11 v11.0.14/go.mod h1:MUEZh6VHGXv1PKx3WnCbdP404LGG2IZVa/L66pyFwnY=
github.com/blevesearch/zap/v12 v12.0.14 h1:2o9iRtl1xaRjsJ1xcqTyLX414qPAwykHNV7wNVmbp3w=
github.com/blevesearch/zap/v12 v12.0.14/go.mod h1:rOnuZOiMKPQj18AEKEHJxuI14236tTQ1ZJz4PAnWlUg=
github.com/blevesearch/zap/v13 v13.0.6 h1:r+VNSVImi9cBhTNNR+Kfl5uiGy8kIbb0JMz/h8r6+O4=
github.com/blevesearch/zap/v13 v13.0.6/go.mod h1:L89gsjdRKGyGrRN6nCpIScCvvkyxvmeDCwZRcjjPCrw=
github.com/blevesearch/zap/v14 v14.0.5 h1:NdcT+81Nvmp2zL+NhwSvGSLh7xNgGL8QRVZ67njR0NU=
github.com/blevesearch/zap/v14 v14.0.5/go.mod h1:bWe8S7tRrSBTIaZ6cLRbgNH4TUDaC9LZSpRGs85AsGY=
github.com/blevesearch/zap/v15 v15.0.3 h1:Ylj8Oe+mo0P25tr9iLPp33lN6d4qcztGjaIsP51UxaY=
github.com/blevesearch/zap/v15 v15.0.3/go.mod h1:iuwQrImsh1WjWJ0Ue2kBqY83a0rFtJTqfa9fp1rbVVU=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.1.0/go.mod h1:9MaHIaRuy9pvLPUJxB8sh8OrLfyDczECVL37grCIubs=
github.com/couchbase/vellum v1.0.2 h1:BrbP0NKiyDdndMPec8Jjhy0U47CZ0Lgx3xUC2r9rZqw=
github.com/couchbase/vellum v1.0.2/go.mod h1:FcwrEivFpNi24R3jLOs3n+fs5RnuQnQqCLBJ1uAg1W4=
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d h1:SwD98825d6bdB+pEuTxWOXiSjBrHdOl/UVp75eI7JT8=
github.com/cznic/b v0.0.0-20181122101859-a26611c4d92d/go.mod h1:URriBxXwVq5ijiJ12C7iIZqlA69nTlI+LgI6/pwftG8=
github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM=
github.com/cznic/strutil v0.0.0-20181122101858-275e90344537/go.mod h1:AHHPPPXTw0h6pVabbcbyGRK1DckRn7r/STdZEeIDzZc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c/go.mod h1:Yg+htXGokKKdzcwhuNDwVvN+uBxDGXJ7G/VN1d8fa64=
github.com/facebookgo/stack v0.0.0-20160209184415-751773369052/go.mod h1:UbMTZqLaRiH3MsBH8va0n7s1pQYcu3uTb8G4tygF4Zg=
github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2 h1:Ujru1hufTHVb++eG6OuNDKMxZnGIvF6o/u8q/8h2+I4=
github.com/glycerine/go-unsnap-stream v0.0.0-20181221182339-f9677308dec2/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99 h1:twflg0XRTjwKpxb/jFExr4HGq6on2dEOmnL6FV+fgPw=
github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/ikawaha/kagome.ipadic v1.1.2/go.mod h1:DPSBbU0czaJhAb/5uKQZHMc9MTVRpDugJfX+HddPHHg=
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/jmhodges/levigo v1.0.0 h1:q5EC36kV79HWeTBWsod3mG11EgStG3qArTKcvlksN1U=
github.com/jmhodges/levigo v1.0.0/go.mod h1:Q6Qx+uH3RAqyK4rFQroq9RL7mdkABMcfhEI+nNuzMJQ=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/kljensen/snowball v0.6.0/go.mod h1:27N7E8fVU5H68RlUmnWwZCfxgt4POBJfENGMvNRhldw=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/mschoch/smat v0.0.0-20160514031455-90eadee771ae/go.mod h1:qAyveg+e4CE+eKJXWVjKXM4ck2QobLqTDytGJbLLhJg=
github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/philhofer/fwd v1.0.0 h1:UbZqGr5Y38ApvM/V/jEljVxwocdweyH+vmYvRPBnbqQ=
github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/steveyen/gtreap v0.1.0 h1:CjhzTa274PyJLJuMZwIzCO1PfC00oRa8d1Kc78bFXJM=
github.com/steveyen/gtreap v0.1.0/go.mod h1:kl/5J7XbrOmlIbYIXdRHDDE5QxHqpk0cmkT7Z4dM9/Y=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
github.com/tebeka/snowball v0.4.2/go.mod h1:4IfL14h1lvwZcp1sfXuuc7/7yCsvVffTWxWxCLfFpYg=
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok=
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8=
github.com/tinylib/msgp v1.1.0 h1:9fQd+ICuRIu/ue4vxJZu6/LzxN0HwMds2nq/0cFvxHU=
github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/willf/bitset v1.1.10 h1:NotGKqX0KwQ72NUzqrjZq5ipPNDQex9lo3WpaS8L2sc=
github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0=
go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ=
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

293
jieba.go Normal file → Executable file
View File

@@ -6,9 +6,9 @@ import (
"regexp" "regexp"
"strings" "strings"
"github.com/wangbin/jiebago/dictionary" "github.com/fumiama/jieba/dictionary"
"github.com/wangbin/jiebago/finalseg" "github.com/fumiama/jieba/finalseg"
"github.com/wangbin/jiebago/util" "github.com/fumiama/jieba/util"
) )
var ( var (
@@ -72,7 +72,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
} }
} else { } else {
word := words[0] word := words[0]
for segment := range seg.Cut(word, false) { for _, segment := range seg.Cut(word, false) {
if freq, ok := seg.dict.Frequency(segment); ok { if freq, ok := seg.dict.Frequency(segment); ok {
frequency *= freq frequency *= freq
} }
@@ -165,95 +165,93 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
return rs return rs
} }
type cutFunc func(sentence string) <-chan string // ratio words and letters in an article commonly
const (
RatioLetterWord float32 = 1.5
RatioLetterWordFull float32 = 1
)
func (seg *Segmenter) cutDAG(sentence string) <-chan string { type cutFunc func(sentence string) []string
result := make(chan string)
go func() { func (seg *Segmenter) cutDAG(sentence string) []string {
runes := []rune(sentence) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
routes := seg.calc(runes)
var y int runes := []rune(sentence)
length := len(runes) routes := seg.calc(runes)
var buf []rune var y int
for x := 0; x < length; { length := len(runes)
y = routes[x].index + 1 var buf []rune
frag := runes[x:y] for x := 0; x < length; {
if y-x == 1 { y = routes[x].index + 1
buf = append(buf, frag...) frag := runes[x:y]
} else { if y-x == 1 {
if len(buf) > 0 { buf = append(buf, frag...)
bufString := string(buf) } else {
if len(buf) == 1 { if len(buf) > 0 {
result <- bufString bufString := string(buf)
if len(buf) == 1 {
result = append(result, bufString)
} else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, finalseg.Cut(bufString)...)
} else { } else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { for _, elem := range buf {
for x := range finalseg.Cut(bufString) { result = append(result, string(elem))
result <- x
}
} else {
for _, elem := range buf {
result <- string(elem)
}
} }
} }
buf = make([]rune, 0)
} }
result <- string(frag) buf = make([]rune, 0)
} }
x = y result = append(result, string(frag))
} }
x = y
}
if len(buf) > 0 { if len(buf) > 0 {
bufString := string(buf) bufString := string(buf)
if len(buf) == 1 { if len(buf) == 1 {
result <- bufString result = append(result, bufString)
} else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, finalseg.Cut(bufString)...)
} else { } else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { for _, elem := range buf {
for t := range finalseg.Cut(bufString) { result = append(result, string(elem))
result <- t
}
} else {
for _, elem := range buf {
result <- string(elem)
}
} }
} }
} }
close(result) }
}()
return result return result
} }
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
result := make(chan string) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
go func() { runes := []rune(sentence)
runes := []rune(sentence) routes := seg.calc(runes)
routes := seg.calc(runes) var y int
var y int length := len(runes)
length := len(runes) var buf []rune
var buf []rune for x := 0; x < length; {
for x := 0; x < length; { y = routes[x].index + 1
y = routes[x].index + 1 frag := runes[x:y]
frag := runes[x:y] if reEng.MatchString(string(frag)) && len(frag) == 1 {
if reEng.MatchString(string(frag)) && len(frag) == 1 { buf = append(buf, frag...)
buf = append(buf, frag...)
x = y
continue
}
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
result <- string(frag)
x = y x = y
continue
} }
if len(buf) > 0 { if len(buf) > 0 {
result <- string(buf) result = append(result, string(buf))
buf = make([]rune, 0) buf = make([]rune, 0)
} }
close(result) result = append(result, string(frag))
}() x = y
}
if len(buf) > 0 {
result = append(result, string(buf))
}
return result return result
} }
@@ -261,8 +259,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
// Parameter hmm controls whether to use the Hidden Markov Model. // Parameter hmm controls whether to use the Hidden Markov Model.
// Accurate mode attempts to cut the sentence into the most accurate // Accurate mode attempts to cut the sentence into the most accurate
// segmentations, which is suitable for text analysis. // segmentations, which is suitable for text analysis.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { func (seg *Segmenter) Cut(sentence string, hmm bool) []string {
result := make(chan string) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
var cut cutFunc var cut cutFunc
if hmm { if hmm {
cut = seg.cutDAG cut = seg.cutDAG
@@ -270,84 +268,74 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
cut = seg.cutDAGNoHMM cut = seg.cutDAGNoHMM
} }
go func() { for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { if len(block) == 0 {
if len(block) == 0 { continue
}
if reHanDefault.MatchString(block) {
result = append(result, cut(block)...)
continue
}
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
if reSkipDefault.MatchString(subBlock) {
result = append(result, subBlock)
continue continue
} }
if reHanDefault.MatchString(block) { for _, r := range subBlock {
for x := range cut(block) { result = append(result, string(r))
result <- x
}
continue
}
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
if reSkipDefault.MatchString(subBlock) {
result <- subBlock
continue
}
for _, r := range subBlock {
result <- string(r)
}
} }
} }
close(result) }
}()
return result return result
} }
func (seg *Segmenter) cutAll(sentence string) <-chan string { func (seg *Segmenter) cutAll(sentence string) []string {
result := make(chan string) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
go func() {
runes := []rune(sentence) runes := []rune(sentence)
dag := seg.dag(runes) dag := seg.dag(runes)
start := -1 start := -1
ks := make([]int, len(dag)) ks := make([]int, len(dag))
for k := range dag { for k := range dag {
ks[k] = k ks[k] = k
}
var l []int
for k := range ks {
l = dag[k]
if len(l) == 1 && k > start {
result = append(result, string(runes[k:l[0]+1]))
start = l[0]
continue
} }
var l []int for _, j := range l {
for k := range ks { if j > k {
l = dag[k] result = append(result, string(runes[k:j+1]))
if len(l) == 1 && k > start { start = j
result <- string(runes[k : l[0]+1])
start = l[0]
continue
}
for _, j := range l {
if j > k {
result <- string(runes[k : j+1])
start = j
}
} }
} }
close(result) }
}()
return result return result
} }
// CutAll cuts a sentence into words using full mode. // CutAll cuts a sentence into words using full mode.
// Full mode gets all the possible words from the sentence. // Full mode gets all the possible words from the sentence.
// Fast but not accurate. // Fast but not accurate.
func (seg *Segmenter) CutAll(sentence string) <-chan string { func (seg *Segmenter) CutAll(sentence string) []string {
result := make(chan string) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
go func() {
for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
if len(block) == 0 { if len(block) == 0 {
continue continue
}
if reHanCutAll.MatchString(block) {
for x := range seg.cutAll(block) {
result <- x
}
continue
}
for _, subBlock := range reSkipCutAll.Split(block, -1) {
result <- subBlock
}
} }
close(result) if reHanCutAll.MatchString(block) {
}() result = append(result, seg.cutAll(block)...)
continue
}
result = append(result, reSkipCutAll.Split(block, -1)...)
}
return result return result
} }
@@ -355,26 +343,25 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
// Search engine mode, based on the accurate mode, attempts to cut long words // Search engine mode, based on the accurate mode, attempts to cut long words
// into several short words, which can raise the recall rate. // into several short words, which can raise the recall rate.
// Suitable for search engines. // Suitable for search engines.
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string {
result := make(chan string) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
go func() {
for word := range seg.Cut(sentence, hmm) { for _, word := range seg.Cut(sentence, hmm) {
runes := []rune(word) runes := []rune(word)
for _, increment := range []int{2, 3} { for _, increment := range []int{2, 3} {
if len(runes) <= increment { if len(runes) <= increment {
continue continue
} }
var gram string var gram string
for i := 0; i < len(runes)-increment+1; i++ { for i := 0; i < len(runes)-increment+1; i++ {
gram = string(runes[i : i+increment]) gram = string(runes[i : i+increment])
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
result <- gram result = append(result, gram)
}
} }
} }
result <- word
} }
close(result) result = append(result, word)
}() }
return result return result
} }

1060
jieba_test.go Normal file → Executable file

File diff suppressed because it is too large Load Diff

0
posseg/char_state_tab.go Normal file → Executable file
View File

0
posseg/char_state_tab_test.go Normal file → Executable file
View File

2
posseg/dictionary.go Normal file → Executable file
View File

@@ -4,7 +4,7 @@ import (
"math" "math"
"sync" "sync"
"github.com/wangbin/jiebago/dictionary" "github.com/fumiama/jieba/dictionary"
) )
// A Dictionary represents a thread-safe dictionary used for word segmentation. // A Dictionary represents a thread-safe dictionary used for word segmentation.

2
posseg/example_test.go Normal file → Executable file
View File

@@ -3,7 +3,7 @@ package posseg_test
import ( import (
"fmt" "fmt"
"github.com/wangbin/jiebago/posseg" "github.com/fumiama/jieba/posseg"
) )
func Example() { func Example() {

2
posseg/posseg.go Normal file → Executable file
View File

@@ -5,7 +5,7 @@ import (
"math" "math"
"regexp" "regexp"
"github.com/wangbin/jiebago/util" "github.com/fumiama/jieba/util"
) )
var ( var (

0
posseg/posseg_test.go Normal file → Executable file
View File

0
posseg/prob_emit.go Normal file → Executable file
View File

0
posseg/prob_start.go Normal file → Executable file
View File

0
posseg/prob_trans.go Normal file → Executable file
View File

0
posseg/viterbi.go Normal file → Executable file
View File

0
posseg/viterbi_test.go Normal file → Executable file
View File

30
tokenizers/analyzer.go Executable file
View File

@@ -0,0 +1,30 @@
package tokenizers
import (
"errors"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
type JiebaAnalyzer struct {
}
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, errors.New("must specify tokenizer")
}
tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}
alz := &analysis.Analyzer{
Tokenizer: tokenizer,
}
return alz, nil
}
func init() {
registry.RegisterAnalyzer("jieba", analyzerConstructor)
}

20
tokenizers/example_bleve_test.go Normal file → Executable file
View File

@@ -6,7 +6,7 @@ import (
"os" "os"
"github.com/blevesearch/bleve" "github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago/tokenizers" _ "github.com/fumiama/jieba/tokenizers"
) )
func Example_beleveSearch() { func Example_beleveSearch() {
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
// Output: // Output:
// Result of "水果世博园": 2 matches: // Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550) // 1. Doc 3, (1.099550)
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。 // Name: 买<mark>水果</mark>然后来<mark>世博</mark>园。
// 2. Doc 2, (0.031941) // 2. Doc 2, (0.031941)
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span> // Name: The second one 你 中文测试中文 is even more interesting! 吃<mark>水果</mark>
// Result of "你": 1 matches: // Result of "你": 1 matches:
// 1. Doc 2, (0.391161) // 1. Doc 2, (0.391161)
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果 // Name: The second one <mark>你</mark> 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches: // Result of "first": 1 matches:
// 1. Doc 1, (0.512150) // 1. Doc 1, (0.512150)
// Name: This is the <span class="highlight">first</span> document weve added // Name: This is the <mark>first</mark> document weve added
// Result of "中文": 1 matches: // Result of "中文": 1 matches:
// 1. Doc 2, (0.553186) // 1. Doc 2, (0.553186)
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果 // Name: The second one 你 <mark>中文</mark>测试<mark>中文</mark> is even more interesting! 吃水果
// Result of "交换机": 2 matches: // Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495) // 1. Doc 4, (0.608495)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作 // Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换机</mark>等技术性器件的安装工作
// 2. Doc 5, (0.086700) // 2. Doc 5, (0.086700)
// Name: 咱俩<span class="highlight">交换</span>一下吧。 // Name: 咱俩<mark>交换</mark>一下吧。
// Result of "交换": 2 matches: // Result of "交换": 2 matches:
// 1. Doc 5, (0.534158) // 1. Doc 5, (0.534158)
// Name: 咱俩<span class="highlight">交换</span>一下吧。 // Name: 咱俩<mark>交换</mark>一下吧。
// 2. Doc 4, (0.296297) // 2. Doc 4, (0.296297)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作 // Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换</mark>机等技术性器件的安装工作
} }

2
tokenizers/example_test.go Normal file → Executable file
View File

@@ -3,7 +3,7 @@ package tokenizers_test
import ( import (
"fmt" "fmt"
"github.com/wangbin/jiebago/tokenizers" "github.com/fumiama/jieba/tokenizers"
) )
func Example() { func Example() {

34
tokenizers/tokenizer.go Normal file → Executable file
View File

@@ -7,7 +7,7 @@ import (
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago" jiebago "github.com/fumiama/jieba"
) )
// Name is the jieba tokenizer name. // Name is the jieba tokenizer name.
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters: Parameters:
dictFilePath: path of the dictioanry file. dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words, hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility. Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words. searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机" In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split "交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jiebago.Segmenter var seg jiebago.Segmenter
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1 pos := 1
var width int var width int
var gram string var gram string
for word := range jt.seg.Cut(string(input), jt.hmm) { for _, word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
width = len(runes) width = len(runes)
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter: Parameter config should contains at least one parameter:
file: the path of the dictionary file. file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details. search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/ */
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) { analysis.Tokenizer, error) {

0
tokenizers/tokenizer_test.go Normal file → Executable file
View File

0
userdict.txt Normal file → Executable file
View File

0
util/util.go Normal file → Executable file
View File

0
util/util_test.go Normal file → Executable file
View File