mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-08 18:40:24 +08:00
Merge branch 'release/v0.3'
This commit is contained in:
408
README.md
408
README.md
@@ -1,403 +1,69 @@
|
||||
#结巴分词 Go 语言版:jiebago
|
||||
#结巴分词 Go 语言版:Jiebago
|
||||
|
||||
|
||||
[](https://travis-ci.org/wangbin/jiebago)
|
||||
[](https://travis-ci.org/wangbin/jiebago) [](https://godoc.org/github.com/wangbin/jiebago)
|
||||
|
||||
[结巴分词](https://github.com/fxsjy/jieba)是[@fxsjy](https://github.com/fxsjy)用Python编写的中文分词组件,jiebago是结巴分词的Go语言实现,目前已经实现的功能包括:三种模式分词、自定义词典、关键词提取和词性标注。
|
||||
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。
|
||||
|
||||
|
||||
## 安装
|
||||
|
||||
```
|
||||
go get github.com/wangbin/jiebago/...
|
||||
```
|
||||
|
||||
go get github.com/wangbin/jiebago/...
|
||||
|
||||
## 分词
|
||||
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
var sentence = "我来到北京清华大学"
|
||||
|
||||
func print(ch chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf("%s / ", word)
|
||||
}
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func main() {
|
||||
jiebago.SetDictionary("/Path/to/dictionary/file") // 设定字典
|
||||
fmt.Print("【全模式】: ")
|
||||
print(jiebago.Cut(sentence, true, true))
|
||||
fmt.Print("【精确模式】: ")
|
||||
print(jiebago.Cut(sentence, false, true))
|
||||
fmt.Print("【新词识别】:")
|
||||
print(jiebago.Cut("他来到了网易杭研大厦", false, true))
|
||||
fmt.Print("【搜索引擎模式】:")
|
||||
print(jiebago.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
|
||||
}
|
||||
|
||||
使用结巴分词自带的[词典文件](https://github.com/fxsjy/jieba/blob/master/jieba/dict.txt),输出结果如下:
|
||||
|
||||
【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
|
||||
|
||||
【精确模式】: 我 / 来到 / 北京 / 清华大学 /
|
||||
|
||||
【新词识别】:他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
|
||||
|
||||
【搜索引擎模式】:小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
||||
|
||||
## 添加自定义词典
|
||||
|
||||
|
||||
var sentence = "李小福是创新办主任也是云计算方面的专家"
|
||||
fmt.Print("Before: ")
|
||||
print(jiebago.Cut(sentence, false, true))
|
||||
jiebago.LoadUserDict("/Path/to/user/dictionary/file")
|
||||
fmt.Print("After: ")
|
||||
print(jiebago.Cut(sentence, false, true))
|
||||
|
||||
使用结巴分词自带的[词典文件](https://github.com/fxsjy/jieba/blob/master/jieba/dict.txt)和[用户自定义词典文件](https://github.com/fxsjy/jieba/blob/master/test/userdict.txt),结果输出如下:
|
||||
|
||||
Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||
|
||||
After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||
|
||||
## 关键词提取
|
||||
|
||||
示例代码:
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago/analyse"
|
||||
)
|
||||
|
||||
var sentence = "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
|
||||
|
||||
func main() {
|
||||
analyse.SetDictionary("/Path/to/dictionary/file")
|
||||
analyse.SetIdf("/Path/to/idf/file")
|
||||
for _, ww := range analyse.ExtractTags(sentence, 20) {
|
||||
fmt.Printf("%s / ", ww.Word)
|
||||
}
|
||||
}
|
||||
|
||||
输出:
|
||||
|
||||
Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 / 北京 / 这是 / 一个 /
|
||||
|
||||
## 基于TextRank算法的关键词抽取实现
|
||||
|
||||
示例代码:
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago/analyse"
|
||||
)
|
||||
|
||||
func main() {
|
||||
sentence := "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚 置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
|
||||
analyse.SetDictionary("/Path/to/dictionary/file")
|
||||
result := analyse.TextRank(sentence, 10)
|
||||
for _, wt := range result {
|
||||
fmt.Printf("%s %f\n", wt.Word, wt.Freq)
|
||||
}
|
||||
}
|
||||
|
||||
输出:
|
||||
|
||||
吉林 1.000000
|
||||
欧亚 0.878078
|
||||
置业 0.562048
|
||||
实现 0.520906
|
||||
收入 0.384284
|
||||
增资 0.360591
|
||||
子公司 0.353132
|
||||
城市 0.307509
|
||||
全资 0.306324
|
||||
商业 0.306138
|
||||
|
||||
## 词性标注
|
||||
|
||||
示例代码:
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago"
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
)
|
||||
|
||||
var sentence = "我爱北京天安门"
|
||||
|
||||
func main() {
|
||||
posseg.SetDictionary("/Path/to/dictionary/file")
|
||||
for wt := range posseg.Cut(sentence, true) {
|
||||
fmt.Printf("%s %s\n", wt.Word, wt.Tag)
|
||||
}
|
||||
}
|
||||
|
||||
输出:
|
||||
|
||||
我 r
|
||||
爱 v
|
||||
北京 ns
|
||||
天安门 ns
|
||||
|
||||
|
||||
## 并行分词
|
||||
|
||||
因为Go有强大的goroutine特性,并行分词实现起来非常简单,所以并没有内置到jiebaogo中,而是由使用者自己实现,下面是一个简单的例子:
|
||||
|
||||
lineCount := 0
|
||||
inputFile, _ := os.Open(FileName)
|
||||
defer inputFile.Close()
|
||||
scanner := bufio.NewScanner(inputFile)
|
||||
ch := make(chan []string, 1)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
fileLength += len([]rune(line))
|
||||
lineCount += 1
|
||||
go func() {
|
||||
for word := range jiebago.Cut(line, false, true) {
|
||||
ch <- word
|
||||
}
|
||||
}()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
|
||||
defer outputFile.Close()
|
||||
writer := bufio.NewWriter(outputFile)
|
||||
results := make([]string, 0)
|
||||
for {
|
||||
if lineCount <= 0 {
|
||||
break
|
||||
}
|
||||
result, ok := <-ch
|
||||
if ok {
|
||||
results = append(results, result...)
|
||||
lineCount -= 1
|
||||
}
|
||||
}
|
||||
writer.WriteString(strings.Join(results, "/ "))
|
||||
writer.Flush()
|
||||
|
||||
|
||||
## Tokenize:返回词语在原文的起始位置
|
||||
|
||||
|
||||
注意新版的 Jiebago Tokenizer 实现了 Bleve 的 Tokenizer 接口,跟之前的实现有很大的变化:
|
||||
|
||||
1. 接受的参数必须是 []byte。
|
||||
2. 输出的 Token 的起始和终止位置是 byte 的位置,不是之前的 rune 的位置,所以和 Python 版的 Jieba.tokenize 输出不一致。
|
||||
## 使用
|
||||
|
||||
```
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago/tokenizers"
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
const DictPath = "/path/to/dict.txt"
|
||||
|
||||
var sentence = []byte("永和服装饰品有限公司")
|
||||
|
||||
func main() {
|
||||
// default mode
|
||||
tokenizer, _ := tokenizers.NewJiebaTokenizer(DictPath, true, false) for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
|
||||
//search mode
|
||||
tokenizer, _ = tokenizers.NewJiebaTokenizer(DictPath, true, true)
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
var seg jiebago.Segmenter
|
||||
|
||||
func init() {
|
||||
seg.LoadDictionary("dict.txt")
|
||||
}
|
||||
|
||||
```
|
||||
默认模式输出:
|
||||
|
||||
```
|
||||
Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
||||
```
|
||||
搜索模式输出:
|
||||
|
||||
```
|
||||
Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
||||
Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
||||
Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
||||
```
|
||||
### 配合 bleve 进行中文全文检索
|
||||
|
||||
[bleve](http://www.blevesearch.com/) 是一个 Go 语言实现的全文索引系统,jiebago 可以配合 bleve 使用实现中文的全文检索。一个简单的用法示例:
|
||||
|
||||
```
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/blevesearch/bleve"
|
||||
_ "github.com/wangbin/jiebago/analyse/tokenizers"
|
||||
"log"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// open a new index
|
||||
indexMapping := bleve.NewIndexMapping()
|
||||
|
||||
err := indexMapping.AddCustomTokenizer("jieba",
|
||||
map[string]interface{}{
|
||||
"file": "/Users/wangbin/mygo/src/github.com/wangbin/jiebago/dict.txt",
|
||||
"type": "jieba",
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
err = indexMapping.AddCustomAnalyzer("jieba",
|
||||
map[string]interface{}{
|
||||
"type": "custom",
|
||||
"tokenizer": "jieba",
|
||||
"token_filters": []string{
|
||||
"possessive_en",
|
||||
"to_lower",
|
||||
"stop_en",
|
||||
},
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
indexMapping.DefaultAnalyzer = "jieba"
|
||||
|
||||
index, err := bleve.New("example.bleve", indexMapping)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
indexMapping.DefaultAnalyzer = "jieba"
|
||||
|
||||
index, err := bleve.New("example.bleve", indexMapping)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
docs := []struct {
|
||||
Title string
|
||||
Name string
|
||||
}{
|
||||
{
|
||||
Title: "Doc 1",
|
||||
Name: "This is the first document we’ve added",
|
||||
},
|
||||
{
|
||||
Title: "Doc 2",
|
||||
Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
|
||||
},
|
||||
{
|
||||
Title: "Doc 3",
|
||||
Name: "买水果然后来世博园。",
|
||||
},
|
||||
{
|
||||
Title: "Doc 4",
|
||||
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
|
||||
},
|
||||
{
|
||||
Title: "Doc 5",
|
||||
Name: "咱俩交换一下吧。",
|
||||
},
|
||||
}
|
||||
// index docs
|
||||
for _, doc := range docs {
|
||||
index.Index(doc.Title, doc)
|
||||
}
|
||||
|
||||
// search for some text
|
||||
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
|
||||
query := bleve.NewMatchQuery(keyword)
|
||||
search := bleve.NewSearchRequest(query)
|
||||
search.Highlight = bleve.NewHighlight()
|
||||
searchResults, err := index.Search(search)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
func print(ch <-chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf(" %s /", word)
|
||||
}
|
||||
fmt.Printf("Result of %s: %s\n", keyword, searchResults)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func Example() {
|
||||
fmt.Print("【全模式】:")
|
||||
print(seg.CutAll("我来到北京清华大学"))
|
||||
|
||||
fmt.Print("【精确模式】:")
|
||||
print(seg.Cut("我来到北京清华大学", false))
|
||||
|
||||
fmt.Print("【新词识别】:")
|
||||
print(seg.Cut("他来到了网易杭研大厦", true))
|
||||
|
||||
fmt.Print("【搜索引擎模式】:")
|
||||
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
|
||||
}
|
||||
```
|
||||
输出结果:
|
||||
|
||||
```
|
||||
Result of 水果世博园: 2 matches, showing 1 through 2, took 377.988µs
|
||||
1. Doc 3 (1.099550)
|
||||
Name
|
||||
买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
|
||||
2. Doc 2 (0.031941)
|
||||
Name
|
||||
The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
|
||||
【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
|
||||
|
||||
Result of 你: 1 matches, showing 1 through 1, took 103.367µs
|
||||
1. Doc 2 (0.391161)
|
||||
Name
|
||||
The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
|
||||
【精确模式】: 我 / 来到 / 北京 / 清华大学 /
|
||||
|
||||
Result of first: 1 matches, showing 1 through 1, took 373.317µs
|
||||
1. Doc 1 (0.512150)
|
||||
Name
|
||||
This is the <span class="highlight">first</span> document we’ve added
|
||||
【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
|
||||
|
||||
Result of 中文: 1 matches, showing 1 through 1, took 106.433µs
|
||||
1. Doc 2 (0.553186)
|
||||
Name
|
||||
The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
|
||||
|
||||
Result of 交换机: 2 matches, showing 1 through 2, took 188.235µs
|
||||
1. Doc 4 (0.608495)
|
||||
Name
|
||||
工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
|
||||
2. Doc 5 (0.086700)
|
||||
Name
|
||||
咱俩<span class="highlight">交换</span>一下吧。
|
||||
|
||||
Result of 交换: 2 matches, showing 1 through 2, took 148.822µs
|
||||
1. Doc 5 (0.534158)
|
||||
Name
|
||||
咱俩<span class="highlight">交换</span>一下吧。
|
||||
2. Doc 4 (0.296297)
|
||||
Name
|
||||
工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
|
||||
【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
||||
```
|
||||
|
||||
更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。
|
||||
|
||||
## 分词速度
|
||||
|
||||
- 2MB / Second in Full Mode
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type wordWeight struct {
|
||||
Word string
|
||||
Weight float64
|
||||
}
|
||||
|
||||
func (w wordWeight) String() string {
|
||||
return fmt.Sprintf("{%s: %f}", w.Word, w.Weight)
|
||||
}
|
||||
|
||||
type wordWeights []wordWeight
|
||||
|
||||
func (ws wordWeights) Len() int {
|
||||
return len(ws)
|
||||
}
|
||||
|
||||
func (ws wordWeights) Less(i, j int) bool {
|
||||
if ws[i].Weight == ws[j].Weight {
|
||||
return ws[i].Word < ws[j].Word
|
||||
}
|
||||
|
||||
return ws[i].Weight < ws[j].Weight
|
||||
}
|
||||
|
||||
func (ws wordWeights) Swap(i, j int) {
|
||||
ws[i], ws[j] = ws[j], ws[i]
|
||||
}
|
||||
|
||||
// Keyword extraction.
|
||||
func ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||
freq := make(map[string]float64)
|
||||
|
||||
for w := range jiebago.Cut(sentence, false, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
if utf8.RuneCountInString(w) < 2 {
|
||||
continue
|
||||
}
|
||||
if _, ok := stopWords[w]; ok {
|
||||
continue
|
||||
}
|
||||
if f, ok := freq[w]; ok {
|
||||
freq[w] = f + 1.0
|
||||
} else {
|
||||
freq[w] = 1.0
|
||||
}
|
||||
}
|
||||
total := 0.0
|
||||
for _, f := range freq {
|
||||
total += f
|
||||
}
|
||||
for k, v := range freq {
|
||||
freq[k] = v / total
|
||||
}
|
||||
ws := make(wordWeights, 0)
|
||||
for k, v := range freq {
|
||||
var ti wordWeight
|
||||
if freq_, ok := loader.Freq[k]; ok {
|
||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
||||
} else {
|
||||
ti = wordWeight{Word: k, Weight: loader.Median * v}
|
||||
}
|
||||
ws = append(ws, ti)
|
||||
}
|
||||
sort.Sort(sort.Reverse(ws))
|
||||
if len(ws) > topK {
|
||||
tags = ws[:topK]
|
||||
} else {
|
||||
tags = ws
|
||||
}
|
||||
return tags
|
||||
}
|
||||
44
analyse/example_test.go
Normal file
44
analyse/example_test.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package analyse_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/analyse"
|
||||
)
|
||||
|
||||
func ExampleExtractTags() {
|
||||
var t analyse.TagExtracter
|
||||
t.LoadDictionary("../dict.txt")
|
||||
t.LoadIdf("idf.txt")
|
||||
|
||||
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
|
||||
segments := t.ExtractTags(sentence, 5)
|
||||
fmt.Printf("Top %d tags:", len(segments))
|
||||
for _, segment := range segments {
|
||||
fmt.Printf(" %s /", segment.Text())
|
||||
}
|
||||
// Output:
|
||||
// Top 5 tags: Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 /
|
||||
}
|
||||
|
||||
func ExampleTextRank() {
|
||||
var t analyse.TextRanker
|
||||
t.LoadDictionary("../dict.txt")
|
||||
sentence := "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
|
||||
result := t.TextRank(sentence, 10)
|
||||
for _, segment := range result {
|
||||
fmt.Printf("%s %f\n", segment.Text(), segment.Weight())
|
||||
}
|
||||
// Output:
|
||||
// 吉林 1.000000
|
||||
// 欧亚 0.878078
|
||||
// 置业 0.562048
|
||||
// 实现 0.520906
|
||||
// 收入 0.384284
|
||||
// 增资 0.360591
|
||||
// 子公司 0.353132
|
||||
// 城市 0.307509
|
||||
// 全资 0.306324
|
||||
// 商业 0.306138
|
||||
}
|
||||
@@ -1,57 +1,56 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
var (
|
||||
loader *idfLoader
|
||||
)
|
||||
|
||||
func init() {
|
||||
loader = newIDFLoader()
|
||||
// Idf represents a thread-safe dictionary for all words with their
|
||||
// IDFs(Inverse Document Frequency).
|
||||
type Idf struct {
|
||||
freqMap map[string]float64
|
||||
median float64
|
||||
freqs []float64
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
type idfLoader struct {
|
||||
Path string
|
||||
Freq map[string]float64
|
||||
Median float64
|
||||
// AddToken adds a new word with IDF into it's dictionary.
|
||||
func (i *Idf) AddToken(token dictionary.Token) {
|
||||
i.Lock()
|
||||
i.freqMap[token.Text()] = token.Frequency()
|
||||
i.freqs = append(i.freqs, token.Frequency())
|
||||
sort.Float64s(i.freqs)
|
||||
i.median = i.freqs[len(i.freqs)/2]
|
||||
i.Unlock()
|
||||
}
|
||||
|
||||
func newIDFLoader() *idfLoader {
|
||||
loader := new(idfLoader)
|
||||
loader.Freq = make(map[string]float64)
|
||||
return loader
|
||||
}
|
||||
|
||||
func (loader *idfLoader) newPath(idfFilePath string) error {
|
||||
if loader.Path == idfFilePath {
|
||||
return nil
|
||||
// Load loads all tokens from channel into it's dictionary.
|
||||
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
||||
i.Lock()
|
||||
for token := range ch {
|
||||
i.freqMap[token.Text()] = token.Frequency()
|
||||
i.freqs = append(i.freqs, token.Frequency())
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(idfFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
freqs := make([]float64, 0)
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
loader.Freq[wtf.Word] = wtf.Freq
|
||||
freqs = append(freqs, wtf.Freq)
|
||||
}
|
||||
|
||||
sort.Float64s(freqs)
|
||||
loader.Median = freqs[len(freqs)/2]
|
||||
return nil
|
||||
sort.Float64s(i.freqs)
|
||||
i.median = i.freqs[len(i.freqs)/2]
|
||||
i.Unlock()
|
||||
}
|
||||
|
||||
// Set the IDF file path, could be absolute path of IDF file, or IDF file
|
||||
// name in current directory.
|
||||
func SetIdf(idfFileName string) error {
|
||||
idfFilePath, err := jiebago.DictPath(idfFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return loader.newPath(idfFilePath)
|
||||
func (i *Idf) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(i, fileName)
|
||||
}
|
||||
|
||||
// Frequency returns the IDF of given word.
|
||||
func (i *Idf) Frequency(key string) (float64, bool) {
|
||||
i.RLock()
|
||||
freq, ok := i.freqMap[key]
|
||||
i.RUnlock()
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// NewIdf creates a new Idf instance.
|
||||
func NewIdf() *Idf {
|
||||
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
||||
}
|
||||
|
||||
@@ -1,58 +1,83 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
var stopWords map[string]int
|
||||
|
||||
func init() {
|
||||
stopWords = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
"is": 1,
|
||||
"and": 1,
|
||||
"to": 1,
|
||||
"in": 1,
|
||||
"that": 1,
|
||||
"we": 1,
|
||||
"for": 1,
|
||||
"an": 1,
|
||||
"are": 1,
|
||||
"by": 1,
|
||||
"be": 1,
|
||||
"as": 1,
|
||||
"on": 1,
|
||||
"with": 1,
|
||||
"can": 1,
|
||||
"if": 1,
|
||||
"from": 1,
|
||||
"which": 1,
|
||||
"you": 1,
|
||||
"it": 1,
|
||||
"this": 1,
|
||||
"then": 1,
|
||||
"at": 1,
|
||||
"have": 1,
|
||||
"all": 1,
|
||||
"not": 1,
|
||||
"one": 1,
|
||||
"has": 1,
|
||||
"or": 1,
|
||||
}
|
||||
// DefaultStopWordMap contains some stop words.
|
||||
var DefaultStopWordMap = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
"is": 1,
|
||||
"and": 1,
|
||||
"to": 1,
|
||||
"in": 1,
|
||||
"that": 1,
|
||||
"we": 1,
|
||||
"for": 1,
|
||||
"an": 1,
|
||||
"are": 1,
|
||||
"by": 1,
|
||||
"be": 1,
|
||||
"as": 1,
|
||||
"on": 1,
|
||||
"with": 1,
|
||||
"can": 1,
|
||||
"if": 1,
|
||||
"from": 1,
|
||||
"which": 1,
|
||||
"you": 1,
|
||||
"it": 1,
|
||||
"this": 1,
|
||||
"then": 1,
|
||||
"at": 1,
|
||||
"have": 1,
|
||||
"all": 1,
|
||||
"not": 1,
|
||||
"one": 1,
|
||||
"has": 1,
|
||||
"or": 1,
|
||||
}
|
||||
|
||||
// Set the stop words file path, could be absolute path of stop words file, or
|
||||
// file name in current directory.
|
||||
func SetStopWords(stopWordsFileName string) error {
|
||||
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
|
||||
for _, wtf := range wtfs {
|
||||
stopWords[wtf.Word] = 1
|
||||
}
|
||||
return nil
|
||||
// StopWord is a thread-safe dictionary for all stop words.
|
||||
type StopWord struct {
|
||||
stopWordMap map[string]int
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// AddToken adds a token into StopWord dictionary.
|
||||
func (s *StopWord) AddToken(token dictionary.Token) {
|
||||
s.Lock()
|
||||
s.stopWordMap[token.Text()] = 1
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
// NewStopWord create a new StopWord with default stop words.
|
||||
func NewStopWord() *StopWord {
|
||||
s := new(StopWord)
|
||||
s.stopWordMap = DefaultStopWordMap
|
||||
return s
|
||||
}
|
||||
|
||||
// IsStopWord checks if a given word is stop word.
|
||||
func (s *StopWord) IsStopWord(word string) bool {
|
||||
s.RLock()
|
||||
_, ok := s.stopWordMap[word]
|
||||
s.RUnlock()
|
||||
return ok
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel into StopWord dictionary.
|
||||
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
||||
s.Lock()
|
||||
for token := range ch {
|
||||
s.stopWordMap[token.Text()] = 1
|
||||
}
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
func (s *StopWord) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(s, fileName)
|
||||
}
|
||||
|
||||
115
analyse/tag_extracker.go
Normal file
115
analyse/tag_extracker.go
Normal file
@@ -0,0 +1,115 @@
|
||||
// Package analyse is the Golang implementation of Jieba's analyse module.
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
// Segment represents a word with weight.
|
||||
type Segment struct {
|
||||
text string
|
||||
weight float64
|
||||
}
|
||||
|
||||
// Text returns the segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
// Weight returns the segment's weight.
|
||||
func (s Segment) Weight() float64 {
|
||||
return s.weight
|
||||
}
|
||||
|
||||
// Segments represents a slice of Segment.
|
||||
type Segments []Segment
|
||||
|
||||
func (ss Segments) Len() int {
|
||||
return len(ss)
|
||||
}
|
||||
|
||||
func (ss Segments) Less(i, j int) bool {
|
||||
if ss[i].weight == ss[j].weight {
|
||||
return ss[i].text < ss[j].text
|
||||
}
|
||||
|
||||
return ss[i].weight < ss[j].weight
|
||||
}
|
||||
|
||||
func (ss Segments) Swap(i, j int) {
|
||||
ss[i], ss[j] = ss[j], ss[i]
|
||||
}
|
||||
|
||||
// TagExtracter is used to extract tags from sentence.
|
||||
type TagExtracter struct {
|
||||
seg *jiebago.Segmenter
|
||||
idf *Idf
|
||||
stopWord *StopWord
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jiebago.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||
t.idf = NewIdf()
|
||||
return t.idf.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
return t.stopWord.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// ExtractTags extracts the topK key words from sentence.
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
freqMap := make(map[string]float64)
|
||||
|
||||
for w := range t.seg.Cut(sentence, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
if utf8.RuneCountInString(w) < 2 {
|
||||
continue
|
||||
}
|
||||
if t.stopWord.IsStopWord(w) {
|
||||
continue
|
||||
}
|
||||
if f, ok := freqMap[w]; ok {
|
||||
freqMap[w] = f + 1.0
|
||||
} else {
|
||||
freqMap[w] = 1.0
|
||||
}
|
||||
}
|
||||
total := 0.0
|
||||
for _, freq := range freqMap {
|
||||
total += freq
|
||||
}
|
||||
for k, v := range freqMap {
|
||||
freqMap[k] = v / total
|
||||
}
|
||||
ws := make(Segments, 0)
|
||||
var s Segment
|
||||
for k, v := range freqMap {
|
||||
if freq, ok := t.idf.Frequency(k); ok {
|
||||
s = Segment{text: k, weight: freq * v}
|
||||
} else {
|
||||
s = Segment{text: k, weight: t.idf.median * v}
|
||||
}
|
||||
ws = append(ws, s)
|
||||
}
|
||||
sort.Sort(sort.Reverse(ws))
|
||||
if len(ws) > topK {
|
||||
tags = ws[:topK]
|
||||
} else {
|
||||
tags = ws
|
||||
}
|
||||
return tags
|
||||
}
|
||||
@@ -1,13 +1,12 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var (
|
||||
test_contents = []string{
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -228,71 +227,74 @@ var (
|
||||
只是逼不得已
|
||||
雖然沒有藉口
|
||||
`
|
||||
LyciWeight = []wordWeight{
|
||||
wordWeight{Word: "所謂", Weight: 1.010262},
|
||||
wordWeight{Word: "是否", Weight: 0.738650},
|
||||
wordWeight{Word: "一般", Weight: 0.607600},
|
||||
wordWeight{Word: "雖然", Weight: 0.336754},
|
||||
wordWeight{Word: "退縮", Weight: 0.336754},
|
||||
wordWeight{Word: "肌迫", Weight: 0.336754},
|
||||
wordWeight{Word: "矯作", Weight: 0.336754},
|
||||
wordWeight{Word: "沒有", Weight: 0.336754},
|
||||
wordWeight{Word: "怯懦", Weight: 0.271099},
|
||||
wordWeight{Word: "隨便", Weight: 0.168377},
|
||||
LyciWeight = Segments{
|
||||
Segment{text: "所謂", weight: 1.010262},
|
||||
Segment{text: "是否", weight: 0.738650},
|
||||
Segment{text: "一般", weight: 0.607600},
|
||||
Segment{text: "雖然", weight: 0.336754},
|
||||
Segment{text: "退縮", weight: 0.336754},
|
||||
Segment{text: "肌迫", weight: 0.336754},
|
||||
Segment{text: "矯作", weight: 0.336754},
|
||||
Segment{text: "沒有", weight: 0.336754},
|
||||
Segment{text: "怯懦", weight: 0.271099},
|
||||
Segment{text: "隨便", weight: 0.168377},
|
||||
}
|
||||
|
||||
LyciWeight2 = []wordWeight{
|
||||
wordWeight{Word: "所謂", Weight: 1.215739},
|
||||
wordWeight{Word: "一般", Weight: 0.731179},
|
||||
wordWeight{Word: "雖然", Weight: 0.405246},
|
||||
wordWeight{Word: "退縮", Weight: 0.405246},
|
||||
wordWeight{Word: "肌迫", Weight: 0.405246},
|
||||
wordWeight{Word: "矯作", Weight: 0.405246},
|
||||
wordWeight{Word: "怯懦", Weight: 0.326238},
|
||||
wordWeight{Word: "逼不得已", Weight: 0.202623},
|
||||
wordWeight{Word: "右銘", Weight: 0.202623},
|
||||
wordWeight{Word: "寬闊", Weight: 0.202623},
|
||||
LyciWeight2 = Segments{
|
||||
Segment{text: "所謂", weight: 1.215739},
|
||||
Segment{text: "一般", weight: 0.731179},
|
||||
Segment{text: "雖然", weight: 0.405246},
|
||||
Segment{text: "退縮", weight: 0.405246},
|
||||
Segment{text: "肌迫", weight: 0.405246},
|
||||
Segment{text: "矯作", weight: 0.405246},
|
||||
Segment{text: "怯懦", weight: 0.326238},
|
||||
Segment{text: "逼不得已", weight: 0.202623},
|
||||
Segment{text: "右銘", weight: 0.202623},
|
||||
Segment{text: "寬闊", weight: 0.202623},
|
||||
}
|
||||
)
|
||||
|
||||
func TestExtractTags(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
|
||||
for index, sentence := range test_contents {
|
||||
result := ExtractTags(sentence, 20)
|
||||
for index, sentence := range testContents {
|
||||
result := te.ExtractTags(sentence, 20)
|
||||
if len(result) != len(Tags[index]) {
|
||||
t.Errorf("%s = %v", sentence, result)
|
||||
t.Fatalf("%s = %v", sentence, result)
|
||||
}
|
||||
for i, tag := range result {
|
||||
if tag.Word != Tags[index][i] {
|
||||
t.Errorf("%s != %s", tag, Tags[index][i])
|
||||
if tag.text != Tags[index][i] {
|
||||
t.Fatalf("%s != %s", tag, Tags[index][i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtratTagsWithWeight(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
result := ExtractTags(Lyric, 10)
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
result := te.ExtractTags(Lyric, 10)
|
||||
for index, tag := range result {
|
||||
if LyciWeight[index].Word != tag.Word ||
|
||||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
|
||||
t.Errorf("%v != %v", tag, LyciWeight[index])
|
||||
if LyciWeight[index].text != tag.text ||
|
||||
math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 {
|
||||
t.Fatalf("%v != %v", tag, LyciWeight[index])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
SetStopWords("stop_words.txt")
|
||||
result := ExtractTags(Lyric, 7)
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
te.LoadStopWords("stop_words.txt")
|
||||
result := te.ExtractTags(Lyric, 7)
|
||||
for index, tag := range result {
|
||||
if LyciWeight2[index].Word != tag.Word ||
|
||||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
|
||||
t.Errorf("%v != %v", tag, LyciWeight2[index])
|
||||
if LyciWeight2[index].text != tag.text ||
|
||||
math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 {
|
||||
t.Fatalf("%v != %v", tag, LyciWeight2[index])
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
)
|
||||
|
||||
const dampingFactor = 0.85
|
||||
@@ -19,10 +19,6 @@ type edge struct {
|
||||
weight float64
|
||||
}
|
||||
|
||||
func (e edge) String() string {
|
||||
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
|
||||
}
|
||||
|
||||
type edges []edge
|
||||
|
||||
func (es edges) Len() int {
|
||||
@@ -65,7 +61,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
|
||||
}
|
||||
}
|
||||
|
||||
func (u *undirectWeightedGraph) rank() wordWeights {
|
||||
func (u *undirectWeightedGraph) rank() Segments {
|
||||
if !sort.IsSorted(u.keys) {
|
||||
sort.Sort(u.keys)
|
||||
}
|
||||
@@ -105,17 +101,17 @@ func (u *undirectWeightedGraph) rank() wordWeights {
|
||||
maxRank = w
|
||||
}
|
||||
}
|
||||
result := make(wordWeights, 0)
|
||||
result := make(Segments, 0)
|
||||
for n, w := range ws {
|
||||
result = append(result, wordWeight{Word: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
|
||||
result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
|
||||
}
|
||||
sort.Sort(sort.Reverse(result))
|
||||
return result
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||
// could be manually speificed.
|
||||
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
|
||||
// Parameter allowPOS allows a customized pos list.
|
||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
||||
posFilt := make(map[string]int)
|
||||
for _, pos := range allowPOS {
|
||||
posFilt[pos] = 1
|
||||
@@ -123,23 +119,20 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
g := newUndirectWeightedGraph()
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
wordTags := make([]posseg.WordTag, 0)
|
||||
for wordTag := range posseg.Cut(sentence, true) {
|
||||
wordTags = append(wordTags, wordTag)
|
||||
var pairs []posseg.Segment
|
||||
for pair := range t.seg.Cut(sentence, true) {
|
||||
pairs = append(pairs, pair)
|
||||
}
|
||||
for i, _ := range wordTags {
|
||||
if _, ok := posFilt[wordTags[i].Tag]; ok {
|
||||
for j := i + 1; j < i+span; j++ {
|
||||
if j > len(wordTags) {
|
||||
break
|
||||
}
|
||||
if _, ok := posFilt[wordTags[j].Tag]; !ok {
|
||||
for i := range pairs {
|
||||
if _, ok := posFilt[pairs[i].Pos()]; ok {
|
||||
for j := i + 1; j < i+span && j <= len(pairs); j++ {
|
||||
if _, ok := posFilt[pairs[j].Pos()]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, ok := cm[[2]string{wordTags[i].Word, wordTags[j].Word}]; !ok {
|
||||
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] = 1.0
|
||||
if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok {
|
||||
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0
|
||||
} else {
|
||||
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] += 1.0
|
||||
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -154,15 +147,19 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
return tags
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm.
|
||||
// topK specify how many top keywords to be returned at most.
|
||||
func TextRank(sentence string, topK int) wordWeights {
|
||||
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
// TextRank extract keywords from sentence using TextRank algorithm.
|
||||
// Parameter topK specify how many top keywords to be returned at most.
|
||||
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
}
|
||||
|
||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||
// name in current directory. This function must be called before cut any
|
||||
// sentence.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
return posseg.SetDictionary(dictFileName)
|
||||
// TextRanker is used to extract tags from sentence.
|
||||
type TextRanker struct {
|
||||
seg *posseg.Segmenter
|
||||
}
|
||||
|
||||
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
|
||||
func (t *TextRanker) LoadDictionary(fileName string) error {
|
||||
t.seg = new(posseg.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
}
|
||||
|
||||
@@ -8,26 +8,27 @@ import (
|
||||
var (
|
||||
sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
|
||||
tagRanks = wordWeights{
|
||||
wordWeight{Word: "吉林", Weight: 1.0},
|
||||
wordWeight{Word: "欧亚", Weight: 0.87807810644},
|
||||
wordWeight{Word: "置业", Weight: 0.562048250306},
|
||||
wordWeight{Word: "实现", Weight: 0.520905743929},
|
||||
wordWeight{Word: "收入", Weight: 0.384283870648},
|
||||
wordWeight{Word: "增资", Weight: 0.360590945312},
|
||||
wordWeight{Word: "子公司", Weight: 0.353131980904},
|
||||
wordWeight{Word: "城市", Weight: 0.307509449283},
|
||||
wordWeight{Word: "全资", Weight: 0.306324426665},
|
||||
wordWeight{Word: "商业", Weight: 0.306138241063},
|
||||
tagRanks = Segments{
|
||||
Segment{text: "吉林", weight: 1.0},
|
||||
Segment{text: "欧亚", weight: 0.87807810644},
|
||||
Segment{text: "置业", weight: 0.562048250306},
|
||||
Segment{text: "实现", weight: 0.520905743929},
|
||||
Segment{text: "收入", weight: 0.384283870648},
|
||||
Segment{text: "增资", weight: 0.360590945312},
|
||||
Segment{text: "子公司", weight: 0.353131980904},
|
||||
Segment{text: "城市", weight: 0.307509449283},
|
||||
Segment{text: "全资", weight: 0.306324426665},
|
||||
Segment{text: "商业", weight: 0.306138241063},
|
||||
}
|
||||
)
|
||||
|
||||
func TestTextRank(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
results := TextRank(sentence, 10)
|
||||
var tr TextRanker
|
||||
tr.LoadDictionary("../dict.txt")
|
||||
results := tr.TextRank(sentence, 10)
|
||||
for index, tw := range results {
|
||||
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
|
||||
t.Errorf("%v != %v", tw, tagRanks[index])
|
||||
if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 {
|
||||
t.Fatalf("%v != %v", tw, tagRanks[index])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
57
dict.go
57
dict.go
@@ -1,57 +0,0 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type WordTagFreq struct {
|
||||
Word, Tag string
|
||||
Freq float64
|
||||
}
|
||||
|
||||
func DictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
pwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(pwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
|
||||
func ParseDictFile(dictFilePath string) (wtfs []*WordTagFreq, err error) {
|
||||
var dictFile *os.File
|
||||
dictFile, err = os.Open(dictFilePath)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer dictFile.Close()
|
||||
scanner := bufio.NewScanner(dictFile)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
fields := strings.Split(line, " ")
|
||||
length := len(fields)
|
||||
word := fields[0]
|
||||
word = strings.Replace(word, "\ufeff", "", 1)
|
||||
wtf := &WordTagFreq{Word: word}
|
||||
if length > 1 {
|
||||
wtf.Freq, err = strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
if length > 2 {
|
||||
wtf.Tag = fields[2]
|
||||
}
|
||||
wtfs = append(wtfs, wtf)
|
||||
}
|
||||
err = scanner.Err()
|
||||
return
|
||||
}
|
||||
62
dictionary.go
Normal file
62
dictionary.go
Normal file
@@ -0,0 +1,62 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
type Dictionary struct {
|
||||
total, logTotal float64
|
||||
freqMap map[string]float64
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel
|
||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
d.addToken(token)
|
||||
}
|
||||
d.Unlock()
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
// AddToken adds one token
|
||||
func (d *Dictionary) AddToken(token dictionary.Token) {
|
||||
d.Lock()
|
||||
d.addToken(token)
|
||||
d.Unlock()
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
func (d *Dictionary) addToken(token dictionary.Token) {
|
||||
d.freqMap[token.Text()] = token.Frequency()
|
||||
d.total += token.Frequency()
|
||||
runes := []rune(token.Text())
|
||||
n := len(runes)
|
||||
for i := 0; i < n; i++ { //TODO: n-1?
|
||||
frag := string(runes[:i+1])
|
||||
if _, ok := d.freqMap[frag]; !ok {
|
||||
d.freqMap[frag] = 0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dictionary) updateLogTotal() {
|
||||
d.logTotal = math.Log(d.total)
|
||||
}
|
||||
|
||||
// Frequency returns the frequency and existence of give word
|
||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
d.RLock()
|
||||
freq, ok := d.freqMap[key]
|
||||
d.RUnlock()
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(d, fileName)
|
||||
}
|
||||
85
dictionary/dictionary.go
Normal file
85
dictionary/dictionary.go
Normal file
@@ -0,0 +1,85 @@
|
||||
// Package dictionary contains a interface and wraps all io related work.
|
||||
// It is used by jiebago module to read/write files.
|
||||
package dictionary
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DictLoader is the interface that could add one token or load
|
||||
// tokens from channel.
|
||||
type DictLoader interface {
|
||||
Load(<-chan Token)
|
||||
AddToken(Token)
|
||||
}
|
||||
|
||||
func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
|
||||
tokenCh, errCh := make(chan Token), make(chan error)
|
||||
|
||||
go func() {
|
||||
defer close(tokenCh)
|
||||
defer close(errCh)
|
||||
scanner := bufio.NewScanner(file)
|
||||
var token Token
|
||||
var line string
|
||||
var fields []string
|
||||
var err error
|
||||
for scanner.Scan() {
|
||||
line = scanner.Text()
|
||||
fields = strings.Split(line, " ")
|
||||
token.text = strings.TrimSpace(strings.Replace(fields[0], "\ufeff", "", 1))
|
||||
if length := len(fields); length > 1 {
|
||||
token.frequency, err = strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
return
|
||||
}
|
||||
if length > 2 {
|
||||
token.pos = strings.TrimSpace(fields[2])
|
||||
}
|
||||
}
|
||||
tokenCh <- token
|
||||
}
|
||||
|
||||
if err = scanner.Err(); err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
}()
|
||||
return tokenCh, errCh
|
||||
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||
func LoadDictionary(dl DictLoader, fileName string) error {
|
||||
filePath, err := dictPath(fileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dictFile, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer dictFile.Close()
|
||||
tokenCh, errCh := loadDictionary(dictFile)
|
||||
dl.Load(tokenCh)
|
||||
|
||||
return <-errCh
|
||||
|
||||
}
|
||||
|
||||
func dictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
59
dictionary/dictionary_test.go
Normal file
59
dictionary/dictionary_test.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package dictionary
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type Dict struct {
|
||||
freqMap map[string]float64
|
||||
posMap map[string]string
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
func (d *Dict) Load(ch <-chan Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
d.freqMap[token.Text()] = token.Frequency()
|
||||
if len(token.Pos()) > 0 {
|
||||
d.posMap[token.Text()] = token.Pos()
|
||||
}
|
||||
}
|
||||
d.Unlock()
|
||||
}
|
||||
|
||||
func (d *Dict) AddToken(token Token) {
|
||||
d.Lock()
|
||||
d.freqMap[token.Text()] = token.Frequency()
|
||||
if len(token.Pos()) > 0 {
|
||||
d.posMap[token.Text()] = token.Pos()
|
||||
}
|
||||
d.Unlock()
|
||||
}
|
||||
|
||||
func TestLoadDictionary(t *testing.T) {
|
||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
err := LoadDictionary(d, "../userdict.txt")
|
||||
if err != nil {
|
||||
t.Fatalf(err.Error())
|
||||
}
|
||||
if len(d.freqMap) != 7 {
|
||||
t.Fatalf("Failed to load userdict.txt, got %d tokens with frequency, expected 7",
|
||||
len(d.freqMap))
|
||||
}
|
||||
if len(d.posMap) != 6 {
|
||||
t.Fatalf("Failed to load userdict.txt, got %d tokens with pos, expected 6", len(d.posMap))
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddToken(t *testing.T) {
|
||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
LoadDictionary(d, "../userdict.txt")
|
||||
d.AddToken(Token{"好用", 99, "a"})
|
||||
if d.freqMap["好用"] != 99 {
|
||||
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
|
||||
}
|
||||
if d.posMap["好用"] != "a" {
|
||||
t.Fatalf("Failed to add token, got pos %s, expected \"a\"", d.posMap["好用"])
|
||||
}
|
||||
}
|
||||
28
dictionary/token.go
Normal file
28
dictionary/token.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package dictionary
|
||||
|
||||
// Token represents a Chinese word with (optional) frequency and POS.
|
||||
type Token struct {
|
||||
text string
|
||||
frequency float64
|
||||
pos string
|
||||
}
|
||||
|
||||
//Text returns token's text.
|
||||
func (t Token) Text() string {
|
||||
return t.text
|
||||
}
|
||||
|
||||
// Frequency returns token's frequency.
|
||||
func (t Token) Frequency() float64 {
|
||||
return t.frequency
|
||||
}
|
||||
|
||||
// Pos returns token's POS.
|
||||
func (t Token) Pos() string {
|
||||
return t.pos
|
||||
}
|
||||
|
||||
// NewToken creates a new token.
|
||||
func NewToken(text string, frequency float64, pos string) Token {
|
||||
return Token{text: text, frequency: frequency, pos: pos}
|
||||
}
|
||||
126
example_bleve_test.go
Normal file
126
example_bleve_test.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package jiebago_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/blevesearch/bleve"
|
||||
_ "github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
func ExampleBeleveSearch() {
|
||||
// open a new index
|
||||
indexMapping := bleve.NewIndexMapping()
|
||||
|
||||
err := indexMapping.AddCustomTokenizer("jieba",
|
||||
map[string]interface{}{
|
||||
"file": "dict.txt",
|
||||
"type": "jieba",
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// create a custom analyzer
|
||||
err = indexMapping.AddCustomAnalyzer("jieba",
|
||||
map[string]interface{}{
|
||||
"type": "custom",
|
||||
"tokenizer": "jieba",
|
||||
"token_filters": []string{
|
||||
"possessive_en",
|
||||
"to_lower",
|
||||
"stop_en",
|
||||
},
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
indexMapping.DefaultAnalyzer = "jieba"
|
||||
cacheDir := "jieba.beleve"
|
||||
os.RemoveAll(cacheDir)
|
||||
index, err := bleve.New(cacheDir, indexMapping)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
docs := []struct {
|
||||
Title string
|
||||
Name string
|
||||
}{
|
||||
{
|
||||
Title: "Doc 1",
|
||||
Name: "This is the first document we’ve added",
|
||||
},
|
||||
{
|
||||
Title: "Doc 2",
|
||||
Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
|
||||
},
|
||||
{
|
||||
Title: "Doc 3",
|
||||
Name: "买水果然后来世博园。",
|
||||
},
|
||||
{
|
||||
Title: "Doc 4",
|
||||
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
|
||||
},
|
||||
{
|
||||
Title: "Doc 5",
|
||||
Name: "咱俩交换一下吧。",
|
||||
},
|
||||
}
|
||||
// index docs
|
||||
for _, doc := range docs {
|
||||
index.Index(doc.Title, doc)
|
||||
}
|
||||
|
||||
// search for some text
|
||||
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
|
||||
query := bleve.NewMatchQuery(keyword)
|
||||
search := bleve.NewSearchRequest(query)
|
||||
search.Highlight = bleve.NewHighlight()
|
||||
searchResults, err := index.Search(search)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
|
||||
for i, hit := range searchResults.Hits {
|
||||
rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
|
||||
for fragmentField, fragments := range hit.Fragments {
|
||||
rv += fmt.Sprintf("%s: ", fragmentField)
|
||||
for _, fragment := range fragments {
|
||||
rv += fmt.Sprintf("%s", fragment)
|
||||
}
|
||||
}
|
||||
fmt.Printf("%s\n", rv)
|
||||
}
|
||||
}
|
||||
// Output:
|
||||
// Result of "水果世博园": 2 matches:
|
||||
// 1. Doc 3, (1.099550)
|
||||
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
|
||||
// 2. Doc 2, (0.031941)
|
||||
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
|
||||
// Result of "你": 1 matches:
|
||||
// 1. Doc 2, (0.391161)
|
||||
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
|
||||
// Result of "first": 1 matches:
|
||||
// 1. Doc 1, (0.512150)
|
||||
// Name: This is the <span class="highlight">first</span> document we’ve added
|
||||
// Result of "中文": 1 matches:
|
||||
// 1. Doc 2, (0.553186)
|
||||
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
|
||||
// Result of "交换机": 2 matches:
|
||||
// 1. Doc 4, (0.608495)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
|
||||
// 2. Doc 5, (0.086700)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// Result of "交换": 2 matches:
|
||||
// 1. Doc 5, (0.534158)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// 2. Doc 4, (0.296297)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
|
||||
}
|
||||
100
example_parallel_cut_test.go
Normal file
100
example_parallel_cut_test.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package jiebago_test
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
type line struct {
|
||||
number int
|
||||
text string
|
||||
}
|
||||
|
||||
var (
|
||||
segmenter = jiebago.Segmenter{}
|
||||
numThreads = runtime.NumCPU()
|
||||
task = make(chan line, numThreads)
|
||||
result = make(chan line, numThreads)
|
||||
)
|
||||
|
||||
func worker() {
|
||||
for l := range task {
|
||||
var segments []string
|
||||
for segment := range segmenter.Cut(l.text, true) {
|
||||
segments = append(segments, segment)
|
||||
}
|
||||
|
||||
l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
|
||||
result <- l
|
||||
}
|
||||
}
|
||||
|
||||
func Example_parallelCut() {
|
||||
// Set the number of goroutines
|
||||
runtime.GOMAXPROCS(numThreads)
|
||||
|
||||
// Load dictionary
|
||||
segmenter.LoadDictionary("dict.txt")
|
||||
|
||||
// open file for segmentation
|
||||
file, err := os.Open("README.md")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// start worker routines
|
||||
for i := 0; i < numThreads; i++ {
|
||||
go worker()
|
||||
}
|
||||
|
||||
var length, size int
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
t0 := time.Now()
|
||||
|
||||
lines := make([]string, 0)
|
||||
|
||||
// Read lines
|
||||
for scanner.Scan() {
|
||||
t := scanner.Text()
|
||||
size += len(t)
|
||||
lines = append(lines, t)
|
||||
}
|
||||
length = len(lines)
|
||||
|
||||
// Segmentation
|
||||
go func() {
|
||||
for i := 0; i < length; i++ {
|
||||
task <- line{number: i, text: lines[i]}
|
||||
}
|
||||
close(task)
|
||||
}()
|
||||
|
||||
// Make sure the segmentation result contains same line as original file
|
||||
for i := 0; i < length; i++ {
|
||||
l := <-result
|
||||
lines[l.number] = l.text
|
||||
}
|
||||
|
||||
t1 := time.Now()
|
||||
|
||||
// Write the segments into a file for verify
|
||||
outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
|
||||
defer outputFile.Close()
|
||||
writer := bufio.NewWriter(outputFile)
|
||||
for _, l := range lines {
|
||||
writer.WriteString(l)
|
||||
}
|
||||
writer.Flush()
|
||||
|
||||
log.Printf("Time cousumed: %v", t1.Sub(t0))
|
||||
log.Printf("Segmentation speed: %f MB/s", float64(size)/t1.Sub(t0).Seconds()/(1024*1024))
|
||||
}
|
||||
88
example_test.go
Normal file
88
example_test.go
Normal file
@@ -0,0 +1,88 @@
|
||||
package jiebago_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
var seg jiebago.Segmenter
|
||||
|
||||
func init() {
|
||||
seg.LoadDictionary("dict.txt")
|
||||
}
|
||||
|
||||
func print(ch <-chan string) {
|
||||
for word := range ch {
|
||||
fmt.Printf(" %s /", word)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func Example() {
|
||||
fmt.Print("【全模式】:")
|
||||
print(seg.CutAll("我来到北京清华大学"))
|
||||
|
||||
fmt.Print("【精确模式】:")
|
||||
print(seg.Cut("我来到北京清华大学", false))
|
||||
|
||||
fmt.Print("【新词识别】:")
|
||||
print(seg.Cut("他来到了网易杭研大厦", true))
|
||||
|
||||
fmt.Print("【搜索引擎模式】:")
|
||||
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
|
||||
// Output:
|
||||
// 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
|
||||
// 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
|
||||
// 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
|
||||
// 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
|
||||
}
|
||||
|
||||
func ExampleLoadUserDictionary() {
|
||||
var sentence = "李小福是创新办主任也是云计算方面的专家"
|
||||
fmt.Print("Before:")
|
||||
print(seg.Cut(sentence, true))
|
||||
|
||||
seg.LoadUserDictionary("userdict.txt")
|
||||
|
||||
fmt.Print("After:")
|
||||
print(seg.Cut(sentence, true))
|
||||
// Output:
|
||||
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||
}
|
||||
|
||||
func ExampleTokenize() {
|
||||
var sentence = []byte("永和服装饰品有限公司")
|
||||
|
||||
// default mode
|
||||
tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false)
|
||||
fmt.Println("Default Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
|
||||
//search mode
|
||||
tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true)
|
||||
fmt.Println("Search Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
// Output:
|
||||
// Default Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
||||
// Search Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
||||
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
// Package finalseg is the Golang implementation of Jieba's finalseg module.
|
||||
package finalseg
|
||||
|
||||
import (
|
||||
@@ -13,10 +14,10 @@ func cutHan(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := pos_list[i]
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
@@ -36,6 +37,8 @@ func cutHan(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by Jiebago for unknonw words.
|
||||
func Cut(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
s := sentence
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
result := make([]string, 0)
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -18,11 +18,11 @@ func TestViterbi(t *testing.T) {
|
||||
states := []byte{'B', 'M', 'E', 'S'}
|
||||
prob, path := viterbi([]rune(obs), states)
|
||||
if math.Abs(prob+39.68824128493802) > 1e-10 {
|
||||
t.Error(prob)
|
||||
t.Fatal(prob)
|
||||
}
|
||||
for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} {
|
||||
if path[index] != state {
|
||||
t.Error(path)
|
||||
t.Fatal(path)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -31,16 +31,16 @@ func TestCutHan(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
result := chanToArray(cutHan(obs))
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
if result[0] != "我们" {
|
||||
t.Error(result[0])
|
||||
t.Fatal(result[0])
|
||||
}
|
||||
if result[1] != "是" {
|
||||
t.Error(result[1])
|
||||
t.Fatal(result[1])
|
||||
}
|
||||
if result[2] != "程序员" {
|
||||
t.Error(result[2])
|
||||
t.Fatal(result[2])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,24 +48,24 @@ func TestCut(t *testing.T) {
|
||||
sentence := "我们是程序员"
|
||||
result := chanToArray(Cut(sentence))
|
||||
if len(result) != 3 {
|
||||
t.Error(len(result))
|
||||
t.Fatal(len(result))
|
||||
}
|
||||
if result[0] != "我们" {
|
||||
t.Error(result[0])
|
||||
t.Fatal(result[0])
|
||||
}
|
||||
if result[1] != "是" {
|
||||
t.Error(result[1])
|
||||
t.Fatal(result[1])
|
||||
}
|
||||
if result[2] != "程序员" {
|
||||
t.Error(result[2])
|
||||
t.Fatal(result[2])
|
||||
}
|
||||
result2 := chanToArray(Cut("I'm a programmer!"))
|
||||
if len(result2) != 8 {
|
||||
t.Error(result2)
|
||||
t.Fatal(result2)
|
||||
}
|
||||
result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
|
||||
if len(result3) != 6 {
|
||||
t.Error(result3)
|
||||
t.Fatal(result3)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
V[t] = make(map[byte]float64)
|
||||
for _, y := range states {
|
||||
ps0 := make(probStates, 0)
|
||||
var em_p float64
|
||||
var emP float64
|
||||
if val, ok := probEmit[y][obs[t]]; ok {
|
||||
em_p = val
|
||||
emP = val
|
||||
} else {
|
||||
em_p = minFloat
|
||||
emP = minFloat
|
||||
}
|
||||
for _, y0 := range prevStatus[y] {
|
||||
var transP float64
|
||||
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
} else {
|
||||
transP = minFloat
|
||||
}
|
||||
prob0 := V[t-1][y0] + transP + em_p
|
||||
prob0 := V[t-1][y0] + transP + emP
|
||||
ps0 = append(ps0, &probState{prob: prob0, state: y0})
|
||||
}
|
||||
sort.Sort(sort.Reverse(ps0))
|
||||
|
||||
409
jieba.go
409
jieba.go
@@ -1,190 +1,151 @@
|
||||
// Golang implemention of jieba (Python Chinese word segmentation module).
|
||||
// Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago/finalseg"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
|
||||
"github.com/wangbin/jiebago/finalseg"
|
||||
"github.com/wangbin/jiebago/util"
|
||||
)
|
||||
|
||||
var (
|
||||
// Word/Tag Map load from user dictionary
|
||||
UserWordTagTab = make(map[string]string)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
|
||||
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
||||
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
|
||||
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
||||
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
type route struct {
|
||||
Freq float64
|
||||
Index int
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
func (r route) String() string {
|
||||
return fmt.Sprintf("(%f, %d)", r.Freq, r.Index)
|
||||
// LoadDictionary loads dictionary from given file name. Everytime
|
||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
type routes []*route
|
||||
|
||||
func (rs routes) Len() int {
|
||||
return len(rs)
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
func (rs routes) Less(i, j int) bool {
|
||||
if rs[i].Freq < rs[j].Freq {
|
||||
return true
|
||||
}
|
||||
if rs[i].Freq == rs[j].Freq {
|
||||
return rs[i].Index < rs[j].Index
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (rs routes) Swap(i, j int) {
|
||||
rs[i], rs[j] = rs[j], rs[i]
|
||||
}
|
||||
|
||||
// Split sentence using regular expression.
|
||||
func RegexpSplit(r *regexp.Regexp, sentence string) []string {
|
||||
result := make([]string, 0)
|
||||
locs := r.FindAllStringIndex(sentence, -1)
|
||||
lastLoc := 0
|
||||
if len(locs) == 0 {
|
||||
return []string{sentence}
|
||||
}
|
||||
for _, loc := range locs {
|
||||
if loc[0] == lastLoc {
|
||||
result = append(result, sentence[loc[0]:loc[1]])
|
||||
} else {
|
||||
result = append(result, sentence[lastLoc:loc[0]])
|
||||
result = append(result, sentence[loc[0]:loc[1]])
|
||||
}
|
||||
lastLoc = loc[1]
|
||||
}
|
||||
if lastLoc < len(sentence) {
|
||||
result = append(result, sentence[lastLoc:])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Build a directed acyclic graph (DAG) for sentence.
|
||||
func DAG(sentence string) map[int][]int {
|
||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
runes := []rune(sentence)
|
||||
n := len(runes)
|
||||
i := 0
|
||||
var frag string
|
||||
var frag []rune
|
||||
var i int
|
||||
for k := 0; k < n; k++ {
|
||||
tmpList := make([]int, 0)
|
||||
dag[k] = make([]int, 0)
|
||||
i = k
|
||||
frag = string(runes[k])
|
||||
frag = runes[k : k+1]
|
||||
for {
|
||||
if freq, ok := Trie.Freq[frag]; !ok {
|
||||
freq, ok := seg.dict.Frequency(string(frag))
|
||||
if !ok {
|
||||
break
|
||||
} else {
|
||||
if freq > 0.0 {
|
||||
tmpList = append(tmpList, i)
|
||||
}
|
||||
}
|
||||
i += 1
|
||||
if freq > 0.0 {
|
||||
dag[k] = append(dag[k], i)
|
||||
}
|
||||
i++
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
frag = string(runes[k : i+1])
|
||||
frag = runes[k : i+1]
|
||||
}
|
||||
if len(tmpList) == 0 {
|
||||
tmpList = append(tmpList, k)
|
||||
if len(dag[k]) == 0 {
|
||||
dag[k] = append(dag[k], k)
|
||||
}
|
||||
dag[k] = tmpList
|
||||
}
|
||||
return dag
|
||||
}
|
||||
|
||||
func Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||
runes := []rune(sentence)
|
||||
number := len(runes)
|
||||
rs := make(map[int]*route)
|
||||
rs[number] = &route{Freq: 0.0, Index: 0}
|
||||
logTotal := math.Log(Trie.Total)
|
||||
for idx := number - 1; idx >= 0; idx-- {
|
||||
candidates := make(routes, 0)
|
||||
type route struct {
|
||||
frequency float64
|
||||
index int
|
||||
}
|
||||
|
||||
func (seg *Segmenter) calc(runes []rune) map[int]route {
|
||||
dag := seg.dag(runes)
|
||||
n := len(runes)
|
||||
rs := make(map[int]route)
|
||||
rs[n] = route{frequency: 0.0, index: 0}
|
||||
var r route
|
||||
for idx := n - 1; idx >= 0; idx-- {
|
||||
for _, i := range dag[idx] {
|
||||
word := string(runes[idx : i+1])
|
||||
var r *route
|
||||
if _, ok := Trie.Freq[word]; ok {
|
||||
r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
||||
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
|
||||
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
} else {
|
||||
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
}
|
||||
if v, ok := rs[idx]; !ok {
|
||||
rs[idx] = r
|
||||
} else {
|
||||
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
||||
rs[idx] = r
|
||||
}
|
||||
}
|
||||
candidates = append(candidates, r)
|
||||
}
|
||||
sort.Sort(sort.Reverse(candidates))
|
||||
rs[idx] = candidates[0]
|
||||
}
|
||||
return rs
|
||||
}
|
||||
|
||||
type cutFunc func(sentence string) chan string
|
||||
type cutFunc func(sentence string) <-chan string
|
||||
|
||||
func cutDAG(sentence string) chan string {
|
||||
func (seg *Segmenter) cutDAG(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
dag := DAG(sentence)
|
||||
routes := Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
if x >= length {
|
||||
break
|
||||
}
|
||||
y = routes[x].Index + 1
|
||||
l_word := runes[x:y]
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, l_word...)
|
||||
buf = append(buf, frag...)
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result <- string(buf)
|
||||
buf = make([]rune, 0)
|
||||
result <- bufString
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for x := range finalseg.Cut(bufString) {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
result <- string(elem) // TODO: I don't get this?
|
||||
result <- string(elem)
|
||||
}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
result <- string(l_word)
|
||||
result <- string(frag)
|
||||
}
|
||||
x = y
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
result <- string(buf)
|
||||
result <- bufString
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range finalseg.Cut(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
result <- string(elem) // TODO: I don't get this?
|
||||
result <- string(elem)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -194,32 +155,27 @@ func cutDAG(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDAGNoHMM(sentence string) chan string {
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
|
||||
go func() {
|
||||
dag := DAG(sentence)
|
||||
routes := Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
if x >= length {
|
||||
break
|
||||
}
|
||||
y = routes[x].Index + 1
|
||||
l_word := runes[x:y]
|
||||
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
|
||||
buf = append(buf, l_word...)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
result <- string(buf)
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
result <- string(l_word)
|
||||
result <- string(frag)
|
||||
x = y
|
||||
}
|
||||
}
|
||||
@@ -232,101 +188,37 @@ func cutDAGNoHMM(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutAll(sentence string) chan string {
|
||||
// Cut cuts a sentence into words using accurate mode.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
// Accurate mode attempts to cut the sentence into the most accurate
|
||||
// segmentations, which is suitable for text analysis.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
var cut cutFunc
|
||||
if hmm {
|
||||
cut = seg.cutDAG
|
||||
} else {
|
||||
cut = seg.cutDAGNoHMM
|
||||
}
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
dag := DAG(sentence)
|
||||
old_j := -1
|
||||
ks := make([]int, 0)
|
||||
for k := range dag {
|
||||
ks = append(ks, k)
|
||||
}
|
||||
sort.Ints(ks)
|
||||
for k := range ks {
|
||||
l := dag[k]
|
||||
if len(l) == 1 && k > old_j {
|
||||
result <- string(runes[k : l[0]+1])
|
||||
old_j = l[0]
|
||||
} else {
|
||||
for _, j := range l {
|
||||
if j > k {
|
||||
result <- string(runes[k : j+1])
|
||||
old_j = j
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
/*
|
||||
Cut sentence.
|
||||
|
||||
isCutAll controls use full cut mode or accurate mode.
|
||||
|
||||
Full Mode gets all the possible words from the sentence. Fast but not accurate.
|
||||
|
||||
Accurate Mode attempts to cut the sentence into the most accurate segmentations,
|
||||
which is suitable for text analysis.
|
||||
|
||||
HMM contols whether to use the Hidden Markov Mode.
|
||||
*/
|
||||
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
var reHan, reSkip *regexp.Regexp
|
||||
if isCutAll {
|
||||
reHan = reHanCutAll
|
||||
reSkip = reSkipCutAll
|
||||
} else {
|
||||
reHan = reHanDefault
|
||||
reSkip = reSkipDefault
|
||||
}
|
||||
blocks := RegexpSplit(reHan, sentence)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
}
|
||||
if isCutAll {
|
||||
cut = cutAll
|
||||
}
|
||||
for _, blk := range blocks {
|
||||
if len(blk) == 0 {
|
||||
for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHan.MatchString(blk) {
|
||||
for x := range cut(blk) {
|
||||
if reHanDefault.MatchString(block) {
|
||||
for x := range cut(block) {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
type skipSplitFunc func(sentence string) []string
|
||||
var ssf skipSplitFunc
|
||||
if isCutAll {
|
||||
ssf = func(sentence string) []string {
|
||||
return reSkip.Split(sentence, -1)
|
||||
}
|
||||
} else {
|
||||
ssf = func(sentence string) []string {
|
||||
return RegexpSplit(reSkip, sentence)
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
|
||||
if reSkipDefault.MatchString(subBlock) {
|
||||
result <- subBlock
|
||||
continue
|
||||
}
|
||||
|
||||
for _, x := range ssf(blk) {
|
||||
if reSkip.MatchString(x) {
|
||||
result <- x
|
||||
} else if !isCutAll {
|
||||
for _, xx := range x {
|
||||
result <- string(xx)
|
||||
}
|
||||
} else {
|
||||
result <- x
|
||||
}
|
||||
for _, r := range subBlock {
|
||||
result <- string(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -335,22 +227,79 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
||||
// to cut long words into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func CutForSearch(sentence string, hmm bool) chan string {
|
||||
func (seg *Segmenter) cutAll(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for word := range Cut(sentence, false, hmm) {
|
||||
runes := []rune(sentence)
|
||||
dag := seg.dag(runes)
|
||||
start := -1
|
||||
ks := make([]int, len(dag))
|
||||
for k := range dag {
|
||||
ks[k] = k
|
||||
}
|
||||
var l []int
|
||||
for k := range ks {
|
||||
l = dag[k]
|
||||
if len(l) == 1 && k > start {
|
||||
result <- string(runes[k : l[0]+1])
|
||||
start = l[0]
|
||||
continue
|
||||
}
|
||||
for _, j := range l {
|
||||
if j > k {
|
||||
result <- string(runes[k : j+1])
|
||||
start = j
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
// CutAll cuts a sentence into words using full mode.
|
||||
// Full mode gets all the possible words from the sentence.
|
||||
// Fast but not accurate.
|
||||
func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
|
||||
if len(block) == 0 {
|
||||
continue
|
||||
}
|
||||
if reHanCutAll.MatchString(block) {
|
||||
for x := range seg.cutAll(block) {
|
||||
result <- x
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, subBlock := range reSkipCutAll.Split(block, -1) {
|
||||
result <- subBlock
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
// CutForSearch cuts sentence into words using search engine mode.
|
||||
// Search engine mode, based on the accurate mode, attempts to cut long words
|
||||
// into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for word := range seg.Cut(sentence, hmm) {
|
||||
runes := []rune(word)
|
||||
for _, increment := range []int{2, 3} {
|
||||
if len(runes) > increment {
|
||||
var gram2 string
|
||||
for i := 0; i < len(runes)-increment+1; i++ {
|
||||
gram2 = string(runes[i : i+increment])
|
||||
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
|
||||
result <- gram2
|
||||
}
|
||||
if len(runes) <= increment {
|
||||
continue
|
||||
}
|
||||
var gram string
|
||||
for i := 0; i < len(runes)-increment+1; i++ {
|
||||
gram = string(runes[i : i+increment])
|
||||
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
|
||||
result <- gram
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
154
jieba_test.go
154
jieba_test.go
@@ -1,12 +1,10 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
)
|
||||
import "testing"
|
||||
|
||||
var (
|
||||
test_contents = []string{
|
||||
seg Segmenter
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -618,11 +616,11 @@ var (
|
||||
)
|
||||
|
||||
func init() {
|
||||
SetDictionary("dict.txt")
|
||||
seg.LoadDictionary("dict.txt")
|
||||
}
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
result := make([]string, 0)
|
||||
func chanToArray(ch <-chan string) []string {
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -630,43 +628,32 @@ func chanToArray(ch chan string) []string {
|
||||
}
|
||||
|
||||
func TestCutDAG(t *testing.T) {
|
||||
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
if len(result) != 11 {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCutDAGNoHmm(t *testing.T) {
|
||||
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
if len(result) != 11 {
|
||||
t.Error(result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegexpSplit(t *testing.T) {
|
||||
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?")
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||
",BP神经网络如何训练才能在分类时#增加区分度?")
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultCut(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, true))
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, true))
|
||||
if len(result) != len(defaultCutResult[index]) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
content, len(defaultCutResult[index]), len(result))
|
||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||
t.Fatalf("got: %v\n", result)
|
||||
}
|
||||
for i, r := range result {
|
||||
if r != defaultCutResult[index][i] {
|
||||
t.Error(r)
|
||||
t.Fatal(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -674,15 +661,17 @@ func TestDefaultCut(t *testing.T) {
|
||||
|
||||
func TestCutAll(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, true, true))
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutAll(content))
|
||||
if len(result) != len(cutAllResult[index]) {
|
||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||
content, len(cutAllResult[index]), len(result))
|
||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||
t.Fatalf("got: %v\n", result)
|
||||
}
|
||||
for i, c := range result {
|
||||
if c != cutAllResult[index][i] {
|
||||
t.Error(c)
|
||||
t.Fatal(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -690,15 +679,15 @@ func TestCutAll(t *testing.T) {
|
||||
|
||||
func TestDefaultCutNoHMM(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, false))
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, false))
|
||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
||||
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
|
||||
content, len(defaultCutNoHMMResult[index]), len(result))
|
||||
}
|
||||
for i, c := range result {
|
||||
if c != defaultCutNoHMMResult[index][i] {
|
||||
t.Error(c)
|
||||
t.Fatal(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -706,88 +695,129 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
||||
|
||||
func TestCutForSearch(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(CutForSearch(content, true))
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutForSearch(content, true))
|
||||
if len(result) != len(cutForSearchResult[index]) {
|
||||
t.Errorf("cut for search for %s length should be %d not %d\n",
|
||||
t.Fatalf("cut for search for %s length should be %d not %d\n",
|
||||
content, len(cutForSearchResult[index]), len(result))
|
||||
}
|
||||
for i, c := range result {
|
||||
if c != cutForSearchResult[index][i] {
|
||||
t.Error(c)
|
||||
t.Fatal(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(CutForSearch(content, false))
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutForSearch(content, false))
|
||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
||||
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
|
||||
content, len(cutForSearchNoHMMResult[index]), len(result))
|
||||
}
|
||||
for i, c := range result {
|
||||
if c != cutForSearchNoHMMResult[index][i] {
|
||||
t.Error(c)
|
||||
t.Fatal(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetdictionary(t *testing.T) {
|
||||
func TestLoadDictionary(t *testing.T) {
|
||||
var result []string
|
||||
SetDictionary("foobar.txt")
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, true))
|
||||
seg.LoadDictionary("foobar.txt")
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, true))
|
||||
if len(result) != len(userDictCutResult[index]) {
|
||||
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||
content, len(userDictCutResult[index]), len(result))
|
||||
}
|
||||
for i, c := range result {
|
||||
if c != userDictCutResult[index][i] {
|
||||
t.Error(c)
|
||||
t.Fatal(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
seg.LoadDictionary("dict.txt")
|
||||
}
|
||||
|
||||
func TestLoadUserDict(t *testing.T) {
|
||||
SetDictionary("dict.txt")
|
||||
LoadUserDict("userdict.txt")
|
||||
func TestLoadUserDictionary(t *testing.T) {
|
||||
seg.LoadUserDictionary("userdict.txt")
|
||||
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||
|
||||
words := chanToArray(Cut(sentence, false, true))
|
||||
words := chanToArray(seg.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
t.Fatal(len(words))
|
||||
}
|
||||
for index, word := range words {
|
||||
if word != result[index] {
|
||||
t.Error(word)
|
||||
t.Fatal(word)
|
||||
}
|
||||
}
|
||||
|
||||
sentence = "easy_install is great"
|
||||
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||
words = chanToArray(Cut(sentence, false, true))
|
||||
words = chanToArray(seg.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
t.Fatal(len(words))
|
||||
}
|
||||
for index, word := range words {
|
||||
if word != result[index] {
|
||||
t.Error(word)
|
||||
t.Fatal(word)
|
||||
}
|
||||
}
|
||||
|
||||
sentence = "python 的正则表达式是好用的"
|
||||
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
||||
words = chanToArray(Cut(sentence, false, true))
|
||||
words = chanToArray(seg.Cut(sentence, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(words)
|
||||
t.Error(result)
|
||||
t.Fatal(words)
|
||||
t.Fatal(result)
|
||||
}
|
||||
for index, word := range words {
|
||||
if word != result[index] {
|
||||
t.Error(word)
|
||||
t.Fatal(word)
|
||||
}
|
||||
}
|
||||
seg.LoadDictionary("dict.txt")
|
||||
}
|
||||
|
||||
func BenchmarkCutNoHMM(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.Cut(sentence, false))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCut(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.Cut(sentence, true))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCutAll(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.CutAll(sentence))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCutForSearchNoHMM(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.CutForSearch(sentence, false))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCutForSearch(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.CutForSearch(sentence, true))
|
||||
}
|
||||
}
|
||||
|
||||
86170
posseg/char_state_tab.go
86170
posseg/char_state_tab.go
File diff suppressed because it is too large
Load Diff
74
posseg/dictionary.go
Normal file
74
posseg/dictionary.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
type Dictionary struct {
|
||||
total, logTotal float64
|
||||
freqMap map[string]float64
|
||||
posMap map[string]string
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel
|
||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
d.addToken(token)
|
||||
}
|
||||
d.Unlock()
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
// AddToken adds one token
|
||||
func (d *Dictionary) AddToken(token dictionary.Token) {
|
||||
d.Lock()
|
||||
d.addToken(token)
|
||||
d.Unlock()
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
func (d *Dictionary) addToken(token dictionary.Token) {
|
||||
d.freqMap[token.Text()] = token.Frequency()
|
||||
d.total += token.Frequency()
|
||||
runes := []rune(token.Text())
|
||||
n := len(runes)
|
||||
for i := 0; i < n; i++ {
|
||||
frag := string(runes[:i+1])
|
||||
if _, ok := d.freqMap[frag]; !ok {
|
||||
d.freqMap[frag] = 0.0
|
||||
}
|
||||
}
|
||||
if len(token.Pos()) > 0 {
|
||||
d.posMap[token.Text()] = token.Pos()
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dictionary) updateLogTotal() {
|
||||
d.logTotal = math.Log(d.total)
|
||||
}
|
||||
|
||||
// Frequency returns the frequency and existence of give word
|
||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
d.RLock()
|
||||
freq, ok := d.freqMap[key]
|
||||
d.RUnlock()
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// Pos returns the POS and existence of give word
|
||||
func (d *Dictionary) Pos(key string) (string, bool) {
|
||||
d.RLock()
|
||||
pos, ok := d.posMap[key]
|
||||
d.RUnlock()
|
||||
return pos, ok
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(d, fileName)
|
||||
}
|
||||
21
posseg/example_test.go
Normal file
21
posseg/example_test.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package posseg_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
var seg posseg.Segmenter
|
||||
seg.LoadDictionary("../dict.txt")
|
||||
|
||||
for segment := range seg.Cut("我爱北京天安门", true) {
|
||||
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
||||
}
|
||||
// Output:
|
||||
// 我 r
|
||||
// 爱 v
|
||||
// 北京 ns
|
||||
// 天安门 ns
|
||||
}
|
||||
411
posseg/posseg.go
411
posseg/posseg.go
@@ -1,14 +1,16 @@
|
||||
// Package posseg is the Golang implementation of Jieba's posseg module.
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"math"
|
||||
"regexp"
|
||||
|
||||
"github.com/wangbin/jiebago/util"
|
||||
)
|
||||
|
||||
var (
|
||||
wordTagMap = make(map[string]string)
|
||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||
reHanDetail = regexp.MustCompile(`(\p{Han}+)`)
|
||||
reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
|
||||
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
|
||||
@@ -16,81 +18,90 @@ var (
|
||||
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
type WordTag struct {
|
||||
Word, Tag string
|
||||
// Segment represents a word with it's POS
|
||||
type Segment struct {
|
||||
text, pos string
|
||||
}
|
||||
|
||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||
// name in current diectory.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dictFilePath, err := jiebago.DictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
wordTagMap[wtf.Word] = wtf.Tag
|
||||
}
|
||||
return nil
|
||||
// Text returns the Segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
func cutDetailInternal(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
// Pos returns the Segment's POS.
|
||||
func (s Segment) Pos() string {
|
||||
return s.pos
|
||||
}
|
||||
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name.
|
||||
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes)
|
||||
posList := viterbi(runes)
|
||||
begin := 0
|
||||
next := 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i].State
|
||||
switch pos {
|
||||
case 'B':
|
||||
pos := posList[i]
|
||||
switch pos.position() {
|
||||
case "B":
|
||||
begin = i
|
||||
case 'E':
|
||||
result <- WordTag{string(runes[begin : i+1]), posList[i].Tag}
|
||||
case "E":
|
||||
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result <- WordTag{string(char), posList[i].Tag}
|
||||
case "S":
|
||||
result <- Segment{string(char), pos.pos()}
|
||||
next = i + 1
|
||||
}
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- WordTag{string(runes[next:]), posList[next].Tag}
|
||||
result <- Segment{string(runes[next:]), posList[next].pos()}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDetail(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
go func() {
|
||||
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
|
||||
for _, blk := range blocks {
|
||||
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
for wordTag := range cutDetailInternal(blk) {
|
||||
result <- wordTag
|
||||
for segment := range seg.cutDetailInternal(blk) {
|
||||
result <- segment
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
|
||||
if len(x) == 0 {
|
||||
continue
|
||||
}
|
||||
switch {
|
||||
case reNum.MatchString(x):
|
||||
result <- WordTag{x, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- WordTag{x, "eng"}
|
||||
default:
|
||||
result <- WordTag{x, "x"}
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
|
||||
if len(x) == 0 {
|
||||
continue
|
||||
}
|
||||
switch {
|
||||
case reNum.MatchString(x):
|
||||
result <- Segment{x, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- Segment{x, "eng"}
|
||||
default:
|
||||
result <- Segment{x, "x"}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -99,88 +110,142 @@ func cutDetail(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
type cutFunc func(sentence string) chan WordTag
|
||||
|
||||
func cutDAG(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
n := len(runes)
|
||||
var frag []rune
|
||||
var i int
|
||||
for k := 0; k < n; k++ {
|
||||
dag[k] = make([]int, 0)
|
||||
i = k
|
||||
frag = runes[k : k+1]
|
||||
for {
|
||||
if x >= length {
|
||||
freq, ok := seg.dict.Frequency(string(frag))
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
y = routes[x].Index + 1
|
||||
l_word := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, l_word...)
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
result <- WordTag{string(elem), tag}
|
||||
} else {
|
||||
result <- WordTag{string(elem), "x"}
|
||||
}
|
||||
if freq > 0.0 {
|
||||
dag[k] = append(dag[k], i)
|
||||
}
|
||||
i++
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
frag = runes[k : i+1]
|
||||
}
|
||||
if len(dag[k]) == 0 {
|
||||
dag[k] = append(dag[k], k)
|
||||
}
|
||||
}
|
||||
return dag
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
type route struct {
|
||||
frequency float64
|
||||
index int
|
||||
}
|
||||
|
||||
func (seg *Segmenter) calc(runes []rune) map[int]route {
|
||||
dag := seg.dag(runes)
|
||||
n := len(runes)
|
||||
rs := make(map[int]route)
|
||||
rs[n] = route{frequency: 0.0, index: 0}
|
||||
var r route
|
||||
for idx := n - 1; idx >= 0; idx-- {
|
||||
for _, i := range dag[idx] {
|
||||
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
|
||||
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
} else {
|
||||
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
}
|
||||
if v, ok := rs[idx]; !ok {
|
||||
rs[idx] = r
|
||||
} else {
|
||||
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
||||
rs[idx] = r
|
||||
}
|
||||
}
|
||||
x = y
|
||||
}
|
||||
}
|
||||
return rs
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
}
|
||||
} else {
|
||||
type cutFunc func(sentence string) <-chan Segment
|
||||
|
||||
func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
if len(buf) == 1 {
|
||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
||||
result <- Segment{bufString, tag}
|
||||
} else {
|
||||
result <- Segment{bufString, "x"}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
continue
|
||||
}
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range seg.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
result <- WordTag{selem, tag}
|
||||
if tag, ok := seg.dict.Pos(selem); ok {
|
||||
result <- Segment{selem, tag}
|
||||
} else {
|
||||
result <- WordTag{selem, "x"}
|
||||
result <- Segment{selem, "x"}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
word := string(frag)
|
||||
if tag, ok := seg.dict.Pos(word); ok {
|
||||
result <- Segment{word, tag}
|
||||
} else {
|
||||
result <- Segment{word, "x"}
|
||||
}
|
||||
x = y
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
||||
result <- Segment{bufString, tag}
|
||||
} else {
|
||||
result <- Segment{bufString, "x"}
|
||||
}
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range seg.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := seg.dict.Pos(selem); ok {
|
||||
result <- Segment{selem, tag}
|
||||
} else {
|
||||
result <- Segment{selem, "x"}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -191,42 +256,38 @@ func cutDAG(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
for {
|
||||
if x >= length {
|
||||
break
|
||||
}
|
||||
y = routes[x].Index + 1
|
||||
l_word := runes[x:y]
|
||||
if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
|
||||
buf = append(buf, l_word...)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- Segment{string(buf), "eng"}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
word := string(frag)
|
||||
if tag, ok := seg.dict.Pos(word); ok {
|
||||
result <- Segment{word, tag}
|
||||
} else {
|
||||
if len(buf) > 0 {
|
||||
result <- WordTag{string(buf), "eng"}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
}
|
||||
x = y
|
||||
result <- Segment{word, "x"}
|
||||
}
|
||||
x = y
|
||||
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- WordTag{string(buf), "eng"}
|
||||
result <- Segment{string(buf), "eng"}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
close(result)
|
||||
@@ -234,44 +295,38 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
// Tags the POS of each word after segmentation, using labels compatible with
|
||||
// ictclas.
|
||||
func Cut(sentence string, HMM bool) chan WordTag {
|
||||
for key := range jiebago.UserWordTagTab {
|
||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
result := make(chan WordTag)
|
||||
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
||||
// Cut cuts a sentence into words.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
if hmm {
|
||||
cut = seg.cutDAG
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
cut = seg.cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
for _, blk := range blocks {
|
||||
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
for wordTag := range cut(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result <- WordTag{x, "x"}
|
||||
} else {
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
result <- WordTag{s, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- WordTag{x, "eng"}
|
||||
break
|
||||
default:
|
||||
result <- WordTag{s, "x"}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result <- Segment{x, "x"}
|
||||
continue
|
||||
}
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
result <- Segment{s, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- Segment{x, "eng"}
|
||||
default:
|
||||
result <- Segment{s, "x"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var (
|
||||
test_contents = []string{
|
||||
seg Segmenter
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -93,183 +93,187 @@ var (
|
||||
"你认识那个和主席握手的的哥吗?他开一辆黑色的士。",
|
||||
"枪杆子中出政权"}
|
||||
|
||||
defaultCutResult = [][]WordTag{[]WordTag{WordTag{"这", "r"}, WordTag{"是", "v"}, WordTag{"一个", "m"}, WordTag{"伸手不见五指", "i"}, WordTag{"的", "uj"}, WordTag{"黑夜", "n"}, WordTag{"。", "x"}, WordTag{"我", "r"}, WordTag{"叫", "v"}, WordTag{"孙悟空", "nr"}, WordTag{",", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{",", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"Python", "eng"}, WordTag{"和", "c"}, WordTag{"C++", "nz"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"不", "d"}, WordTag{"喜欢", "v"}, WordTag{"日本", "ns"}, WordTag{"和服", "nz"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"回归", "v"}, WordTag{"人间", "n"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"工信处", "n"}, WordTag{"女干事", "n"}, WordTag{"每月", "r"}, WordTag{"经过", "p"}, WordTag{"下属", "v"}, WordTag{"科室", "n"}, WordTag{"都", "d"}, WordTag{"要", "v"}, WordTag{"亲口", "n"}, WordTag{"交代", "n"}, WordTag{"24", "m"}, WordTag{"口", "n"}, WordTag{"交换机", "n"}, WordTag{"等", "u"}, WordTag{"技术性", "n"}, WordTag{"器件", "n"}, WordTag{"的", "uj"}, WordTag{"安装", "v"}, WordTag{"工作", "vn"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"需要", "v"}, WordTag{"廉租房", "n"}},
|
||||
[]WordTag{WordTag{"永和", "nz"}, WordTag{"服装", "vn"}, WordTag{"饰品", "n"}, WordTag{"有限公司", "n"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"天安门", "ns"}},
|
||||
[]WordTag{WordTag{"abc", "eng"}},
|
||||
[]WordTag{WordTag{"隐", "n"}, WordTag{"马尔可夫", "nr"}},
|
||||
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"是", "v"}, WordTag{"个", "q"}, WordTag{"好", "a"}, WordTag{"网站", "n"}},
|
||||
[]WordTag{WordTag{"“", "x"}, WordTag{"Microsoft", "eng"}, WordTag{"”", "x"}, WordTag{"一", "m"}, WordTag{"词", "n"}, WordTag{"由", "p"}, WordTag{"“", "x"}, WordTag{"MICROcomputer", "eng"}, WordTag{"(", "x"}, WordTag{"微型", "b"}, WordTag{"计算机", "n"}, WordTag{")", "x"}, WordTag{"”", "x"}, WordTag{"和", "c"}, WordTag{"“", "x"}, WordTag{"SOFTware", "eng"}, WordTag{"(", "x"}, WordTag{"软件", "n"}, WordTag{")", "x"}, WordTag{"”", "x"}, WordTag{"两", "m"}, WordTag{"部分", "n"}, WordTag{"组成", "v"}},
|
||||
[]WordTag{WordTag{"草泥马", "n"}, WordTag{"和", "c"}, WordTag{"欺实", "v"}, WordTag{"马", "n"}, WordTag{"是", "v"}, WordTag{"今年", "t"}, WordTag{"的", "uj"}, WordTag{"流行", "v"}, WordTag{"词汇", "n"}},
|
||||
[]WordTag{WordTag{"伊藤", "nr"}, WordTag{"洋华堂", "n"}, WordTag{"总府", "n"}, WordTag{"店", "n"}},
|
||||
[]WordTag{WordTag{"中国科学院计算技术研究所", "nt"}},
|
||||
[]WordTag{WordTag{"罗密欧", "nr"}, WordTag{"与", "p"}, WordTag{"朱丽叶", "nr"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"购买", "v"}, WordTag{"了", "ul"}, WordTag{"道具", "n"}, WordTag{"和", "c"}, WordTag{"服装", "vn"}},
|
||||
[]WordTag{WordTag{"PS", "eng"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"我", "r"}, WordTag{"觉得", "v"}, WordTag{"开源", "n"}, WordTag{"有", "v"}, WordTag{"一个", "m"}, WordTag{"好处", "d"}, WordTag{",", "x"}, WordTag{"就是", "d"}, WordTag{"能够", "v"}, WordTag{"敦促", "v"}, WordTag{"自己", "r"}, WordTag{"不断改进", "l"}, WordTag{",", "x"}, WordTag{"避免", "v"}, WordTag{"敞", "v"}, WordTag{"帚", "ng"}, WordTag{"自珍", "b"}},
|
||||
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"石首市", "ns"}},
|
||||
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"十堰市", "ns"}},
|
||||
[]WordTag{WordTag{"总经理", "n"}, WordTag{"完成", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}},
|
||||
[]WordTag{WordTag{"电脑", "n"}, WordTag{"修好", "v"}, WordTag{"了", "ul"}},
|
||||
[]WordTag{WordTag{"做好", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"就", "d"}, WordTag{"一了百了", "l"}, WordTag{"了", "ul"}},
|
||||
[]WordTag{WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"我们", "r"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"美的", "nr"}, WordTag{"空调", "n"}},
|
||||
[]WordTag{WordTag{"线程", "n"}, WordTag{"初始化", "l"}, WordTag{"时", "n"}, WordTag{"我们", "r"}, WordTag{"要", "v"}, WordTag{"注意", "v"}},
|
||||
[]WordTag{WordTag{"一个", "m"}, WordTag{"分子", "n"}, WordTag{"是", "v"}, WordTag{"由", "p"}, WordTag{"好多", "m"}, WordTag{"原子", "n"}, WordTag{"组织", "v"}, WordTag{"成", "v"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"祝", "v"}, WordTag{"你", "r"}, WordTag{"马到功成", "i"}},
|
||||
[]WordTag{WordTag{"他", "r"}, WordTag{"掉", "v"}, WordTag{"进", "v"}, WordTag{"了", "ul"}, WordTag{"无底洞", "ns"}, WordTag{"里", "f"}},
|
||||
[]WordTag{WordTag{"中国", "ns"}, WordTag{"的", "uj"}, WordTag{"首都", "d"}, WordTag{"是", "v"}, WordTag{"北京", "ns"}},
|
||||
[]WordTag{WordTag{"孙君意", "nr"}},
|
||||
[]WordTag{WordTag{"外交部", "nt"}, WordTag{"发言人", "l"}, WordTag{"马朝旭", "nr"}},
|
||||
[]WordTag{WordTag{"领导人", "n"}, WordTag{"会议", "n"}, WordTag{"和", "c"}, WordTag{"第四届", "m"}, WordTag{"东亚", "ns"}, WordTag{"峰会", "n"}},
|
||||
[]WordTag{WordTag{"在", "p"}, WordTag{"过去", "t"}, WordTag{"的", "uj"}, WordTag{"这", "r"}, WordTag{"五年", "t"}},
|
||||
[]WordTag{WordTag{"还", "d"}, WordTag{"需要", "v"}, WordTag{"很", "d"}, WordTag{"长", "a"}, WordTag{"的", "uj"}, WordTag{"路", "n"}, WordTag{"要", "v"}, WordTag{"走", "v"}},
|
||||
[]WordTag{WordTag{"60", "m"}, WordTag{"周年", "t"}, WordTag{"首都", "d"}, WordTag{"阅兵", "v"}},
|
||||
[]WordTag{WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"来", "v"}, WordTag{"世博园", "nr"}},
|
||||
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"世博园", "nr"}},
|
||||
[]WordTag{WordTag{"但是", "c"}, WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}, WordTag{"知道", "v"}, WordTag{"你", "r"}, WordTag{"是", "v"}, WordTag{"对", "p"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"存在", "v"}, WordTag{"即", "v"}, WordTag{"合理", "vn"}},
|
||||
[]WordTag{WordTag{"的的", "u"}, WordTag{"的的", "u"}, WordTag{"的", "uj"}, WordTag{"在的", "u"}, WordTag{"的的", "u"}, WordTag{"的", "uj"}, WordTag{"就", "d"}, WordTag{"以", "p"}, WordTag{"和和", "nz"}, WordTag{"和", "c"}},
|
||||
[]WordTag{WordTag{"I", "x"}, WordTag{" ", "x"}, WordTag{"love", "eng"}, WordTag{"你", "r"}, WordTag{",", "x"}, WordTag{"不以为耻", "i"}, WordTag{",", "x"}, WordTag{"反", "zg"}, WordTag{"以为", "c"}, WordTag{"rong", "eng"}},
|
||||
[]WordTag{WordTag{"因", "p"}},
|
||||
[]WordTag{},
|
||||
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"很好", "a"}, WordTag{"但", "c"}, WordTag{"主要", "b"}, WordTag{"是", "v"}, WordTag{"基于", "p"}, WordTag{"网页", "n"}, WordTag{"形式", "n"}},
|
||||
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"为什么", "r"}, WordTag{"我", "r"}, WordTag{"不能", "v"}, WordTag{"拥有", "v"}, WordTag{"想要", "v"}, WordTag{"的", "uj"}, WordTag{"生活", "vn"}},
|
||||
[]WordTag{WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}},
|
||||
[]WordTag{WordTag{"此次", "r"}, WordTag{"来", "v"}, WordTag{"中国", "ns"}, WordTag{"是", "v"}, WordTag{"为了", "p"}},
|
||||
[]WordTag{WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{",", "x"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"其实", "d"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"好人", "n"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"是因为", "c"}, WordTag{"和", "c"}, WordTag{"国家", "n"}},
|
||||
[]WordTag{WordTag{"老年", "t"}, WordTag{"搜索", "v"}, WordTag{"还", "d"}, WordTag{"支持", "v"}},
|
||||
[]WordTag{WordTag{"干脆", "d"}, WordTag{"就", "d"}, WordTag{"把", "p"}, WordTag{"那部", "r"}, WordTag{"蒙人", "n"}, WordTag{"的", "uj"}, WordTag{"闲法", "n"}, WordTag{"给", "p"}, WordTag{"废", "v"}, WordTag{"了", "ul"}, WordTag{"拉倒", "v"}, WordTag{"!", "x"}, WordTag{"RT", "eng"}, WordTag{" ", "x"}, WordTag{"@", "x"}, WordTag{"laoshipukong", "eng"}, WordTag{" ", "x"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"27", "m"}, WordTag{"日", "m"}, WordTag{",", "x"}, WordTag{"全国人大常委会", "nt"}, WordTag{"第三次", "m"}, WordTag{"审议", "v"}, WordTag{"侵权", "v"}, WordTag{"责任法", "n"}, WordTag{"草案", "n"}, WordTag{",", "x"}, WordTag{"删除", "v"}, WordTag{"了", "ul"}, WordTag{"有关", "vn"}, WordTag{"医疗", "n"}, WordTag{"损害", "v"}, WordTag{"责任", "n"}, WordTag{"“", "x"}, WordTag{"举证", "v"}, WordTag{"倒置", "v"}, WordTag{"”", "x"}, WordTag{"的", "uj"}, WordTag{"规定", "n"}, WordTag{"。", "x"}, WordTag{"在", "p"}, WordTag{"医患", "n"}, WordTag{"纠纷", "n"}, WordTag{"中本", "ns"}, WordTag{"已", "d"}, WordTag{"处于", "v"}, WordTag{"弱势", "n"}, WordTag{"地位", "n"}, WordTag{"的", "uj"}, WordTag{"消费者", "n"}, WordTag{"由此", "c"}, WordTag{"将", "d"}, WordTag{"陷入", "v"}, WordTag{"万劫不复", "i"}, WordTag{"的", "uj"}, WordTag{"境地", "s"}, WordTag{"。", "x"}, WordTag{" ", "x"}},
|
||||
[]WordTag{WordTag{"大", "a"}},
|
||||
[]WordTag{},
|
||||
[]WordTag{WordTag{"他", "r"}, WordTag{"说", "v"}, WordTag{"的", "uj"}, WordTag{"确实", "ad"}, WordTag{"在", "p"}, WordTag{"理", "n"}},
|
||||
[]WordTag{WordTag{"长春", "ns"}, WordTag{"市长", "n"}, WordTag{"春节", "t"}, WordTag{"讲话", "n"}},
|
||||
[]WordTag{WordTag{"结婚", "v"}, WordTag{"的", "uj"}, WordTag{"和", "c"}, WordTag{"尚未", "d"}, WordTag{"结婚", "v"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"结合", "v"}, WordTag{"成", "n"}, WordTag{"分子", "n"}, WordTag{"时", "n"}},
|
||||
[]WordTag{WordTag{"旅游", "vn"}, WordTag{"和", "c"}, WordTag{"服务", "vn"}, WordTag{"是", "v"}, WordTag{"最好", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"的确", "d"}, WordTag{"是", "v"}, WordTag{"我", "r"}, WordTag{"的", "uj"}, WordTag{"错", "n"}},
|
||||
[]WordTag{WordTag{"供", "v"}, WordTag{"大家", "n"}, WordTag{"参考", "v"}, WordTag{"指正", "v"}},
|
||||
[]WordTag{WordTag{"哈尔滨", "ns"}, WordTag{"政府", "n"}, WordTag{"公布", "v"}, WordTag{"塌", "v"}, WordTag{"桥", "n"}, WordTag{"原因", "n"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"在", "p"}, WordTag{"机场", "n"}, WordTag{"入口处", "i"}},
|
||||
[]WordTag{WordTag{"邢永臣", "nr"}, WordTag{"摄影", "n"}, WordTag{"报道", "v"}},
|
||||
[]WordTag{WordTag{"BP", "eng"}, WordTag{"神经网络", "n"}, WordTag{"如何", "r"}, WordTag{"训练", "vn"}, WordTag{"才能", "v"}, WordTag{"在", "p"}, WordTag{"分类", "n"}, WordTag{"时", "n"}, WordTag{"增加", "v"}, WordTag{"区分度", "n"}, WordTag{"?", "x"}},
|
||||
[]WordTag{WordTag{"南京市", "ns"}, WordTag{"长江大桥", "ns"}},
|
||||
[]WordTag{WordTag{"应", "v"}, WordTag{"一些", "m"}, WordTag{"使用者", "n"}, WordTag{"的", "uj"}, WordTag{"建议", "n"}, WordTag{",", "x"}, WordTag{"也", "d"}, WordTag{"为了", "p"}, WordTag{"便于", "v"}, WordTag{"利用", "n"}, WordTag{"NiuTrans", "eng"}, WordTag{"用于", "v"}, WordTag{"SMT", "eng"}, WordTag{"研究", "vn"}},
|
||||
[]WordTag{WordTag{"长春市", "ns"}, WordTag{"长春", "ns"}, WordTag{"药店", "n"}},
|
||||
[]WordTag{WordTag{"邓颖超", "nr"}, WordTag{"生前", "t"}, WordTag{"最", "d"}, WordTag{"喜欢", "v"}, WordTag{"的", "uj"}, WordTag{"衣服", "n"}},
|
||||
[]WordTag{WordTag{"胡锦涛", "nr"}, WordTag{"是", "v"}, WordTag{"热爱", "a"}, WordTag{"世界", "n"}, WordTag{"和平", "nz"}, WordTag{"的", "uj"}, WordTag{"政治局", "n"}, WordTag{"常委", "j"}},
|
||||
[]WordTag{WordTag{"程序员", "n"}, WordTag{"祝", "v"}, WordTag{"海林", "nz"}, WordTag{"和", "c"}, WordTag{"朱会震", "nr"}, WordTag{"是", "v"}, WordTag{"在", "p"}, WordTag{"孙健", "nr"}, WordTag{"的", "uj"}, WordTag{"左面", "f"}, WordTag{"和", "c"}, WordTag{"右面", "f"}, WordTag{",", "x"}, WordTag{" ", "x"}, WordTag{"范凯", "nr"}, WordTag{"在", "p"}, WordTag{"最", "a"}, WordTag{"右面", "f"}, WordTag{".", "m"}, WordTag{"再往", "d"}, WordTag{"左", "f"}, WordTag{"是", "v"}, WordTag{"李松洪", "nr"}},
|
||||
[]WordTag{WordTag{"一次性", "d"}, WordTag{"交", "v"}, WordTag{"多少", "m"}, WordTag{"钱", "n"}},
|
||||
[]WordTag{WordTag{"两块", "m"}, WordTag{"五", "m"}, WordTag{"一套", "m"}, WordTag{",", "x"}, WordTag{"三块", "m"}, WordTag{"八", "m"}, WordTag{"一斤", "m"}, WordTag{",", "x"}, WordTag{"四块", "m"}, WordTag{"七", "m"}, WordTag{"一本", "m"}, WordTag{",", "x"}, WordTag{"五块", "m"}, WordTag{"六", "m"}, WordTag{"一条", "m"}},
|
||||
[]WordTag{WordTag{"小", "a"}, WordTag{"和尚", "nr"}, WordTag{"留", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"像", "v"}, WordTag{"大", "a"}, WordTag{"和尚", "nr"}, WordTag{"一样", "r"}, WordTag{"的", "uj"}, WordTag{"和尚头", "nr"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"是", "v"}, WordTag{"中华人民共和国", "ns"}, WordTag{"公民", "n"}, WordTag{";", "x"}, WordTag{"我", "r"}, WordTag{"爸爸", "n"}, WordTag{"是", "v"}, WordTag{"共和党", "nt"}, WordTag{"党员", "n"}, WordTag{";", "x"}, WordTag{" ", "x"}, WordTag{"地铁", "n"}, WordTag{"和平门", "ns"}, WordTag{"站", "v"}},
|
||||
[]WordTag{WordTag{"张晓梅", "nr"}, WordTag{"去", "v"}, WordTag{"人民", "n"}, WordTag{"医院", "n"}, WordTag{"做", "v"}, WordTag{"了", "ul"}, WordTag{"个", "q"}, WordTag{"B超", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"件", "q"}, WordTag{"T恤", "n"}},
|
||||
[]WordTag{WordTag{"AT&T", "nz"}, WordTag{"是", "v"}, WordTag{"一件", "m"}, WordTag{"不错", "a"}, WordTag{"的", "uj"}, WordTag{"公司", "n"}, WordTag{",", "x"}, WordTag{"给", "p"}, WordTag{"你", "r"}, WordTag{"发", "v"}, WordTag{"offer", "eng"}, WordTag{"了", "ul"}, WordTag{"吗", "y"}, WordTag{"?", "x"}},
|
||||
[]WordTag{WordTag{"C++", "nz"}, WordTag{"和", "c"}, WordTag{"c#", "nz"}, WordTag{"是", "v"}, WordTag{"什么", "r"}, WordTag{"关系", "n"}, WordTag{"?", "x"}, WordTag{"11", "m"}, WordTag{"+", "x"}, WordTag{"122", "m"}, WordTag{"=", "x"}, WordTag{"133", "m"}, WordTag{",", "x"}, WordTag{"是", "v"}, WordTag{"吗", "y"}, WordTag{"?", "x"}, WordTag{"PI", "eng"}, WordTag{"=", "x"}, WordTag{"3.14159", "m"}},
|
||||
[]WordTag{WordTag{"你", "r"}, WordTag{"认识", "v"}, WordTag{"那个", "r"}, WordTag{"和", "c"}, WordTag{"主席", "n"}, WordTag{"握手", "v"}, WordTag{"的", "uj"}, WordTag{"的哥", "n"}, WordTag{"吗", "y"}, WordTag{"?", "x"}, WordTag{"他", "r"}, WordTag{"开", "v"}, WordTag{"一辆", "m"}, WordTag{"黑色", "n"}, WordTag{"的士", "n"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"枪杆子", "n"}, WordTag{"中", "f"}, WordTag{"出", "v"}, WordTag{"政权", "n"}},
|
||||
defaultCutResult = [][]Segment{[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "m"}, Segment{"口", "n"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
|
||||
[]Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
|
||||
[]Segment{Segment{"abc", "eng"}},
|
||||
[]Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
|
||||
[]Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
|
||||
[]Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
|
||||
[]Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺实", "v"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
|
||||
[]Segment{Segment{"伊藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
|
||||
[]Segment{Segment{"中国科学院计算技术研究所", "nt"}},
|
||||
[]Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
|
||||
[]Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
|
||||
[]Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
|
||||
[]Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
|
||||
[]Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
|
||||
[]Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
|
||||
[]Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
|
||||
[]Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
|
||||
[]Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
|
||||
[]Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "v"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
|
||||
[]Segment{Segment{"他", "r"}, Segment{"掉", "v"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
|
||||
[]Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
|
||||
[]Segment{Segment{"孙君意", "nr"}},
|
||||
[]Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
|
||||
[]Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
|
||||
[]Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
|
||||
[]Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "d"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
|
||||
[]Segment{Segment{"60", "m"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
|
||||
[]Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
|
||||
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
|
||||
[]Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
|
||||
[]Segment{Segment{"的的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"在的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和和", "nz"}, Segment{"和", "c"}},
|
||||
[]Segment{Segment{"I", "x"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
|
||||
[]Segment{Segment{"因", "p"}},
|
||||
[]Segment{},
|
||||
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"很好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
|
||||
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
|
||||
[]Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
|
||||
[]Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
|
||||
[]Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
|
||||
[]Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
|
||||
[]Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那部", "r"}, Segment{"蒙人", "n"}, Segment{"的", "uj"}, Segment{"闲法", "n"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "m"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中本", "ns"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
|
||||
[]Segment{Segment{"大", "a"}},
|
||||
[]Segment{},
|
||||
[]Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
|
||||
[]Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
|
||||
[]Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
|
||||
[]Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "n"}},
|
||||
[]Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
|
||||
[]Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
|
||||
[]Segment{Segment{"邢永臣", "nr"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
|
||||
[]Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}},
|
||||
[]Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
|
||||
[]Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
|
||||
[]Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
|
||||
[]Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
|
||||
[]Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
|
||||
[]Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱会震", "nr"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙健", "nr"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范凯", "nr"}, Segment{"在", "p"}, Segment{"最", "a"}, Segment{"右面", "f"}, Segment{".", "m"}, Segment{"再往", "d"}, Segment{"左", "f"}, Segment{"是", "v"}, Segment{"李松洪", "nr"}},
|
||||
[]Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
|
||||
[]Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
|
||||
[]Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
|
||||
[]Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "q"}, Segment{"T恤", "n"}},
|
||||
[]Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}},
|
||||
[]Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "m"}, Segment{"+", "x"}, Segment{"122", "m"}, Segment{"=", "x"}, Segment{"133", "m"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3.14159", "m"}},
|
||||
[]Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
|
||||
}
|
||||
noHMMCutResult = [][]WordTag{
|
||||
[]WordTag{WordTag{"这", "r"}, WordTag{"是", "v"}, WordTag{"一个", "m"}, WordTag{"伸手不见五指", "i"}, WordTag{"的", "uj"}, WordTag{"黑夜", "n"}, WordTag{"。", "x"}, WordTag{"我", "r"}, WordTag{"叫", "v"}, WordTag{"孙悟空", "nr"}, WordTag{",", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{",", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"Python", "eng"}, WordTag{"和", "c"}, WordTag{"C++", "nz"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"不", "d"}, WordTag{"喜欢", "v"}, WordTag{"日本", "ns"}, WordTag{"和服", "nz"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"回归", "v"}, WordTag{"人间", "n"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"工信处", "n"}, WordTag{"女干事", "n"}, WordTag{"每月", "r"}, WordTag{"经过", "p"}, WordTag{"下属", "v"}, WordTag{"科室", "n"}, WordTag{"都", "d"}, WordTag{"要", "v"}, WordTag{"亲口", "n"}, WordTag{"交代", "n"}, WordTag{"24", "eng"}, WordTag{"口", "q"}, WordTag{"交换机", "n"}, WordTag{"等", "u"}, WordTag{"技术性", "n"}, WordTag{"器件", "n"}, WordTag{"的", "uj"}, WordTag{"安装", "v"}, WordTag{"工作", "vn"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"需要", "v"}, WordTag{"廉租房", "n"}},
|
||||
[]WordTag{WordTag{"永和", "nz"}, WordTag{"服装", "vn"}, WordTag{"饰品", "n"}, WordTag{"有限公司", "n"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"天安门", "ns"}},
|
||||
[]WordTag{WordTag{"abc", "eng"}},
|
||||
[]WordTag{WordTag{"隐", "n"}, WordTag{"马尔可夫", "nr"}},
|
||||
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"是", "v"}, WordTag{"个", "q"}, WordTag{"好", "a"}, WordTag{"网站", "n"}},
|
||||
[]WordTag{WordTag{"“", "x"}, WordTag{"Microsoft", "eng"}, WordTag{"”", "x"}, WordTag{"一", "m"}, WordTag{"词", "n"}, WordTag{"由", "p"}, WordTag{"“", "x"}, WordTag{"MICROcomputer", "eng"}, WordTag{"(", "x"}, WordTag{"微型", "b"}, WordTag{"计算机", "n"}, WordTag{")", "x"}, WordTag{"”", "x"}, WordTag{"和", "c"}, WordTag{"“", "x"}, WordTag{"SOFTware", "eng"}, WordTag{"(", "x"}, WordTag{"软件", "n"}, WordTag{")", "x"}, WordTag{"”", "x"}, WordTag{"两", "m"}, WordTag{"部分", "n"}, WordTag{"组成", "v"}},
|
||||
[]WordTag{WordTag{"草泥马", "n"}, WordTag{"和", "c"}, WordTag{"欺", "vn"}, WordTag{"实", "n"}, WordTag{"马", "n"}, WordTag{"是", "v"}, WordTag{"今年", "t"}, WordTag{"的", "uj"}, WordTag{"流行", "v"}, WordTag{"词汇", "n"}},
|
||||
[]WordTag{WordTag{"伊", "ns"}, WordTag{"藤", "nr"}, WordTag{"洋华堂", "n"}, WordTag{"总府", "n"}, WordTag{"店", "n"}},
|
||||
[]WordTag{WordTag{"中国科学院计算技术研究所", "nt"}},
|
||||
[]WordTag{WordTag{"罗密欧", "nr"}, WordTag{"与", "p"}, WordTag{"朱丽叶", "nr"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"购买", "v"}, WordTag{"了", "ul"}, WordTag{"道具", "n"}, WordTag{"和", "c"}, WordTag{"服装", "vn"}},
|
||||
[]WordTag{WordTag{"PS", "eng"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"我", "r"}, WordTag{"觉得", "v"}, WordTag{"开源", "n"}, WordTag{"有", "v"}, WordTag{"一个", "m"}, WordTag{"好处", "d"}, WordTag{",", "x"}, WordTag{"就是", "d"}, WordTag{"能够", "v"}, WordTag{"敦促", "v"}, WordTag{"自己", "r"}, WordTag{"不断改进", "l"}, WordTag{",", "x"}, WordTag{"避免", "v"}, WordTag{"敞", "v"}, WordTag{"帚", "ng"}, WordTag{"自珍", "b"}},
|
||||
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"石首市", "ns"}},
|
||||
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"十堰市", "ns"}},
|
||||
[]WordTag{WordTag{"总经理", "n"}, WordTag{"完成", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}},
|
||||
[]WordTag{WordTag{"电脑", "n"}, WordTag{"修好", "v"}, WordTag{"了", "ul"}},
|
||||
[]WordTag{WordTag{"做好", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"就", "d"}, WordTag{"一了百了", "l"}, WordTag{"了", "ul"}},
|
||||
[]WordTag{WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"我们", "r"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"美的", "nr"}, WordTag{"空调", "n"}},
|
||||
[]WordTag{WordTag{"线程", "n"}, WordTag{"初始化", "l"}, WordTag{"时", "n"}, WordTag{"我们", "r"}, WordTag{"要", "v"}, WordTag{"注意", "v"}},
|
||||
[]WordTag{WordTag{"一个", "m"}, WordTag{"分子", "n"}, WordTag{"是", "v"}, WordTag{"由", "p"}, WordTag{"好多", "m"}, WordTag{"原子", "n"}, WordTag{"组织", "v"}, WordTag{"成", "n"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"祝", "v"}, WordTag{"你", "r"}, WordTag{"马到功成", "i"}},
|
||||
[]WordTag{WordTag{"他", "r"}, WordTag{"掉", "zg"}, WordTag{"进", "v"}, WordTag{"了", "ul"}, WordTag{"无底洞", "ns"}, WordTag{"里", "f"}},
|
||||
[]WordTag{WordTag{"中国", "ns"}, WordTag{"的", "uj"}, WordTag{"首都", "d"}, WordTag{"是", "v"}, WordTag{"北京", "ns"}},
|
||||
[]WordTag{WordTag{"孙", "zg"}, WordTag{"君", "nz"}, WordTag{"意", "n"}},
|
||||
[]WordTag{WordTag{"外交部", "nt"}, WordTag{"发言人", "l"}, WordTag{"马朝旭", "nr"}},
|
||||
[]WordTag{WordTag{"领导人", "n"}, WordTag{"会议", "n"}, WordTag{"和", "c"}, WordTag{"第四届", "m"}, WordTag{"东亚", "ns"}, WordTag{"峰会", "n"}},
|
||||
[]WordTag{WordTag{"在", "p"}, WordTag{"过去", "t"}, WordTag{"的", "uj"}, WordTag{"这", "r"}, WordTag{"五年", "t"}},
|
||||
[]WordTag{WordTag{"还", "d"}, WordTag{"需要", "v"}, WordTag{"很", "zg"}, WordTag{"长", "a"}, WordTag{"的", "uj"}, WordTag{"路", "n"}, WordTag{"要", "v"}, WordTag{"走", "v"}},
|
||||
[]WordTag{WordTag{"60", "eng"}, WordTag{"周年", "t"}, WordTag{"首都", "d"}, WordTag{"阅兵", "v"}},
|
||||
[]WordTag{WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"来", "v"}, WordTag{"世博园", "nr"}},
|
||||
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"世博园", "nr"}},
|
||||
[]WordTag{WordTag{"但是", "c"}, WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}, WordTag{"知道", "v"}, WordTag{"你", "r"}, WordTag{"是", "v"}, WordTag{"对", "p"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"存在", "v"}, WordTag{"即", "v"}, WordTag{"合理", "vn"}},
|
||||
[]WordTag{WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"在", "p"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"就", "d"}, WordTag{"以", "p"}, WordTag{"和", "c"}, WordTag{"和", "c"}, WordTag{"和", "c"}},
|
||||
[]WordTag{WordTag{"I", "eng"}, WordTag{" ", "x"}, WordTag{"love", "eng"}, WordTag{"你", "r"}, WordTag{",", "x"}, WordTag{"不以为耻", "i"}, WordTag{",", "x"}, WordTag{"反", "zg"}, WordTag{"以为", "c"}, WordTag{"rong", "eng"}},
|
||||
[]WordTag{WordTag{"因", "p"}},
|
||||
[]WordTag{},
|
||||
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"很", "zg"}, WordTag{"好", "a"}, WordTag{"但", "c"}, WordTag{"主要", "b"}, WordTag{"是", "v"}, WordTag{"基于", "p"}, WordTag{"网页", "n"}, WordTag{"形式", "n"}},
|
||||
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"为什么", "r"}, WordTag{"我", "r"}, WordTag{"不能", "v"}, WordTag{"拥有", "v"}, WordTag{"想要", "v"}, WordTag{"的", "uj"}, WordTag{"生活", "vn"}},
|
||||
[]WordTag{WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}},
|
||||
[]WordTag{WordTag{"此次", "r"}, WordTag{"来", "v"}, WordTag{"中国", "ns"}, WordTag{"是", "v"}, WordTag{"为了", "p"}},
|
||||
[]WordTag{WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{",", "x"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"其实", "d"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"好人", "n"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
|
||||
[]WordTag{WordTag{"是因为", "c"}, WordTag{"和", "c"}, WordTag{"国家", "n"}},
|
||||
[]WordTag{WordTag{"老年", "t"}, WordTag{"搜索", "v"}, WordTag{"还", "d"}, WordTag{"支持", "v"}},
|
||||
[]WordTag{WordTag{"干脆", "d"}, WordTag{"就", "d"}, WordTag{"把", "p"}, WordTag{"那", "r"}, WordTag{"部", "n"}, WordTag{"蒙", "v"}, WordTag{"人", "n"}, WordTag{"的", "uj"}, WordTag{"闲", "n"}, WordTag{"法", "j"}, WordTag{"给", "p"}, WordTag{"废", "v"}, WordTag{"了", "ul"}, WordTag{"拉倒", "v"}, WordTag{"!", "x"}, WordTag{"RT", "eng"}, WordTag{" ", "x"}, WordTag{"@", "x"}, WordTag{"laoshipukong", "eng"}, WordTag{" ", "x"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"27", "eng"}, WordTag{"日", "m"}, WordTag{",", "x"}, WordTag{"全国人大常委会", "nt"}, WordTag{"第三次", "m"}, WordTag{"审议", "v"}, WordTag{"侵权", "v"}, WordTag{"责任法", "n"}, WordTag{"草案", "n"}, WordTag{",", "x"}, WordTag{"删除", "v"}, WordTag{"了", "ul"}, WordTag{"有关", "vn"}, WordTag{"医疗", "n"}, WordTag{"损害", "v"}, WordTag{"责任", "n"}, WordTag{"“", "x"}, WordTag{"举证", "v"}, WordTag{"倒置", "v"}, WordTag{"”", "x"}, WordTag{"的", "uj"}, WordTag{"规定", "n"}, WordTag{"。", "x"}, WordTag{"在", "p"}, WordTag{"医患", "n"}, WordTag{"纠纷", "n"}, WordTag{"中", "f"}, WordTag{"本", "r"}, WordTag{"已", "d"}, WordTag{"处于", "v"}, WordTag{"弱势", "n"}, WordTag{"地位", "n"}, WordTag{"的", "uj"}, WordTag{"消费者", "n"}, WordTag{"由此", "c"}, WordTag{"将", "d"}, WordTag{"陷入", "v"}, WordTag{"万劫不复", "i"}, WordTag{"的", "uj"}, WordTag{"境地", "s"}, WordTag{"。", "x"}, WordTag{" ", "x"}},
|
||||
[]WordTag{WordTag{"大", "a"}},
|
||||
[]WordTag{},
|
||||
[]WordTag{WordTag{"他", "r"}, WordTag{"说", "v"}, WordTag{"的", "uj"}, WordTag{"确实", "ad"}, WordTag{"在", "p"}, WordTag{"理", "n"}},
|
||||
[]WordTag{WordTag{"长春", "ns"}, WordTag{"市长", "n"}, WordTag{"春节", "t"}, WordTag{"讲话", "n"}},
|
||||
[]WordTag{WordTag{"结婚", "v"}, WordTag{"的", "uj"}, WordTag{"和", "c"}, WordTag{"尚未", "d"}, WordTag{"结婚", "v"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"结合", "v"}, WordTag{"成", "n"}, WordTag{"分子", "n"}, WordTag{"时", "n"}},
|
||||
[]WordTag{WordTag{"旅游", "vn"}, WordTag{"和", "c"}, WordTag{"服务", "vn"}, WordTag{"是", "v"}, WordTag{"最好", "a"}, WordTag{"的", "uj"}},
|
||||
[]WordTag{WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"的确", "d"}, WordTag{"是", "v"}, WordTag{"我", "r"}, WordTag{"的", "uj"}, WordTag{"错", "v"}},
|
||||
[]WordTag{WordTag{"供", "v"}, WordTag{"大家", "n"}, WordTag{"参考", "v"}, WordTag{"指正", "v"}},
|
||||
[]WordTag{WordTag{"哈尔滨", "ns"}, WordTag{"政府", "n"}, WordTag{"公布", "v"}, WordTag{"塌", "v"}, WordTag{"桥", "n"}, WordTag{"原因", "n"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"在", "p"}, WordTag{"机场", "n"}, WordTag{"入口处", "i"}},
|
||||
[]WordTag{WordTag{"邢", "nr"}, WordTag{"永", "ns"}, WordTag{"臣", "n"}, WordTag{"摄影", "n"}, WordTag{"报道", "v"}},
|
||||
[]WordTag{WordTag{"BP", "eng"}, WordTag{"神经网络", "n"}, WordTag{"如何", "r"}, WordTag{"训练", "vn"}, WordTag{"才能", "v"}, WordTag{"在", "p"}, WordTag{"分类", "n"}, WordTag{"时", "n"}, WordTag{"增加", "v"}, WordTag{"区分度", "n"}, WordTag{"?", "x"}},
|
||||
[]WordTag{WordTag{"南京市", "ns"}, WordTag{"长江大桥", "ns"}},
|
||||
[]WordTag{WordTag{"应", "v"}, WordTag{"一些", "m"}, WordTag{"使用者", "n"}, WordTag{"的", "uj"}, WordTag{"建议", "n"}, WordTag{",", "x"}, WordTag{"也", "d"}, WordTag{"为了", "p"}, WordTag{"便于", "v"}, WordTag{"利用", "n"}, WordTag{"NiuTrans", "eng"}, WordTag{"用于", "v"}, WordTag{"SMT", "eng"}, WordTag{"研究", "vn"}},
|
||||
[]WordTag{WordTag{"长春市", "ns"}, WordTag{"长春", "ns"}, WordTag{"药店", "n"}},
|
||||
[]WordTag{WordTag{"邓颖超", "nr"}, WordTag{"生前", "t"}, WordTag{"最", "d"}, WordTag{"喜欢", "v"}, WordTag{"的", "uj"}, WordTag{"衣服", "n"}},
|
||||
[]WordTag{WordTag{"胡锦涛", "nr"}, WordTag{"是", "v"}, WordTag{"热爱", "a"}, WordTag{"世界", "n"}, WordTag{"和平", "nz"}, WordTag{"的", "uj"}, WordTag{"政治局", "n"}, WordTag{"常委", "j"}},
|
||||
[]WordTag{WordTag{"程序员", "n"}, WordTag{"祝", "v"}, WordTag{"海林", "nz"}, WordTag{"和", "c"}, WordTag{"朱", "nr"}, WordTag{"会", "v"}, WordTag{"震", "v"}, WordTag{"是", "v"}, WordTag{"在", "p"}, WordTag{"孙", "zg"}, WordTag{"健", "a"}, WordTag{"的", "uj"}, WordTag{"左面", "f"}, WordTag{"和", "c"}, WordTag{"右面", "f"}, WordTag{",", "x"}, WordTag{" ", "x"}, WordTag{"范", "nr"}, WordTag{"凯", "nr"}, WordTag{"在", "p"}, WordTag{"最", "d"}, WordTag{"右面", "f"}, WordTag{".", "x"}, WordTag{"再", "d"}, WordTag{"往", "zg"}, WordTag{"左", "m"}, WordTag{"是", "v"}, WordTag{"李", "nr"}, WordTag{"松", "v"}, WordTag{"洪", "nr"}},
|
||||
[]WordTag{WordTag{"一次性", "d"}, WordTag{"交", "v"}, WordTag{"多少", "m"}, WordTag{"钱", "n"}},
|
||||
[]WordTag{WordTag{"两块", "m"}, WordTag{"五", "m"}, WordTag{"一套", "m"}, WordTag{",", "x"}, WordTag{"三块", "m"}, WordTag{"八", "m"}, WordTag{"一斤", "m"}, WordTag{",", "x"}, WordTag{"四块", "m"}, WordTag{"七", "m"}, WordTag{"一本", "m"}, WordTag{",", "x"}, WordTag{"五块", "m"}, WordTag{"六", "m"}, WordTag{"一条", "m"}},
|
||||
[]WordTag{WordTag{"小", "a"}, WordTag{"和尚", "nr"}, WordTag{"留", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"像", "v"}, WordTag{"大", "a"}, WordTag{"和尚", "nr"}, WordTag{"一样", "r"}, WordTag{"的", "uj"}, WordTag{"和尚头", "nr"}},
|
||||
[]WordTag{WordTag{"我", "r"}, WordTag{"是", "v"}, WordTag{"中华人民共和国", "ns"}, WordTag{"公民", "n"}, WordTag{";", "x"}, WordTag{"我", "r"}, WordTag{"爸爸", "n"}, WordTag{"是", "v"}, WordTag{"共和党", "nt"}, WordTag{"党员", "n"}, WordTag{";", "x"}, WordTag{" ", "x"}, WordTag{"地铁", "n"}, WordTag{"和平门", "ns"}, WordTag{"站", "v"}},
|
||||
[]WordTag{WordTag{"张晓梅", "nr"}, WordTag{"去", "v"}, WordTag{"人民", "n"}, WordTag{"医院", "n"}, WordTag{"做", "v"}, WordTag{"了", "ul"}, WordTag{"个", "q"}, WordTag{"B超", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"件", "zg"}, WordTag{"T恤", "n"}},
|
||||
[]WordTag{WordTag{"AT&T", "nz"}, WordTag{"是", "v"}, WordTag{"一件", "m"}, WordTag{"不错", "a"}, WordTag{"的", "uj"}, WordTag{"公司", "n"}, WordTag{",", "x"}, WordTag{"给", "p"}, WordTag{"你", "r"}, WordTag{"发", "v"}, WordTag{"offer", "eng"}, WordTag{"了", "ul"}, WordTag{"吗", "y"}, WordTag{"?", "x"}},
|
||||
[]WordTag{WordTag{"C++", "nz"}, WordTag{"和", "c"}, WordTag{"c#", "nz"}, WordTag{"是", "v"}, WordTag{"什么", "r"}, WordTag{"关系", "n"}, WordTag{"?", "x"}, WordTag{"11", "eng"}, WordTag{"+", "x"}, WordTag{"122", "eng"}, WordTag{"=", "x"}, WordTag{"133", "eng"}, WordTag{",", "x"}, WordTag{"是", "v"}, WordTag{"吗", "y"}, WordTag{"?", "x"}, WordTag{"PI", "eng"}, WordTag{"=", "x"}, WordTag{"3", "eng"}, WordTag{".", "x"}, WordTag{"14159", "eng"}},
|
||||
[]WordTag{WordTag{"你", "r"}, WordTag{"认识", "v"}, WordTag{"那个", "r"}, WordTag{"和", "c"}, WordTag{"主席", "n"}, WordTag{"握手", "v"}, WordTag{"的", "uj"}, WordTag{"的哥", "n"}, WordTag{"吗", "y"}, WordTag{"?", "x"}, WordTag{"他", "r"}, WordTag{"开", "v"}, WordTag{"一辆", "m"}, WordTag{"黑色", "n"}, WordTag{"的士", "n"}, WordTag{"。", "x"}},
|
||||
[]WordTag{WordTag{"枪杆子", "n"}, WordTag{"中", "f"}, WordTag{"出", "v"}, WordTag{"政权", "n"}},
|
||||
noHMMCutResult = [][]Segment{
|
||||
[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "eng"}, Segment{"口", "q"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
|
||||
[]Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
|
||||
[]Segment{Segment{"abc", "eng"}},
|
||||
[]Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
|
||||
[]Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
|
||||
[]Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
|
||||
[]Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺", "vn"}, Segment{"实", "n"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
|
||||
[]Segment{Segment{"伊", "ns"}, Segment{"藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
|
||||
[]Segment{Segment{"中国科学院计算技术研究所", "nt"}},
|
||||
[]Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
|
||||
[]Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
|
||||
[]Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
|
||||
[]Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
|
||||
[]Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
|
||||
[]Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
|
||||
[]Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
|
||||
[]Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
|
||||
[]Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
|
||||
[]Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "n"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
|
||||
[]Segment{Segment{"他", "r"}, Segment{"掉", "zg"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
|
||||
[]Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
|
||||
[]Segment{Segment{"孙", "zg"}, Segment{"君", "nz"}, Segment{"意", "n"}},
|
||||
[]Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
|
||||
[]Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
|
||||
[]Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
|
||||
[]Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "zg"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
|
||||
[]Segment{Segment{"60", "eng"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
|
||||
[]Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
|
||||
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
|
||||
[]Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
|
||||
[]Segment{Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"在", "p"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和", "c"}, Segment{"和", "c"}, Segment{"和", "c"}},
|
||||
[]Segment{Segment{"I", "eng"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
|
||||
[]Segment{Segment{"因", "p"}},
|
||||
[]Segment{},
|
||||
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"很", "zg"}, Segment{"好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
|
||||
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
|
||||
[]Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
|
||||
[]Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
|
||||
[]Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
|
||||
[]Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
|
||||
[]Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
|
||||
[]Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那", "r"}, Segment{"部", "n"}, Segment{"蒙", "v"}, Segment{"人", "n"}, Segment{"的", "uj"}, Segment{"闲", "n"}, Segment{"法", "j"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "eng"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中", "f"}, Segment{"本", "r"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
|
||||
[]Segment{Segment{"大", "a"}},
|
||||
[]Segment{},
|
||||
[]Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
|
||||
[]Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
|
||||
[]Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
|
||||
[]Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
|
||||
[]Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "v"}},
|
||||
[]Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
|
||||
[]Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
|
||||
[]Segment{Segment{"邢", "nr"}, Segment{"永", "ns"}, Segment{"臣", "n"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
|
||||
[]Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}},
|
||||
[]Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
|
||||
[]Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
|
||||
[]Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
|
||||
[]Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
|
||||
[]Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
|
||||
[]Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱", "nr"}, Segment{"会", "v"}, Segment{"震", "v"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙", "zg"}, Segment{"健", "a"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范", "nr"}, Segment{"凯", "nr"}, Segment{"在", "p"}, Segment{"最", "d"}, Segment{"右面", "f"}, Segment{".", "x"}, Segment{"再", "d"}, Segment{"往", "zg"}, Segment{"左", "m"}, Segment{"是", "v"}, Segment{"李", "nr"}, Segment{"松", "v"}, Segment{"洪", "nr"}},
|
||||
[]Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
|
||||
[]Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
|
||||
[]Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
|
||||
[]Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
|
||||
[]Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "zg"}, Segment{"T恤", "n"}},
|
||||
[]Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}},
|
||||
[]Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "eng"}, Segment{"+", "x"}, Segment{"122", "eng"}, Segment{"=", "x"}, Segment{"133", "eng"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3", "eng"}, Segment{".", "x"}, Segment{"14159", "eng"}},
|
||||
[]Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
|
||||
[]Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
|
||||
}
|
||||
)
|
||||
|
||||
func chanToArray(ch chan WordTag) []WordTag {
|
||||
result := make([]WordTag, 0)
|
||||
func init() {
|
||||
seg.LoadDictionary("../dict.txt")
|
||||
}
|
||||
|
||||
func chanToArray(ch <-chan Segment) []Segment {
|
||||
var result []Segment
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -277,136 +281,148 @@ func chanToArray(ch chan WordTag) []WordTag {
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
for index, content := range test_contents {
|
||||
result := chanToArray(Cut(content, true))
|
||||
for index, content := range testContents {
|
||||
result := chanToArray(seg.Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
content, len(defaultCutResult[index]), len(result))
|
||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||
t.Fatalf("got: %v\n", result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != defaultCutResult[index][i] {
|
||||
t.Error(content)
|
||||
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||
}
|
||||
}
|
||||
result = chanToArray(Cut(content, false))
|
||||
result = chanToArray(seg.Cut(content, false))
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
t.Fatal(content)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != noHMMCutResult[index][i] {
|
||||
t.Error(content)
|
||||
t.Fatal(content)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// https://github.com/fxsjy/jieba/issues/132
|
||||
func TestBug132(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/132
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
sentence := "又跛又啞"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"又", "d"},
|
||||
WordTag{"跛", "a"},
|
||||
WordTag{"又", "d"},
|
||||
WordTag{"啞", "v"},
|
||||
cutResult := []Segment{
|
||||
Segment{"又", "d"},
|
||||
Segment{"跛", "a"},
|
||||
Segment{"又", "d"},
|
||||
Segment{"啞", "v"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(seg.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Error(result[i])
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// https://github.com/fxsjy/jieba/issues/137
|
||||
func TestBug137(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/137
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"前", "f"},
|
||||
WordTag{"港督", "n"},
|
||||
WordTag{"衛奕", "z"},
|
||||
WordTag{"信", "n"},
|
||||
WordTag{"在", "p"},
|
||||
WordTag{"八八年", "m"},
|
||||
WordTag{"十月", "t"},
|
||||
WordTag{"宣布", "v"},
|
||||
WordTag{"成立", "v"},
|
||||
WordTag{"中央", "n"},
|
||||
WordTag{"政策", "n"},
|
||||
WordTag{"研究", "vn"},
|
||||
WordTag{"組", "x"},
|
||||
cutResult := []Segment{
|
||||
Segment{"前", "f"},
|
||||
Segment{"港督", "n"},
|
||||
Segment{"衛奕", "z"},
|
||||
Segment{"信", "n"},
|
||||
Segment{"在", "p"},
|
||||
Segment{"八八年", "m"},
|
||||
Segment{"十月", "t"},
|
||||
Segment{"宣布", "v"},
|
||||
Segment{"成立", "v"},
|
||||
Segment{"中央", "n"},
|
||||
Segment{"政策", "n"},
|
||||
Segment{"研究", "vn"},
|
||||
Segment{"組", "x"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(seg.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Error(result[i])
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUserDict(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
jiebago.LoadUserDict("../userdict.txt")
|
||||
seg.LoadUserDictionary("../userdict.txt")
|
||||
defer seg.LoadDictionary("../dict.txt")
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
|
||||
cutResult := []WordTag{
|
||||
WordTag{"李小福", "nr"},
|
||||
WordTag{"是", "v"},
|
||||
WordTag{"创新办", "i"},
|
||||
WordTag{"主任", "b"},
|
||||
WordTag{"也", "d"},
|
||||
WordTag{"是", "v"},
|
||||
WordTag{"云计算", "x"},
|
||||
WordTag{"方面", "n"},
|
||||
WordTag{"的", "uj"},
|
||||
WordTag{"专家", "n"},
|
||||
WordTag{";", "x"},
|
||||
WordTag{" ", "x"},
|
||||
WordTag{"什么", "r"},
|
||||
WordTag{"是", "v"},
|
||||
WordTag{"八一双鹿", "nz"},
|
||||
WordTag{"例如", "v"},
|
||||
WordTag{"我", "r"},
|
||||
WordTag{"输入", "v"},
|
||||
WordTag{"一个", "m"},
|
||||
WordTag{"带", "v"},
|
||||
WordTag{"“", "x"},
|
||||
WordTag{"韩玉赏鉴", "nz"},
|
||||
WordTag{"”", "x"},
|
||||
WordTag{"的", "uj"},
|
||||
WordTag{"标题", "n"},
|
||||
WordTag{",", "x"},
|
||||
WordTag{"在", "p"},
|
||||
WordTag{"自定义词", "n"},
|
||||
WordTag{"库中", "nrt"},
|
||||
WordTag{"也", "d"},
|
||||
WordTag{"增加", "v"},
|
||||
WordTag{"了", "ul"},
|
||||
WordTag{"此", "r"},
|
||||
WordTag{"词", "n"},
|
||||
WordTag{"为", "p"},
|
||||
WordTag{"N", "eng"},
|
||||
WordTag{"类型", "n"}}
|
||||
cutResult := []Segment{
|
||||
Segment{"李小福", "nr"},
|
||||
Segment{"是", "v"},
|
||||
Segment{"创新办", "i"},
|
||||
Segment{"主任", "b"},
|
||||
Segment{"也", "d"},
|
||||
Segment{"是", "v"},
|
||||
Segment{"云计算", "x"},
|
||||
Segment{"方面", "n"},
|
||||
Segment{"的", "uj"},
|
||||
Segment{"专家", "n"},
|
||||
Segment{";", "x"},
|
||||
Segment{" ", "x"},
|
||||
Segment{"什么", "r"},
|
||||
Segment{"是", "v"},
|
||||
Segment{"八一双鹿", "nz"},
|
||||
Segment{"例如", "v"},
|
||||
Segment{"我", "r"},
|
||||
Segment{"输入", "v"},
|
||||
Segment{"一个", "m"},
|
||||
Segment{"带", "v"},
|
||||
Segment{"“", "x"},
|
||||
Segment{"韩玉赏鉴", "nz"},
|
||||
Segment{"”", "x"},
|
||||
Segment{"的", "uj"},
|
||||
Segment{"标题", "n"},
|
||||
Segment{",", "x"},
|
||||
Segment{"在", "p"},
|
||||
Segment{"自定义词", "n"},
|
||||
Segment{"库中", "nrt"},
|
||||
Segment{"也", "d"},
|
||||
Segment{"增加", "v"},
|
||||
Segment{"了", "ul"},
|
||||
Segment{"此", "r"},
|
||||
Segment{"词", "n"},
|
||||
Segment{"为", "p"},
|
||||
Segment{"N", "eng"},
|
||||
Segment{"类型", "n"}}
|
||||
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(seg.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Error(result[i])
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCutNoHMM(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.Cut(sentence, false))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCut(b *testing.B) {
|
||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
chanToArray(seg.Cut(sentence, true))
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,264 +1,260 @@
|
||||
package posseg
|
||||
|
||||
var (
|
||||
probStart = make(map[stateTag]float64)
|
||||
)
|
||||
|
||||
func init() {
|
||||
probStart[stateTag{'B', "a"}] = -4.762305214596967
|
||||
probStart[stateTag{'B', "ad"}] = -6.680066036784177
|
||||
probStart[stateTag{'B', "ag"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "an"}] = -8.697083223018778
|
||||
probStart[stateTag{'B', "b"}] = -5.018374362109218
|
||||
probStart[stateTag{'B', "bg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "c"}] = -3.423880184954888
|
||||
probStart[stateTag{'B', "d"}] = -3.9750475297585357
|
||||
probStart[stateTag{'B', "df"}] = -8.888974230828882
|
||||
probStart[stateTag{'B', "dg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "e"}] = -8.563551830394255
|
||||
probStart[stateTag{'B', "en"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "f"}] = -5.491630418482717
|
||||
probStart[stateTag{'B', "g"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "h"}] = -13.533365129970255
|
||||
probStart[stateTag{'B', "i"}] = -6.1157847275557105
|
||||
probStart[stateTag{'B', "in"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "j"}] = -5.0576191284681915
|
||||
probStart[stateTag{'B', "jn"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "k"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "l"}] = -4.905883584659895
|
||||
probStart[stateTag{'B', "ln"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "m"}] = -3.6524299819046386
|
||||
probStart[stateTag{'B', "mg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "mq"}] = -6.78695300139688
|
||||
probStart[stateTag{'B', "n"}] = -1.6966257797548328
|
||||
probStart[stateTag{'B', "ng"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "nr"}] = -2.2310495913769506
|
||||
probStart[stateTag{'B', "nrfg"}] = -5.873722175405573
|
||||
probStart[stateTag{'B', "nrt"}] = -4.985642733519195
|
||||
probStart[stateTag{'B', "ns"}] = -2.8228438314969213
|
||||
probStart[stateTag{'B', "nt"}] = -4.846091668182416
|
||||
probStart[stateTag{'B', "nz"}] = -3.94698846057672
|
||||
probStart[stateTag{'B', "o"}] = -8.433498702146057
|
||||
probStart[stateTag{'B', "p"}] = -4.200984132085048
|
||||
probStart[stateTag{'B', "q"}] = -6.998123858956596
|
||||
probStart[stateTag{'B', "qe"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "qg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "r"}] = -3.4098187790818413
|
||||
probStart[stateTag{'B', "rg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "rr"}] = -12.434752841302146
|
||||
probStart[stateTag{'B', "rz"}] = -7.946116471570005
|
||||
probStart[stateTag{'B', "s"}] = -5.522673590839954
|
||||
probStart[stateTag{'B', "t"}] = -3.3647479094528574
|
||||
probStart[stateTag{'B', "tg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "u"}] = -9.163917277503234
|
||||
probStart[stateTag{'B', "ud"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "ug"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "uj"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "ul"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "uv"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "uz"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "v"}] = -2.6740584874265685
|
||||
probStart[stateTag{'B', "vd"}] = -9.044728760238115
|
||||
probStart[stateTag{'B', "vg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "vi"}] = -12.434752841302146
|
||||
probStart[stateTag{'B', "vn"}] = -4.3315610890163585
|
||||
probStart[stateTag{'B', "vq"}] = -12.147070768850364
|
||||
probStart[stateTag{'B', "w"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "x"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "y"}] = -9.844485675856319
|
||||
probStart[stateTag{'B', "yg"}] = -3.14e+100
|
||||
probStart[stateTag{'B', "z"}] = -7.045681111485645
|
||||
probStart[stateTag{'B', "zg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "a"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ad"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ag"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "an"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "b"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "bg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "c"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "d"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "df"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "dg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "e"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "en"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "f"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "g"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "h"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "i"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "in"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "j"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "jn"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "k"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "l"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ln"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "m"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "mg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "mq"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "n"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ng"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "nr"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "nrfg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "nrt"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ns"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "nt"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "nz"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "o"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "p"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "q"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "qe"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "qg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "r"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "rg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "rr"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "rz"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "s"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "t"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "tg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "u"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ud"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ug"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "uj"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "ul"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "uv"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "uz"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "v"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "vd"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "vg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "vi"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "vn"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "vq"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "w"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "x"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "y"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "yg"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "z"}] = -3.14e+100
|
||||
probStart[stateTag{'E', "zg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "a"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ad"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ag"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "an"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "b"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "bg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "c"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "d"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "df"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "dg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "e"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "en"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "f"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "g"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "h"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "i"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "in"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "j"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "jn"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "k"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "l"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ln"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "m"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "mg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "mq"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "n"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ng"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "nr"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "nrfg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "nrt"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ns"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "nt"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "nz"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "o"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "p"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "q"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "qe"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "qg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "r"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "rg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "rr"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "rz"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "s"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "t"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "tg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "u"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ud"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ug"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "uj"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "ul"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "uv"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "uz"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "v"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "vd"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "vg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "vi"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "vn"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "vq"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "w"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "x"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "y"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "yg"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "z"}] = -3.14e+100
|
||||
probStart[stateTag{'M', "zg"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "a"}] = -3.9025396831295227
|
||||
probStart[stateTag{'S', "ad"}] = -11.048458480182255
|
||||
probStart[stateTag{'S', "ag"}] = -6.954113917960154
|
||||
probStart[stateTag{'S', "an"}] = -12.84021794941031
|
||||
probStart[stateTag{'S', "b"}] = -6.472888763970454
|
||||
probStart[stateTag{'S', "bg"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "c"}] = -4.786966795861212
|
||||
probStart[stateTag{'S', "d"}] = -3.903919764181873
|
||||
probStart[stateTag{'S', "df"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "dg"}] = -8.948397651299683
|
||||
probStart[stateTag{'S', "e"}] = -5.942513006281674
|
||||
probStart[stateTag{'S', "en"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "f"}] = -5.194820249981676
|
||||
probStart[stateTag{'S', "g"}] = -6.507826815331734
|
||||
probStart[stateTag{'S', "h"}] = -8.650563207383884
|
||||
probStart[stateTag{'S', "i"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "in"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "j"}] = -4.911992119644354
|
||||
probStart[stateTag{'S', "jn"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "k"}] = -6.940320595827818
|
||||
probStart[stateTag{'S', "l"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "ln"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "m"}] = -3.269200652116097
|
||||
probStart[stateTag{'S', "mg"}] = -10.825314928868044
|
||||
probStart[stateTag{'S', "mq"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "n"}] = -3.8551483897645107
|
||||
probStart[stateTag{'S', "ng"}] = -4.913434861102905
|
||||
probStart[stateTag{'S', "nr"}] = -4.483663103956885
|
||||
probStart[stateTag{'S', "nrfg"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "nrt"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "ns"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "nt"}] = -12.147070768850364
|
||||
probStart[stateTag{'S', "nz"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "o"}] = -8.464460927750023
|
||||
probStart[stateTag{'S', "p"}] = -2.9868401813596317
|
||||
probStart[stateTag{'S', "q"}] = -4.888658618255058
|
||||
probStart[stateTag{'S', "qe"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "qg"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "r"}] = -2.7635336784127853
|
||||
probStart[stateTag{'S', "rg"}] = -10.275268591948773
|
||||
probStart[stateTag{'S', "rr"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "rz"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "s"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "t"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "tg"}] = -6.272842531880403
|
||||
probStart[stateTag{'S', "u"}] = -6.940320595827818
|
||||
probStart[stateTag{'S', "ud"}] = -7.728230161053767
|
||||
probStart[stateTag{'S', "ug"}] = -7.5394037026636855
|
||||
probStart[stateTag{'S', "uj"}] = -6.85251045118004
|
||||
probStart[stateTag{'S', "ul"}] = -8.4153713175535
|
||||
probStart[stateTag{'S', "uv"}] = -8.15808672228609
|
||||
probStart[stateTag{'S', "uz"}] = -9.299258625372996
|
||||
probStart[stateTag{'S', "v"}] = -3.053292303412302
|
||||
probStart[stateTag{'S', "vd"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "vg"}] = -5.9430181843676895
|
||||
probStart[stateTag{'S', "vi"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "vn"}] = -11.453923588290419
|
||||
probStart[stateTag{'S', "vq"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "w"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "x"}] = -8.427419656069674
|
||||
probStart[stateTag{'S', "y"}] = -6.1970794699489575
|
||||
probStart[stateTag{'S', "yg"}] = -13.533365129970255
|
||||
probStart[stateTag{'S', "z"}] = -3.14e+100
|
||||
probStart[stateTag{'S', "zg"}] = -3.14e+100
|
||||
var probStart = map[uint16]float64{
|
||||
100: -4.762305214596967,
|
||||
101: -6.680066036784177,
|
||||
102: -3.14e+100,
|
||||
103: -8.697083223018778,
|
||||
104: -5.018374362109218,
|
||||
105: -3.14e+100,
|
||||
106: -3.423880184954888,
|
||||
107: -3.9750475297585357,
|
||||
108: -8.888974230828882,
|
||||
109: -3.14e+100,
|
||||
110: -8.563551830394255,
|
||||
111: -3.14e+100,
|
||||
112: -5.491630418482717,
|
||||
113: -3.14e+100,
|
||||
114: -13.533365129970255,
|
||||
115: -6.1157847275557105,
|
||||
116: -3.14e+100,
|
||||
117: -5.0576191284681915,
|
||||
118: -3.14e+100,
|
||||
119: -3.14e+100,
|
||||
120: -4.905883584659895,
|
||||
121: -3.14e+100,
|
||||
122: -3.6524299819046386,
|
||||
123: -3.14e+100,
|
||||
124: -6.78695300139688,
|
||||
125: -1.6966257797548328,
|
||||
126: -3.14e+100,
|
||||
127: -2.2310495913769506,
|
||||
128: -5.873722175405573,
|
||||
129: -4.985642733519195,
|
||||
130: -2.8228438314969213,
|
||||
131: -4.846091668182416,
|
||||
132: -3.94698846057672,
|
||||
133: -8.433498702146057,
|
||||
134: -4.200984132085048,
|
||||
135: -6.998123858956596,
|
||||
136: -3.14e+100,
|
||||
137: -3.14e+100,
|
||||
138: -3.4098187790818413,
|
||||
139: -3.14e+100,
|
||||
140: -12.434752841302146,
|
||||
141: -7.946116471570005,
|
||||
142: -5.522673590839954,
|
||||
143: -3.3647479094528574,
|
||||
144: -3.14e+100,
|
||||
145: -9.163917277503234,
|
||||
146: -3.14e+100,
|
||||
147: -3.14e+100,
|
||||
148: -3.14e+100,
|
||||
149: -3.14e+100,
|
||||
150: -3.14e+100,
|
||||
151: -3.14e+100,
|
||||
152: -2.6740584874265685,
|
||||
153: -9.044728760238115,
|
||||
154: -3.14e+100,
|
||||
155: -12.434752841302146,
|
||||
156: -4.3315610890163585,
|
||||
157: -12.147070768850364,
|
||||
158: -3.14e+100,
|
||||
159: -3.14e+100,
|
||||
160: -9.844485675856319,
|
||||
161: -3.14e+100,
|
||||
162: -7.045681111485645,
|
||||
163: -3.14e+100,
|
||||
200: -3.14e+100,
|
||||
201: -3.14e+100,
|
||||
202: -3.14e+100,
|
||||
203: -3.14e+100,
|
||||
204: -3.14e+100,
|
||||
205: -3.14e+100,
|
||||
206: -3.14e+100,
|
||||
207: -3.14e+100,
|
||||
208: -3.14e+100,
|
||||
209: -3.14e+100,
|
||||
210: -3.14e+100,
|
||||
211: -3.14e+100,
|
||||
212: -3.14e+100,
|
||||
213: -3.14e+100,
|
||||
214: -3.14e+100,
|
||||
215: -3.14e+100,
|
||||
216: -3.14e+100,
|
||||
217: -3.14e+100,
|
||||
218: -3.14e+100,
|
||||
219: -3.14e+100,
|
||||
220: -3.14e+100,
|
||||
221: -3.14e+100,
|
||||
222: -3.14e+100,
|
||||
223: -3.14e+100,
|
||||
224: -3.14e+100,
|
||||
225: -3.14e+100,
|
||||
226: -3.14e+100,
|
||||
227: -3.14e+100,
|
||||
228: -3.14e+100,
|
||||
229: -3.14e+100,
|
||||
230: -3.14e+100,
|
||||
231: -3.14e+100,
|
||||
232: -3.14e+100,
|
||||
233: -3.14e+100,
|
||||
234: -3.14e+100,
|
||||
235: -3.14e+100,
|
||||
236: -3.14e+100,
|
||||
237: -3.14e+100,
|
||||
238: -3.14e+100,
|
||||
239: -3.14e+100,
|
||||
240: -3.14e+100,
|
||||
241: -3.14e+100,
|
||||
242: -3.14e+100,
|
||||
243: -3.14e+100,
|
||||
244: -3.14e+100,
|
||||
245: -3.14e+100,
|
||||
246: -3.14e+100,
|
||||
247: -3.14e+100,
|
||||
248: -3.14e+100,
|
||||
249: -3.14e+100,
|
||||
250: -3.14e+100,
|
||||
251: -3.14e+100,
|
||||
252: -3.14e+100,
|
||||
253: -3.14e+100,
|
||||
254: -3.14e+100,
|
||||
255: -3.14e+100,
|
||||
256: -3.14e+100,
|
||||
257: -3.14e+100,
|
||||
258: -3.14e+100,
|
||||
259: -3.14e+100,
|
||||
260: -3.14e+100,
|
||||
261: -3.14e+100,
|
||||
262: -3.14e+100,
|
||||
263: -3.14e+100,
|
||||
300: -3.14e+100,
|
||||
301: -3.14e+100,
|
||||
302: -3.14e+100,
|
||||
303: -3.14e+100,
|
||||
304: -3.14e+100,
|
||||
305: -3.14e+100,
|
||||
306: -3.14e+100,
|
||||
307: -3.14e+100,
|
||||
308: -3.14e+100,
|
||||
309: -3.14e+100,
|
||||
310: -3.14e+100,
|
||||
311: -3.14e+100,
|
||||
312: -3.14e+100,
|
||||
313: -3.14e+100,
|
||||
314: -3.14e+100,
|
||||
315: -3.14e+100,
|
||||
316: -3.14e+100,
|
||||
317: -3.14e+100,
|
||||
318: -3.14e+100,
|
||||
319: -3.14e+100,
|
||||
320: -3.14e+100,
|
||||
321: -3.14e+100,
|
||||
322: -3.14e+100,
|
||||
323: -3.14e+100,
|
||||
324: -3.14e+100,
|
||||
325: -3.14e+100,
|
||||
326: -3.14e+100,
|
||||
327: -3.14e+100,
|
||||
328: -3.14e+100,
|
||||
329: -3.14e+100,
|
||||
330: -3.14e+100,
|
||||
331: -3.14e+100,
|
||||
332: -3.14e+100,
|
||||
333: -3.14e+100,
|
||||
334: -3.14e+100,
|
||||
335: -3.14e+100,
|
||||
336: -3.14e+100,
|
||||
337: -3.14e+100,
|
||||
338: -3.14e+100,
|
||||
339: -3.14e+100,
|
||||
340: -3.14e+100,
|
||||
341: -3.14e+100,
|
||||
342: -3.14e+100,
|
||||
343: -3.14e+100,
|
||||
344: -3.14e+100,
|
||||
345: -3.14e+100,
|
||||
346: -3.14e+100,
|
||||
347: -3.14e+100,
|
||||
348: -3.14e+100,
|
||||
349: -3.14e+100,
|
||||
350: -3.14e+100,
|
||||
351: -3.14e+100,
|
||||
352: -3.14e+100,
|
||||
353: -3.14e+100,
|
||||
354: -3.14e+100,
|
||||
355: -3.14e+100,
|
||||
356: -3.14e+100,
|
||||
357: -3.14e+100,
|
||||
358: -3.14e+100,
|
||||
359: -3.14e+100,
|
||||
360: -3.14e+100,
|
||||
361: -3.14e+100,
|
||||
362: -3.14e+100,
|
||||
363: -3.14e+100,
|
||||
400: -3.9025396831295227,
|
||||
401: -11.048458480182255,
|
||||
402: -6.954113917960154,
|
||||
403: -12.84021794941031,
|
||||
404: -6.472888763970454,
|
||||
405: -3.14e+100,
|
||||
406: -4.786966795861212,
|
||||
407: -3.903919764181873,
|
||||
408: -3.14e+100,
|
||||
409: -8.948397651299683,
|
||||
410: -5.942513006281674,
|
||||
411: -3.14e+100,
|
||||
412: -5.194820249981676,
|
||||
413: -6.507826815331734,
|
||||
414: -8.650563207383884,
|
||||
415: -3.14e+100,
|
||||
416: -3.14e+100,
|
||||
417: -4.911992119644354,
|
||||
418: -3.14e+100,
|
||||
419: -6.940320595827818,
|
||||
420: -3.14e+100,
|
||||
421: -3.14e+100,
|
||||
422: -3.269200652116097,
|
||||
423: -10.825314928868044,
|
||||
424: -3.14e+100,
|
||||
425: -3.8551483897645107,
|
||||
426: -4.913434861102905,
|
||||
427: -4.483663103956885,
|
||||
428: -3.14e+100,
|
||||
429: -3.14e+100,
|
||||
430: -3.14e+100,
|
||||
431: -12.147070768850364,
|
||||
432: -3.14e+100,
|
||||
433: -8.464460927750023,
|
||||
434: -2.9868401813596317,
|
||||
435: -4.888658618255058,
|
||||
436: -3.14e+100,
|
||||
437: -3.14e+100,
|
||||
438: -2.7635336784127853,
|
||||
439: -10.275268591948773,
|
||||
440: -3.14e+100,
|
||||
441: -3.14e+100,
|
||||
442: -3.14e+100,
|
||||
443: -3.14e+100,
|
||||
444: -6.272842531880403,
|
||||
445: -6.940320595827818,
|
||||
446: -7.728230161053767,
|
||||
447: -7.5394037026636855,
|
||||
448: -6.85251045118004,
|
||||
449: -8.4153713175535,
|
||||
450: -8.15808672228609,
|
||||
451: -9.299258625372996,
|
||||
452: -3.053292303412302,
|
||||
453: -3.14e+100,
|
||||
454: -5.9430181843676895,
|
||||
455: -3.14e+100,
|
||||
456: -11.453923588290419,
|
||||
457: -3.14e+100,
|
||||
458: -3.14e+100,
|
||||
459: -8.427419656069674,
|
||||
460: -6.1970794699489575,
|
||||
461: -13.533365129970255,
|
||||
462: -3.14e+100,
|
||||
463: -3.14e+100,
|
||||
}
|
||||
|
||||
5750
posseg/prob_trans.go
5750
posseg/prob_trans.go
File diff suppressed because it is too large
Load Diff
@@ -5,26 +5,13 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
type stateTag struct {
|
||||
State byte
|
||||
Tag string
|
||||
}
|
||||
|
||||
func (st stateTag) String() string {
|
||||
return fmt.Sprintf("(%q, %s)", st.State, st.Tag)
|
||||
}
|
||||
|
||||
func emptyStateTag() stateTag {
|
||||
return stateTag{' ', ""}
|
||||
}
|
||||
|
||||
type probState struct {
|
||||
Prob float64
|
||||
ST stateTag
|
||||
prob float64
|
||||
state uint16
|
||||
}
|
||||
|
||||
func (ps probState) String() string {
|
||||
return fmt.Sprintf("(%v: %f)", ps.ST, ps.Prob)
|
||||
return fmt.Sprintf("(%v: %f)", ps.state, ps.prob)
|
||||
}
|
||||
|
||||
type probStates []probState
|
||||
@@ -34,94 +21,87 @@ func (pss probStates) Len() int {
|
||||
}
|
||||
|
||||
func (pss probStates) Less(i, j int) bool {
|
||||
if pss[i].Prob == pss[j].Prob {
|
||||
if pss[i].ST.State == pss[j].ST.State {
|
||||
return pss[i].ST.Tag < pss[j].ST.Tag
|
||||
}
|
||||
return pss[i].ST.State < pss[j].ST.State
|
||||
if pss[i].prob == pss[j].prob {
|
||||
return pss[i].state < pss[j].state
|
||||
}
|
||||
return pss[i].Prob < pss[j].Prob
|
||||
return pss[i].prob < pss[j].prob
|
||||
}
|
||||
|
||||
func (pss probStates) Swap(i, j int) {
|
||||
pss[i], pss[j] = pss[j], pss[i]
|
||||
}
|
||||
|
||||
func viterbi(obs []rune) (float64, []stateTag) {
|
||||
func viterbi(obs []rune) []tag {
|
||||
obsLength := len(obs)
|
||||
V := make([]map[stateTag]float64, obsLength)
|
||||
V[0] = make(map[stateTag]float64)
|
||||
mem_path := make([]map[stateTag]stateTag, obsLength)
|
||||
mem_path[0] = make(map[stateTag]stateTag)
|
||||
V := make([]map[uint16]float64, obsLength)
|
||||
V[0] = make(map[uint16]float64)
|
||||
memPath := make([]map[uint16]uint16, obsLength)
|
||||
memPath[0] = make(map[uint16]uint16)
|
||||
ys := charStateTab.get(obs[0]) // default is all_states
|
||||
for _, y := range ys {
|
||||
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
|
||||
mem_path[0][y] = emptyStateTag()
|
||||
memPath[0][y] = 0
|
||||
}
|
||||
for t := 1; t < obsLength; t++ {
|
||||
prev_states := make([]stateTag, 0)
|
||||
for x, _ := range mem_path[t-1] {
|
||||
var prevStates []uint16
|
||||
for x := range memPath[t-1] {
|
||||
if len(probTrans[x]) > 0 {
|
||||
prev_states = append(prev_states, x)
|
||||
prevStates = append(prevStates, x)
|
||||
}
|
||||
}
|
||||
//use Go's map to implement Python's Set()
|
||||
prev_states_expect_next := make(map[stateTag]stateTag)
|
||||
for _, x := range prev_states {
|
||||
for y, _ := range probTrans[x] {
|
||||
prev_states_expect_next[y] = y
|
||||
prevStatesExpectNext := make(map[uint16]int)
|
||||
for _, x := range prevStates {
|
||||
for y := range probTrans[x] {
|
||||
prevStatesExpectNext[y] = 1
|
||||
}
|
||||
}
|
||||
tmp_obs_states := charStateTab.get(obs[t])
|
||||
tmpObsStates := charStateTab.get(obs[t])
|
||||
|
||||
obs_states := make([]stateTag, 0)
|
||||
for index, _ := range tmp_obs_states {
|
||||
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
|
||||
obs_states = append(obs_states, tmp_obs_states[index])
|
||||
var obsStates []uint16
|
||||
for index := range tmpObsStates {
|
||||
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
|
||||
obsStates = append(obsStates, tmpObsStates[index])
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
for key := range prev_states_expect_next {
|
||||
obs_states = append(obs_states, key)
|
||||
if len(obsStates) == 0 {
|
||||
for key := range prevStatesExpectNext {
|
||||
obsStates = append(obsStates, key)
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
obs_states = probTransKeys
|
||||
if len(obsStates) == 0 {
|
||||
obsStates = probTransKeys
|
||||
}
|
||||
mem_path[t] = make(map[stateTag]stateTag) // TODO: value needed or not?
|
||||
V[t] = make(map[stateTag]float64)
|
||||
for _, y := range obs_states {
|
||||
pss := make(probStates, 0)
|
||||
for _, y0 := range prev_states {
|
||||
ps := probState{
|
||||
Prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
||||
ST: y0}
|
||||
pss = append(pss, ps)
|
||||
memPath[t] = make(map[uint16]uint16)
|
||||
V[t] = make(map[uint16]float64)
|
||||
for _, y := range obsStates {
|
||||
var max, ps probState
|
||||
for i, y0 := range prevStates {
|
||||
ps = probState{
|
||||
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
||||
state: y0}
|
||||
if i == 0 || ps.prob > max.prob || (ps.prob == max.prob && ps.state > max.state) {
|
||||
max = ps
|
||||
}
|
||||
}
|
||||
sort.Sort(sort.Reverse(pss))
|
||||
V[t][y] = pss[0].Prob
|
||||
mem_path[t][y] = pss[0].ST
|
||||
V[t][y] = max.prob
|
||||
memPath[t][y] = max.state
|
||||
}
|
||||
}
|
||||
last := make(probStates, 0)
|
||||
length := len(mem_path)
|
||||
length := len(memPath)
|
||||
vlength := len(V)
|
||||
for y, _ := range mem_path[length-1] {
|
||||
ps := probState{Prob: V[vlength-1][y], ST: y}
|
||||
for y := range memPath[length-1] {
|
||||
ps := probState{prob: V[vlength-1][y], state: y}
|
||||
last = append(last, ps)
|
||||
}
|
||||
sort.Sort(sort.Reverse(last))
|
||||
prob := last[0].Prob
|
||||
state := last[0].ST
|
||||
route := make([]stateTag, len(obs))
|
||||
i := obsLength - 1
|
||||
for {
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
route[i] = state
|
||||
state = mem_path[i][state]
|
||||
i -= 1
|
||||
state := last[0].state
|
||||
route := make([]tag, len(obs))
|
||||
|
||||
for i := obsLength - 1; i >= 0; i-- {
|
||||
route[i] = tag(state)
|
||||
state = memPath[i][state]
|
||||
}
|
||||
return prob, route
|
||||
return route
|
||||
}
|
||||
|
||||
@@ -4,42 +4,68 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
var (
|
||||
route1 = []stateTag{
|
||||
stateTag{'B', "nr"},
|
||||
stateTag{'M', "nr"},
|
||||
stateTag{'E', "nr"},
|
||||
stateTag{'S', "v"},
|
||||
stateTag{'B', "v"},
|
||||
stateTag{'E', "v"},
|
||||
stateTag{'B', "n"},
|
||||
stateTag{'M', "n"},
|
||||
stateTag{'E', "n"},
|
||||
stateTag{'S', "d"},
|
||||
stateTag{'S', "v"},
|
||||
stateTag{'S', "n"},
|
||||
stateTag{'B', "v"},
|
||||
stateTag{'E', "v"},
|
||||
stateTag{'B', "nr"},
|
||||
stateTag{'M', "nr"},
|
||||
stateTag{'M', "nr"},
|
||||
stateTag{'M', "nr"},
|
||||
stateTag{'E', "nr"},
|
||||
stateTag{'S', "zg"}}
|
||||
)
|
||||
var defaultRoute []tag
|
||||
|
||||
func init() {
|
||||
var t tag
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("B", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("M", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("E", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("S", "d")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("S", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = newTag("S", "zg")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
}
|
||||
|
||||
func TestViterbi(t *testing.T) {
|
||||
ss := "李小福是创新办主任也是云计算方面的专家;"
|
||||
prob, route := viterbi([]rune(ss))
|
||||
if prob != MinFloat {
|
||||
t.Error(prob)
|
||||
route := viterbi([]rune(ss))
|
||||
if len(route) != len(defaultRoute) {
|
||||
t.Fatal(len(route))
|
||||
}
|
||||
if len(route) != len(route1) {
|
||||
t.Error(len(route))
|
||||
}
|
||||
for index, _ := range route {
|
||||
if route[index] != route1[index] {
|
||||
t.Error(route[index])
|
||||
for index := range route {
|
||||
if route[index] != defaultRoute[index] {
|
||||
t.Fatal(route[index])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkViterbi(b *testing.B) {
|
||||
ss := "李小福是创新办主任也是云计算方面的专家;"
|
||||
for i := 0; i < b.N; i++ {
|
||||
viterbi([]rune(ss))
|
||||
}
|
||||
}
|
||||
|
||||
145
tokenizer.go
Normal file
145
tokenizer.go
Normal file
@@ -0,0 +1,145 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
const Name = "jieba"
|
||||
|
||||
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||
type JiebaTokenizer struct {
|
||||
seg Segmenter
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
/*
|
||||
NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFilePath: path of the dictioanry file.
|
||||
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg Segmenter
|
||||
err := seg.LoadDictionary(dictFilePath)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
hmm: hmm,
|
||||
searchMode: searchMode,
|
||||
}, err
|
||||
}
|
||||
|
||||
// Tokenize cuts input into bleve token stream.
|
||||
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
runeStart := 0
|
||||
start := 0
|
||||
end := 0
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
for _, step := range [2]int{2, 3} {
|
||||
if width > step {
|
||||
for i := 0; i < width-step+1; i++ {
|
||||
gram = string(runes[i : i+step])
|
||||
gramLen := len(gram)
|
||||
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
|
||||
gramStart := start + len(string(runes[:i]))
|
||||
token := analysis.Token{
|
||||
Term: []byte(gram),
|
||||
Start: gramStart,
|
||||
End: gramStart + gramLen,
|
||||
Position: pos,
|
||||
Type: detectTokenType(gram),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
pos++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
end = start + len(word)
|
||||
token := analysis.Token{
|
||||
Term: []byte(word),
|
||||
Start: start,
|
||||
End: end,
|
||||
Position: pos,
|
||||
Type: detectTokenType(word),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
pos++
|
||||
runeStart += width
|
||||
start = end
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
/*
|
||||
JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||
|
||||
Parameter config should contains at least one parameter:
|
||||
|
||||
file: the path of the dictionary file.
|
||||
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
*/
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
dictFilePath, ok := config["file"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify dictionary file path")
|
||||
}
|
||||
hmm, ok := config["hmm"].(bool)
|
||||
if !ok {
|
||||
hmm = true
|
||||
}
|
||||
searchMode, ok := config["search"].(bool)
|
||||
if !ok {
|
||||
searchMode = true
|
||||
}
|
||||
|
||||
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
|
||||
}
|
||||
|
||||
func detectTokenType(term string) analysis.TokenType {
|
||||
if ideographRegexp.MatchString(term) {
|
||||
return analysis.Ideographic
|
||||
}
|
||||
_, err := strconv.ParseFloat(term, 64)
|
||||
if err == nil {
|
||||
return analysis.Numeric
|
||||
}
|
||||
return analysis.AlphaNumeric
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
|
||||
}
|
||||
@@ -1,9 +1,10 @@
|
||||
package tokenizers
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
||||
@@ -5218,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -11056,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -16473,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -22505,11 +22506,11 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
t.Fatalf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/wangbin/jiebago"
|
||||
"regexp"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
const Name = "jieba"
|
||||
|
||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
type JiebaTokenizer struct {
|
||||
dictFileName string
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
return &JiebaTokenizer{
|
||||
dictFileName: dictFileName,
|
||||
hmm: hmm,
|
||||
searchMode: searchMode,
|
||||
}, err
|
||||
}
|
||||
|
||||
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
runeStart := 0
|
||||
start := 0
|
||||
end := 0
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jiebago.Cut(string(input), false, jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
for _, step := range [2]int{2, 3} {
|
||||
if width > step {
|
||||
for i := 0; i < width-step+1; i++ {
|
||||
gram = string(runes[i : i+step])
|
||||
gramLen := len(gram)
|
||||
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
|
||||
gramStart := start + len(string(runes[:i]))
|
||||
token := analysis.Token{
|
||||
Term: []byte(gram),
|
||||
Start: gramStart,
|
||||
End: gramStart + gramLen,
|
||||
Position: pos,
|
||||
Type: detectTokenType(gram),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
pos++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
end = start + len(word)
|
||||
token := analysis.Token{
|
||||
Term: []byte(word),
|
||||
Start: start,
|
||||
End: end,
|
||||
Position: pos,
|
||||
Type: detectTokenType(word),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
pos++
|
||||
runeStart += width
|
||||
start = end
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
dictFileName, ok := config["file"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify dictionary file path")
|
||||
}
|
||||
hmm, ok := config["hmm"].(bool)
|
||||
if !ok {
|
||||
hmm = true
|
||||
}
|
||||
searchMode, ok := config["search"].(bool)
|
||||
if !ok {
|
||||
searchMode = true
|
||||
}
|
||||
|
||||
return NewJiebaTokenizer(dictFileName, hmm, searchMode)
|
||||
}
|
||||
|
||||
func detectTokenType(term string) analysis.TokenType {
|
||||
if IdeographRegexp.MatchString(term) {
|
||||
return analysis.Ideographic
|
||||
}
|
||||
_, err := strconv.ParseFloat(term, 64)
|
||||
if err == nil {
|
||||
return analysis.Numeric
|
||||
}
|
||||
return analysis.AlphaNumeric
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
|
||||
}
|
||||
126
trie.go
126
trie.go
@@ -1,126 +0,0 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"encoding/gob"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Trie store the total frequency and map of all words and their frequenciesb
|
||||
var Trie *trie
|
||||
|
||||
type trie struct {
|
||||
Total float64
|
||||
Freq map[string]float64
|
||||
}
|
||||
|
||||
func (t *trie) load(dictFileName string) error {
|
||||
dictFilePath, err := DictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dictFileInfo, err := os.Stat(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("Building Trie..., from %s\n", dictFilePath)
|
||||
h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath)))
|
||||
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
|
||||
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
|
||||
isDictCached := true
|
||||
|
||||
cacheFileInfo, err := os.Stat(cacheFilePath)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
}
|
||||
|
||||
if isDictCached {
|
||||
isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime())
|
||||
}
|
||||
|
||||
var cacheFile *os.File
|
||||
if isDictCached {
|
||||
cacheFile, err = os.Open(cacheFilePath)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
}
|
||||
defer cacheFile.Close()
|
||||
}
|
||||
|
||||
if isDictCached {
|
||||
dec := gob.NewDecoder(cacheFile)
|
||||
err = dec.Decode(&t)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
} else {
|
||||
log.Printf("loaded model from cache %s\n", cacheFilePath)
|
||||
}
|
||||
}
|
||||
|
||||
if !isDictCached {
|
||||
wtfs, err := ParseDictFile(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
t.addWord(wtf)
|
||||
}
|
||||
// dump trie
|
||||
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cacheFile.Close()
|
||||
enc := gob.NewEncoder(cacheFile)
|
||||
err = enc.Encode(t)
|
||||
if err != nil {
|
||||
return err
|
||||
} else {
|
||||
log.Printf("dumped model from cache %s\n", cacheFilePath)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *trie) addWord(wtf *WordTagFreq) {
|
||||
t.Freq[wtf.Word] = wtf.Freq
|
||||
t.Total += wtf.Freq
|
||||
runes := []rune(wtf.Word)
|
||||
count := len(runes)
|
||||
for i := 0; i < count; i++ {
|
||||
wfrag := string(runes[0 : i+1])
|
||||
if _, ok := t.Freq[wfrag]; !ok {
|
||||
t.Freq[wfrag] = 0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load user specified dictionary file.
|
||||
func LoadUserDict(dictFilePath string) error {
|
||||
wtfs, err := ParseDictFile(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, wtf := range wtfs {
|
||||
if len(wtf.Tag) > 0 {
|
||||
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
||||
}
|
||||
Trie.addWord(wtf)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||
// name in current directory. This function must be called before cut any
|
||||
// sentence.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
Trie = &trie{Total: 0.0, Freq: make(map[string]float64)}
|
||||
return Trie.load(dictFileName)
|
||||
}
|
||||
53
util/util.go
Normal file
53
util/util.go
Normal file
@@ -0,0 +1,53 @@
|
||||
// Package util contains some util functions used by jiebago.
|
||||
package util
|
||||
|
||||
import "regexp"
|
||||
|
||||
/*
|
||||
RegexpSplit split slices s into substrings separated by the expression and
|
||||
returns a slice of the substrings between those expression matches.
|
||||
If capturing parentheses are used in expression, then the text of all groups
|
||||
in the expression are also returned as part of the resulting slice.
|
||||
|
||||
This function acts consistent with Python's re.split function.
|
||||
*/
|
||||
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(re.String()) > 0 && len(s) == 0 {
|
||||
return []string{""}
|
||||
}
|
||||
|
||||
var matches [][]int
|
||||
if len(re.SubexpNames()) > 1 {
|
||||
matches = re.FindAllStringSubmatchIndex(s, n)
|
||||
} else {
|
||||
matches = re.FindAllStringIndex(s, n)
|
||||
}
|
||||
strings := make([]string, 0, len(matches))
|
||||
|
||||
beg := 0
|
||||
end := 0
|
||||
for _, match := range matches {
|
||||
if n > 0 && len(strings) >= n-1 {
|
||||
break
|
||||
}
|
||||
|
||||
end = match[0]
|
||||
if match[1] != 0 {
|
||||
strings = append(strings, s[beg:end])
|
||||
}
|
||||
beg = match[1]
|
||||
if len(re.SubexpNames()) > 1 {
|
||||
strings = append(strings, s[match[0]:match[1]])
|
||||
}
|
||||
}
|
||||
|
||||
if end != len(s) {
|
||||
strings = append(strings, s[beg:])
|
||||
}
|
||||
|
||||
return strings
|
||||
}
|
||||
24
util/util_test.go
Normal file
24
util/util_test.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRegexpSplit(t *testing.T) {
|
||||
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||
if len(result) != 2 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?", -1)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user