1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-08 18:40:24 +08:00

Merge branch 'release/v0.3'

This commit is contained in:
Wang Bin
2015-05-07 15:26:03 +08:00
38 changed files with 9970 additions and 87398 deletions

408
README.md
View File

@@ -1,403 +1,69 @@
#结巴分词 Go 语言版:jiebago
#结巴分词 Go 语言版:Jiebago
[![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago)
[![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/wangbin/jiebago?status.svg)](https://godoc.org/github.com/wangbin/jiebago)
[结巴分词](https://github.com/fxsjy/jieba)[@fxsjy](https://github.com/fxsjy)Python编写的中文分词组件jiebago是结巴分词的Go语言实现,目前已经实现的功能包括:三种模式分词、自定义词典、关键词提取和词性标注
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。
## 安装
```
go get github.com/wangbin/jiebago/...
```
go get github.com/wangbin/jiebago/...
## 分词
package main
import (
"fmt"
"github.com/wangbin/jiebago"
)
var sentence = "我来到北京清华大学"
func print(ch chan string) {
for word := range ch {
fmt.Printf("%s / ", word)
}
fmt.Println()
fmt.Println()
}
func main() {
jiebago.SetDictionary("/Path/to/dictionary/file") // 设定字典
fmt.Print("【全模式】: ")
print(jiebago.Cut(sentence, true, true))
fmt.Print("【精确模式】: ")
print(jiebago.Cut(sentence, false, true))
fmt.Print("【新词识别】:")
print(jiebago.Cut("他来到了网易杭研大厦", false, true))
fmt.Print("【搜索引擎模式】:")
print(jiebago.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
}
使用结巴分词自带的[词典文件](https://github.com/fxsjy/jieba/blob/master/jieba/dict.txt),输出结果如下:
【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
【精确模式】: 我 / 来到 / 北京 / 清华大学 /
【新词识别】:他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
【搜索引擎模式】:小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
## 添加自定义词典
var sentence = "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before: ")
print(jiebago.Cut(sentence, false, true))
jiebago.LoadUserDict("/Path/to/user/dictionary/file")
fmt.Print("After: ")
print(jiebago.Cut(sentence, false, true))
使用结巴分词自带的[词典文件](https://github.com/fxsjy/jieba/blob/master/jieba/dict.txt)和[用户自定义词典文件](https://github.com/fxsjy/jieba/blob/master/test/userdict.txt),结果输出如下:
Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
## 关键词提取
示例代码:
package main
import (
"fmt"
"github.com/wangbin/jiebago/analyse"
)
var sentence = "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"
func main() {
analyse.SetDictionary("/Path/to/dictionary/file")
analyse.SetIdf("/Path/to/idf/file")
for _, ww := range analyse.ExtractTags(sentence, 20) {
fmt.Printf("%s / ", ww.Word)
}
}
输出:
Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 / 北京 / 这是 / 一个 /
## 基于TextRank算法的关键词抽取实现
示例代码:
package main
import (
"fmt"
"github.com/wangbin/jiebago/analyse"
)
func main() {
sentence := "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚 置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
analyse.SetDictionary("/Path/to/dictionary/file")
result := analyse.TextRank(sentence, 10)
for _, wt := range result {
fmt.Printf("%s %f\n", wt.Word, wt.Freq)
}
}
输出:
吉林 1.000000
欧亚 0.878078
置业 0.562048
实现 0.520906
收入 0.384284
增资 0.360591
子公司 0.353132
城市 0.307509
全资 0.306324
商业 0.306138
## 词性标注
示例代码:
package main
import (
"fmt"
"github.com/wangbin/jiebago"
"github.com/wangbin/jiebago/posseg"
)
var sentence = "我爱北京天安门"
func main() {
posseg.SetDictionary("/Path/to/dictionary/file")
for wt := range posseg.Cut(sentence, true) {
fmt.Printf("%s %s\n", wt.Word, wt.Tag)
}
}
输出:
我 r
爱 v
北京 ns
天安门 ns
## 并行分词
因为Go有强大的goroutine特性并行分词实现起来非常简单所以并没有内置到jiebaogo中而是由使用者自己实现下面是一个简单的例子
lineCount := 0
inputFile, _ := os.Open(FileName)
defer inputFile.Close()
scanner := bufio.NewScanner(inputFile)
ch := make(chan []string, 1)
for scanner.Scan() {
line := scanner.Text()
fileLength += len([]rune(line))
lineCount += 1
go func() {
for word := range jiebago.Cut(line, false, true) {
ch <- word
}
}()
}
if err := scanner.Err(); err != nil {
panic(err)
}
outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
defer outputFile.Close()
writer := bufio.NewWriter(outputFile)
results := make([]string, 0)
for {
if lineCount <= 0 {
break
}
result, ok := <-ch
if ok {
results = append(results, result...)
lineCount -= 1
}
}
writer.WriteString(strings.Join(results, "/ "))
writer.Flush()
## Tokenize返回词语在原文的起始位置
注意新版的 Jiebago Tokenizer 实现了 Bleve Tokenizer 接口跟之前的实现有很大的变化
1. 接受的参数必须是 []byte
2. 输出的 Token 的起始和终止位置是 byte 的位置不是之前的 rune 的位置所以和 Python 版的 Jieba.tokenize 输出不一致
## 使用
```
package main
import (
"fmt"
"github.com/wangbin/jiebago/tokenizers"
"fmt"
"github.com/wangbin/jiebago"
)
const DictPath = "/path/to/dict.txt"
var sentence = []byte("永和服装饰品有限公司")
func main() {
// default mode
tokenizer, _ := tokenizers.NewJiebaTokenizer(DictPath, true, false) for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
//search mode
tokenizer, _ = tokenizers.NewJiebaTokenizer(DictPath, true, true)
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
var seg jiebago.Segmenter
func init() {
seg.LoadDictionary("dict.txt")
}
```
默认模式输出
```
Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
```
搜索模式输出
```
Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
```
### 配合 bleve 进行中文全文检索
[bleve](http://www.blevesearch.com/) 是一个 Go 语言实现的全文索引系统jiebago 可以配合 bleve 使用实现中文的全文检索。一个简单的用法示例:
```
package main
import (
"fmt"
"github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago/analyse/tokenizers"
"log"
)
func main() {
// open a new index
indexMapping := bleve.NewIndexMapping()
err := indexMapping.AddCustomTokenizer("jieba",
map[string]interface{}{
"file": "/Users/wangbin/mygo/src/github.com/wangbin/jiebago/dict.txt",
"type": "jieba",
})
if err != nil {
log.Fatal(err)
}
err = indexMapping.AddCustomAnalyzer("jieba",
map[string]interface{}{
"type": "custom",
"tokenizer": "jieba",
"token_filters": []string{
"possessive_en",
"to_lower",
"stop_en",
},
})
if err != nil {
log.Fatal(err)
}
indexMapping.DefaultAnalyzer = "jieba"
index, err := bleve.New("example.bleve", indexMapping)
if err != nil {
log.Fatal(err)
}
indexMapping.DefaultAnalyzer = "jieba"
index, err := bleve.New("example.bleve", indexMapping)
if err != nil {
log.Fatal(err)
}
docs := []struct {
Title string
Name string
}{
{
Title: "Doc 1",
Name: "This is the first document weve added",
},
{
Title: "Doc 2",
Name: "The second one 中文测试中文 is even more interesting! 吃水果",
},
{
Title: "Doc 3",
Name: "买水果然后来世博园。",
},
{
Title: "Doc 4",
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
},
{
Title: "Doc 5",
Name: "咱俩交换一下吧。",
},
}
// index docs
for _, doc := range docs {
index.Index(doc.Title, doc)
}
// search for some text
for _, keyword := range []string{"水果世博园", "", "first", "中文", "交换机", "交换"} {
query := bleve.NewMatchQuery(keyword)
search := bleve.NewSearchRequest(query)
search.Highlight = bleve.NewHighlight()
searchResults, err := index.Search(search)
if err != nil {
log.Fatal(err)
func print(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Printf("Result of %s: %s\n", keyword, searchResults)
}
fmt.Println()
}
func Example() {
fmt.Print("【全模式】:")
print(seg.CutAll("我来到北京清华大学"))
fmt.Print("【精确模式】:")
print(seg.Cut("我来到北京清华大学", false))
fmt.Print("【新词识别】:")
print(seg.Cut("他来到了网易杭研大厦", true))
fmt.Print("【搜索引擎模式】:")
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
}
```
输出结果:
```
Result of 水果世博园: 2 matches, showing 1 through 2, took 377.988µs
1. Doc 3 (1.099550)
Name
<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
2. Doc 2 (0.031941)
Name
The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
Result of 你: 1 matches, showing 1 through 1, took 103.367µs
1. Doc 2 (0.391161)
Name
The second one <span class="highlight"></span> 中文测试中文 is even more interesting! 吃水果
【精确模式】: 我 / 来到 / 北京 / 清华大学 /
Result of first: 1 matches, showing 1 through 1, took 373.317µs
1. Doc 1 (0.512150)
Name
This is the <span class="highlight">first</span> document weve added
【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
Result of 中文: 1 matches, showing 1 through 1, took 106.433µs
1. Doc 2 (0.553186)
Name
The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
Result of 交换机: 2 matches, showing 1 through 2, took 188.235µs
1. Doc 4 (0.608495)
Name
工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
2. Doc 5 (0.086700)
Name
咱俩<span class="highlight">交换</span>一下吧。
Result of 交换: 2 matches, showing 1 through 2, took 148.822µs
1. Doc 5 (0.534158)
Name
咱俩<span class="highlight">交换</span>一下吧。
2. Doc 4 (0.296297)
Name
工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
```
更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。
## 分词速度
- 2MB / Second in Full Mode

View File

@@ -1,80 +0,0 @@
package analyse
import (
"fmt"
"github.com/wangbin/jiebago"
"sort"
"strings"
"unicode/utf8"
)
type wordWeight struct {
Word string
Weight float64
}
func (w wordWeight) String() string {
return fmt.Sprintf("{%s: %f}", w.Word, w.Weight)
}
type wordWeights []wordWeight
func (ws wordWeights) Len() int {
return len(ws)
}
func (ws wordWeights) Less(i, j int) bool {
if ws[i].Weight == ws[j].Weight {
return ws[i].Word < ws[j].Word
}
return ws[i].Weight < ws[j].Weight
}
func (ws wordWeights) Swap(i, j int) {
ws[i], ws[j] = ws[j], ws[i]
}
// Keyword extraction.
func ExtractTags(sentence string, topK int) (tags wordWeights) {
freq := make(map[string]float64)
for w := range jiebago.Cut(sentence, false, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
}
if _, ok := stopWords[w]; ok {
continue
}
if f, ok := freq[w]; ok {
freq[w] = f + 1.0
} else {
freq[w] = 1.0
}
}
total := 0.0
for _, f := range freq {
total += f
}
for k, v := range freq {
freq[k] = v / total
}
ws := make(wordWeights, 0)
for k, v := range freq {
var ti wordWeight
if freq_, ok := loader.Freq[k]; ok {
ti = wordWeight{Word: k, Weight: freq_ * v}
} else {
ti = wordWeight{Word: k, Weight: loader.Median * v}
}
ws = append(ws, ti)
}
sort.Sort(sort.Reverse(ws))
if len(ws) > topK {
tags = ws[:topK]
} else {
tags = ws
}
return tags
}

44
analyse/example_test.go Normal file
View File

@@ -0,0 +1,44 @@
package analyse_test
import (
"fmt"
"github.com/wangbin/jiebago/analyse"
)
func ExampleExtractTags() {
var t analyse.TagExtracter
t.LoadDictionary("../dict.txt")
t.LoadIdf("idf.txt")
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"
segments := t.ExtractTags(sentence, 5)
fmt.Printf("Top %d tags:", len(segments))
for _, segment := range segments {
fmt.Printf(" %s /", segment.Text())
}
// Output:
// Top 5 tags: Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 /
}
func ExampleTextRank() {
var t analyse.TextRanker
t.LoadDictionary("../dict.txt")
sentence := "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
result := t.TextRank(sentence, 10)
for _, segment := range result {
fmt.Printf("%s %f\n", segment.Text(), segment.Weight())
}
// Output:
// 吉林 1.000000
// 欧亚 0.878078
// 置业 0.562048
// 实现 0.520906
// 收入 0.384284
// 增资 0.360591
// 子公司 0.353132
// 城市 0.307509
// 全资 0.306324
// 商业 0.306138
}

View File

@@ -1,57 +1,56 @@
package analyse
import (
"github.com/wangbin/jiebago"
"sort"
"sync"
"github.com/wangbin/jiebago/dictionary"
)
var (
loader *idfLoader
)
func init() {
loader = newIDFLoader()
// Idf represents a thread-safe dictionary for all words with their
// IDFs(Inverse Document Frequency).
type Idf struct {
freqMap map[string]float64
median float64
freqs []float64
sync.RWMutex
}
type idfLoader struct {
Path string
Freq map[string]float64
Median float64
// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(token dictionary.Token) {
i.Lock()
i.freqMap[token.Text()] = token.Frequency()
i.freqs = append(i.freqs, token.Frequency())
sort.Float64s(i.freqs)
i.median = i.freqs[len(i.freqs)/2]
i.Unlock()
}
func newIDFLoader() *idfLoader {
loader := new(idfLoader)
loader.Freq = make(map[string]float64)
return loader
}
func (loader *idfLoader) newPath(idfFilePath string) error {
if loader.Path == idfFilePath {
return nil
// Load loads all tokens from channel into it's dictionary.
func (i *Idf) Load(ch <-chan dictionary.Token) {
i.Lock()
for token := range ch {
i.freqMap[token.Text()] = token.Frequency()
i.freqs = append(i.freqs, token.Frequency())
}
wtfs, err := jiebago.ParseDictFile(idfFilePath)
if err != nil {
return err
}
freqs := make([]float64, 0)
for _, wtf := range wtfs {
loader.Freq[wtf.Word] = wtf.Freq
freqs = append(freqs, wtf.Freq)
}
sort.Float64s(freqs)
loader.Median = freqs[len(freqs)/2]
return nil
sort.Float64s(i.freqs)
i.median = i.freqs[len(i.freqs)/2]
i.Unlock()
}
// Set the IDF file path, could be absolute path of IDF file, or IDF file
// name in current directory.
func SetIdf(idfFileName string) error {
idfFilePath, err := jiebago.DictPath(idfFileName)
if err != nil {
return err
}
return loader.newPath(idfFilePath)
func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName)
}
// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) {
i.RLock()
freq, ok := i.freqMap[key]
i.RUnlock()
return freq, ok
}
// NewIdf creates a new Idf instance.
func NewIdf() *Idf {
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
}

View File

@@ -1,58 +1,83 @@
package analyse
import (
"github.com/wangbin/jiebago"
"sync"
"github.com/wangbin/jiebago/dictionary"
)
var stopWords map[string]int
func init() {
stopWords = map[string]int{
"the": 1,
"of": 1,
"is": 1,
"and": 1,
"to": 1,
"in": 1,
"that": 1,
"we": 1,
"for": 1,
"an": 1,
"are": 1,
"by": 1,
"be": 1,
"as": 1,
"on": 1,
"with": 1,
"can": 1,
"if": 1,
"from": 1,
"which": 1,
"you": 1,
"it": 1,
"this": 1,
"then": 1,
"at": 1,
"have": 1,
"all": 1,
"not": 1,
"one": 1,
"has": 1,
"or": 1,
}
// DefaultStopWordMap contains some stop words.
var DefaultStopWordMap = map[string]int{
"the": 1,
"of": 1,
"is": 1,
"and": 1,
"to": 1,
"in": 1,
"that": 1,
"we": 1,
"for": 1,
"an": 1,
"are": 1,
"by": 1,
"be": 1,
"as": 1,
"on": 1,
"with": 1,
"can": 1,
"if": 1,
"from": 1,
"which": 1,
"you": 1,
"it": 1,
"this": 1,
"then": 1,
"at": 1,
"have": 1,
"all": 1,
"not": 1,
"one": 1,
"has": 1,
"or": 1,
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
for _, wtf := range wtfs {
stopWords[wtf.Word] = 1
}
return nil
// StopWord is a thread-safe dictionary for all stop words.
type StopWord struct {
stopWordMap map[string]int
sync.RWMutex
}
// AddToken adds a token into StopWord dictionary.
func (s *StopWord) AddToken(token dictionary.Token) {
s.Lock()
s.stopWordMap[token.Text()] = 1
s.Unlock()
}
// NewStopWord create a new StopWord with default stop words.
func NewStopWord() *StopWord {
s := new(StopWord)
s.stopWordMap = DefaultStopWordMap
return s
}
// IsStopWord checks if a given word is stop word.
func (s *StopWord) IsStopWord(word string) bool {
s.RLock()
_, ok := s.stopWordMap[word]
s.RUnlock()
return ok
}
// Load loads all tokens from given channel into StopWord dictionary.
func (s *StopWord) Load(ch <-chan dictionary.Token) {
s.Lock()
for token := range ch {
s.stopWordMap[token.Text()] = 1
}
s.Unlock()
}
func (s *StopWord) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(s, fileName)
}

115
analyse/tag_extracker.go Normal file
View File

@@ -0,0 +1,115 @@
// Package analyse is the Golang implementation of Jieba's analyse module.
package analyse
import (
"sort"
"strings"
"unicode/utf8"
"github.com/wangbin/jiebago"
)
// Segment represents a word with weight.
type Segment struct {
text string
weight float64
}
// Text returns the segment's text.
func (s Segment) Text() string {
return s.text
}
// Weight returns the segment's weight.
func (s Segment) Weight() float64 {
return s.weight
}
// Segments represents a slice of Segment.
type Segments []Segment
func (ss Segments) Len() int {
return len(ss)
}
func (ss Segments) Less(i, j int) bool {
if ss[i].weight == ss[j].weight {
return ss[i].text < ss[j].text
}
return ss[i].weight < ss[j].weight
}
func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct {
seg *jiebago.Segmenter
idf *Idf
stopWord *StopWord
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jiebago.Segmenter)
return t.seg.LoadDictionary(fileName)
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionary(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName)
}
// ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)
for w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
}
if t.stopWord.IsStopWord(w) {
continue
}
if f, ok := freqMap[w]; ok {
freqMap[w] = f + 1.0
} else {
freqMap[w] = 1.0
}
}
total := 0.0
for _, freq := range freqMap {
total += freq
}
for k, v := range freqMap {
freqMap[k] = v / total
}
ws := make(Segments, 0)
var s Segment
for k, v := range freqMap {
if freq, ok := t.idf.Frequency(k); ok {
s = Segment{text: k, weight: freq * v}
} else {
s = Segment{text: k, weight: t.idf.median * v}
}
ws = append(ws, s)
}
sort.Sort(sort.Reverse(ws))
if len(ws) > topK {
tags = ws[:topK]
} else {
tags = ws
}
return tags
}

View File

@@ -1,13 +1,12 @@
package analyse
import (
"github.com/wangbin/jiebago"
"math"
"testing"
)
var (
test_contents = []string{
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -228,71 +227,74 @@ var (
只是逼不得已
雖然沒有藉口
`
LyciWeight = []wordWeight{
wordWeight{Word: "所謂", Weight: 1.010262},
wordWeight{Word: "是否", Weight: 0.738650},
wordWeight{Word: "一般", Weight: 0.607600},
wordWeight{Word: "雖然", Weight: 0.336754},
wordWeight{Word: "退縮", Weight: 0.336754},
wordWeight{Word: "肌迫", Weight: 0.336754},
wordWeight{Word: "矯作", Weight: 0.336754},
wordWeight{Word: "沒有", Weight: 0.336754},
wordWeight{Word: "怯懦", Weight: 0.271099},
wordWeight{Word: "隨便", Weight: 0.168377},
LyciWeight = Segments{
Segment{text: "所謂", weight: 1.010262},
Segment{text: "是否", weight: 0.738650},
Segment{text: "一般", weight: 0.607600},
Segment{text: "雖然", weight: 0.336754},
Segment{text: "退縮", weight: 0.336754},
Segment{text: "肌迫", weight: 0.336754},
Segment{text: "矯作", weight: 0.336754},
Segment{text: "沒有", weight: 0.336754},
Segment{text: "怯懦", weight: 0.271099},
Segment{text: "隨便", weight: 0.168377},
}
LyciWeight2 = []wordWeight{
wordWeight{Word: "所謂", Weight: 1.215739},
wordWeight{Word: "一般", Weight: 0.731179},
wordWeight{Word: "雖然", Weight: 0.405246},
wordWeight{Word: "退縮", Weight: 0.405246},
wordWeight{Word: "肌迫", Weight: 0.405246},
wordWeight{Word: "矯作", Weight: 0.405246},
wordWeight{Word: "怯懦", Weight: 0.326238},
wordWeight{Word: "逼不得已", Weight: 0.202623},
wordWeight{Word: "右銘", Weight: 0.202623},
wordWeight{Word: "寬闊", Weight: 0.202623},
LyciWeight2 = Segments{
Segment{text: "所謂", weight: 1.215739},
Segment{text: "一般", weight: 0.731179},
Segment{text: "雖然", weight: 0.405246},
Segment{text: "退縮", weight: 0.405246},
Segment{text: "肌迫", weight: 0.405246},
Segment{text: "矯作", weight: 0.405246},
Segment{text: "怯懦", weight: 0.326238},
Segment{text: "逼不得已", weight: 0.202623},
Segment{text: "右銘", weight: 0.202623},
Segment{text: "寬闊", weight: 0.202623},
}
)
func TestExtractTags(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
for index, sentence := range test_contents {
result := ExtractTags(sentence, 20)
for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) {
t.Errorf("%s = %v", sentence, result)
t.Fatalf("%s = %v", sentence, result)
}
for i, tag := range result {
if tag.Word != Tags[index][i] {
t.Errorf("%s != %s", tag, Tags[index][i])
if tag.text != Tags[index][i] {
t.Fatalf("%s != %s", tag, Tags[index][i])
}
}
}
}
func TestExtratTagsWithWeight(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
result := ExtractTags(Lyric, 10)
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
result := te.ExtractTags(Lyric, 10)
for index, tag := range result {
if LyciWeight[index].Word != tag.Word ||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
t.Errorf("%v != %v", tag, LyciWeight[index])
if LyciWeight[index].text != tag.text ||
math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 {
t.Fatalf("%v != %v", tag, LyciWeight[index])
}
}
}
func TestExtractTagsWithStopWordsFile(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
SetStopWords("stop_words.txt")
result := ExtractTags(Lyric, 7)
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadStopWords("stop_words.txt")
result := te.ExtractTags(Lyric, 7)
for index, tag := range result {
if LyciWeight2[index].Word != tag.Word ||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
t.Errorf("%v != %v", tag, LyciWeight2[index])
if LyciWeight2[index].text != tag.text ||
math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 {
t.Fatalf("%v != %v", tag, LyciWeight2[index])
}
}
}

View File

@@ -1,10 +1,10 @@
package analyse
import (
"fmt"
"github.com/wangbin/jiebago/posseg"
"math"
"sort"
"github.com/wangbin/jiebago/posseg"
)
const dampingFactor = 0.85
@@ -19,10 +19,6 @@ type edge struct {
weight float64
}
func (e edge) String() string {
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
}
type edges []edge
func (es edges) Len() int {
@@ -65,7 +61,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
}
}
func (u *undirectWeightedGraph) rank() wordWeights {
func (u *undirectWeightedGraph) rank() Segments {
if !sort.IsSorted(u.keys) {
sort.Sort(u.keys)
}
@@ -105,17 +101,17 @@ func (u *undirectWeightedGraph) rank() wordWeights {
maxRank = w
}
}
result := make(wordWeights, 0)
result := make(Segments, 0)
for n, w := range ws {
result = append(result, wordWeight{Word: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
}
sort.Sort(sort.Reverse(result))
return result
}
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
// could be manually speificed.
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
// Parameter allowPOS allows a customized pos list.
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
posFilt := make(map[string]int)
for _, pos := range allowPOS {
posFilt[pos] = 1
@@ -123,23 +119,20 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64)
span := 5
wordTags := make([]posseg.WordTag, 0)
for wordTag := range posseg.Cut(sentence, true) {
wordTags = append(wordTags, wordTag)
var pairs []posseg.Segment
for pair := range t.seg.Cut(sentence, true) {
pairs = append(pairs, pair)
}
for i, _ := range wordTags {
if _, ok := posFilt[wordTags[i].Tag]; ok {
for j := i + 1; j < i+span; j++ {
if j > len(wordTags) {
break
}
if _, ok := posFilt[wordTags[j].Tag]; !ok {
for i := range pairs {
if _, ok := posFilt[pairs[i].Pos()]; ok {
for j := i + 1; j < i+span && j <= len(pairs); j++ {
if _, ok := posFilt[pairs[j].Pos()]; !ok {
continue
}
if _, ok := cm[[2]string{wordTags[i].Word, wordTags[j].Word}]; !ok {
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] = 1.0
if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok {
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0
} else {
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] += 1.0
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0
}
}
}
@@ -154,15 +147,19 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
return tags
}
// Extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most.
func TextRank(sentence string, topK int) wordWeights {
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
// TextRank extract keywords from sentence using TextRank algorithm.
// Parameter topK specify how many top keywords to be returned at most.
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
}
// Set the dictionary, could be absolute path of dictionary file, or dictionary
// name in current directory. This function must be called before cut any
// sentence.
func SetDictionary(dictFileName string) error {
return posseg.SetDictionary(dictFileName)
// TextRanker is used to extract tags from sentence.
type TextRanker struct {
seg *posseg.Segmenter
}
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
func (t *TextRanker) LoadDictionary(fileName string) error {
t.seg = new(posseg.Segmenter)
return t.seg.LoadDictionary(fileName)
}

View File

@@ -8,26 +8,27 @@ import (
var (
sentence = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
tagRanks = wordWeights{
wordWeight{Word: "吉林", Weight: 1.0},
wordWeight{Word: "欧亚", Weight: 0.87807810644},
wordWeight{Word: "置业", Weight: 0.562048250306},
wordWeight{Word: "实现", Weight: 0.520905743929},
wordWeight{Word: "收入", Weight: 0.384283870648},
wordWeight{Word: "增资", Weight: 0.360590945312},
wordWeight{Word: "子公司", Weight: 0.353131980904},
wordWeight{Word: "城市", Weight: 0.307509449283},
wordWeight{Word: "全资", Weight: 0.306324426665},
wordWeight{Word: "商业", Weight: 0.306138241063},
tagRanks = Segments{
Segment{text: "吉林", weight: 1.0},
Segment{text: "欧亚", weight: 0.87807810644},
Segment{text: "置业", weight: 0.562048250306},
Segment{text: "实现", weight: 0.520905743929},
Segment{text: "收入", weight: 0.384283870648},
Segment{text: "增资", weight: 0.360590945312},
Segment{text: "子公司", weight: 0.353131980904},
Segment{text: "城市", weight: 0.307509449283},
Segment{text: "全资", weight: 0.306324426665},
Segment{text: "商业", weight: 0.306138241063},
}
)
func TestTextRank(t *testing.T) {
SetDictionary("../dict.txt")
results := TextRank(sentence, 10)
var tr TextRanker
tr.LoadDictionary("../dict.txt")
results := tr.TextRank(sentence, 10)
for index, tw := range results {
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
t.Errorf("%v != %v", tw, tagRanks[index])
if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 {
t.Fatalf("%v != %v", tw, tagRanks[index])
}
}
}

57
dict.go
View File

@@ -1,57 +0,0 @@
package jiebago
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
)
type WordTagFreq struct {
Word, Tag string
Freq float64
}
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
pwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(pwd, dictFileName))
return dictFilePath, nil
}
func ParseDictFile(dictFilePath string) (wtfs []*WordTagFreq, err error) {
var dictFile *os.File
dictFile, err = os.Open(dictFilePath)
if err != nil {
return
}
defer dictFile.Close()
scanner := bufio.NewScanner(dictFile)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Split(line, " ")
length := len(fields)
word := fields[0]
word = strings.Replace(word, "\ufeff", "", 1)
wtf := &WordTagFreq{Word: word}
if length > 1 {
wtf.Freq, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
return
}
}
if length > 2 {
wtf.Tag = fields[2]
}
wtfs = append(wtfs, wtf)
}
err = scanner.Err()
return
}

62
dictionary.go Normal file
View File

@@ -0,0 +1,62 @@
package jiebago
import (
"math"
"sync"
"github.com/wangbin/jiebago/dictionary"
)
// A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct {
total, logTotal float64
freqMap map[string]float64
sync.RWMutex
}
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
d.addToken(token)
}
d.Unlock()
d.updateLogTotal()
}
// AddToken adds one token
func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock()
d.addToken(token)
d.Unlock()
d.updateLogTotal()
}
func (d *Dictionary) addToken(token dictionary.Token) {
d.freqMap[token.Text()] = token.Frequency()
d.total += token.Frequency()
runes := []rune(token.Text())
n := len(runes)
for i := 0; i < n; i++ { //TODO: n-1?
frag := string(runes[:i+1])
if _, ok := d.freqMap[frag]; !ok {
d.freqMap[frag] = 0.0
}
}
}
func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]
d.RUnlock()
return freq, ok
}
func (d *Dictionary) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(d, fileName)
}

85
dictionary/dictionary.go Normal file
View File

@@ -0,0 +1,85 @@
// Package dictionary contains a interface and wraps all io related work.
// It is used by jiebago module to read/write files.
package dictionary
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
)
// DictLoader is the interface that could add one token or load
// tokens from channel.
type DictLoader interface {
Load(<-chan Token)
AddToken(Token)
}
func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
tokenCh, errCh := make(chan Token), make(chan error)
go func() {
defer close(tokenCh)
defer close(errCh)
scanner := bufio.NewScanner(file)
var token Token
var line string
var fields []string
var err error
for scanner.Scan() {
line = scanner.Text()
fields = strings.Split(line, " ")
token.text = strings.TrimSpace(strings.Replace(fields[0], "\ufeff", "", 1))
if length := len(fields); length > 1 {
token.frequency, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
errCh <- err
return
}
if length > 2 {
token.pos = strings.TrimSpace(fields[2])
}
}
tokenCh <- token
}
if err = scanner.Err(); err != nil {
errCh <- err
}
}()
return tokenCh, errCh
}
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, fileName string) error {
filePath, err := dictPath(fileName)
if err != nil {
return err
}
dictFile, err := os.Open(filePath)
if err != nil {
return err
}
defer dictFile.Close()
tokenCh, errCh := loadDictionary(dictFile)
dl.Load(tokenCh)
return <-errCh
}
func dictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}

View File

@@ -0,0 +1,59 @@
package dictionary
import (
"sync"
"testing"
)
type Dict struct {
freqMap map[string]float64
posMap map[string]string
sync.RWMutex
}
func (d *Dict) Load(ch <-chan Token) {
d.Lock()
for token := range ch {
d.freqMap[token.Text()] = token.Frequency()
if len(token.Pos()) > 0 {
d.posMap[token.Text()] = token.Pos()
}
}
d.Unlock()
}
func (d *Dict) AddToken(token Token) {
d.Lock()
d.freqMap[token.Text()] = token.Frequency()
if len(token.Pos()) > 0 {
d.posMap[token.Text()] = token.Pos()
}
d.Unlock()
}
func TestLoadDictionary(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
err := LoadDictionary(d, "../userdict.txt")
if err != nil {
t.Fatalf(err.Error())
}
if len(d.freqMap) != 7 {
t.Fatalf("Failed to load userdict.txt, got %d tokens with frequency, expected 7",
len(d.freqMap))
}
if len(d.posMap) != 6 {
t.Fatalf("Failed to load userdict.txt, got %d tokens with pos, expected 6", len(d.posMap))
}
}
func TestAddToken(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
LoadDictionary(d, "../userdict.txt")
d.AddToken(Token{"好用", 99, "a"})
if d.freqMap["好用"] != 99 {
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
}
if d.posMap["好用"] != "a" {
t.Fatalf("Failed to add token, got pos %s, expected \"a\"", d.posMap["好用"])
}
}

28
dictionary/token.go Normal file
View File

@@ -0,0 +1,28 @@
package dictionary
// Token represents a Chinese word with (optional) frequency and POS.
type Token struct {
text string
frequency float64
pos string
}
//Text returns token's text.
func (t Token) Text() string {
return t.text
}
// Frequency returns token's frequency.
func (t Token) Frequency() float64 {
return t.frequency
}
// Pos returns token's POS.
func (t Token) Pos() string {
return t.pos
}
// NewToken creates a new token.
func NewToken(text string, frequency float64, pos string) Token {
return Token{text: text, frequency: frequency, pos: pos}
}

126
example_bleve_test.go Normal file
View File

@@ -0,0 +1,126 @@
package jiebago_test
import (
"fmt"
"log"
"os"
"github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago"
)
func ExampleBeleveSearch() {
// open a new index
indexMapping := bleve.NewIndexMapping()
err := indexMapping.AddCustomTokenizer("jieba",
map[string]interface{}{
"file": "dict.txt",
"type": "jieba",
})
if err != nil {
log.Fatal(err)
}
// create a custom analyzer
err = indexMapping.AddCustomAnalyzer("jieba",
map[string]interface{}{
"type": "custom",
"tokenizer": "jieba",
"token_filters": []string{
"possessive_en",
"to_lower",
"stop_en",
},
})
if err != nil {
log.Fatal(err)
}
indexMapping.DefaultAnalyzer = "jieba"
cacheDir := "jieba.beleve"
os.RemoveAll(cacheDir)
index, err := bleve.New(cacheDir, indexMapping)
if err != nil {
log.Fatal(err)
}
docs := []struct {
Title string
Name string
}{
{
Title: "Doc 1",
Name: "This is the first document weve added",
},
{
Title: "Doc 2",
Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
},
{
Title: "Doc 3",
Name: "买水果然后来世博园。",
},
{
Title: "Doc 4",
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
},
{
Title: "Doc 5",
Name: "咱俩交换一下吧。",
},
}
// index docs
for _, doc := range docs {
index.Index(doc.Title, doc)
}
// search for some text
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
query := bleve.NewMatchQuery(keyword)
search := bleve.NewSearchRequest(query)
search.Highlight = bleve.NewHighlight()
searchResults, err := index.Search(search)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
for i, hit := range searchResults.Hits {
rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
for fragmentField, fragments := range hit.Fragments {
rv += fmt.Sprintf("%s: ", fragmentField)
for _, fragment := range fragments {
rv += fmt.Sprintf("%s", fragment)
}
}
fmt.Printf("%s\n", rv)
}
}
// Output:
// Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550)
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
// 2. Doc 2, (0.031941)
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
// Result of "你": 1 matches:
// 1. Doc 2, (0.391161)
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches:
// 1. Doc 1, (0.512150)
// Name: This is the <span class="highlight">first</span> document weve added
// Result of "中文": 1 matches:
// 1. Doc 2, (0.553186)
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
// Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
// 2. Doc 5, (0.086700)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// Result of "交换": 2 matches:
// 1. Doc 5, (0.534158)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// 2. Doc 4, (0.296297)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
}

View File

@@ -0,0 +1,100 @@
package jiebago_test
import (
"bufio"
"fmt"
"log"
"os"
"runtime"
"strings"
"time"
"github.com/wangbin/jiebago"
)
type line struct {
number int
text string
}
var (
segmenter = jiebago.Segmenter{}
numThreads = runtime.NumCPU()
task = make(chan line, numThreads)
result = make(chan line, numThreads)
)
func worker() {
for l := range task {
var segments []string
for segment := range segmenter.Cut(l.text, true) {
segments = append(segments, segment)
}
l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
result <- l
}
}
func Example_parallelCut() {
// Set the number of goroutines
runtime.GOMAXPROCS(numThreads)
// Load dictionary
segmenter.LoadDictionary("dict.txt")
// open file for segmentation
file, err := os.Open("README.md")
if err != nil {
log.Fatal(err)
}
defer file.Close()
// start worker routines
for i := 0; i < numThreads; i++ {
go worker()
}
var length, size int
scanner := bufio.NewScanner(file)
t0 := time.Now()
lines := make([]string, 0)
// Read lines
for scanner.Scan() {
t := scanner.Text()
size += len(t)
lines = append(lines, t)
}
length = len(lines)
// Segmentation
go func() {
for i := 0; i < length; i++ {
task <- line{number: i, text: lines[i]}
}
close(task)
}()
// Make sure the segmentation result contains same line as original file
for i := 0; i < length; i++ {
l := <-result
lines[l.number] = l.text
}
t1 := time.Now()
// Write the segments into a file for verify
outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
defer outputFile.Close()
writer := bufio.NewWriter(outputFile)
for _, l := range lines {
writer.WriteString(l)
}
writer.Flush()
log.Printf("Time cousumed: %v", t1.Sub(t0))
log.Printf("Segmentation speed: %f MB/s", float64(size)/t1.Sub(t0).Seconds()/(1024*1024))
}

88
example_test.go Normal file
View File

@@ -0,0 +1,88 @@
package jiebago_test
import (
"fmt"
"github.com/wangbin/jiebago"
)
var seg jiebago.Segmenter
func init() {
seg.LoadDictionary("dict.txt")
}
func print(ch <-chan string) {
for word := range ch {
fmt.Printf(" %s /", word)
}
fmt.Println()
}
func Example() {
fmt.Print("【全模式】:")
print(seg.CutAll("我来到北京清华大学"))
fmt.Print("【精确模式】:")
print(seg.Cut("我来到北京清华大学", false))
fmt.Print("【新词识别】:")
print(seg.Cut("他来到了网易杭研大厦", true))
fmt.Print("【搜索引擎模式】:")
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
// Output:
// 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
// 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
// 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
// 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
}
func ExampleLoadUserDictionary() {
var sentence = "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:")
print(seg.Cut(sentence, true))
seg.LoadUserDictionary("userdict.txt")
fmt.Print("After:")
print(seg.Cut(sentence, true))
// Output:
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
}
func ExampleTokenize() {
var sentence = []byte("永和服装饰品有限公司")
// default mode
tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false)
fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
//search mode
tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true)
fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
// Output:
// Default Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
// Search Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
}

View File

@@ -1,3 +1,4 @@
// Package finalseg is the Golang implementation of Jieba's finalseg module.
package finalseg
import (
@@ -13,10 +14,10 @@ func cutHan(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := pos_list[i]
pos := posList[i]
switch pos {
case 'B':
begin = i
@@ -36,6 +37,8 @@ func cutHan(sentence string) chan string {
return result
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string {
result := make(chan string)
s := sentence

View File

@@ -6,7 +6,7 @@ import (
)
func chanToArray(ch chan string) []string {
result := make([]string, 0)
var result []string
for word := range ch {
result = append(result, word)
}
@@ -18,11 +18,11 @@ func TestViterbi(t *testing.T) {
states := []byte{'B', 'M', 'E', 'S'}
prob, path := viterbi([]rune(obs), states)
if math.Abs(prob+39.68824128493802) > 1e-10 {
t.Error(prob)
t.Fatal(prob)
}
for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} {
if path[index] != state {
t.Error(path)
t.Fatal(path)
}
}
}
@@ -31,16 +31,16 @@ func TestCutHan(t *testing.T) {
obs := "我们是程序员"
result := chanToArray(cutHan(obs))
if len(result) != 3 {
t.Error(result)
t.Fatal(result)
}
if result[0] != "我们" {
t.Error(result[0])
t.Fatal(result[0])
}
if result[1] != "是" {
t.Error(result[1])
t.Fatal(result[1])
}
if result[2] != "程序员" {
t.Error(result[2])
t.Fatal(result[2])
}
}
@@ -48,24 +48,24 @@ func TestCut(t *testing.T) {
sentence := "我们是程序员"
result := chanToArray(Cut(sentence))
if len(result) != 3 {
t.Error(len(result))
t.Fatal(len(result))
}
if result[0] != "我们" {
t.Error(result[0])
t.Fatal(result[0])
}
if result[1] != "是" {
t.Error(result[1])
t.Fatal(result[1])
}
if result[2] != "程序员" {
t.Error(result[2])
t.Fatal(result[2])
}
result2 := chanToArray(Cut("I'm a programmer!"))
if len(result2) != 8 {
t.Error(result2)
t.Fatal(result2)
}
result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
if len(result3) != 6 {
t.Error(result3)
t.Fatal(result3)
}
}

View File

@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
V[t] = make(map[byte]float64)
for _, y := range states {
ps0 := make(probStates, 0)
var em_p float64
var emP float64
if val, ok := probEmit[y][obs[t]]; ok {
em_p = val
emP = val
} else {
em_p = minFloat
emP = minFloat
}
for _, y0 := range prevStatus[y] {
var transP float64
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
} else {
transP = minFloat
}
prob0 := V[t-1][y0] + transP + em_p
prob0 := V[t-1][y0] + transP + emP
ps0 = append(ps0, &probState{prob: prob0, state: y0})
}
sort.Sort(sort.Reverse(ps0))

409
jieba.go
View File

@@ -1,190 +1,151 @@
// Golang implemention of jieba (Python Chinese word segmentation module).
// Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
package jiebago
import (
"fmt"
"github.com/wangbin/jiebago/finalseg"
"math"
"regexp"
"sort"
"github.com/wangbin/jiebago/finalseg"
"github.com/wangbin/jiebago/util"
)
var (
// Word/Tag Map load from user dictionary
UserWordTagTab = make(map[string]string)
reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
)
type route struct {
Freq float64
Index int
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
func (r route) String() string {
return fmt.Sprintf("(%f, %d)", r.Freq, r.Index)
// LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(fileName)
}
type routes []*route
func (rs routes) Len() int {
return len(rs)
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
func (rs routes) Less(i, j int) bool {
if rs[i].Freq < rs[j].Freq {
return true
}
if rs[i].Freq == rs[j].Freq {
return rs[i].Index < rs[j].Index
}
return false
}
func (rs routes) Swap(i, j int) {
rs[i], rs[j] = rs[j], rs[i]
}
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) []string {
result := make([]string, 0)
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
if len(locs) == 0 {
return []string{sentence}
}
for _, loc := range locs {
if loc[0] == lastLoc {
result = append(result, sentence[loc[0]:loc[1]])
} else {
result = append(result, sentence[lastLoc:loc[0]])
result = append(result, sentence[loc[0]:loc[1]])
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result = append(result, sentence[lastLoc:])
}
return result
}
// Build a directed acyclic graph (DAG) for sentence.
func DAG(sentence string) map[int][]int {
func (seg *Segmenter) dag(runes []rune) map[int][]int {
dag := make(map[int][]int)
runes := []rune(sentence)
n := len(runes)
i := 0
var frag string
var frag []rune
var i int
for k := 0; k < n; k++ {
tmpList := make([]int, 0)
dag[k] = make([]int, 0)
i = k
frag = string(runes[k])
frag = runes[k : k+1]
for {
if freq, ok := Trie.Freq[frag]; !ok {
freq, ok := seg.dict.Frequency(string(frag))
if !ok {
break
} else {
if freq > 0.0 {
tmpList = append(tmpList, i)
}
}
i += 1
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i++
if i >= n {
break
}
frag = string(runes[k : i+1])
frag = runes[k : i+1]
}
if len(tmpList) == 0 {
tmpList = append(tmpList, k)
if len(dag[k]) == 0 {
dag[k] = append(dag[k], k)
}
dag[k] = tmpList
}
return dag
}
func Calc(sentence string, dag map[int][]int) map[int]*route {
runes := []rune(sentence)
number := len(runes)
rs := make(map[int]*route)
rs[number] = &route{Freq: 0.0, Index: 0}
logTotal := math.Log(Trie.Total)
for idx := number - 1; idx >= 0; idx-- {
candidates := make(routes, 0)
type route struct {
frequency float64
index int
}
func (seg *Segmenter) calc(runes []rune) map[int]route {
dag := seg.dag(runes)
n := len(runes)
rs := make(map[int]route)
rs[n] = route{frequency: 0.0, index: 0}
var r route
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
word := string(runes[idx : i+1])
var r *route
if _, ok := Trie.Freq[word]; ok {
r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
} else {
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
}
if v, ok := rs[idx]; !ok {
rs[idx] = r
} else {
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
rs[idx] = r
}
}
candidates = append(candidates, r)
}
sort.Sort(sort.Reverse(candidates))
rs[idx] = candidates[0]
}
return rs
}
type cutFunc func(sentence string) chan string
type cutFunc func(sentence string) <-chan string
func cutDAG(sentence string) chan string {
func (seg *Segmenter) cutDAG(sentence string) <-chan string {
result := make(chan string)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
buf = append(buf, frag...)
} else {
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
result <- string(buf)
buf = make([]rune, 0)
result <- bufString
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for x := range finalseg.Cut(bufString) {
result <- x
}
} else {
for _, elem := range buf {
result <- string(elem) // TODO: I don't get this?
result <- string(elem)
}
}
buf = make([]rune, 0)
}
buf = make([]rune, 0)
}
result <- string(l_word)
result <- string(frag)
}
x = y
}
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
result <- string(buf)
result <- bufString
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) {
result <- t
}
} else {
for _, elem := range buf {
result <- string(elem) // TODO: I don't get this?
result <- string(elem)
}
}
}
@@ -194,32 +155,27 @@ func cutDAG(sentence string) chan string {
return result
}
func cutDAGNoHMM(sentence string) chan string {
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
result := make(chan string)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
if reEng.MatchString(string(frag)) && len(frag) == 1 {
buf = append(buf, frag...)
x = y
} else {
if len(buf) > 0 {
result <- string(buf)
buf = make([]rune, 0)
}
result <- string(l_word)
result <- string(frag)
x = y
}
}
@@ -232,101 +188,37 @@ func cutDAGNoHMM(sentence string) chan string {
return result
}
func cutAll(sentence string) chan string {
// Cut cuts a sentence into words using accurate mode.
// Parameter hmm controls whether to use the Hidden Markov Model.
// Accurate mode attempts to cut the sentence into the most accurate
// segmentations, which is suitable for text analysis.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
result := make(chan string)
var cut cutFunc
if hmm {
cut = seg.cutDAG
} else {
cut = seg.cutDAGNoHMM
}
go func() {
runes := []rune(sentence)
dag := DAG(sentence)
old_j := -1
ks := make([]int, 0)
for k := range dag {
ks = append(ks, k)
}
sort.Ints(ks)
for k := range ks {
l := dag[k]
if len(l) == 1 && k > old_j {
result <- string(runes[k : l[0]+1])
old_j = l[0]
} else {
for _, j := range l {
if j > k {
result <- string(runes[k : j+1])
old_j = j
}
}
}
}
close(result)
}()
return result
}
/*
Cut sentence.
isCutAll controls use full cut mode or accurate mode.
Full Mode gets all the possible words from the sentence. Fast but not accurate.
Accurate Mode attempts to cut the sentence into the most accurate segmentations,
which is suitable for text analysis.
HMM contols whether to use the Hidden Markov Mode.
*/
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
result := make(chan string)
go func() {
var reHan, reSkip *regexp.Regexp
if isCutAll {
reHan = reHanCutAll
reSkip = reSkipCutAll
} else {
reHan = reHanDefault
reSkip = reSkipDefault
}
blocks := RegexpSplit(reHan, sentence)
var cut cutFunc
if HMM {
cut = cutDAG
} else {
cut = cutDAGNoHMM
}
if isCutAll {
cut = cutAll
}
for _, blk := range blocks {
if len(blk) == 0 {
for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
if len(block) == 0 {
continue
}
if reHan.MatchString(blk) {
for x := range cut(blk) {
if reHanDefault.MatchString(block) {
for x := range cut(block) {
result <- x
}
} else {
type skipSplitFunc func(sentence string) []string
var ssf skipSplitFunc
if isCutAll {
ssf = func(sentence string) []string {
return reSkip.Split(sentence, -1)
}
} else {
ssf = func(sentence string) []string {
return RegexpSplit(reSkip, sentence)
}
continue
}
for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
if reSkipDefault.MatchString(subBlock) {
result <- subBlock
continue
}
for _, x := range ssf(blk) {
if reSkip.MatchString(x) {
result <- x
} else if !isCutAll {
for _, xx := range x {
result <- string(xx)
}
} else {
result <- x
}
for _, r := range subBlock {
result <- string(r)
}
}
}
@@ -335,22 +227,79 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
return result
}
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
// to cut long words into several short words, which can raise the recall rate.
// Suitable for search engines.
func CutForSearch(sentence string, hmm bool) chan string {
func (seg *Segmenter) cutAll(sentence string) <-chan string {
result := make(chan string)
go func() {
for word := range Cut(sentence, false, hmm) {
runes := []rune(sentence)
dag := seg.dag(runes)
start := -1
ks := make([]int, len(dag))
for k := range dag {
ks[k] = k
}
var l []int
for k := range ks {
l = dag[k]
if len(l) == 1 && k > start {
result <- string(runes[k : l[0]+1])
start = l[0]
continue
}
for _, j := range l {
if j > k {
result <- string(runes[k : j+1])
start = j
}
}
}
close(result)
}()
return result
}
// CutAll cuts a sentence into words using full mode.
// Full mode gets all the possible words from the sentence.
// Fast but not accurate.
func (seg *Segmenter) CutAll(sentence string) <-chan string {
result := make(chan string)
go func() {
for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
if len(block) == 0 {
continue
}
if reHanCutAll.MatchString(block) {
for x := range seg.cutAll(block) {
result <- x
}
continue
}
for _, subBlock := range reSkipCutAll.Split(block, -1) {
result <- subBlock
}
}
close(result)
}()
return result
}
// CutForSearch cuts sentence into words using search engine mode.
// Search engine mode, based on the accurate mode, attempts to cut long words
// into several short words, which can raise the recall rate.
// Suitable for search engines.
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
result := make(chan string)
go func() {
for word := range seg.Cut(sentence, hmm) {
runes := []rune(word)
for _, increment := range []int{2, 3} {
if len(runes) > increment {
var gram2 string
for i := 0; i < len(runes)-increment+1; i++ {
gram2 = string(runes[i : i+increment])
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
result <- gram2
}
if len(runes) <= increment {
continue
}
var gram string
for i := 0; i < len(runes)-increment+1; i++ {
gram = string(runes[i : i+increment])
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
result <- gram
}
}
}

View File

@@ -1,12 +1,10 @@
package jiebago
import (
"regexp"
"testing"
)
import "testing"
var (
test_contents = []string{
seg Segmenter
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -618,11 +616,11 @@ var (
)
func init() {
SetDictionary("dict.txt")
seg.LoadDictionary("dict.txt")
}
func chanToArray(ch chan string) []string {
result := make([]string, 0)
func chanToArray(ch <-chan string) []string {
var result []string
for word := range ch {
result = append(result, word)
}
@@ -630,43 +628,32 @@ func chanToArray(ch chan string) []string {
}
func TestCutDAG(t *testing.T) {
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
t.Fatal(result)
}
}
func TestCutDAGNoHmm(t *testing.T) {
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
}
}
func TestRegexpSplit(t *testing.T) {
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度")
if len(result) != 3 {
t.Error(result)
}
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?")
if len(result) != 3 {
t.Error(result)
t.Fatal(result)
}
}
func TestDefaultCut(t *testing.T) {
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, false, true))
for index, content := range testContents {
result = chanToArray(seg.Cut(content, true))
if len(result) != len(defaultCutResult[index]) {
t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result))
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
}
for i, r := range result {
if r != defaultCutResult[index][i] {
t.Error(r)
t.Fatal(r)
}
}
}
@@ -674,15 +661,17 @@ func TestDefaultCut(t *testing.T) {
func TestCutAll(t *testing.T) {
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, true, true))
for index, content := range testContents {
result = chanToArray(seg.CutAll(content))
if len(result) != len(cutAllResult[index]) {
t.Errorf("cut all for %s length should be %d not %d\n",
content, len(cutAllResult[index]), len(result))
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
}
for i, c := range result {
if c != cutAllResult[index][i] {
t.Error(c)
t.Fatal(c)
}
}
}
@@ -690,15 +679,15 @@ func TestCutAll(t *testing.T) {
func TestDefaultCutNoHMM(t *testing.T) {
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, false, false))
for index, content := range testContents {
result = chanToArray(seg.Cut(content, false))
if len(result) != len(defaultCutNoHMMResult[index]) {
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
content, len(defaultCutNoHMMResult[index]), len(result))
}
for i, c := range result {
if c != defaultCutNoHMMResult[index][i] {
t.Error(c)
t.Fatal(c)
}
}
}
@@ -706,88 +695,129 @@ func TestDefaultCutNoHMM(t *testing.T) {
func TestCutForSearch(t *testing.T) {
var result []string
for index, content := range test_contents {
result = chanToArray(CutForSearch(content, true))
for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, true))
if len(result) != len(cutForSearchResult[index]) {
t.Errorf("cut for search for %s length should be %d not %d\n",
t.Fatalf("cut for search for %s length should be %d not %d\n",
content, len(cutForSearchResult[index]), len(result))
}
for i, c := range result {
if c != cutForSearchResult[index][i] {
t.Error(c)
t.Fatal(c)
}
}
}
for index, content := range test_contents {
result = chanToArray(CutForSearch(content, false))
for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, false))
if len(result) != len(cutForSearchNoHMMResult[index]) {
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
content, len(cutForSearchNoHMMResult[index]), len(result))
}
for i, c := range result {
if c != cutForSearchNoHMMResult[index][i] {
t.Error(c)
t.Fatal(c)
}
}
}
}
func TestSetdictionary(t *testing.T) {
func TestLoadDictionary(t *testing.T) {
var result []string
SetDictionary("foobar.txt")
for index, content := range test_contents {
result = chanToArray(Cut(content, false, true))
seg.LoadDictionary("foobar.txt")
for index, content := range testContents {
result = chanToArray(seg.Cut(content, true))
if len(result) != len(userDictCutResult[index]) {
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
content, len(userDictCutResult[index]), len(result))
}
for i, c := range result {
if c != userDictCutResult[index][i] {
t.Error(c)
t.Fatal(c)
}
}
}
seg.LoadDictionary("dict.txt")
}
func TestLoadUserDict(t *testing.T) {
SetDictionary("dict.txt")
LoadUserDict("userdict.txt")
func TestLoadUserDictionary(t *testing.T) {
seg.LoadUserDictionary("userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
words := chanToArray(Cut(sentence, false, true))
words := chanToArray(seg.Cut(sentence, true))
if len(words) != len(result) {
t.Error(len(words))
t.Fatal(len(words))
}
for index, word := range words {
if word != result[index] {
t.Error(word)
t.Fatal(word)
}
}
sentence = "easy_install is great"
result = []string{"easy_install", " ", "is", " ", "great"}
words = chanToArray(Cut(sentence, false, true))
words = chanToArray(seg.Cut(sentence, true))
if len(words) != len(result) {
t.Error(len(words))
t.Fatal(len(words))
}
for index, word := range words {
if word != result[index] {
t.Error(word)
t.Fatal(word)
}
}
sentence = "python 的正则表达式是好用的"
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
words = chanToArray(Cut(sentence, false, true))
words = chanToArray(seg.Cut(sentence, true))
if len(words) != len(result) {
t.Error(words)
t.Error(result)
t.Fatal(words)
t.Fatal(result)
}
for index, word := range words {
if word != result[index] {
t.Error(word)
t.Fatal(word)
}
}
seg.LoadDictionary("dict.txt")
}
func BenchmarkCutNoHMM(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.Cut(sentence, false))
}
}
func BenchmarkCut(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.Cut(sentence, true))
}
}
func BenchmarkCutAll(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.CutAll(sentence))
}
}
func BenchmarkCutForSearchNoHMM(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.CutForSearch(sentence, false))
}
}
func BenchmarkCutForSearch(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.CutForSearch(sentence, true))
}
}

File diff suppressed because it is too large Load Diff

74
posseg/dictionary.go Normal file
View File

@@ -0,0 +1,74 @@
package posseg
import (
"math"
"sync"
"github.com/wangbin/jiebago/dictionary"
)
// A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct {
total, logTotal float64
freqMap map[string]float64
posMap map[string]string
sync.RWMutex
}
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
d.addToken(token)
}
d.Unlock()
d.updateLogTotal()
}
// AddToken adds one token
func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock()
d.addToken(token)
d.Unlock()
d.updateLogTotal()
}
func (d *Dictionary) addToken(token dictionary.Token) {
d.freqMap[token.Text()] = token.Frequency()
d.total += token.Frequency()
runes := []rune(token.Text())
n := len(runes)
for i := 0; i < n; i++ {
frag := string(runes[:i+1])
if _, ok := d.freqMap[frag]; !ok {
d.freqMap[frag] = 0.0
}
}
if len(token.Pos()) > 0 {
d.posMap[token.Text()] = token.Pos()
}
}
func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]
d.RUnlock()
return freq, ok
}
// Pos returns the POS and existence of give word
func (d *Dictionary) Pos(key string) (string, bool) {
d.RLock()
pos, ok := d.posMap[key]
d.RUnlock()
return pos, ok
}
func (d *Dictionary) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(d, fileName)
}

21
posseg/example_test.go Normal file
View File

@@ -0,0 +1,21 @@
package posseg_test
import (
"fmt"
"github.com/wangbin/jiebago/posseg"
)
func Example() {
var seg posseg.Segmenter
seg.LoadDictionary("../dict.txt")
for segment := range seg.Cut("我爱北京天安门", true) {
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
}
// Output:
// 我 r
// 爱 v
// 北京 ns
// 天安门 ns
}

View File

@@ -1,14 +1,16 @@
// Package posseg is the Golang implementation of Jieba's posseg module.
package posseg
import (
"github.com/wangbin/jiebago"
"math"
"regexp"
"github.com/wangbin/jiebago/util"
)
var (
wordTagMap = make(map[string]string)
reHanDetail = regexp.MustCompile(`\p{Han}+`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
reHanDetail = regexp.MustCompile(`(\p{Han}+)`)
reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
@@ -16,81 +18,90 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
)
type WordTag struct {
Word, Tag string
// Segment represents a word with it's POS
type Segment struct {
text, pos string
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func SetDictionary(dictFileName string) error {
err := jiebago.SetDictionary(dictFileName)
if err != nil {
return err
}
dictFilePath, err := jiebago.DictPath(dictFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(dictFilePath)
for _, wtf := range wtfs {
wordTagMap[wtf.Word] = wtf.Tag
}
return nil
// Text returns the Segment's text.
func (s Segment) Text() string {
return s.text
}
func cutDetailInternal(sentence string) chan WordTag {
result := make(chan WordTag)
// Pos returns the Segment's POS.
func (s Segment) Pos() string {
return s.pos
}
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
runes := []rune(sentence)
_, posList := viterbi(runes)
posList := viterbi(runes)
begin := 0
next := 0
for i, char := range runes {
pos := posList[i].State
switch pos {
case 'B':
pos := posList[i]
switch pos.position() {
case "B":
begin = i
case 'E':
result <- WordTag{string(runes[begin : i+1]), posList[i].Tag}
case "E":
result <- Segment{string(runes[begin : i+1]), pos.pos()}
next = i + 1
case 'S':
result <- WordTag{string(char), posList[i].Tag}
case "S":
result <- Segment{string(char), pos.pos()}
next = i + 1
}
}
if next < len(runes) {
result <- WordTag{string(runes[next:]), posList[next].Tag}
result <- Segment{string(runes[next:]), posList[next].pos()}
}
close(result)
}()
return result
}
func cutDetail(sentence string) chan WordTag {
result := make(chan WordTag)
func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
for _, blk := range blocks {
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
if reHanDetail.MatchString(blk) {
for wordTag := range cutDetailInternal(blk) {
result <- wordTag
for segment := range seg.cutDetailInternal(blk) {
result <- segment
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
if len(x) == 0 {
continue
}
switch {
case reNum.MatchString(x):
result <- WordTag{x, "m"}
case reEng.MatchString(x):
result <- WordTag{x, "eng"}
default:
result <- WordTag{x, "x"}
}
continue
}
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
if len(x) == 0 {
continue
}
switch {
case reNum.MatchString(x):
result <- Segment{x, "m"}
case reEng.MatchString(x):
result <- Segment{x, "eng"}
default:
result <- Segment{x, "x"}
}
}
}
@@ -99,88 +110,142 @@ func cutDetail(sentence string) chan WordTag {
return result
}
type cutFunc func(sentence string) chan WordTag
func cutDAG(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
func (seg *Segmenter) dag(runes []rune) map[int][]int {
dag := make(map[int][]int)
n := len(runes)
var frag []rune
var i int
for k := 0; k < n; k++ {
dag[k] = make([]int, 0)
i = k
frag = runes[k : k+1]
for {
if x >= length {
freq, ok := seg.dict.Frequency(string(frag))
if !ok {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
}
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
result <- WordTag{string(elem), tag}
} else {
result <- WordTag{string(elem), "x"}
}
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i++
if i >= n {
break
}
frag = runes[k : i+1]
}
if len(dag[k]) == 0 {
dag[k] = append(dag[k], k)
}
}
return dag
}
}
}
buf = make([]rune, 0)
}
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
type route struct {
frequency float64
index int
}
func (seg *Segmenter) calc(runes []rune) map[int]route {
dag := seg.dag(runes)
n := len(runes)
rs := make(map[int]route)
rs[n] = route{frequency: 0.0, index: 0}
var r route
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
} else {
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
}
if v, ok := rs[idx]; !ok {
rs[idx] = r
} else {
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
rs[idx] = r
}
}
x = y
}
}
return rs
}
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
}
} else {
type cutFunc func(sentence string) <-chan Segment
func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
runes := []rune(sentence)
routes := seg.calc(runes)
var y int
length := len(runes)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
if y-x == 1 {
buf = append(buf, frag...)
x = y
continue
}
if len(buf) > 0 {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
if len(buf) == 1 {
if tag, ok := seg.dict.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Segment{bufString, "x"}
}
buf = make([]rune, 0)
continue
}
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
result <- WordTag{selem, tag}
if tag, ok := seg.dict.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- WordTag{selem, "x"}
result <- Segment{selem, "x"}
}
}
}
buf = make([]rune, 0)
}
word := string(frag)
if tag, ok := seg.dict.Pos(word); ok {
result <- Segment{word, tag}
} else {
result <- Segment{word, "x"}
}
x = y
}
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
if tag, ok := seg.dict.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Segment{bufString, "x"}
}
} else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := seg.dict.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- Segment{selem, "x"}
}
}
}
@@ -191,42 +256,38 @@ func cutDAG(sentence string) chan WordTag {
return result
}
func cutDAGNoHMM(sentence string) chan WordTag {
result := make(chan WordTag)
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
buf = append(buf, frag...)
x = y
continue
}
if len(buf) > 0 {
result <- Segment{string(buf), "eng"}
buf = make([]rune, 0)
}
word := string(frag)
if tag, ok := seg.dict.Pos(word); ok {
result <- Segment{word, tag}
} else {
if len(buf) > 0 {
result <- WordTag{string(buf), "eng"}
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
}
x = y
result <- Segment{word, "x"}
}
x = y
}
if len(buf) > 0 {
result <- WordTag{string(buf), "eng"}
result <- Segment{string(buf), "eng"}
buf = make([]rune, 0)
}
close(result)
@@ -234,44 +295,38 @@ func cutDAGNoHMM(sentence string) chan WordTag {
return result
}
// Tags the POS of each word after segmentation, using labels compatible with
// ictclas.
func Cut(sentence string, HMM bool) chan WordTag {
for key := range jiebago.UserWordTagTab {
wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
result := make(chan WordTag)
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
// Cut cuts a sentence into words.
// Parameter hmm controls whether to use the Hidden Markov Model.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment)
var cut cutFunc
if HMM {
cut = cutDAG
if hmm {
cut = seg.cutDAG
} else {
cut = cutDAGNoHMM
cut = seg.cutDAGNoHMM
}
go func() {
for _, blk := range blocks {
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
if reHanInternal.MatchString(blk) {
for wordTag := range cut(blk) {
result <- wordTag
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
if reSkipInternal.MatchString(x) {
result <- WordTag{x, "x"}
} else {
for _, xx := range x {
s := string(xx)
switch {
case reNum.MatchString(s):
result <- WordTag{s, "m"}
case reEng.MatchString(x):
result <- WordTag{x, "eng"}
break
default:
result <- WordTag{s, "x"}
}
}
continue
}
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
if reSkipInternal.MatchString(x) {
result <- Segment{x, "x"}
continue
}
for _, xx := range x {
s := string(xx)
switch {
case reNum.MatchString(s):
result <- Segment{s, "m"}
case reEng.MatchString(x):
result <- Segment{x, "eng"}
default:
result <- Segment{s, "x"}
}
}
}

View File

@@ -1,12 +1,12 @@
package posseg
import (
"github.com/wangbin/jiebago"
"testing"
)
var (
test_contents = []string{
seg Segmenter
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -93,183 +93,187 @@ var (
"你认识那个和主席握手的的哥吗?他开一辆黑色的士。",
"枪杆子中出政权"}
defaultCutResult = [][]WordTag{[]WordTag{WordTag{"这", "r"}, WordTag{"是", "v"}, WordTag{"一个", "m"}, WordTag{"伸手不见五指", "i"}, WordTag{"的", "uj"}, WordTag{"黑夜", "n"}, WordTag{"。", "x"}, WordTag{"我", "r"}, WordTag{"叫", "v"}, WordTag{"孙悟空", "nr"}, WordTag{"", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"Python", "eng"}, WordTag{"和", "c"}, WordTag{"C++", "nz"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"不", "d"}, WordTag{"喜欢", "v"}, WordTag{"日本", "ns"}, WordTag{"和服", "nz"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"回归", "v"}, WordTag{"人间", "n"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"工信处", "n"}, WordTag{"女干事", "n"}, WordTag{"每月", "r"}, WordTag{"经过", "p"}, WordTag{"下属", "v"}, WordTag{"科室", "n"}, WordTag{"都", "d"}, WordTag{"要", "v"}, WordTag{"亲口", "n"}, WordTag{"交代", "n"}, WordTag{"24", "m"}, WordTag{"口", "n"}, WordTag{"交换机", "n"}, WordTag{"等", "u"}, WordTag{"技术性", "n"}, WordTag{"器件", "n"}, WordTag{"的", "uj"}, WordTag{"安装", "v"}, WordTag{"工作", "vn"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"需要", "v"}, WordTag{"廉租房", "n"}},
[]WordTag{WordTag{"永和", "nz"}, WordTag{"服装", "vn"}, WordTag{"饰品", "n"}, WordTag{"有限公司", "n"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"天安门", "ns"}},
[]WordTag{WordTag{"abc", "eng"}},
[]WordTag{WordTag{"隐", "n"}, WordTag{"马尔可夫", "nr"}},
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"是", "v"}, WordTag{"个", "q"}, WordTag{"好", "a"}, WordTag{"网站", "n"}},
[]WordTag{WordTag{"“", "x"}, WordTag{"Microsoft", "eng"}, WordTag{"”", "x"}, WordTag{"一", "m"}, WordTag{"词", "n"}, WordTag{"由", "p"}, WordTag{"“", "x"}, WordTag{"MICROcomputer", "eng"}, WordTag{"", "x"}, WordTag{"微型", "b"}, WordTag{"计算机", "n"}, WordTag{"", "x"}, WordTag{"”", "x"}, WordTag{"和", "c"}, WordTag{"“", "x"}, WordTag{"SOFTware", "eng"}, WordTag{"", "x"}, WordTag{"软件", "n"}, WordTag{"", "x"}, WordTag{"”", "x"}, WordTag{"两", "m"}, WordTag{"部分", "n"}, WordTag{"组成", "v"}},
[]WordTag{WordTag{"草泥马", "n"}, WordTag{"和", "c"}, WordTag{"欺实", "v"}, WordTag{"马", "n"}, WordTag{"是", "v"}, WordTag{"今年", "t"}, WordTag{"的", "uj"}, WordTag{"流行", "v"}, WordTag{"词汇", "n"}},
[]WordTag{WordTag{"伊藤", "nr"}, WordTag{"洋华堂", "n"}, WordTag{"总府", "n"}, WordTag{"店", "n"}},
[]WordTag{WordTag{"中国科学院计算技术研究所", "nt"}},
[]WordTag{WordTag{"罗密欧", "nr"}, WordTag{"与", "p"}, WordTag{"朱丽叶", "nr"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"购买", "v"}, WordTag{"了", "ul"}, WordTag{"道具", "n"}, WordTag{"和", "c"}, WordTag{"服装", "vn"}},
[]WordTag{WordTag{"PS", "eng"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"我", "r"}, WordTag{"觉得", "v"}, WordTag{"开源", "n"}, WordTag{"有", "v"}, WordTag{"一个", "m"}, WordTag{"好处", "d"}, WordTag{"", "x"}, WordTag{"就是", "d"}, WordTag{"能够", "v"}, WordTag{"敦促", "v"}, WordTag{"自己", "r"}, WordTag{"不断改进", "l"}, WordTag{"", "x"}, WordTag{"避免", "v"}, WordTag{"敞", "v"}, WordTag{"帚", "ng"}, WordTag{"自珍", "b"}},
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"石首市", "ns"}},
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"十堰市", "ns"}},
[]WordTag{WordTag{"总经理", "n"}, WordTag{"完成", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}},
[]WordTag{WordTag{"电脑", "n"}, WordTag{"修好", "v"}, WordTag{"了", "ul"}},
[]WordTag{WordTag{"做好", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"就", "d"}, WordTag{"一了百了", "l"}, WordTag{"了", "ul"}},
[]WordTag{WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"我们", "r"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"美的", "nr"}, WordTag{"空调", "n"}},
[]WordTag{WordTag{"线程", "n"}, WordTag{"初始化", "l"}, WordTag{"时", "n"}, WordTag{"我们", "r"}, WordTag{"要", "v"}, WordTag{"注意", "v"}},
[]WordTag{WordTag{"一个", "m"}, WordTag{"分子", "n"}, WordTag{"是", "v"}, WordTag{"由", "p"}, WordTag{"好多", "m"}, WordTag{"原子", "n"}, WordTag{"组织", "v"}, WordTag{"成", "v"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"祝", "v"}, WordTag{"你", "r"}, WordTag{"马到功成", "i"}},
[]WordTag{WordTag{"他", "r"}, WordTag{"掉", "v"}, WordTag{"进", "v"}, WordTag{"了", "ul"}, WordTag{"无底洞", "ns"}, WordTag{"里", "f"}},
[]WordTag{WordTag{"中国", "ns"}, WordTag{"的", "uj"}, WordTag{"首都", "d"}, WordTag{"是", "v"}, WordTag{"北京", "ns"}},
[]WordTag{WordTag{"孙君意", "nr"}},
[]WordTag{WordTag{"外交部", "nt"}, WordTag{"发言人", "l"}, WordTag{"马朝旭", "nr"}},
[]WordTag{WordTag{"领导人", "n"}, WordTag{"会议", "n"}, WordTag{"和", "c"}, WordTag{"第四届", "m"}, WordTag{"东亚", "ns"}, WordTag{"峰会", "n"}},
[]WordTag{WordTag{"在", "p"}, WordTag{"过去", "t"}, WordTag{"的", "uj"}, WordTag{"这", "r"}, WordTag{"五年", "t"}},
[]WordTag{WordTag{"还", "d"}, WordTag{"需要", "v"}, WordTag{"很", "d"}, WordTag{"长", "a"}, WordTag{"的", "uj"}, WordTag{"路", "n"}, WordTag{"要", "v"}, WordTag{"走", "v"}},
[]WordTag{WordTag{"60", "m"}, WordTag{"周年", "t"}, WordTag{"首都", "d"}, WordTag{"阅兵", "v"}},
[]WordTag{WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"来", "v"}, WordTag{"世博园", "nr"}},
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"世博园", "nr"}},
[]WordTag{WordTag{"但是", "c"}, WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}, WordTag{"知道", "v"}, WordTag{"你", "r"}, WordTag{"是", "v"}, WordTag{"对", "p"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"存在", "v"}, WordTag{"即", "v"}, WordTag{"合理", "vn"}},
[]WordTag{WordTag{"的的", "u"}, WordTag{"的的", "u"}, WordTag{"的", "uj"}, WordTag{"在的", "u"}, WordTag{"的的", "u"}, WordTag{"的", "uj"}, WordTag{"就", "d"}, WordTag{"以", "p"}, WordTag{"和和", "nz"}, WordTag{"和", "c"}},
[]WordTag{WordTag{"I", "x"}, WordTag{" ", "x"}, WordTag{"love", "eng"}, WordTag{"你", "r"}, WordTag{"", "x"}, WordTag{"不以为耻", "i"}, WordTag{"", "x"}, WordTag{"反", "zg"}, WordTag{"以为", "c"}, WordTag{"rong", "eng"}},
[]WordTag{WordTag{"因", "p"}},
[]WordTag{},
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"很好", "a"}, WordTag{"但", "c"}, WordTag{"主要", "b"}, WordTag{"是", "v"}, WordTag{"基于", "p"}, WordTag{"网页", "n"}, WordTag{"形式", "n"}},
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"为什么", "r"}, WordTag{"我", "r"}, WordTag{"不能", "v"}, WordTag{"拥有", "v"}, WordTag{"想要", "v"}, WordTag{"的", "uj"}, WordTag{"生活", "vn"}},
[]WordTag{WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}},
[]WordTag{WordTag{"此次", "r"}, WordTag{"来", "v"}, WordTag{"中国", "ns"}, WordTag{"是", "v"}, WordTag{"为了", "p"}},
[]WordTag{WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{",", "x"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"其实", "d"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"好人", "n"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"是因为", "c"}, WordTag{"和", "c"}, WordTag{"国家", "n"}},
[]WordTag{WordTag{"老年", "t"}, WordTag{"搜索", "v"}, WordTag{"还", "d"}, WordTag{"支持", "v"}},
[]WordTag{WordTag{"干脆", "d"}, WordTag{"就", "d"}, WordTag{"把", "p"}, WordTag{"那部", "r"}, WordTag{"蒙人", "n"}, WordTag{"的", "uj"}, WordTag{"闲法", "n"}, WordTag{"给", "p"}, WordTag{"废", "v"}, WordTag{"了", "ul"}, WordTag{"拉倒", "v"}, WordTag{"", "x"}, WordTag{"RT", "eng"}, WordTag{" ", "x"}, WordTag{"@", "x"}, WordTag{"laoshipukong", "eng"}, WordTag{" ", "x"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"27", "m"}, WordTag{"日", "m"}, WordTag{"", "x"}, WordTag{"全国人大常委会", "nt"}, WordTag{"第三次", "m"}, WordTag{"审议", "v"}, WordTag{"侵权", "v"}, WordTag{"责任法", "n"}, WordTag{"草案", "n"}, WordTag{"", "x"}, WordTag{"删除", "v"}, WordTag{"了", "ul"}, WordTag{"有关", "vn"}, WordTag{"医疗", "n"}, WordTag{"损害", "v"}, WordTag{"责任", "n"}, WordTag{"“", "x"}, WordTag{"举证", "v"}, WordTag{"倒置", "v"}, WordTag{"”", "x"}, WordTag{"的", "uj"}, WordTag{"规定", "n"}, WordTag{"。", "x"}, WordTag{"在", "p"}, WordTag{"医患", "n"}, WordTag{"纠纷", "n"}, WordTag{"中本", "ns"}, WordTag{"已", "d"}, WordTag{"处于", "v"}, WordTag{"弱势", "n"}, WordTag{"地位", "n"}, WordTag{"的", "uj"}, WordTag{"消费者", "n"}, WordTag{"由此", "c"}, WordTag{"将", "d"}, WordTag{"陷入", "v"}, WordTag{"万劫不复", "i"}, WordTag{"的", "uj"}, WordTag{"境地", "s"}, WordTag{"。", "x"}, WordTag{" ", "x"}},
[]WordTag{WordTag{"大", "a"}},
[]WordTag{},
[]WordTag{WordTag{"他", "r"}, WordTag{"说", "v"}, WordTag{"的", "uj"}, WordTag{"确实", "ad"}, WordTag{"在", "p"}, WordTag{"理", "n"}},
[]WordTag{WordTag{"长春", "ns"}, WordTag{"市长", "n"}, WordTag{"春节", "t"}, WordTag{"讲话", "n"}},
[]WordTag{WordTag{"结婚", "v"}, WordTag{"的", "uj"}, WordTag{"和", "c"}, WordTag{"尚未", "d"}, WordTag{"结婚", "v"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"结合", "v"}, WordTag{"成", "n"}, WordTag{"分子", "n"}, WordTag{"时", "n"}},
[]WordTag{WordTag{"旅游", "vn"}, WordTag{"和", "c"}, WordTag{"服务", "vn"}, WordTag{"是", "v"}, WordTag{"最好", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"的确", "d"}, WordTag{"是", "v"}, WordTag{"我", "r"}, WordTag{"的", "uj"}, WordTag{"错", "n"}},
[]WordTag{WordTag{"供", "v"}, WordTag{"大家", "n"}, WordTag{"参考", "v"}, WordTag{"指正", "v"}},
[]WordTag{WordTag{"哈尔滨", "ns"}, WordTag{"政府", "n"}, WordTag{"公布", "v"}, WordTag{"塌", "v"}, WordTag{"桥", "n"}, WordTag{"原因", "n"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"在", "p"}, WordTag{"机场", "n"}, WordTag{"入口处", "i"}},
[]WordTag{WordTag{"邢永臣", "nr"}, WordTag{"摄影", "n"}, WordTag{"报道", "v"}},
[]WordTag{WordTag{"BP", "eng"}, WordTag{"神经网络", "n"}, WordTag{"如何", "r"}, WordTag{"训练", "vn"}, WordTag{"才能", "v"}, WordTag{"在", "p"}, WordTag{"分类", "n"}, WordTag{"时", "n"}, WordTag{"增加", "v"}, WordTag{"区分度", "n"}, WordTag{"", "x"}},
[]WordTag{WordTag{"南京市", "ns"}, WordTag{"长江大桥", "ns"}},
[]WordTag{WordTag{"应", "v"}, WordTag{"一些", "m"}, WordTag{"使用者", "n"}, WordTag{"的", "uj"}, WordTag{"建议", "n"}, WordTag{"", "x"}, WordTag{"也", "d"}, WordTag{"为了", "p"}, WordTag{"便于", "v"}, WordTag{"利用", "n"}, WordTag{"NiuTrans", "eng"}, WordTag{"用于", "v"}, WordTag{"SMT", "eng"}, WordTag{"研究", "vn"}},
[]WordTag{WordTag{"长春市", "ns"}, WordTag{"长春", "ns"}, WordTag{"药店", "n"}},
[]WordTag{WordTag{"邓颖超", "nr"}, WordTag{"生前", "t"}, WordTag{"最", "d"}, WordTag{"喜欢", "v"}, WordTag{"的", "uj"}, WordTag{"衣服", "n"}},
[]WordTag{WordTag{"胡锦涛", "nr"}, WordTag{"是", "v"}, WordTag{"热爱", "a"}, WordTag{"世界", "n"}, WordTag{"和平", "nz"}, WordTag{"的", "uj"}, WordTag{"政治局", "n"}, WordTag{"常委", "j"}},
[]WordTag{WordTag{"程序员", "n"}, WordTag{"祝", "v"}, WordTag{"海林", "nz"}, WordTag{"和", "c"}, WordTag{"朱会震", "nr"}, WordTag{"是", "v"}, WordTag{"在", "p"}, WordTag{"孙健", "nr"}, WordTag{"的", "uj"}, WordTag{"左面", "f"}, WordTag{"和", "c"}, WordTag{"右面", "f"}, WordTag{",", "x"}, WordTag{" ", "x"}, WordTag{"范凯", "nr"}, WordTag{"在", "p"}, WordTag{"最", "a"}, WordTag{"右面", "f"}, WordTag{".", "m"}, WordTag{"再往", "d"}, WordTag{"左", "f"}, WordTag{"是", "v"}, WordTag{"李松洪", "nr"}},
[]WordTag{WordTag{"一次性", "d"}, WordTag{"交", "v"}, WordTag{"多少", "m"}, WordTag{"钱", "n"}},
[]WordTag{WordTag{"两块", "m"}, WordTag{"五", "m"}, WordTag{"一套", "m"}, WordTag{"", "x"}, WordTag{"三块", "m"}, WordTag{"八", "m"}, WordTag{"一斤", "m"}, WordTag{"", "x"}, WordTag{"四块", "m"}, WordTag{"七", "m"}, WordTag{"一本", "m"}, WordTag{"", "x"}, WordTag{"五块", "m"}, WordTag{"六", "m"}, WordTag{"一条", "m"}},
[]WordTag{WordTag{"小", "a"}, WordTag{"和尚", "nr"}, WordTag{"留", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"像", "v"}, WordTag{"大", "a"}, WordTag{"和尚", "nr"}, WordTag{"一样", "r"}, WordTag{"的", "uj"}, WordTag{"和尚头", "nr"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"是", "v"}, WordTag{"中华人民共和国", "ns"}, WordTag{"公民", "n"}, WordTag{";", "x"}, WordTag{"我", "r"}, WordTag{"爸爸", "n"}, WordTag{"是", "v"}, WordTag{"共和党", "nt"}, WordTag{"党员", "n"}, WordTag{";", "x"}, WordTag{" ", "x"}, WordTag{"地铁", "n"}, WordTag{"和平门", "ns"}, WordTag{"站", "v"}},
[]WordTag{WordTag{"张晓梅", "nr"}, WordTag{"去", "v"}, WordTag{"人民", "n"}, WordTag{"医院", "n"}, WordTag{"做", "v"}, WordTag{"了", "ul"}, WordTag{"个", "q"}, WordTag{"B超", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"件", "q"}, WordTag{"T恤", "n"}},
[]WordTag{WordTag{"AT&T", "nz"}, WordTag{"是", "v"}, WordTag{"一件", "m"}, WordTag{"不错", "a"}, WordTag{"的", "uj"}, WordTag{"公司", "n"}, WordTag{"", "x"}, WordTag{"给", "p"}, WordTag{"你", "r"}, WordTag{"发", "v"}, WordTag{"offer", "eng"}, WordTag{"了", "ul"}, WordTag{"吗", "y"}, WordTag{"", "x"}},
[]WordTag{WordTag{"C++", "nz"}, WordTag{"和", "c"}, WordTag{"c#", "nz"}, WordTag{"是", "v"}, WordTag{"什么", "r"}, WordTag{"关系", "n"}, WordTag{"", "x"}, WordTag{"11", "m"}, WordTag{"+", "x"}, WordTag{"122", "m"}, WordTag{"=", "x"}, WordTag{"133", "m"}, WordTag{"", "x"}, WordTag{"是", "v"}, WordTag{"吗", "y"}, WordTag{"", "x"}, WordTag{"PI", "eng"}, WordTag{"=", "x"}, WordTag{"3.14159", "m"}},
[]WordTag{WordTag{"你", "r"}, WordTag{"认识", "v"}, WordTag{"那个", "r"}, WordTag{"和", "c"}, WordTag{"主席", "n"}, WordTag{"握手", "v"}, WordTag{"的", "uj"}, WordTag{"的哥", "n"}, WordTag{"吗", "y"}, WordTag{"", "x"}, WordTag{"他", "r"}, WordTag{"开", "v"}, WordTag{"一辆", "m"}, WordTag{"黑色", "n"}, WordTag{"的士", "n"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"枪杆子", "n"}, WordTag{"中", "f"}, WordTag{"出", "v"}, WordTag{"政权", "n"}},
defaultCutResult = [][]Segment{[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{"", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
[]Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
[]Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
[]Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "m"}, Segment{"口", "n"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
[]Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
[]Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
[]Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
[]Segment{Segment{"abc", "eng"}},
[]Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
[]Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
[]Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{"", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"", "x"}, Segment{"软件", "n"}, Segment{"", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
[]Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺实", "v"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
[]Segment{Segment{"伊藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
[]Segment{Segment{"中国科学院计算技术研究所", "nt"}},
[]Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
[]Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
[]Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{"", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{"", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
[]Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
[]Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
[]Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
[]Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
[]Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
[]Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
[]Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
[]Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "v"}, Segment{"的", "uj"}},
[]Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
[]Segment{Segment{"他", "r"}, Segment{"掉", "v"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
[]Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
[]Segment{Segment{"孙君意", "nr"}},
[]Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
[]Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
[]Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
[]Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "d"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
[]Segment{Segment{"60", "m"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
[]Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
[]Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
[]Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
[]Segment{Segment{"的的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"在的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和和", "nz"}, Segment{"和", "c"}},
[]Segment{Segment{"I", "x"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{"", "x"}, Segment{"不以为耻", "i"}, Segment{"", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
[]Segment{Segment{"因", "p"}},
[]Segment{},
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"很好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
[]Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
[]Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
[]Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
[]Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
[]Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那部", "r"}, Segment{"蒙人", "n"}, Segment{"的", "uj"}, Segment{"闲法", "n"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "m"}, Segment{"日", "m"}, Segment{"", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{"", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中本", "ns"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
[]Segment{Segment{"大", "a"}},
[]Segment{},
[]Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
[]Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
[]Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
[]Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
[]Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "n"}},
[]Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
[]Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
[]Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
[]Segment{Segment{"邢永臣", "nr"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
[]Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"", "x"}},
[]Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
[]Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{"", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
[]Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
[]Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
[]Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
[]Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱会震", "nr"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙健", "nr"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范凯", "nr"}, Segment{"在", "p"}, Segment{"最", "a"}, Segment{"右面", "f"}, Segment{".", "m"}, Segment{"再往", "d"}, Segment{"左", "f"}, Segment{"是", "v"}, Segment{"李松洪", "nr"}},
[]Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
[]Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{"", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{"", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{"", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
[]Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
[]Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
[]Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "q"}, Segment{"T恤", "n"}},
[]Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{"", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"", "x"}},
[]Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"", "x"}, Segment{"11", "m"}, Segment{"+", "x"}, Segment{"122", "m"}, Segment{"=", "x"}, Segment{"133", "m"}, Segment{"", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3.14159", "m"}},
[]Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
[]Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
}
noHMMCutResult = [][]WordTag{
[]WordTag{WordTag{"这", "r"}, WordTag{"是", "v"}, WordTag{"一个", "m"}, WordTag{"伸手不见五指", "i"}, WordTag{"的", "uj"}, WordTag{"黑夜", "n"}, WordTag{"。", "x"}, WordTag{"我", "r"}, WordTag{"叫", "v"}, WordTag{"孙悟空", "nr"}, WordTag{"", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"", "x"}, WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"Python", "eng"}, WordTag{"和", "c"}, WordTag{"C++", "nz"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"不", "d"}, WordTag{"喜欢", "v"}, WordTag{"日本", "ns"}, WordTag{"和服", "nz"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"回归", "v"}, WordTag{"人间", "n"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"工信处", "n"}, WordTag{"女干事", "n"}, WordTag{"每月", "r"}, WordTag{"经过", "p"}, WordTag{"下属", "v"}, WordTag{"科室", "n"}, WordTag{"都", "d"}, WordTag{"要", "v"}, WordTag{"亲口", "n"}, WordTag{"交代", "n"}, WordTag{"24", "eng"}, WordTag{"口", "q"}, WordTag{"交换机", "n"}, WordTag{"等", "u"}, WordTag{"技术性", "n"}, WordTag{"器件", "n"}, WordTag{"的", "uj"}, WordTag{"安装", "v"}, WordTag{"工作", "vn"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"需要", "v"}, WordTag{"廉租房", "n"}},
[]WordTag{WordTag{"永和", "nz"}, WordTag{"服装", "vn"}, WordTag{"饰品", "n"}, WordTag{"有限公司", "n"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"爱", "v"}, WordTag{"北京", "ns"}, WordTag{"天安门", "ns"}},
[]WordTag{WordTag{"abc", "eng"}},
[]WordTag{WordTag{"隐", "n"}, WordTag{"马尔可夫", "nr"}},
[]WordTag{WordTag{"雷猴", "n"}, WordTag{"是", "v"}, WordTag{"个", "q"}, WordTag{"好", "a"}, WordTag{"网站", "n"}},
[]WordTag{WordTag{"“", "x"}, WordTag{"Microsoft", "eng"}, WordTag{"”", "x"}, WordTag{"一", "m"}, WordTag{"词", "n"}, WordTag{"由", "p"}, WordTag{"“", "x"}, WordTag{"MICROcomputer", "eng"}, WordTag{"", "x"}, WordTag{"微型", "b"}, WordTag{"计算机", "n"}, WordTag{"", "x"}, WordTag{"”", "x"}, WordTag{"和", "c"}, WordTag{"“", "x"}, WordTag{"SOFTware", "eng"}, WordTag{"", "x"}, WordTag{"软件", "n"}, WordTag{"", "x"}, WordTag{"”", "x"}, WordTag{"两", "m"}, WordTag{"部分", "n"}, WordTag{"组成", "v"}},
[]WordTag{WordTag{"草泥马", "n"}, WordTag{"和", "c"}, WordTag{"欺", "vn"}, WordTag{"实", "n"}, WordTag{"马", "n"}, WordTag{"是", "v"}, WordTag{"今年", "t"}, WordTag{"的", "uj"}, WordTag{"流行", "v"}, WordTag{"词汇", "n"}},
[]WordTag{WordTag{"伊", "ns"}, WordTag{"藤", "nr"}, WordTag{"洋华堂", "n"}, WordTag{"总府", "n"}, WordTag{"店", "n"}},
[]WordTag{WordTag{"中国科学院计算技术研究所", "nt"}},
[]WordTag{WordTag{"罗密欧", "nr"}, WordTag{"与", "p"}, WordTag{"朱丽叶", "nr"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"购买", "v"}, WordTag{"了", "ul"}, WordTag{"道具", "n"}, WordTag{"和", "c"}, WordTag{"服装", "vn"}},
[]WordTag{WordTag{"PS", "eng"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"我", "r"}, WordTag{"觉得", "v"}, WordTag{"开源", "n"}, WordTag{"有", "v"}, WordTag{"一个", "m"}, WordTag{"好处", "d"}, WordTag{"", "x"}, WordTag{"就是", "d"}, WordTag{"能够", "v"}, WordTag{"敦促", "v"}, WordTag{"自己", "r"}, WordTag{"不断改进", "l"}, WordTag{"", "x"}, WordTag{"避免", "v"}, WordTag{"敞", "v"}, WordTag{"帚", "ng"}, WordTag{"自珍", "b"}},
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"石首市", "ns"}},
[]WordTag{WordTag{"湖北省", "ns"}, WordTag{"十堰市", "ns"}},
[]WordTag{WordTag{"总经理", "n"}, WordTag{"完成", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}},
[]WordTag{WordTag{"电脑", "n"}, WordTag{"修好", "v"}, WordTag{"了", "ul"}},
[]WordTag{WordTag{"做好", "v"}, WordTag{"了", "ul"}, WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"就", "d"}, WordTag{"一了百了", "l"}, WordTag{"了", "ul"}},
[]WordTag{WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"我们", "r"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"美的", "nr"}, WordTag{"空调", "n"}},
[]WordTag{WordTag{"线程", "n"}, WordTag{"初始化", "l"}, WordTag{"时", "n"}, WordTag{"我们", "r"}, WordTag{"要", "v"}, WordTag{"注意", "v"}},
[]WordTag{WordTag{"一个", "m"}, WordTag{"分子", "n"}, WordTag{"是", "v"}, WordTag{"由", "p"}, WordTag{"好多", "m"}, WordTag{"原子", "n"}, WordTag{"组织", "v"}, WordTag{"成", "n"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"祝", "v"}, WordTag{"你", "r"}, WordTag{"马到功成", "i"}},
[]WordTag{WordTag{"他", "r"}, WordTag{"掉", "zg"}, WordTag{"进", "v"}, WordTag{"了", "ul"}, WordTag{"无底洞", "ns"}, WordTag{"里", "f"}},
[]WordTag{WordTag{"中国", "ns"}, WordTag{"的", "uj"}, WordTag{"首都", "d"}, WordTag{"是", "v"}, WordTag{"北京", "ns"}},
[]WordTag{WordTag{"孙", "zg"}, WordTag{"君", "nz"}, WordTag{"意", "n"}},
[]WordTag{WordTag{"外交部", "nt"}, WordTag{"发言人", "l"}, WordTag{"马朝旭", "nr"}},
[]WordTag{WordTag{"领导人", "n"}, WordTag{"会议", "n"}, WordTag{"和", "c"}, WordTag{"第四届", "m"}, WordTag{"东亚", "ns"}, WordTag{"峰会", "n"}},
[]WordTag{WordTag{"在", "p"}, WordTag{"过去", "t"}, WordTag{"的", "uj"}, WordTag{"这", "r"}, WordTag{"五年", "t"}},
[]WordTag{WordTag{"还", "d"}, WordTag{"需要", "v"}, WordTag{"很", "zg"}, WordTag{"长", "a"}, WordTag{"的", "uj"}, WordTag{"路", "n"}, WordTag{"要", "v"}, WordTag{"走", "v"}},
[]WordTag{WordTag{"60", "eng"}, WordTag{"周年", "t"}, WordTag{"首都", "d"}, WordTag{"阅兵", "v"}},
[]WordTag{WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"来", "v"}, WordTag{"世博园", "nr"}},
[]WordTag{WordTag{"买", "v"}, WordTag{"水果", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"世博园", "nr"}},
[]WordTag{WordTag{"但是", "c"}, WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}, WordTag{"知道", "v"}, WordTag{"你", "r"}, WordTag{"是", "v"}, WordTag{"对", "p"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"存在", "v"}, WordTag{"即", "v"}, WordTag{"合理", "vn"}},
[]WordTag{WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"在", "p"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"的", "uj"}, WordTag{"就", "d"}, WordTag{"以", "p"}, WordTag{"和", "c"}, WordTag{"和", "c"}, WordTag{"和", "c"}},
[]WordTag{WordTag{"I", "eng"}, WordTag{" ", "x"}, WordTag{"love", "eng"}, WordTag{"你", "r"}, WordTag{"", "x"}, WordTag{"不以为耻", "i"}, WordTag{"", "x"}, WordTag{"反", "zg"}, WordTag{"以为", "c"}, WordTag{"rong", "eng"}},
[]WordTag{WordTag{"因", "p"}},
[]WordTag{},
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"很", "zg"}, WordTag{"好", "a"}, WordTag{"但", "c"}, WordTag{"主要", "b"}, WordTag{"是", "v"}, WordTag{"基于", "p"}, WordTag{"网页", "n"}, WordTag{"形式", "n"}},
[]WordTag{WordTag{"hello", "eng"}, WordTag{"你好", "l"}, WordTag{"人们", "n"}, WordTag{"审美", "vn"}, WordTag{"的", "uj"}, WordTag{"观点", "n"}, WordTag{"是", "v"}, WordTag{"不同", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"为什么", "r"}, WordTag{"我", "r"}, WordTag{"不能", "v"}, WordTag{"拥有", "v"}, WordTag{"想要", "v"}, WordTag{"的", "uj"}, WordTag{"生活", "vn"}},
[]WordTag{WordTag{"后来", "t"}, WordTag{"我", "r"}, WordTag{"才", "d"}},
[]WordTag{WordTag{"此次", "r"}, WordTag{"来", "v"}, WordTag{"中国", "ns"}, WordTag{"是", "v"}, WordTag{"为了", "p"}},
[]WordTag{WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{",", "x"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"其实", "d"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"好人", "n"}, WordTag{"使用", "v"}, WordTag{"了", "ul"}, WordTag{"它", "r"}, WordTag{"就", "d"}, WordTag{"可以", "c"}, WordTag{"解决", "v"}, WordTag{"一些", "m"}, WordTag{"问题", "n"}},
[]WordTag{WordTag{"是因为", "c"}, WordTag{"和", "c"}, WordTag{"国家", "n"}},
[]WordTag{WordTag{"老年", "t"}, WordTag{"搜索", "v"}, WordTag{"还", "d"}, WordTag{"支持", "v"}},
[]WordTag{WordTag{"干脆", "d"}, WordTag{"就", "d"}, WordTag{"把", "p"}, WordTag{"那", "r"}, WordTag{"部", "n"}, WordTag{"蒙", "v"}, WordTag{"人", "n"}, WordTag{"的", "uj"}, WordTag{"闲", "n"}, WordTag{"法", "j"}, WordTag{"给", "p"}, WordTag{"废", "v"}, WordTag{"了", "ul"}, WordTag{"拉倒", "v"}, WordTag{"", "x"}, WordTag{"RT", "eng"}, WordTag{" ", "x"}, WordTag{"@", "x"}, WordTag{"laoshipukong", "eng"}, WordTag{" ", "x"}, WordTag{":", "x"}, WordTag{" ", "x"}, WordTag{"27", "eng"}, WordTag{"日", "m"}, WordTag{"", "x"}, WordTag{"全国人大常委会", "nt"}, WordTag{"第三次", "m"}, WordTag{"审议", "v"}, WordTag{"侵权", "v"}, WordTag{"责任法", "n"}, WordTag{"草案", "n"}, WordTag{"", "x"}, WordTag{"删除", "v"}, WordTag{"了", "ul"}, WordTag{"有关", "vn"}, WordTag{"医疗", "n"}, WordTag{"损害", "v"}, WordTag{"责任", "n"}, WordTag{"“", "x"}, WordTag{"举证", "v"}, WordTag{"倒置", "v"}, WordTag{"”", "x"}, WordTag{"的", "uj"}, WordTag{"规定", "n"}, WordTag{"。", "x"}, WordTag{"在", "p"}, WordTag{"医患", "n"}, WordTag{"纠纷", "n"}, WordTag{"中", "f"}, WordTag{"本", "r"}, WordTag{"已", "d"}, WordTag{"处于", "v"}, WordTag{"弱势", "n"}, WordTag{"地位", "n"}, WordTag{"的", "uj"}, WordTag{"消费者", "n"}, WordTag{"由此", "c"}, WordTag{"将", "d"}, WordTag{"陷入", "v"}, WordTag{"万劫不复", "i"}, WordTag{"的", "uj"}, WordTag{"境地", "s"}, WordTag{"。", "x"}, WordTag{" ", "x"}},
[]WordTag{WordTag{"大", "a"}},
[]WordTag{},
[]WordTag{WordTag{"他", "r"}, WordTag{"说", "v"}, WordTag{"的", "uj"}, WordTag{"确实", "ad"}, WordTag{"在", "p"}, WordTag{"理", "n"}},
[]WordTag{WordTag{"长春", "ns"}, WordTag{"市长", "n"}, WordTag{"春节", "t"}, WordTag{"讲话", "n"}},
[]WordTag{WordTag{"结婚", "v"}, WordTag{"的", "uj"}, WordTag{"和", "c"}, WordTag{"尚未", "d"}, WordTag{"结婚", "v"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"结合", "v"}, WordTag{"成", "n"}, WordTag{"分子", "n"}, WordTag{"时", "n"}},
[]WordTag{WordTag{"旅游", "vn"}, WordTag{"和", "c"}, WordTag{"服务", "vn"}, WordTag{"是", "v"}, WordTag{"最好", "a"}, WordTag{"的", "uj"}},
[]WordTag{WordTag{"这件", "mq"}, WordTag{"事情", "n"}, WordTag{"的确", "d"}, WordTag{"是", "v"}, WordTag{"我", "r"}, WordTag{"的", "uj"}, WordTag{"错", "v"}},
[]WordTag{WordTag{"供", "v"}, WordTag{"大家", "n"}, WordTag{"参考", "v"}, WordTag{"指正", "v"}},
[]WordTag{WordTag{"哈尔滨", "ns"}, WordTag{"政府", "n"}, WordTag{"公布", "v"}, WordTag{"塌", "v"}, WordTag{"桥", "n"}, WordTag{"原因", "n"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"在", "p"}, WordTag{"机场", "n"}, WordTag{"入口处", "i"}},
[]WordTag{WordTag{"邢", "nr"}, WordTag{"永", "ns"}, WordTag{"臣", "n"}, WordTag{"摄影", "n"}, WordTag{"报道", "v"}},
[]WordTag{WordTag{"BP", "eng"}, WordTag{"神经网络", "n"}, WordTag{"如何", "r"}, WordTag{"训练", "vn"}, WordTag{"才能", "v"}, WordTag{"在", "p"}, WordTag{"分类", "n"}, WordTag{"时", "n"}, WordTag{"增加", "v"}, WordTag{"区分度", "n"}, WordTag{"", "x"}},
[]WordTag{WordTag{"南京市", "ns"}, WordTag{"长江大桥", "ns"}},
[]WordTag{WordTag{"应", "v"}, WordTag{"一些", "m"}, WordTag{"使用者", "n"}, WordTag{"的", "uj"}, WordTag{"建议", "n"}, WordTag{"", "x"}, WordTag{"也", "d"}, WordTag{"为了", "p"}, WordTag{"便于", "v"}, WordTag{"利用", "n"}, WordTag{"NiuTrans", "eng"}, WordTag{"用于", "v"}, WordTag{"SMT", "eng"}, WordTag{"研究", "vn"}},
[]WordTag{WordTag{"长春市", "ns"}, WordTag{"长春", "ns"}, WordTag{"药店", "n"}},
[]WordTag{WordTag{"邓颖超", "nr"}, WordTag{"生前", "t"}, WordTag{"最", "d"}, WordTag{"喜欢", "v"}, WordTag{"的", "uj"}, WordTag{"衣服", "n"}},
[]WordTag{WordTag{"胡锦涛", "nr"}, WordTag{"是", "v"}, WordTag{"热爱", "a"}, WordTag{"世界", "n"}, WordTag{"和平", "nz"}, WordTag{"的", "uj"}, WordTag{"政治局", "n"}, WordTag{"常委", "j"}},
[]WordTag{WordTag{"程序员", "n"}, WordTag{"祝", "v"}, WordTag{"海林", "nz"}, WordTag{"和", "c"}, WordTag{"朱", "nr"}, WordTag{"会", "v"}, WordTag{"震", "v"}, WordTag{"是", "v"}, WordTag{"在", "p"}, WordTag{"孙", "zg"}, WordTag{"健", "a"}, WordTag{"的", "uj"}, WordTag{"左面", "f"}, WordTag{"和", "c"}, WordTag{"右面", "f"}, WordTag{",", "x"}, WordTag{" ", "x"}, WordTag{"范", "nr"}, WordTag{"凯", "nr"}, WordTag{"在", "p"}, WordTag{"最", "d"}, WordTag{"右面", "f"}, WordTag{".", "x"}, WordTag{"再", "d"}, WordTag{"往", "zg"}, WordTag{"左", "m"}, WordTag{"是", "v"}, WordTag{"李", "nr"}, WordTag{"松", "v"}, WordTag{"洪", "nr"}},
[]WordTag{WordTag{"一次性", "d"}, WordTag{"交", "v"}, WordTag{"多少", "m"}, WordTag{"钱", "n"}},
[]WordTag{WordTag{"两块", "m"}, WordTag{"五", "m"}, WordTag{"一套", "m"}, WordTag{"", "x"}, WordTag{"三块", "m"}, WordTag{"八", "m"}, WordTag{"一斤", "m"}, WordTag{"", "x"}, WordTag{"四块", "m"}, WordTag{"七", "m"}, WordTag{"一本", "m"}, WordTag{"", "x"}, WordTag{"五块", "m"}, WordTag{"六", "m"}, WordTag{"一条", "m"}},
[]WordTag{WordTag{"小", "a"}, WordTag{"和尚", "nr"}, WordTag{"留", "v"}, WordTag{"了", "ul"}, WordTag{"一个", "m"}, WordTag{"像", "v"}, WordTag{"大", "a"}, WordTag{"和尚", "nr"}, WordTag{"一样", "r"}, WordTag{"的", "uj"}, WordTag{"和尚头", "nr"}},
[]WordTag{WordTag{"我", "r"}, WordTag{"是", "v"}, WordTag{"中华人民共和国", "ns"}, WordTag{"公民", "n"}, WordTag{";", "x"}, WordTag{"我", "r"}, WordTag{"爸爸", "n"}, WordTag{"是", "v"}, WordTag{"共和党", "nt"}, WordTag{"党员", "n"}, WordTag{";", "x"}, WordTag{" ", "x"}, WordTag{"地铁", "n"}, WordTag{"和平门", "ns"}, WordTag{"站", "v"}},
[]WordTag{WordTag{"张晓梅", "nr"}, WordTag{"去", "v"}, WordTag{"人民", "n"}, WordTag{"医院", "n"}, WordTag{"做", "v"}, WordTag{"了", "ul"}, WordTag{"个", "q"}, WordTag{"B超", "n"}, WordTag{"然后", "c"}, WordTag{"去", "v"}, WordTag{"买", "v"}, WordTag{"了", "ul"}, WordTag{"件", "zg"}, WordTag{"T恤", "n"}},
[]WordTag{WordTag{"AT&T", "nz"}, WordTag{"是", "v"}, WordTag{"一件", "m"}, WordTag{"不错", "a"}, WordTag{"的", "uj"}, WordTag{"公司", "n"}, WordTag{"", "x"}, WordTag{"给", "p"}, WordTag{"你", "r"}, WordTag{"发", "v"}, WordTag{"offer", "eng"}, WordTag{"了", "ul"}, WordTag{"吗", "y"}, WordTag{"", "x"}},
[]WordTag{WordTag{"C++", "nz"}, WordTag{"和", "c"}, WordTag{"c#", "nz"}, WordTag{"是", "v"}, WordTag{"什么", "r"}, WordTag{"关系", "n"}, WordTag{"", "x"}, WordTag{"11", "eng"}, WordTag{"+", "x"}, WordTag{"122", "eng"}, WordTag{"=", "x"}, WordTag{"133", "eng"}, WordTag{"", "x"}, WordTag{"是", "v"}, WordTag{"吗", "y"}, WordTag{"", "x"}, WordTag{"PI", "eng"}, WordTag{"=", "x"}, WordTag{"3", "eng"}, WordTag{".", "x"}, WordTag{"14159", "eng"}},
[]WordTag{WordTag{"你", "r"}, WordTag{"认识", "v"}, WordTag{"那个", "r"}, WordTag{"和", "c"}, WordTag{"主席", "n"}, WordTag{"握手", "v"}, WordTag{"的", "uj"}, WordTag{"的哥", "n"}, WordTag{"吗", "y"}, WordTag{"", "x"}, WordTag{"他", "r"}, WordTag{"开", "v"}, WordTag{"一辆", "m"}, WordTag{"黑色", "n"}, WordTag{"的士", "n"}, WordTag{"。", "x"}},
[]WordTag{WordTag{"枪杆子", "n"}, WordTag{"中", "f"}, WordTag{"出", "v"}, WordTag{"政权", "n"}},
noHMMCutResult = [][]Segment{
[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{"", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
[]Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
[]Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
[]Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "eng"}, Segment{"口", "q"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
[]Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
[]Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
[]Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
[]Segment{Segment{"abc", "eng"}},
[]Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
[]Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
[]Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{"", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"", "x"}, Segment{"软件", "n"}, Segment{"", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
[]Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺", "vn"}, Segment{"实", "n"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
[]Segment{Segment{"伊", "ns"}, Segment{"藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
[]Segment{Segment{"中国科学院计算技术研究所", "nt"}},
[]Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
[]Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
[]Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{"", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{"", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
[]Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
[]Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
[]Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
[]Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
[]Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
[]Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
[]Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
[]Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "n"}, Segment{"的", "uj"}},
[]Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
[]Segment{Segment{"他", "r"}, Segment{"掉", "zg"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
[]Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
[]Segment{Segment{"孙", "zg"}, Segment{"君", "nz"}, Segment{"意", "n"}},
[]Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
[]Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
[]Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
[]Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "zg"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
[]Segment{Segment{"60", "eng"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
[]Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
[]Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
[]Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
[]Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
[]Segment{Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"在", "p"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和", "c"}, Segment{"和", "c"}, Segment{"和", "c"}},
[]Segment{Segment{"I", "eng"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{"", "x"}, Segment{"不以为耻", "i"}, Segment{"", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
[]Segment{Segment{"因", "p"}},
[]Segment{},
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"很", "zg"}, Segment{"好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
[]Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
[]Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
[]Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
[]Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
[]Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
[]Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
[]Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那", "r"}, Segment{"部", "n"}, Segment{"蒙", "v"}, Segment{"人", "n"}, Segment{"的", "uj"}, Segment{"闲", "n"}, Segment{"法", "j"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "eng"}, Segment{"日", "m"}, Segment{"", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{"", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中", "f"}, Segment{"本", "r"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
[]Segment{Segment{"大", "a"}},
[]Segment{},
[]Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
[]Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
[]Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
[]Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
[]Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
[]Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "v"}},
[]Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
[]Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
[]Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
[]Segment{Segment{"邢", "nr"}, Segment{"永", "ns"}, Segment{"臣", "n"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
[]Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"", "x"}},
[]Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
[]Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{"", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
[]Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
[]Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
[]Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
[]Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱", "nr"}, Segment{"会", "v"}, Segment{"震", "v"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙", "zg"}, Segment{"健", "a"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范", "nr"}, Segment{"凯", "nr"}, Segment{"在", "p"}, Segment{"最", "d"}, Segment{"右面", "f"}, Segment{".", "x"}, Segment{"再", "d"}, Segment{"往", "zg"}, Segment{"左", "m"}, Segment{"是", "v"}, Segment{"李", "nr"}, Segment{"松", "v"}, Segment{"洪", "nr"}},
[]Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
[]Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{"", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{"", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{"", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
[]Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
[]Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
[]Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "zg"}, Segment{"T恤", "n"}},
[]Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{"", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"", "x"}},
[]Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"", "x"}, Segment{"11", "eng"}, Segment{"+", "x"}, Segment{"122", "eng"}, Segment{"=", "x"}, Segment{"133", "eng"}, Segment{"", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3", "eng"}, Segment{".", "x"}, Segment{"14159", "eng"}},
[]Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
[]Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
}
)
func chanToArray(ch chan WordTag) []WordTag {
result := make([]WordTag, 0)
func init() {
seg.LoadDictionary("../dict.txt")
}
func chanToArray(ch <-chan Segment) []Segment {
var result []Segment
for word := range ch {
result = append(result, word)
}
@@ -277,136 +281,148 @@ func chanToArray(ch chan WordTag) []WordTag {
}
func TestCut(t *testing.T) {
SetDictionary("../dict.txt")
for index, content := range test_contents {
result := chanToArray(Cut(content, true))
for index, content := range testContents {
result := chanToArray(seg.Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Error(content)
t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result))
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
}
for i, _ := range result {
for i := range result {
if result[i] != defaultCutResult[index][i] {
t.Error(content)
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
}
}
result = chanToArray(Cut(content, false))
result = chanToArray(seg.Cut(content, false))
if len(noHMMCutResult[index]) != len(result) {
t.Error(content)
t.Fatal(content)
}
for i, _ := range result {
for i := range result {
if result[i] != noHMMCutResult[index][i] {
t.Error(content)
t.Fatal(content)
}
}
}
}
// https://github.com/fxsjy/jieba/issues/132
func TestBug132(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/132
*/
SetDictionary("../dict.txt")
sentence := "又跛又啞"
cutResult := []WordTag{
WordTag{"又", "d"},
WordTag{"跛", "a"},
WordTag{"又", "d"},
WordTag{"啞", "v"},
cutResult := []Segment{
Segment{"又", "d"},
Segment{"跛", "a"},
Segment{"又", "d"},
Segment{"啞", "v"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(seg.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Error(result[i])
t.Fatal(result[i])
}
}
}
// https://github.com/fxsjy/jieba/issues/137
func TestBug137(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/137
*/
SetDictionary("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []WordTag{
WordTag{"前", "f"},
WordTag{"港督", "n"},
WordTag{"衛奕", "z"},
WordTag{"信", "n"},
WordTag{"在", "p"},
WordTag{"八八年", "m"},
WordTag{"十月", "t"},
WordTag{"宣布", "v"},
WordTag{"成立", "v"},
WordTag{"中央", "n"},
WordTag{"政策", "n"},
WordTag{"研究", "vn"},
WordTag{"組", "x"},
cutResult := []Segment{
Segment{"前", "f"},
Segment{"港督", "n"},
Segment{"衛奕", "z"},
Segment{"信", "n"},
Segment{"在", "p"},
Segment{"八八年", "m"},
Segment{"十月", "t"},
Segment{"宣布", "v"},
Segment{"成立", "v"},
Segment{"中央", "n"},
Segment{"政策", "n"},
Segment{"研究", "vn"},
Segment{"組", "x"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(seg.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Error(result[i])
t.Fatal(result[i])
}
}
}
func TestUserDict(t *testing.T) {
SetDictionary("../dict.txt")
jiebago.LoadUserDict("../userdict.txt")
seg.LoadUserDictionary("../userdict.txt")
defer seg.LoadDictionary("../dict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
cutResult := []WordTag{
WordTag{"李小福", "nr"},
WordTag{"是", "v"},
WordTag{"创新办", "i"},
WordTag{"主任", "b"},
WordTag{"也", "d"},
WordTag{"是", "v"},
WordTag{"云计算", "x"},
WordTag{"方面", "n"},
WordTag{"的", "uj"},
WordTag{"专家", "n"},
WordTag{";", "x"},
WordTag{" ", "x"},
WordTag{"什么", "r"},
WordTag{"是", "v"},
WordTag{"八一双鹿", "nz"},
WordTag{"例如", "v"},
WordTag{"我", "r"},
WordTag{"输入", "v"},
WordTag{"一个", "m"},
WordTag{"带", "v"},
WordTag{"“", "x"},
WordTag{"韩玉赏鉴", "nz"},
WordTag{"”", "x"},
WordTag{"的", "uj"},
WordTag{"标题", "n"},
WordTag{"", "x"},
WordTag{"在", "p"},
WordTag{"自定义词", "n"},
WordTag{"库中", "nrt"},
WordTag{"也", "d"},
WordTag{"增加", "v"},
WordTag{"了", "ul"},
WordTag{"此", "r"},
WordTag{"词", "n"},
WordTag{"为", "p"},
WordTag{"N", "eng"},
WordTag{"类型", "n"}}
cutResult := []Segment{
Segment{"李小福", "nr"},
Segment{"是", "v"},
Segment{"创新办", "i"},
Segment{"主任", "b"},
Segment{"也", "d"},
Segment{"是", "v"},
Segment{"云计算", "x"},
Segment{"方面", "n"},
Segment{"的", "uj"},
Segment{"专家", "n"},
Segment{";", "x"},
Segment{" ", "x"},
Segment{"什么", "r"},
Segment{"是", "v"},
Segment{"八一双鹿", "nz"},
Segment{"例如", "v"},
Segment{"我", "r"},
Segment{"输入", "v"},
Segment{"一个", "m"},
Segment{"带", "v"},
Segment{"“", "x"},
Segment{"韩玉赏鉴", "nz"},
Segment{"”", "x"},
Segment{"的", "uj"},
Segment{"标题", "n"},
Segment{"", "x"},
Segment{"在", "p"},
Segment{"自定义词", "n"},
Segment{"库中", "nrt"},
Segment{"也", "d"},
Segment{"增加", "v"},
Segment{"了", "ul"},
Segment{"此", "r"},
Segment{"词", "n"},
Segment{"为", "p"},
Segment{"N", "eng"},
Segment{"类型", "n"}}
result := chanToArray(Cut(sentence, true))
result := chanToArray(seg.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Error(result[i])
t.Fatal(result[i])
}
}
}
func BenchmarkCutNoHMM(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.Cut(sentence, false))
}
}
func BenchmarkCut(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
chanToArray(seg.Cut(sentence, true))
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,264 +1,260 @@
package posseg
var (
probStart = make(map[stateTag]float64)
)
func init() {
probStart[stateTag{'B', "a"}] = -4.762305214596967
probStart[stateTag{'B', "ad"}] = -6.680066036784177
probStart[stateTag{'B', "ag"}] = -3.14e+100
probStart[stateTag{'B', "an"}] = -8.697083223018778
probStart[stateTag{'B', "b"}] = -5.018374362109218
probStart[stateTag{'B', "bg"}] = -3.14e+100
probStart[stateTag{'B', "c"}] = -3.423880184954888
probStart[stateTag{'B', "d"}] = -3.9750475297585357
probStart[stateTag{'B', "df"}] = -8.888974230828882
probStart[stateTag{'B', "dg"}] = -3.14e+100
probStart[stateTag{'B', "e"}] = -8.563551830394255
probStart[stateTag{'B', "en"}] = -3.14e+100
probStart[stateTag{'B', "f"}] = -5.491630418482717
probStart[stateTag{'B', "g"}] = -3.14e+100
probStart[stateTag{'B', "h"}] = -13.533365129970255
probStart[stateTag{'B', "i"}] = -6.1157847275557105
probStart[stateTag{'B', "in"}] = -3.14e+100
probStart[stateTag{'B', "j"}] = -5.0576191284681915
probStart[stateTag{'B', "jn"}] = -3.14e+100
probStart[stateTag{'B', "k"}] = -3.14e+100
probStart[stateTag{'B', "l"}] = -4.905883584659895
probStart[stateTag{'B', "ln"}] = -3.14e+100
probStart[stateTag{'B', "m"}] = -3.6524299819046386
probStart[stateTag{'B', "mg"}] = -3.14e+100
probStart[stateTag{'B', "mq"}] = -6.78695300139688
probStart[stateTag{'B', "n"}] = -1.6966257797548328
probStart[stateTag{'B', "ng"}] = -3.14e+100
probStart[stateTag{'B', "nr"}] = -2.2310495913769506
probStart[stateTag{'B', "nrfg"}] = -5.873722175405573
probStart[stateTag{'B', "nrt"}] = -4.985642733519195
probStart[stateTag{'B', "ns"}] = -2.8228438314969213
probStart[stateTag{'B', "nt"}] = -4.846091668182416
probStart[stateTag{'B', "nz"}] = -3.94698846057672
probStart[stateTag{'B', "o"}] = -8.433498702146057
probStart[stateTag{'B', "p"}] = -4.200984132085048
probStart[stateTag{'B', "q"}] = -6.998123858956596
probStart[stateTag{'B', "qe"}] = -3.14e+100
probStart[stateTag{'B', "qg"}] = -3.14e+100
probStart[stateTag{'B', "r"}] = -3.4098187790818413
probStart[stateTag{'B', "rg"}] = -3.14e+100
probStart[stateTag{'B', "rr"}] = -12.434752841302146
probStart[stateTag{'B', "rz"}] = -7.946116471570005
probStart[stateTag{'B', "s"}] = -5.522673590839954
probStart[stateTag{'B', "t"}] = -3.3647479094528574
probStart[stateTag{'B', "tg"}] = -3.14e+100
probStart[stateTag{'B', "u"}] = -9.163917277503234
probStart[stateTag{'B', "ud"}] = -3.14e+100
probStart[stateTag{'B', "ug"}] = -3.14e+100
probStart[stateTag{'B', "uj"}] = -3.14e+100
probStart[stateTag{'B', "ul"}] = -3.14e+100
probStart[stateTag{'B', "uv"}] = -3.14e+100
probStart[stateTag{'B', "uz"}] = -3.14e+100
probStart[stateTag{'B', "v"}] = -2.6740584874265685
probStart[stateTag{'B', "vd"}] = -9.044728760238115
probStart[stateTag{'B', "vg"}] = -3.14e+100
probStart[stateTag{'B', "vi"}] = -12.434752841302146
probStart[stateTag{'B', "vn"}] = -4.3315610890163585
probStart[stateTag{'B', "vq"}] = -12.147070768850364
probStart[stateTag{'B', "w"}] = -3.14e+100
probStart[stateTag{'B', "x"}] = -3.14e+100
probStart[stateTag{'B', "y"}] = -9.844485675856319
probStart[stateTag{'B', "yg"}] = -3.14e+100
probStart[stateTag{'B', "z"}] = -7.045681111485645
probStart[stateTag{'B', "zg"}] = -3.14e+100
probStart[stateTag{'E', "a"}] = -3.14e+100
probStart[stateTag{'E', "ad"}] = -3.14e+100
probStart[stateTag{'E', "ag"}] = -3.14e+100
probStart[stateTag{'E', "an"}] = -3.14e+100
probStart[stateTag{'E', "b"}] = -3.14e+100
probStart[stateTag{'E', "bg"}] = -3.14e+100
probStart[stateTag{'E', "c"}] = -3.14e+100
probStart[stateTag{'E', "d"}] = -3.14e+100
probStart[stateTag{'E', "df"}] = -3.14e+100
probStart[stateTag{'E', "dg"}] = -3.14e+100
probStart[stateTag{'E', "e"}] = -3.14e+100
probStart[stateTag{'E', "en"}] = -3.14e+100
probStart[stateTag{'E', "f"}] = -3.14e+100
probStart[stateTag{'E', "g"}] = -3.14e+100
probStart[stateTag{'E', "h"}] = -3.14e+100
probStart[stateTag{'E', "i"}] = -3.14e+100
probStart[stateTag{'E', "in"}] = -3.14e+100
probStart[stateTag{'E', "j"}] = -3.14e+100
probStart[stateTag{'E', "jn"}] = -3.14e+100
probStart[stateTag{'E', "k"}] = -3.14e+100
probStart[stateTag{'E', "l"}] = -3.14e+100
probStart[stateTag{'E', "ln"}] = -3.14e+100
probStart[stateTag{'E', "m"}] = -3.14e+100
probStart[stateTag{'E', "mg"}] = -3.14e+100
probStart[stateTag{'E', "mq"}] = -3.14e+100
probStart[stateTag{'E', "n"}] = -3.14e+100
probStart[stateTag{'E', "ng"}] = -3.14e+100
probStart[stateTag{'E', "nr"}] = -3.14e+100
probStart[stateTag{'E', "nrfg"}] = -3.14e+100
probStart[stateTag{'E', "nrt"}] = -3.14e+100
probStart[stateTag{'E', "ns"}] = -3.14e+100
probStart[stateTag{'E', "nt"}] = -3.14e+100
probStart[stateTag{'E', "nz"}] = -3.14e+100
probStart[stateTag{'E', "o"}] = -3.14e+100
probStart[stateTag{'E', "p"}] = -3.14e+100
probStart[stateTag{'E', "q"}] = -3.14e+100
probStart[stateTag{'E', "qe"}] = -3.14e+100
probStart[stateTag{'E', "qg"}] = -3.14e+100
probStart[stateTag{'E', "r"}] = -3.14e+100
probStart[stateTag{'E', "rg"}] = -3.14e+100
probStart[stateTag{'E', "rr"}] = -3.14e+100
probStart[stateTag{'E', "rz"}] = -3.14e+100
probStart[stateTag{'E', "s"}] = -3.14e+100
probStart[stateTag{'E', "t"}] = -3.14e+100
probStart[stateTag{'E', "tg"}] = -3.14e+100
probStart[stateTag{'E', "u"}] = -3.14e+100
probStart[stateTag{'E', "ud"}] = -3.14e+100
probStart[stateTag{'E', "ug"}] = -3.14e+100
probStart[stateTag{'E', "uj"}] = -3.14e+100
probStart[stateTag{'E', "ul"}] = -3.14e+100
probStart[stateTag{'E', "uv"}] = -3.14e+100
probStart[stateTag{'E', "uz"}] = -3.14e+100
probStart[stateTag{'E', "v"}] = -3.14e+100
probStart[stateTag{'E', "vd"}] = -3.14e+100
probStart[stateTag{'E', "vg"}] = -3.14e+100
probStart[stateTag{'E', "vi"}] = -3.14e+100
probStart[stateTag{'E', "vn"}] = -3.14e+100
probStart[stateTag{'E', "vq"}] = -3.14e+100
probStart[stateTag{'E', "w"}] = -3.14e+100
probStart[stateTag{'E', "x"}] = -3.14e+100
probStart[stateTag{'E', "y"}] = -3.14e+100
probStart[stateTag{'E', "yg"}] = -3.14e+100
probStart[stateTag{'E', "z"}] = -3.14e+100
probStart[stateTag{'E', "zg"}] = -3.14e+100
probStart[stateTag{'M', "a"}] = -3.14e+100
probStart[stateTag{'M', "ad"}] = -3.14e+100
probStart[stateTag{'M', "ag"}] = -3.14e+100
probStart[stateTag{'M', "an"}] = -3.14e+100
probStart[stateTag{'M', "b"}] = -3.14e+100
probStart[stateTag{'M', "bg"}] = -3.14e+100
probStart[stateTag{'M', "c"}] = -3.14e+100
probStart[stateTag{'M', "d"}] = -3.14e+100
probStart[stateTag{'M', "df"}] = -3.14e+100
probStart[stateTag{'M', "dg"}] = -3.14e+100
probStart[stateTag{'M', "e"}] = -3.14e+100
probStart[stateTag{'M', "en"}] = -3.14e+100
probStart[stateTag{'M', "f"}] = -3.14e+100
probStart[stateTag{'M', "g"}] = -3.14e+100
probStart[stateTag{'M', "h"}] = -3.14e+100
probStart[stateTag{'M', "i"}] = -3.14e+100
probStart[stateTag{'M', "in"}] = -3.14e+100
probStart[stateTag{'M', "j"}] = -3.14e+100
probStart[stateTag{'M', "jn"}] = -3.14e+100
probStart[stateTag{'M', "k"}] = -3.14e+100
probStart[stateTag{'M', "l"}] = -3.14e+100
probStart[stateTag{'M', "ln"}] = -3.14e+100
probStart[stateTag{'M', "m"}] = -3.14e+100
probStart[stateTag{'M', "mg"}] = -3.14e+100
probStart[stateTag{'M', "mq"}] = -3.14e+100
probStart[stateTag{'M', "n"}] = -3.14e+100
probStart[stateTag{'M', "ng"}] = -3.14e+100
probStart[stateTag{'M', "nr"}] = -3.14e+100
probStart[stateTag{'M', "nrfg"}] = -3.14e+100
probStart[stateTag{'M', "nrt"}] = -3.14e+100
probStart[stateTag{'M', "ns"}] = -3.14e+100
probStart[stateTag{'M', "nt"}] = -3.14e+100
probStart[stateTag{'M', "nz"}] = -3.14e+100
probStart[stateTag{'M', "o"}] = -3.14e+100
probStart[stateTag{'M', "p"}] = -3.14e+100
probStart[stateTag{'M', "q"}] = -3.14e+100
probStart[stateTag{'M', "qe"}] = -3.14e+100
probStart[stateTag{'M', "qg"}] = -3.14e+100
probStart[stateTag{'M', "r"}] = -3.14e+100
probStart[stateTag{'M', "rg"}] = -3.14e+100
probStart[stateTag{'M', "rr"}] = -3.14e+100
probStart[stateTag{'M', "rz"}] = -3.14e+100
probStart[stateTag{'M', "s"}] = -3.14e+100
probStart[stateTag{'M', "t"}] = -3.14e+100
probStart[stateTag{'M', "tg"}] = -3.14e+100
probStart[stateTag{'M', "u"}] = -3.14e+100
probStart[stateTag{'M', "ud"}] = -3.14e+100
probStart[stateTag{'M', "ug"}] = -3.14e+100
probStart[stateTag{'M', "uj"}] = -3.14e+100
probStart[stateTag{'M', "ul"}] = -3.14e+100
probStart[stateTag{'M', "uv"}] = -3.14e+100
probStart[stateTag{'M', "uz"}] = -3.14e+100
probStart[stateTag{'M', "v"}] = -3.14e+100
probStart[stateTag{'M', "vd"}] = -3.14e+100
probStart[stateTag{'M', "vg"}] = -3.14e+100
probStart[stateTag{'M', "vi"}] = -3.14e+100
probStart[stateTag{'M', "vn"}] = -3.14e+100
probStart[stateTag{'M', "vq"}] = -3.14e+100
probStart[stateTag{'M', "w"}] = -3.14e+100
probStart[stateTag{'M', "x"}] = -3.14e+100
probStart[stateTag{'M', "y"}] = -3.14e+100
probStart[stateTag{'M', "yg"}] = -3.14e+100
probStart[stateTag{'M', "z"}] = -3.14e+100
probStart[stateTag{'M', "zg"}] = -3.14e+100
probStart[stateTag{'S', "a"}] = -3.9025396831295227
probStart[stateTag{'S', "ad"}] = -11.048458480182255
probStart[stateTag{'S', "ag"}] = -6.954113917960154
probStart[stateTag{'S', "an"}] = -12.84021794941031
probStart[stateTag{'S', "b"}] = -6.472888763970454
probStart[stateTag{'S', "bg"}] = -3.14e+100
probStart[stateTag{'S', "c"}] = -4.786966795861212
probStart[stateTag{'S', "d"}] = -3.903919764181873
probStart[stateTag{'S', "df"}] = -3.14e+100
probStart[stateTag{'S', "dg"}] = -8.948397651299683
probStart[stateTag{'S', "e"}] = -5.942513006281674
probStart[stateTag{'S', "en"}] = -3.14e+100
probStart[stateTag{'S', "f"}] = -5.194820249981676
probStart[stateTag{'S', "g"}] = -6.507826815331734
probStart[stateTag{'S', "h"}] = -8.650563207383884
probStart[stateTag{'S', "i"}] = -3.14e+100
probStart[stateTag{'S', "in"}] = -3.14e+100
probStart[stateTag{'S', "j"}] = -4.911992119644354
probStart[stateTag{'S', "jn"}] = -3.14e+100
probStart[stateTag{'S', "k"}] = -6.940320595827818
probStart[stateTag{'S', "l"}] = -3.14e+100
probStart[stateTag{'S', "ln"}] = -3.14e+100
probStart[stateTag{'S', "m"}] = -3.269200652116097
probStart[stateTag{'S', "mg"}] = -10.825314928868044
probStart[stateTag{'S', "mq"}] = -3.14e+100
probStart[stateTag{'S', "n"}] = -3.8551483897645107
probStart[stateTag{'S', "ng"}] = -4.913434861102905
probStart[stateTag{'S', "nr"}] = -4.483663103956885
probStart[stateTag{'S', "nrfg"}] = -3.14e+100
probStart[stateTag{'S', "nrt"}] = -3.14e+100
probStart[stateTag{'S', "ns"}] = -3.14e+100
probStart[stateTag{'S', "nt"}] = -12.147070768850364
probStart[stateTag{'S', "nz"}] = -3.14e+100
probStart[stateTag{'S', "o"}] = -8.464460927750023
probStart[stateTag{'S', "p"}] = -2.9868401813596317
probStart[stateTag{'S', "q"}] = -4.888658618255058
probStart[stateTag{'S', "qe"}] = -3.14e+100
probStart[stateTag{'S', "qg"}] = -3.14e+100
probStart[stateTag{'S', "r"}] = -2.7635336784127853
probStart[stateTag{'S', "rg"}] = -10.275268591948773
probStart[stateTag{'S', "rr"}] = -3.14e+100
probStart[stateTag{'S', "rz"}] = -3.14e+100
probStart[stateTag{'S', "s"}] = -3.14e+100
probStart[stateTag{'S', "t"}] = -3.14e+100
probStart[stateTag{'S', "tg"}] = -6.272842531880403
probStart[stateTag{'S', "u"}] = -6.940320595827818
probStart[stateTag{'S', "ud"}] = -7.728230161053767
probStart[stateTag{'S', "ug"}] = -7.5394037026636855
probStart[stateTag{'S', "uj"}] = -6.85251045118004
probStart[stateTag{'S', "ul"}] = -8.4153713175535
probStart[stateTag{'S', "uv"}] = -8.15808672228609
probStart[stateTag{'S', "uz"}] = -9.299258625372996
probStart[stateTag{'S', "v"}] = -3.053292303412302
probStart[stateTag{'S', "vd"}] = -3.14e+100
probStart[stateTag{'S', "vg"}] = -5.9430181843676895
probStart[stateTag{'S', "vi"}] = -3.14e+100
probStart[stateTag{'S', "vn"}] = -11.453923588290419
probStart[stateTag{'S', "vq"}] = -3.14e+100
probStart[stateTag{'S', "w"}] = -3.14e+100
probStart[stateTag{'S', "x"}] = -8.427419656069674
probStart[stateTag{'S', "y"}] = -6.1970794699489575
probStart[stateTag{'S', "yg"}] = -13.533365129970255
probStart[stateTag{'S', "z"}] = -3.14e+100
probStart[stateTag{'S', "zg"}] = -3.14e+100
var probStart = map[uint16]float64{
100: -4.762305214596967,
101: -6.680066036784177,
102: -3.14e+100,
103: -8.697083223018778,
104: -5.018374362109218,
105: -3.14e+100,
106: -3.423880184954888,
107: -3.9750475297585357,
108: -8.888974230828882,
109: -3.14e+100,
110: -8.563551830394255,
111: -3.14e+100,
112: -5.491630418482717,
113: -3.14e+100,
114: -13.533365129970255,
115: -6.1157847275557105,
116: -3.14e+100,
117: -5.0576191284681915,
118: -3.14e+100,
119: -3.14e+100,
120: -4.905883584659895,
121: -3.14e+100,
122: -3.6524299819046386,
123: -3.14e+100,
124: -6.78695300139688,
125: -1.6966257797548328,
126: -3.14e+100,
127: -2.2310495913769506,
128: -5.873722175405573,
129: -4.985642733519195,
130: -2.8228438314969213,
131: -4.846091668182416,
132: -3.94698846057672,
133: -8.433498702146057,
134: -4.200984132085048,
135: -6.998123858956596,
136: -3.14e+100,
137: -3.14e+100,
138: -3.4098187790818413,
139: -3.14e+100,
140: -12.434752841302146,
141: -7.946116471570005,
142: -5.522673590839954,
143: -3.3647479094528574,
144: -3.14e+100,
145: -9.163917277503234,
146: -3.14e+100,
147: -3.14e+100,
148: -3.14e+100,
149: -3.14e+100,
150: -3.14e+100,
151: -3.14e+100,
152: -2.6740584874265685,
153: -9.044728760238115,
154: -3.14e+100,
155: -12.434752841302146,
156: -4.3315610890163585,
157: -12.147070768850364,
158: -3.14e+100,
159: -3.14e+100,
160: -9.844485675856319,
161: -3.14e+100,
162: -7.045681111485645,
163: -3.14e+100,
200: -3.14e+100,
201: -3.14e+100,
202: -3.14e+100,
203: -3.14e+100,
204: -3.14e+100,
205: -3.14e+100,
206: -3.14e+100,
207: -3.14e+100,
208: -3.14e+100,
209: -3.14e+100,
210: -3.14e+100,
211: -3.14e+100,
212: -3.14e+100,
213: -3.14e+100,
214: -3.14e+100,
215: -3.14e+100,
216: -3.14e+100,
217: -3.14e+100,
218: -3.14e+100,
219: -3.14e+100,
220: -3.14e+100,
221: -3.14e+100,
222: -3.14e+100,
223: -3.14e+100,
224: -3.14e+100,
225: -3.14e+100,
226: -3.14e+100,
227: -3.14e+100,
228: -3.14e+100,
229: -3.14e+100,
230: -3.14e+100,
231: -3.14e+100,
232: -3.14e+100,
233: -3.14e+100,
234: -3.14e+100,
235: -3.14e+100,
236: -3.14e+100,
237: -3.14e+100,
238: -3.14e+100,
239: -3.14e+100,
240: -3.14e+100,
241: -3.14e+100,
242: -3.14e+100,
243: -3.14e+100,
244: -3.14e+100,
245: -3.14e+100,
246: -3.14e+100,
247: -3.14e+100,
248: -3.14e+100,
249: -3.14e+100,
250: -3.14e+100,
251: -3.14e+100,
252: -3.14e+100,
253: -3.14e+100,
254: -3.14e+100,
255: -3.14e+100,
256: -3.14e+100,
257: -3.14e+100,
258: -3.14e+100,
259: -3.14e+100,
260: -3.14e+100,
261: -3.14e+100,
262: -3.14e+100,
263: -3.14e+100,
300: -3.14e+100,
301: -3.14e+100,
302: -3.14e+100,
303: -3.14e+100,
304: -3.14e+100,
305: -3.14e+100,
306: -3.14e+100,
307: -3.14e+100,
308: -3.14e+100,
309: -3.14e+100,
310: -3.14e+100,
311: -3.14e+100,
312: -3.14e+100,
313: -3.14e+100,
314: -3.14e+100,
315: -3.14e+100,
316: -3.14e+100,
317: -3.14e+100,
318: -3.14e+100,
319: -3.14e+100,
320: -3.14e+100,
321: -3.14e+100,
322: -3.14e+100,
323: -3.14e+100,
324: -3.14e+100,
325: -3.14e+100,
326: -3.14e+100,
327: -3.14e+100,
328: -3.14e+100,
329: -3.14e+100,
330: -3.14e+100,
331: -3.14e+100,
332: -3.14e+100,
333: -3.14e+100,
334: -3.14e+100,
335: -3.14e+100,
336: -3.14e+100,
337: -3.14e+100,
338: -3.14e+100,
339: -3.14e+100,
340: -3.14e+100,
341: -3.14e+100,
342: -3.14e+100,
343: -3.14e+100,
344: -3.14e+100,
345: -3.14e+100,
346: -3.14e+100,
347: -3.14e+100,
348: -3.14e+100,
349: -3.14e+100,
350: -3.14e+100,
351: -3.14e+100,
352: -3.14e+100,
353: -3.14e+100,
354: -3.14e+100,
355: -3.14e+100,
356: -3.14e+100,
357: -3.14e+100,
358: -3.14e+100,
359: -3.14e+100,
360: -3.14e+100,
361: -3.14e+100,
362: -3.14e+100,
363: -3.14e+100,
400: -3.9025396831295227,
401: -11.048458480182255,
402: -6.954113917960154,
403: -12.84021794941031,
404: -6.472888763970454,
405: -3.14e+100,
406: -4.786966795861212,
407: -3.903919764181873,
408: -3.14e+100,
409: -8.948397651299683,
410: -5.942513006281674,
411: -3.14e+100,
412: -5.194820249981676,
413: -6.507826815331734,
414: -8.650563207383884,
415: -3.14e+100,
416: -3.14e+100,
417: -4.911992119644354,
418: -3.14e+100,
419: -6.940320595827818,
420: -3.14e+100,
421: -3.14e+100,
422: -3.269200652116097,
423: -10.825314928868044,
424: -3.14e+100,
425: -3.8551483897645107,
426: -4.913434861102905,
427: -4.483663103956885,
428: -3.14e+100,
429: -3.14e+100,
430: -3.14e+100,
431: -12.147070768850364,
432: -3.14e+100,
433: -8.464460927750023,
434: -2.9868401813596317,
435: -4.888658618255058,
436: -3.14e+100,
437: -3.14e+100,
438: -2.7635336784127853,
439: -10.275268591948773,
440: -3.14e+100,
441: -3.14e+100,
442: -3.14e+100,
443: -3.14e+100,
444: -6.272842531880403,
445: -6.940320595827818,
446: -7.728230161053767,
447: -7.5394037026636855,
448: -6.85251045118004,
449: -8.4153713175535,
450: -8.15808672228609,
451: -9.299258625372996,
452: -3.053292303412302,
453: -3.14e+100,
454: -5.9430181843676895,
455: -3.14e+100,
456: -11.453923588290419,
457: -3.14e+100,
458: -3.14e+100,
459: -8.427419656069674,
460: -6.1970794699489575,
461: -13.533365129970255,
462: -3.14e+100,
463: -3.14e+100,
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,26 +5,13 @@ import (
"sort"
)
type stateTag struct {
State byte
Tag string
}
func (st stateTag) String() string {
return fmt.Sprintf("(%q, %s)", st.State, st.Tag)
}
func emptyStateTag() stateTag {
return stateTag{' ', ""}
}
type probState struct {
Prob float64
ST stateTag
prob float64
state uint16
}
func (ps probState) String() string {
return fmt.Sprintf("(%v: %f)", ps.ST, ps.Prob)
return fmt.Sprintf("(%v: %f)", ps.state, ps.prob)
}
type probStates []probState
@@ -34,94 +21,87 @@ func (pss probStates) Len() int {
}
func (pss probStates) Less(i, j int) bool {
if pss[i].Prob == pss[j].Prob {
if pss[i].ST.State == pss[j].ST.State {
return pss[i].ST.Tag < pss[j].ST.Tag
}
return pss[i].ST.State < pss[j].ST.State
if pss[i].prob == pss[j].prob {
return pss[i].state < pss[j].state
}
return pss[i].Prob < pss[j].Prob
return pss[i].prob < pss[j].prob
}
func (pss probStates) Swap(i, j int) {
pss[i], pss[j] = pss[j], pss[i]
}
func viterbi(obs []rune) (float64, []stateTag) {
func viterbi(obs []rune) []tag {
obsLength := len(obs)
V := make([]map[stateTag]float64, obsLength)
V[0] = make(map[stateTag]float64)
mem_path := make([]map[stateTag]stateTag, obsLength)
mem_path[0] = make(map[stateTag]stateTag)
V := make([]map[uint16]float64, obsLength)
V[0] = make(map[uint16]float64)
memPath := make([]map[uint16]uint16, obsLength)
memPath[0] = make(map[uint16]uint16)
ys := charStateTab.get(obs[0]) // default is all_states
for _, y := range ys {
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
mem_path[0][y] = emptyStateTag()
memPath[0][y] = 0
}
for t := 1; t < obsLength; t++ {
prev_states := make([]stateTag, 0)
for x, _ := range mem_path[t-1] {
var prevStates []uint16
for x := range memPath[t-1] {
if len(probTrans[x]) > 0 {
prev_states = append(prev_states, x)
prevStates = append(prevStates, x)
}
}
//use Go's map to implement Python's Set()
prev_states_expect_next := make(map[stateTag]stateTag)
for _, x := range prev_states {
for y, _ := range probTrans[x] {
prev_states_expect_next[y] = y
prevStatesExpectNext := make(map[uint16]int)
for _, x := range prevStates {
for y := range probTrans[x] {
prevStatesExpectNext[y] = 1
}
}
tmp_obs_states := charStateTab.get(obs[t])
tmpObsStates := charStateTab.get(obs[t])
obs_states := make([]stateTag, 0)
for index, _ := range tmp_obs_states {
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
obs_states = append(obs_states, tmp_obs_states[index])
var obsStates []uint16
for index := range tmpObsStates {
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
obsStates = append(obsStates, tmpObsStates[index])
}
}
if len(obs_states) == 0 {
for key := range prev_states_expect_next {
obs_states = append(obs_states, key)
if len(obsStates) == 0 {
for key := range prevStatesExpectNext {
obsStates = append(obsStates, key)
}
}
if len(obs_states) == 0 {
obs_states = probTransKeys
if len(obsStates) == 0 {
obsStates = probTransKeys
}
mem_path[t] = make(map[stateTag]stateTag) // TODO: value needed or not?
V[t] = make(map[stateTag]float64)
for _, y := range obs_states {
pss := make(probStates, 0)
for _, y0 := range prev_states {
ps := probState{
Prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
ST: y0}
pss = append(pss, ps)
memPath[t] = make(map[uint16]uint16)
V[t] = make(map[uint16]float64)
for _, y := range obsStates {
var max, ps probState
for i, y0 := range prevStates {
ps = probState{
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
state: y0}
if i == 0 || ps.prob > max.prob || (ps.prob == max.prob && ps.state > max.state) {
max = ps
}
}
sort.Sort(sort.Reverse(pss))
V[t][y] = pss[0].Prob
mem_path[t][y] = pss[0].ST
V[t][y] = max.prob
memPath[t][y] = max.state
}
}
last := make(probStates, 0)
length := len(mem_path)
length := len(memPath)
vlength := len(V)
for y, _ := range mem_path[length-1] {
ps := probState{Prob: V[vlength-1][y], ST: y}
for y := range memPath[length-1] {
ps := probState{prob: V[vlength-1][y], state: y}
last = append(last, ps)
}
sort.Sort(sort.Reverse(last))
prob := last[0].Prob
state := last[0].ST
route := make([]stateTag, len(obs))
i := obsLength - 1
for {
if i < 0 {
break
}
route[i] = state
state = mem_path[i][state]
i -= 1
state := last[0].state
route := make([]tag, len(obs))
for i := obsLength - 1; i >= 0; i-- {
route[i] = tag(state)
state = memPath[i][state]
}
return prob, route
return route
}

View File

@@ -4,42 +4,68 @@ import (
"testing"
)
var (
route1 = []stateTag{
stateTag{'B', "nr"},
stateTag{'M', "nr"},
stateTag{'E', "nr"},
stateTag{'S', "v"},
stateTag{'B', "v"},
stateTag{'E', "v"},
stateTag{'B', "n"},
stateTag{'M', "n"},
stateTag{'E', "n"},
stateTag{'S', "d"},
stateTag{'S', "v"},
stateTag{'S', "n"},
stateTag{'B', "v"},
stateTag{'E', "v"},
stateTag{'B', "nr"},
stateTag{'M', "nr"},
stateTag{'M', "nr"},
stateTag{'M', "nr"},
stateTag{'E', "nr"},
stateTag{'S', "zg"}}
)
var defaultRoute []tag
func init() {
var t tag
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("B", "n")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("M", "n")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("E", "n")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("S", "d")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("S", "n")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = newTag("S", "zg")
defaultRoute = append(defaultRoute, t)
}
func TestViterbi(t *testing.T) {
ss := "李小福是创新办主任也是云计算方面的专家;"
prob, route := viterbi([]rune(ss))
if prob != MinFloat {
t.Error(prob)
route := viterbi([]rune(ss))
if len(route) != len(defaultRoute) {
t.Fatal(len(route))
}
if len(route) != len(route1) {
t.Error(len(route))
}
for index, _ := range route {
if route[index] != route1[index] {
t.Error(route[index])
for index := range route {
if route[index] != defaultRoute[index] {
t.Fatal(route[index])
}
}
}
func BenchmarkViterbi(b *testing.B) {
ss := "李小福是创新办主任也是云计算方面的专家;"
for i := 0; i < b.N; i++ {
viterbi([]rune(ss))
}
}

145
tokenizer.go Normal file
View File

@@ -0,0 +1,145 @@
package jiebago
import (
"fmt"
"regexp"
"strconv"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
// Name is the jieba tokenizer name.
const Name = "jieba"
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct {
seg Segmenter
hmm, searchMode bool
}
/*
NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg Segmenter
err := seg.LoadDictionary(dictFilePath)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
searchMode: searchMode,
}, err
}
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
runeStart := 0
start := 0
end := 0
pos := 1
var width int
var gram string
for word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)
for _, step := range [2]int{2, 3} {
if width > step {
for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step])
gramLen := len(gram)
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
gramStart := start + len(string(runes[:i]))
token := analysis.Token{
Term: []byte(gram),
Start: gramStart,
End: gramStart + gramLen,
Position: pos,
Type: detectTokenType(gram),
}
rv = append(rv, &token)
pos++
}
}
}
}
}
end = start + len(word)
token := analysis.Token{
Term: []byte(word),
Start: start,
End: end,
Position: pos,
Type: detectTokenType(word),
}
rv = append(rv, &token)
pos++
runeStart += width
start = end
}
return rv
}
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFilePath, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
hmm, ok := config["hmm"].(bool)
if !ok {
hmm = true
}
searchMode, ok := config["search"].(bool)
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
}
func detectTokenType(term string) analysis.TokenType {
if ideographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)
if err == nil {
return analysis.Numeric
}
return analysis.AlphaNumeric
}
func init() {
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
}

View File

@@ -1,9 +1,10 @@
package tokenizers
package jiebago
import (
"github.com/blevesearch/bleve/analysis"
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
@@ -5218,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -11056,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -16473,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -22505,11 +22506,11 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
t.Fatalf("Expected %v, got %v for %s", test.output, actual, string(test.input))
}
}
}

View File

@@ -1,110 +0,0 @@
package tokenizers
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago"
"regexp"
"strconv"
)
const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
type JiebaTokenizer struct {
dictFileName string
hmm, searchMode bool
}
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
err := jiebago.SetDictionary(dictFileName)
return &JiebaTokenizer{
dictFileName: dictFileName,
hmm: hmm,
searchMode: searchMode,
}, err
}
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
runeStart := 0
start := 0
end := 0
pos := 1
var width int
var gram string
for word := range jiebago.Cut(string(input), false, jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)
for _, step := range [2]int{2, 3} {
if width > step {
for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step])
gramLen := len(gram)
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
gramStart := start + len(string(runes[:i]))
token := analysis.Token{
Term: []byte(gram),
Start: gramStart,
End: gramStart + gramLen,
Position: pos,
Type: detectTokenType(gram),
}
rv = append(rv, &token)
pos++
}
}
}
}
}
end = start + len(word)
token := analysis.Token{
Term: []byte(word),
Start: start,
End: end,
Position: pos,
Type: detectTokenType(word),
}
rv = append(rv, &token)
pos++
runeStart += width
start = end
}
return rv
}
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFileName, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
hmm, ok := config["hmm"].(bool)
if !ok {
hmm = true
}
searchMode, ok := config["search"].(bool)
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFileName, hmm, searchMode)
}
func detectTokenType(term string) analysis.TokenType {
if IdeographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)
if err == nil {
return analysis.Numeric
}
return analysis.AlphaNumeric
}
func init() {
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
}

126
trie.go
View File

@@ -1,126 +0,0 @@
package jiebago
import (
"crypto/md5"
"encoding/gob"
"fmt"
"log"
"os"
"path/filepath"
"strings"
)
// Trie store the total frequency and map of all words and their frequenciesb
var Trie *trie
type trie struct {
Total float64
Freq map[string]float64
}
func (t *trie) load(dictFileName string) error {
dictFilePath, err := DictPath(dictFileName)
if err != nil {
return err
}
dictFileInfo, err := os.Stat(dictFilePath)
if err != nil {
return err
}
log.Printf("Building Trie..., from %s\n", dictFilePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath)))
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true
cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil {
isDictCached = false
}
if isDictCached {
isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime())
}
var cacheFile *os.File
if isDictCached {
cacheFile, err = os.Open(cacheFilePath)
if err != nil {
isDictCached = false
}
defer cacheFile.Close()
}
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&t)
if err != nil {
isDictCached = false
} else {
log.Printf("loaded model from cache %s\n", cacheFilePath)
}
}
if !isDictCached {
wtfs, err := ParseDictFile(dictFilePath)
if err != nil {
return err
}
for _, wtf := range wtfs {
t.addWord(wtf)
}
// dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err = enc.Encode(t)
if err != nil {
return err
} else {
log.Printf("dumped model from cache %s\n", cacheFilePath)
}
}
return nil
}
func (t *trie) addWord(wtf *WordTagFreq) {
t.Freq[wtf.Word] = wtf.Freq
t.Total += wtf.Freq
runes := []rune(wtf.Word)
count := len(runes)
for i := 0; i < count; i++ {
wfrag := string(runes[0 : i+1])
if _, ok := t.Freq[wfrag]; !ok {
t.Freq[wfrag] = 0.0
}
}
}
// Load user specified dictionary file.
func LoadUserDict(dictFilePath string) error {
wtfs, err := ParseDictFile(dictFilePath)
if err != nil {
return err
}
for _, wtf := range wtfs {
if len(wtf.Tag) > 0 {
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
}
Trie.addWord(wtf)
}
return nil
}
// Set the dictionary, could be absolute path of dictionary file, or dictionary
// name in current directory. This function must be called before cut any
// sentence.
func SetDictionary(dictFileName string) error {
Trie = &trie{Total: 0.0, Freq: make(map[string]float64)}
return Trie.load(dictFileName)
}

53
util/util.go Normal file
View File

@@ -0,0 +1,53 @@
// Package util contains some util functions used by jiebago.
package util
import "regexp"
/*
RegexpSplit split slices s into substrings separated by the expression and
returns a slice of the substrings between those expression matches.
If capturing parentheses are used in expression, then the text of all groups
in the expression are also returned as part of the resulting slice.
This function acts consistent with Python's re.split function.
*/
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 {
return nil
}
if len(re.String()) > 0 && len(s) == 0 {
return []string{""}
}
var matches [][]int
if len(re.SubexpNames()) > 1 {
matches = re.FindAllStringSubmatchIndex(s, n)
} else {
matches = re.FindAllStringIndex(s, n)
}
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
if len(re.SubexpNames()) > 1 {
strings = append(strings, s[match[0]:match[1]])
}
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}

24
util/util_test.go Normal file
View File

@@ -0,0 +1,24 @@
package util
import (
"regexp"
"testing"
)
func TestRegexpSplit(t *testing.T) {
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 2 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
"BP神经网络如何训练才能在分类时增加区分度", -1)
if len(result) != 3 {
t.Fatal(result)
}
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?", -1)
if len(result) != 3 {
t.Fatal(result)
}
}