1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 13:10:25 +08:00
Files
jieba/analyse/tokenizers/jieba_test.go
2015-03-17 15:30:13 +08:00

5230 lines
100 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package tokenizers
import (
"github.com/blevesearch/bleve/analysis"
"reflect"
"testing"
)
func TestJiebaTokenizer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
[]byte("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("这是"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("一个"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 30,
Term: []byte("伸手不见五指"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("黑夜"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("。"),
Position: 6,
Type: analysis.AlphaNumeric,
},
{
Start: 42,
End: 45,
Term: []byte("我"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 48,
Term: []byte("叫"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 48,
End: 57,
Term: []byte("孙悟空"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 57,
End: 60,
Term: []byte(""),
Position: 10,
Type: analysis.AlphaNumeric,
},
{
Start: 60,
End: 63,
Term: []byte("我"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 63,
End: 66,
Term: []byte("爱"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 66,
End: 72,
Term: []byte("北京"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 72,
End: 75,
Term: []byte(""),
Position: 14,
Type: analysis.AlphaNumeric,
},
{
Start: 75,
End: 78,
Term: []byte("我"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 78,
End: 81,
Term: []byte("爱"),
Position: 16,
Type: analysis.Ideographic,
},
{
Start: 81,
End: 87,
Term: []byte("Python"),
Position: 17,
Type: analysis.AlphaNumeric,
},
{
Start: 87,
End: 90,
Term: []byte("和"),
Position: 18,
Type: analysis.Ideographic,
},
{
Start: 90,
End: 93,
Term: []byte("C++"),
Position: 19,
Type: analysis.AlphaNumeric,
},
{
Start: 93,
End: 96,
Term: []byte("。"),
Position: 20,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("我不喜欢日本和服。"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("不"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("喜欢"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("日本"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("和服"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("。"),
Position: 6,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("雷猴回归人间。"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("雷猴"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("回归"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("人间"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("。"),
Position: 4,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("工信处"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("女干事"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("每月"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("经过"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("下属"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 42,
Term: []byte("科室"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("都"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 48,
Term: []byte("要"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 48,
End: 54,
Term: []byte("亲口"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 54,
End: 60,
Term: []byte("交代"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 60,
End: 62,
Term: []byte("24"),
Position: 11,
Type: analysis.Numeric,
},
{
Start: 62,
End: 65,
Term: []byte("口"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 65,
End: 74,
Term: []byte("交换机"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 74,
End: 77,
Term: []byte("等"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 77,
End: 86,
Term: []byte("技术性"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 86,
End: 92,
Term: []byte("器件"),
Position: 16,
Type: analysis.Ideographic,
},
{
Start: 92,
End: 95,
Term: []byte("的"),
Position: 17,
Type: analysis.Ideographic,
},
{
Start: 95,
End: 101,
Term: []byte("安装"),
Position: 18,
Type: analysis.Ideographic,
},
{
Start: 101,
End: 107,
Term: []byte("工作"),
Position: 19,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我需要廉租房"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("需要"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("廉租房"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("永和服装饰品有限公司"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("永和"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("服装"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("饰品"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 30,
Term: []byte("有限公司"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我爱北京天安门"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("爱"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("北京"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 21,
Term: []byte("天安门"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("abc"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("abc"),
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("隐马尔可夫"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("隐"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 15,
Term: []byte("马尔可夫"),
Position: 2,
Type: analysis.Ideographic,
},
},
},
{
[]byte("雷猴是个好网站"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("雷猴"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("是"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("个"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("好"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("网站"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("“Microsoft”一词由“MICROcomputer微型计算机”和“SOFTware软件”两部分组成"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("“"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 3,
End: 12,
Term: []byte("Microsoft"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 12,
End: 15,
Term: []byte("”"),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
Start: 15,
End: 21,
Term: []byte("一词"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("由"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("“"),
Position: 6,
Type: analysis.AlphaNumeric,
},
{
Start: 27,
End: 40,
Term: []byte("MICROcomputer"),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
Start: 40,
End: 43,
Term: []byte(""),
Position: 8,
Type: analysis.AlphaNumeric,
},
{
Start: 43,
End: 49,
Term: []byte("微型"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 49,
End: 58,
Term: []byte("计算机"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 58,
End: 61,
Term: []byte(""),
Position: 11,
Type: analysis.AlphaNumeric,
},
{
Start: 61,
End: 64,
Term: []byte("”"),
Position: 12,
Type: analysis.AlphaNumeric,
},
{
Start: 64,
End: 67,
Term: []byte("和"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 67,
End: 70,
Term: []byte("“"),
Position: 14,
Type: analysis.AlphaNumeric,
},
{
Start: 70,
End: 78,
Term: []byte("SOFTware"),
Position: 15,
Type: analysis.AlphaNumeric,
},
{
Start: 78,
End: 81,
Term: []byte(""),
Position: 16,
Type: analysis.AlphaNumeric,
},
{
Start: 81,
End: 87,
Term: []byte("软件"),
Position: 17,
Type: analysis.Ideographic,
},
{
Start: 87,
End: 90,
Term: []byte(""),
Position: 18,
Type: analysis.AlphaNumeric,
},
{
Start: 90,
End: 93,
Term: []byte("”"),
Position: 19,
Type: analysis.AlphaNumeric,
},
{
Start: 93,
End: 96,
Term: []byte("两"),
Position: 20,
Type: analysis.Ideographic,
},
{
Start: 96,
End: 102,
Term: []byte("部分"),
Position: 21,
Type: analysis.Ideographic,
},
{
Start: 102,
End: 108,
Term: []byte("组成"),
Position: 22,
Type: analysis.Ideographic,
},
},
},
{
[]byte("草泥马和欺实马是今年的流行词汇"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("草泥马"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("和"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("欺实"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("马"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("是"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("今年"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("流行"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 45,
Term: []byte("词汇"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("伊藤洋华堂总府店"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("伊藤"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 15,
Term: []byte("洋华堂"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("总府"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("店"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("中国科学院计算技术研究所"),
analysis.TokenStream{
{
Start: 0,
End: 36,
Term: []byte("中国科学院计算技术研究所"),
Position: 1,
Type: analysis.Ideographic,
},
},
},
{
[]byte("罗密欧与朱丽叶"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("罗密欧"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("与"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 21,
Term: []byte("朱丽叶"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我购买了道具和服装"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("购买"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("道具"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("和"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("服装"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"),
analysis.TokenStream{
{
Start: 0,
End: 2,
Term: []byte("PS"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 2,
End: 3,
Term: []byte(":"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 3,
End: 4,
Term: []byte(" "),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
Start: 4,
End: 7,
Term: []byte("我"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 7,
End: 13,
Term: []byte("觉得"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 13,
End: 19,
Term: []byte("开源"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 19,
End: 22,
Term: []byte("有"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 22,
End: 28,
Term: []byte("一个"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 28,
End: 34,
Term: []byte("好处"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 34,
End: 37,
Term: []byte(""),
Position: 10,
Type: analysis.AlphaNumeric,
},
{
Start: 37,
End: 43,
Term: []byte("就是"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 43,
End: 49,
Term: []byte("能够"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 49,
End: 55,
Term: []byte("敦促"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 55,
End: 61,
Term: []byte("自己"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 61,
End: 73,
Term: []byte("不断改进"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 73,
End: 76,
Term: []byte(""),
Position: 16,
Type: analysis.AlphaNumeric,
},
{
Start: 76,
End: 82,
Term: []byte("避免"),
Position: 17,
Type: analysis.Ideographic,
},
{
Start: 82,
End: 88,
Term: []byte("敞帚"),
Position: 18,
Type: analysis.Ideographic,
},
{
Start: 88,
End: 94,
Term: []byte("自珍"),
Position: 19,
Type: analysis.Ideographic,
},
},
},
{
[]byte("湖北省石首市"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("湖北省"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("石首市"),
Position: 2,
Type: analysis.Ideographic,
},
},
},
{
[]byte("湖北省十堰市"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("湖北省"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("十堰市"),
Position: 2,
Type: analysis.Ideographic,
},
},
},
{
[]byte("总经理完成了这件事情"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("总经理"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("完成"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("这件"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("事情"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("电脑修好了"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("电脑"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("修好"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("做好了这件事情就一了百了了"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("做好"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("了"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("这件"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("事情"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("就"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 36,
Term: []byte("一了百了"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("了"),
Position: 7,
Type: analysis.Ideographic,
},
},
},
{
[]byte("人们审美的观点是不同的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("人们"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("审美"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("的"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("观点"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("是"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("不同"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 7,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我们买了一个美的空调"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("我们"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("买"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("一个"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("美的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("空调"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("线程初始化时我们要注意"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("线程"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 15,
Term: []byte("初始化"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("时"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("我们"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("要"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("注意"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("一个分子是由好多原子组织成的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("一个"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("分子"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("是"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("由"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("好多"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("原子"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("组织"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("成"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("祝你马到功成"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("祝"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("你"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 18,
Term: []byte("马到功成"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("他掉进了无底洞里"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("他"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("掉"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("进"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("了"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 21,
Term: []byte("无底洞"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("里"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("中国的首都是北京"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("中国"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("的"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("首都"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("是"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("北京"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("孙君意"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("孙君意"),
Position: 1,
Type: analysis.Ideographic,
},
},
},
{
[]byte("外交部发言人马朝旭"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("外交部"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("发言人"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 27,
Term: []byte("马朝旭"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("领导人会议和第四届东亚峰会"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("领导人"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("会议"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("和"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 27,
Term: []byte("第四届"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("东亚"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("峰会"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("在过去的这五年"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("在"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("过去"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("的"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("这"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("五年"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("还需要很长的路要走"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("还"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("需要"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("很长"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("的"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("路"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("要"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("走"),
Position: 7,
Type: analysis.Ideographic,
},
},
},
{
[]byte("60周年首都阅兵"),
analysis.TokenStream{
{
Start: 0,
End: 2,
Term: []byte("60"),
Position: 1,
Type: analysis.Numeric,
},
{
Start: 2,
End: 8,
Term: []byte("周年"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 8,
End: 14,
Term: []byte("首都"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 14,
End: 20,
Term: []byte("阅兵"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("你好人们审美的观点是不同的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("你好"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("人们"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("审美"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("的"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("观点"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("是"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("不同"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("的"),
Position: 8,
Type: analysis.Ideographic,
},
},
},
{
[]byte("买水果然后来世博园"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("买"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("水果"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("然后"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("来"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 27,
Term: []byte("世博园"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("买水果然后去世博园"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("买"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("水果"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("然后"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("去"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 27,
Term: []byte("世博园"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("但是后来我才知道你是对的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("但是"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("后来"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("我"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("才"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("知道"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("你"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("是"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("对"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 36,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("存在即合理"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("存在"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("即"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("合理"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("的的的的的在的的的的就以和和和"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("的"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("的"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("的"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("的"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("在"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("的"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("的"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("的"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("就"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 36,
Term: []byte("以"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("和"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("和"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("和"),
Position: 15,
Type: analysis.Ideographic,
},
},
},
{
[]byte("I love你不以为耻反以为rong"),
analysis.TokenStream{
{
Start: 0,
End: 1,
Term: []byte("I"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 1,
End: 2,
Term: []byte(" "),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 2,
End: 6,
Term: []byte("love"),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
Start: 6,
End: 9,
Term: []byte("你"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte(""),
Position: 5,
Type: analysis.AlphaNumeric,
},
{
Start: 12,
End: 24,
Term: []byte("不以为耻"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte(""),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
Start: 27,
End: 30,
Term: []byte("反"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("以为"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 40,
Term: []byte("rong"),
Position: 10,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("因"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("因"),
Position: 1,
Type: analysis.Ideographic,
},
},
},
{
[]byte(""),
analysis.TokenStream{},
},
{
[]byte("hello你好人们审美的观点是不同的"),
analysis.TokenStream{
{
Start: 0,
End: 5,
Term: []byte("hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 5,
End: 11,
Term: []byte("你好"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 11,
End: 17,
Term: []byte("人们"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 17,
End: 23,
Term: []byte("审美"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 23,
End: 26,
Term: []byte("的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 26,
End: 32,
Term: []byte("观点"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 32,
End: 35,
Term: []byte("是"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 35,
End: 41,
Term: []byte("不同"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 41,
End: 44,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("很好但主要是基于网页形式"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("很"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("好"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("但"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("主要"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("是"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("基于"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("网页"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("形式"),
Position: 8,
Type: analysis.Ideographic,
},
},
},
{
[]byte("hello你好人们审美的观点是不同的"),
analysis.TokenStream{
{
Start: 0,
End: 5,
Term: []byte("hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 5,
End: 11,
Term: []byte("你好"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 11,
End: 17,
Term: []byte("人们"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 17,
End: 23,
Term: []byte("审美"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 23,
End: 26,
Term: []byte("的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 26,
End: 32,
Term: []byte("观点"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 32,
End: 35,
Term: []byte("是"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 35,
End: 41,
Term: []byte("不同"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 41,
End: 44,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("为什么我不能拥有想要的生活"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("为什么"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("我"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("不能"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("拥有"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("想要"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("生活"),
Position: 7,
Type: analysis.Ideographic,
},
},
},
{
[]byte("后来我才"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("后来"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("我"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("才"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("此次来中国是为了"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("此次"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("来"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("中国"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("是"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("为了"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("使用了它就可以解决一些问题"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("使用"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("了"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("它"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("就"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("可以"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("解决"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("一些"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("问题"),
Position: 8,
Type: analysis.Ideographic,
},
},
},
{
[]byte(",使用了它就可以解决一些问题"),
analysis.TokenStream{
{
Start: 0,
End: 1,
Term: []byte(","),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 1,
End: 7,
Term: []byte("使用"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 7,
End: 10,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 10,
End: 13,
Term: []byte("它"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 13,
End: 16,
Term: []byte("就"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 16,
End: 22,
Term: []byte("可以"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 22,
End: 28,
Term: []byte("解决"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 28,
End: 34,
Term: []byte("一些"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 34,
End: 40,
Term: []byte("问题"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("其实使用了它就可以解决一些问题"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("其实"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("使用"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("它"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("就"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("可以"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("解决"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("一些"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 45,
Term: []byte("问题"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("好人使用了它就可以解决一些问题"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("好人"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("使用"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("了"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("它"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("就"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("可以"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("解决"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("一些"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 45,
Term: []byte("问题"),
Position: 9,
Type: analysis.Ideographic,
},
},
},
{
[]byte("是因为和国家"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("是因为"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("和"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("国家"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("老年搜索还支持"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("老年"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("搜索"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("还"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("支持"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("干脆就把那部蒙人的闲法给废了拉倒RT @laoshipukong : 27日全国人大常委会第三次审议侵权责任法草案删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("干脆"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("就"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("把"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("那部"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("蒙人"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("闲法"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 36,
Term: []byte("给"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("废"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("了"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 48,
Term: []byte("拉倒"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 48,
End: 51,
Term: []byte(""),
Position: 12,
Type: analysis.AlphaNumeric,
},
{
Start: 51,
End: 53,
Term: []byte("RT"),
Position: 13,
Type: analysis.AlphaNumeric,
},
{
Start: 53,
End: 54,
Term: []byte(" "),
Position: 14,
Type: analysis.AlphaNumeric,
},
{
Start: 54,
End: 55,
Term: []byte("@"),
Position: 15,
Type: analysis.AlphaNumeric,
},
{
Start: 55,
End: 67,
Term: []byte("laoshipukong"),
Position: 16,
Type: analysis.AlphaNumeric,
},
{
Start: 67,
End: 68,
Term: []byte(" "),
Position: 17,
Type: analysis.AlphaNumeric,
},
{
Start: 68,
End: 69,
Term: []byte(":"),
Position: 18,
Type: analysis.AlphaNumeric,
},
{
Start: 69,
End: 70,
Term: []byte(" "),
Position: 19,
Type: analysis.AlphaNumeric,
},
{
Start: 70,
End: 72,
Term: []byte("27"),
Position: 20,
Type: analysis.Numeric,
},
{
Start: 72,
End: 75,
Term: []byte("日"),
Position: 21,
Type: analysis.Ideographic,
},
{
Start: 75,
End: 78,
Term: []byte(""),
Position: 22,
Type: analysis.AlphaNumeric,
},
{
Start: 78,
End: 99,
Term: []byte("全国人大常委会"),
Position: 23,
Type: analysis.Ideographic,
},
{
Start: 99,
End: 108,
Term: []byte("第三次"),
Position: 24,
Type: analysis.Ideographic,
},
{
Start: 108,
End: 114,
Term: []byte("审议"),
Position: 25,
Type: analysis.Ideographic,
},
{
Start: 114,
End: 120,
Term: []byte("侵权"),
Position: 26,
Type: analysis.Ideographic,
},
{
Start: 120,
End: 129,
Term: []byte("责任法"),
Position: 27,
Type: analysis.Ideographic,
},
{
Start: 129,
End: 135,
Term: []byte("草案"),
Position: 28,
Type: analysis.Ideographic,
},
{
Start: 135,
End: 138,
Term: []byte(""),
Position: 29,
Type: analysis.AlphaNumeric,
},
{
Start: 138,
End: 144,
Term: []byte("删除"),
Position: 30,
Type: analysis.Ideographic,
},
{
Start: 144,
End: 147,
Term: []byte("了"),
Position: 31,
Type: analysis.Ideographic,
},
{
Start: 147,
End: 153,
Term: []byte("有关"),
Position: 32,
Type: analysis.Ideographic,
},
{
Start: 153,
End: 159,
Term: []byte("医疗"),
Position: 33,
Type: analysis.Ideographic,
},
{
Start: 159,
End: 165,
Term: []byte("损害"),
Position: 34,
Type: analysis.Ideographic,
},
{
Start: 165,
End: 171,
Term: []byte("责任"),
Position: 35,
Type: analysis.Ideographic,
},
{
Start: 171,
End: 174,
Term: []byte("“"),
Position: 36,
Type: analysis.AlphaNumeric,
},
{
Start: 174,
End: 180,
Term: []byte("举证"),
Position: 37,
Type: analysis.Ideographic,
},
{
Start: 180,
End: 186,
Term: []byte("倒置"),
Position: 38,
Type: analysis.Ideographic,
},
{
Start: 186,
End: 189,
Term: []byte("”"),
Position: 39,
Type: analysis.AlphaNumeric,
},
{
Start: 189,
End: 192,
Term: []byte("的"),
Position: 40,
Type: analysis.Ideographic,
},
{
Start: 192,
End: 198,
Term: []byte("规定"),
Position: 41,
Type: analysis.Ideographic,
},
{
Start: 198,
End: 201,
Term: []byte("。"),
Position: 42,
Type: analysis.AlphaNumeric,
},
{
Start: 201,
End: 204,
Term: []byte("在"),
Position: 43,
Type: analysis.Ideographic,
},
{
Start: 204,
End: 210,
Term: []byte("医患"),
Position: 44,
Type: analysis.Ideographic,
},
{
Start: 210,
End: 216,
Term: []byte("纠纷"),
Position: 45,
Type: analysis.Ideographic,
},
{
Start: 216,
End: 222,
Term: []byte("中本"),
Position: 46,
Type: analysis.Ideographic,
},
{
Start: 222,
End: 225,
Term: []byte("已"),
Position: 47,
Type: analysis.Ideographic,
},
{
Start: 225,
End: 231,
Term: []byte("处于"),
Position: 48,
Type: analysis.Ideographic,
},
{
Start: 231,
End: 237,
Term: []byte("弱势"),
Position: 49,
Type: analysis.Ideographic,
},
{
Start: 237,
End: 243,
Term: []byte("地位"),
Position: 50,
Type: analysis.Ideographic,
},
{
Start: 243,
End: 246,
Term: []byte("的"),
Position: 51,
Type: analysis.Ideographic,
},
{
Start: 246,
End: 255,
Term: []byte("消费者"),
Position: 52,
Type: analysis.Ideographic,
},
{
Start: 255,
End: 261,
Term: []byte("由此"),
Position: 53,
Type: analysis.Ideographic,
},
{
Start: 261,
End: 264,
Term: []byte("将"),
Position: 54,
Type: analysis.Ideographic,
},
{
Start: 264,
End: 270,
Term: []byte("陷入"),
Position: 55,
Type: analysis.Ideographic,
},
{
Start: 270,
End: 282,
Term: []byte("万劫不复"),
Position: 56,
Type: analysis.Ideographic,
},
{
Start: 282,
End: 285,
Term: []byte("的"),
Position: 57,
Type: analysis.Ideographic,
},
{
Start: 285,
End: 291,
Term: []byte("境地"),
Position: 58,
Type: analysis.Ideographic,
},
{
Start: 291,
End: 294,
Term: []byte("。"),
Position: 59,
Type: analysis.AlphaNumeric,
},
{
Start: 294,
End: 295,
Term: []byte(" "),
Position: 60,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("大"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("大"),
Position: 1,
Type: analysis.Ideographic,
},
},
},
{
[]byte(""),
analysis.TokenStream{},
},
{
[]byte("他说的确实在理"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("他"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("说"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("的"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("确实"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("在理"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("长春市长春节讲话"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("长春"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("市长"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("春节"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("讲话"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("结婚的和尚未结婚的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("结婚"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("的"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("和"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("尚未"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("结婚"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("结合成分子时"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("结合"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("成"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("分子"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("时"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("旅游和服务是最好的"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("旅游"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("和"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("服务"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("是"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("最好"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("这件事情的确是我的错"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("这件"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("事情"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("的确"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("是"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("我"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("错"),
Position: 7,
Type: analysis.Ideographic,
},
},
},
{
[]byte("供大家参考指正"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("供"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("大家"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("参考"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("指正"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("哈尔滨政府公布塌桥原因"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("哈尔滨"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("政府"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("公布"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("塌桥"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("原因"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我在机场入口处"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("在"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 12,
Term: []byte("机场"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 21,
Term: []byte("入口处"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("邢永臣摄影报道"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("邢永臣"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("摄影"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("报道"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("BP神经网络如何训练才能在分类时增加区分度"),
analysis.TokenStream{
{
Start: 0,
End: 2,
Term: []byte("BP"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 2,
End: 14,
Term: []byte("神经网络"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 14,
End: 20,
Term: []byte("如何"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 20,
End: 26,
Term: []byte("训练"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 26,
End: 32,
Term: []byte("才能"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 32,
End: 35,
Term: []byte("在"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 35,
End: 41,
Term: []byte("分类"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 41,
End: 44,
Term: []byte("时"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 44,
End: 50,
Term: []byte("增加"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 50,
End: 59,
Term: []byte("区分度"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 59,
End: 62,
Term: []byte(""),
Position: 11,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("南京市长江大桥"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("南京市"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 21,
Term: []byte("长江大桥"),
Position: 2,
Type: analysis.Ideographic,
},
},
},
{
[]byte("应一些使用者的建议也为了便于利用NiuTrans用于SMT研究"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("应"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("一些"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 18,
Term: []byte("使用者"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("的"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("建议"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte(""),
Position: 6,
Type: analysis.AlphaNumeric,
},
{
Start: 30,
End: 33,
Term: []byte("也"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("为了"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 45,
Term: []byte("便于"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 51,
Term: []byte("利用"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 51,
End: 59,
Term: []byte("NiuTrans"),
Position: 11,
Type: analysis.AlphaNumeric,
},
{
Start: 59,
End: 65,
Term: []byte("用于"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 65,
End: 68,
Term: []byte("SMT"),
Position: 13,
Type: analysis.AlphaNumeric,
},
{
Start: 68,
End: 74,
Term: []byte("研究"),
Position: 14,
Type: analysis.Ideographic,
},
},
},
{
[]byte("长春市长春药店"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("长春市"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("长春"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("药店"),
Position: 3,
Type: analysis.Ideographic,
},
},
},
{
[]byte("邓颖超生前最喜欢的衣服"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("邓颖超"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("生前"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("最"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("喜欢"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("衣服"),
Position: 6,
Type: analysis.Ideographic,
},
},
},
{
[]byte("胡锦涛是热爱世界和平的政治局常委"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("胡锦涛"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("是"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("热爱"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("世界"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("和平"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 42,
Term: []byte("政治局"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 48,
Term: []byte("常委"),
Position: 8,
Type: analysis.Ideographic,
},
},
},
{
[]byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("程序员"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("祝"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("海林"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("和"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 30,
Term: []byte("朱会震"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("是"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 36,
Term: []byte("在"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 42,
Term: []byte("孙健"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("的"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 51,
Term: []byte("左面"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 51,
End: 54,
Term: []byte("和"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 54,
End: 60,
Term: []byte("右面"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 60,
End: 61,
Term: []byte(","),
Position: 13,
Type: analysis.AlphaNumeric,
},
{
Start: 61,
End: 62,
Term: []byte(" "),
Position: 14,
Type: analysis.AlphaNumeric,
},
{
Start: 62,
End: 68,
Term: []byte("范凯"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 68,
End: 71,
Term: []byte("在"),
Position: 16,
Type: analysis.Ideographic,
},
{
Start: 71,
End: 74,
Term: []byte("最"),
Position: 17,
Type: analysis.Ideographic,
},
{
Start: 74,
End: 80,
Term: []byte("右面"),
Position: 18,
Type: analysis.Ideographic,
},
{
Start: 80,
End: 81,
Term: []byte("."),
Position: 19,
Type: analysis.AlphaNumeric,
},
{
Start: 81,
End: 87,
Term: []byte("再往"),
Position: 20,
Type: analysis.Ideographic,
},
{
Start: 87,
End: 90,
Term: []byte("左"),
Position: 21,
Type: analysis.Ideographic,
},
{
Start: 90,
End: 93,
Term: []byte("是"),
Position: 22,
Type: analysis.Ideographic,
},
{
Start: 93,
End: 102,
Term: []byte("李松洪"),
Position: 23,
Type: analysis.Ideographic,
},
},
},
{
[]byte("一次性交多少钱"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("一次性"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("交"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("多少"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 21,
Term: []byte("钱"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("两块五一套,三块八一斤,四块七一本,五块六一条"),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("两块"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 9,
Term: []byte("五"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("一套"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte(""),
Position: 4,
Type: analysis.AlphaNumeric,
},
{
Start: 18,
End: 24,
Term: []byte("三块"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("八"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("一斤"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 36,
Term: []byte(""),
Position: 8,
Type: analysis.AlphaNumeric,
},
{
Start: 36,
End: 42,
Term: []byte("四块"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("七"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 51,
Term: []byte("一本"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 51,
End: 54,
Term: []byte(""),
Position: 12,
Type: analysis.AlphaNumeric,
},
{
Start: 54,
End: 60,
Term: []byte("五块"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 60,
End: 63,
Term: []byte("六"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 63,
End: 69,
Term: []byte("一条"),
Position: 15,
Type: analysis.Ideographic,
},
},
},
{
[]byte("小和尚留了一个像大和尚一样的和尚头"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("小"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("和尚"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("留"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("了"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("一个"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("像"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("大"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("和尚"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("一样"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("的"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 51,
Term: []byte("和尚头"),
Position: 11,
Type: analysis.Ideographic,
},
},
},
{
[]byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("我"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 6,
Term: []byte("是"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 27,
Term: []byte("中华人民共和国"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 33,
Term: []byte("公民"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 34,
Term: []byte(";"),
Position: 5,
Type: analysis.AlphaNumeric,
},
{
Start: 34,
End: 37,
Term: []byte("我"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 37,
End: 43,
Term: []byte("爸爸"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 43,
End: 46,
Term: []byte("是"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 46,
End: 55,
Term: []byte("共和党"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 55,
End: 61,
Term: []byte("党员"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 61,
End: 62,
Term: []byte(";"),
Position: 11,
Type: analysis.AlphaNumeric,
},
{
Start: 62,
End: 63,
Term: []byte(" "),
Position: 12,
Type: analysis.AlphaNumeric,
},
{
Start: 63,
End: 69,
Term: []byte("地铁"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 69,
End: 78,
Term: []byte("和平门"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 78,
End: 81,
Term: []byte("站"),
Position: 15,
Type: analysis.Ideographic,
},
},
},
{
[]byte("张晓梅去人民医院做了个B超然后去买了件T恤"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("张晓梅"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("去"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 18,
Term: []byte("人民"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("医院"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 27,
Term: []byte("做"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("了"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("个"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 37,
Term: []byte("B超"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 37,
End: 43,
Term: []byte("然后"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 43,
End: 46,
Term: []byte("去"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 46,
End: 49,
Term: []byte("买"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 49,
End: 52,
Term: []byte("了"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 52,
End: 55,
Term: []byte("件"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 55,
End: 59,
Term: []byte("T恤"),
Position: 14,
Type: analysis.Ideographic,
},
},
},
{
[]byte("AT&T是一件不错的公司给你发offer了吗"),
analysis.TokenStream{
{
Start: 0,
End: 4,
Term: []byte("AT&T"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 4,
End: 7,
Term: []byte("是"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 7,
End: 13,
Term: []byte("一件"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 13,
End: 19,
Term: []byte("不错"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 19,
End: 22,
Term: []byte("的"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 22,
End: 28,
Term: []byte("公司"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 28,
End: 31,
Term: []byte(""),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
Start: 31,
End: 34,
Term: []byte("给"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 34,
End: 37,
Term: []byte("你"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 37,
End: 40,
Term: []byte("发"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 40,
End: 45,
Term: []byte("offer"),
Position: 11,
Type: analysis.AlphaNumeric,
},
{
Start: 45,
End: 48,
Term: []byte("了"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 48,
End: 51,
Term: []byte("吗"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 51,
End: 54,
Term: []byte(""),
Position: 14,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("C++和c#是什么关系11+122=133是吗PI=3.14159"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("C++"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 3,
End: 6,
Term: []byte("和"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 6,
End: 8,
Term: []byte("c#"),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
Start: 8,
End: 11,
Term: []byte("是"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 11,
End: 17,
Term: []byte("什么"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 17,
End: 23,
Term: []byte("关系"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 23,
End: 26,
Term: []byte(""),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
Start: 26,
End: 28,
Term: []byte("11"),
Position: 8,
Type: analysis.Numeric,
},
{
Start: 28,
End: 29,
Term: []byte("+"),
Position: 9,
Type: analysis.AlphaNumeric,
},
{
Start: 29,
End: 32,
Term: []byte("122"),
Position: 10,
Type: analysis.Numeric,
},
{
Start: 32,
End: 33,
Term: []byte("="),
Position: 11,
Type: analysis.AlphaNumeric,
},
{
Start: 33,
End: 36,
Term: []byte("133"),
Position: 12,
Type: analysis.Numeric,
},
{
Start: 36,
End: 39,
Term: []byte(""),
Position: 13,
Type: analysis.AlphaNumeric,
},
{
Start: 39,
End: 42,
Term: []byte("是"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("吗"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 45,
End: 48,
Term: []byte(""),
Position: 16,
Type: analysis.AlphaNumeric,
},
{
Start: 48,
End: 50,
Term: []byte("PI"),
Position: 17,
Type: analysis.AlphaNumeric,
},
{
Start: 50,
End: 51,
Term: []byte("="),
Position: 18,
Type: analysis.AlphaNumeric,
},
{
Start: 51,
End: 58,
Term: []byte("3.14159"),
Position: 19,
Type: analysis.Numeric,
},
},
},
{
[]byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("你"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 9,
Term: []byte("认识"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("那个"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 18,
Term: []byte("和"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 18,
End: 24,
Term: []byte("主席"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 30,
Term: []byte("握手"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("的"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 39,
Term: []byte("的哥"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("吗"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte(""),
Position: 10,
Type: analysis.AlphaNumeric,
},
{
Start: 45,
End: 51,
Term: []byte("他开"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 51,
End: 57,
Term: []byte("一辆"),
Position: 12,
Type: analysis.Ideographic,
},
{
Start: 57,
End: 63,
Term: []byte("黑色"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 63,
End: 69,
Term: []byte("的士"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 69,
End: 72,
Term: []byte("。"),
Position: 15,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("枪杆子中出政权"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("枪杆子"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 12,
Term: []byte("中"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 12,
End: 15,
Term: []byte("出"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("政权"),
Position: 4,
Type: analysis.Ideographic,
},
},
},
{
[]byte("张三风同学走上了不归路"),
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("张三风"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 9,
End: 15,
Term: []byte("同学"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("走上"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 24,
Term: []byte("了"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 24,
End: 33,
Term: []byte("不归路"),
Position: 5,
Type: analysis.Ideographic,
},
},
},
{
[]byte("阿Q腰间挂着BB机手里拿着大哥大我一般吃饭不AA制的。"),
analysis.TokenStream{
{
Start: 0,
End: 4,
Term: []byte("阿Q"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 4,
End: 10,
Term: []byte("腰间"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 10,
End: 13,
Term: []byte("挂"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 13,
End: 16,
Term: []byte("着"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 16,
End: 21,
Term: []byte("BB机"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 21,
End: 27,
Term: []byte("手里"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 27,
End: 30,
Term: []byte("拿"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 33,
Term: []byte("着"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 33,
End: 42,
Term: []byte("大哥大"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte(""),
Position: 10,
Type: analysis.AlphaNumeric,
},
{
Start: 45,
End: 48,
Term: []byte("说"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 48,
End: 51,
Term: []byte(""),
Position: 12,
Type: analysis.AlphaNumeric,
},
{
Start: 51,
End: 54,
Term: []byte("我"),
Position: 13,
Type: analysis.Ideographic,
},
{
Start: 54,
End: 60,
Term: []byte("一般"),
Position: 14,
Type: analysis.Ideographic,
},
{
Start: 60,
End: 66,
Term: []byte("吃饭"),
Position: 15,
Type: analysis.Ideographic,
},
{
Start: 66,
End: 69,
Term: []byte("不"),
Position: 16,
Type: analysis.Ideographic,
},
{
Start: 69,
End: 74,
Term: []byte("AA制"),
Position: 17,
Type: analysis.Ideographic,
},
{
Start: 74,
End: 77,
Term: []byte("的"),
Position: 18,
Type: analysis.Ideographic,
},
{
Start: 77,
End: 80,
Term: []byte("。"),
Position: 19,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("在1号店能买到小S和大S八卦的书。"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("在"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 3,
End: 10,
Term: []byte("1号店"),
Position: 2,
Type: analysis.Ideographic,
},
{
Start: 10,
End: 13,
Term: []byte("能"),
Position: 3,
Type: analysis.Ideographic,
},
{
Start: 13,
End: 16,
Term: []byte("买"),
Position: 4,
Type: analysis.Ideographic,
},
{
Start: 16,
End: 19,
Term: []byte("到"),
Position: 5,
Type: analysis.Ideographic,
},
{
Start: 19,
End: 23,
Term: []byte("小S"),
Position: 6,
Type: analysis.Ideographic,
},
{
Start: 23,
End: 26,
Term: []byte("和"),
Position: 7,
Type: analysis.Ideographic,
},
{
Start: 26,
End: 30,
Term: []byte("大S"),
Position: 8,
Type: analysis.Ideographic,
},
{
Start: 30,
End: 36,
Term: []byte("八卦"),
Position: 9,
Type: analysis.Ideographic,
},
{
Start: 36,
End: 39,
Term: []byte("的"),
Position: 10,
Type: analysis.Ideographic,
},
{
Start: 39,
End: 42,
Term: []byte("书"),
Position: 11,
Type: analysis.Ideographic,
},
{
Start: 42,
End: 45,
Term: []byte("。"),
Position: 12,
Type: analysis.AlphaNumeric,
},
},
},
}
tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, false)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
}
}
}