diff --git a/analyse/tokenizers/jieba.go b/analyse/tokenizers/jieba.go index 0bedd55..1e1547d 100644 --- a/analyse/tokenizers/jieba.go +++ b/analyse/tokenizers/jieba.go @@ -43,11 +43,13 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { if width > step { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) + gramLen := len(gram) if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 { + gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram), - Start: start, - End: start + len(gram), + Start: gramStart, + End: gramStart + gramLen, Position: pos, Type: detectTokenType(gram), } diff --git a/analyse/tokenizers/jieba_test.go b/analyse/tokenizers/jieba_test.go index ee4878b..81ca918 100644 --- a/analyse/tokenizers/jieba_test.go +++ b/analyse/tokenizers/jieba_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -func TestJiebaTokenizer(t *testing.T) { +func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream @@ -5227,3 +5227,5841 @@ func TestJiebaTokenizer(t *testing.T) { } } + +func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("一个"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("伸手"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("不见"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("五指"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 30, + Term: []byte("伸手不见五指"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("黑夜"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("。"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 42, + End: 45, + Term: []byte("我"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("叫"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("悟空"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 57, + Term: []byte("孙悟空"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 60, + Term: []byte(","), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 60, + End: 63, + Term: []byte("我"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 66, + Term: []byte("爱"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 72, + Term: []byte("北京"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 72, + End: 75, + Term: []byte(","), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 75, + End: 78, + Term: []byte("我"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("爱"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("Python"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 87, + End: 90, + Term: []byte("和"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("C++"), + Position: 23, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("。"), + Position: 24, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("我不喜欢日本和服。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("不"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("喜欢"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("日本"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("和服"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("雷猴回归人间。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("回归"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人间"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("。"), + Position: 4, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("工信处"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("干事"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("女干事"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("每月"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("经过"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("下属"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("科室"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("都"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("要"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 54, + Term: []byte("亲口"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("交代"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 62, + Term: []byte("24"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 62, + End: 65, + Term: []byte("口"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 71, + Term: []byte("交换"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 74, + Term: []byte("换机"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 74, + Term: []byte("交换机"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("等"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 83, + Term: []byte("技术"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 86, + Term: []byte("技术性"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 86, + End: 92, + Term: []byte("器件"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 92, + End: 95, + Term: []byte("的"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 95, + End: 101, + Term: []byte("安装"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 101, + End: 107, + Term: []byte("工作"), + Position: 23, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我需要廉租房"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("廉租"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("租房"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("廉租房"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("永和服装饰品有限公司"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("永和"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("服装"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("饰品"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("有限"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("公司"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 30, + Term: []byte("有限公司"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我爱北京天安门"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("爱"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("北京"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("天安"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("天安门"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("abc"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("abc"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("隐马尔可夫"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("隐"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("可夫"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 12, + Term: []byte("马尔可"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 15, + Term: []byte("马尔可夫"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("雷猴是个好网站"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("好"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("网站"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("“"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 12, + Term: []byte("Microsoft"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 15, + Term: []byte("”"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 15, + End: 21, + Term: []byte("一词"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("由"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("“"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 40, + Term: []byte("MICROcomputer"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 40, + End: 43, + Term: []byte("("), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 43, + End: 49, + Term: []byte("微型"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("计算"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 58, + Term: []byte("算机"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 58, + Term: []byte("计算机"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 58, + End: 61, + Term: []byte(")"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 64, + Term: []byte("”"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 64, + End: 67, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 70, + Term: []byte("“"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 78, + Term: []byte("SOFTware"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 81, + Term: []byte("("), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("软件"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte(")"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 90, + End: 93, + Term: []byte("”"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("两"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 102, + Term: []byte("部分"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("组成"), + Position: 24, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("草泥马和欺实马是今年的流行词汇"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("草泥马"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("欺实"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("马"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("今年"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("流行"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("词汇"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("伊藤洋华堂总府店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("伊藤"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("洋华堂"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("总府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国科学院计算技术研究所"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("科学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("学院"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("计算"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("技术"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("研究"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("科学院"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 36, + Term: []byte("研究所"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 36, + Term: []byte("中国科学院计算技术研究所"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("罗密欧与朱丽叶"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("罗密欧"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("与"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("朱丽叶"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我购买了道具和服装"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("购买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("道具"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("服装"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("PS"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 3, + Term: []byte(":"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 4, + Term: []byte(" "), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("我"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("觉得"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("开源"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("有"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("一个"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("好处"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 37, + End: 43, + Term: []byte("就是"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 49, + Term: []byte("能够"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("敦促"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("自己"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 67, + Term: []byte("不断"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 73, + Term: []byte("改进"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 73, + Term: []byte("不断改进"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 73, + End: 76, + Term: []byte(","), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 76, + End: 82, + Term: []byte("避免"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 82, + End: 88, + Term: []byte("敞帚"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 88, + End: 94, + Term: []byte("自珍"), + Position: 21, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省石首市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("石首"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("石首市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省十堰市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("十堰"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("十堰市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("总经理完成了这件事情"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("经理"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("总经理"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("完成"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("这件"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("事情"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("电脑修好了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("电脑"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("修好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("做好了这件事情就一了百了了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("做好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("这件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("事情"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 36, + Term: []byte("一了百了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("了"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("人们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("审美"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("观点"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("不同"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我们买了一个美的空调"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("我们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("一个"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("美的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("空调"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("线程初始化时我们要注意"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("线程"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("初始"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("初始化"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("我们"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("注意"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一个分子是由好多原子组织成的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一个"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("分子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("由"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("好多"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("原子"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("组织"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("成"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("祝你马到功成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("祝"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("你"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 18, + Term: []byte("马到功成"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("他掉进了无底洞里"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("掉"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("进"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("无底"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("无底洞"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("里"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国的首都是北京"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("北京"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("孙君意"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("孙君意"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("外交部发言人马朝旭"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("外交"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("外交部"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("发言"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("发言人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("马朝旭"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("领导人会议和第四届东亚峰会"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("领导"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("领导人"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("会议"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("第四"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("四届"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("第四届"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("东亚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("峰会"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("在过去的这五年"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("过去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("这"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("五年"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("还需要很长的路要走"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("还"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("很长"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("路"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("走"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("60周年首都阅兵"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("60"), + Position: 1, + Type: analysis.Numeric, + }, + { + Start: 2, + End: 8, + Term: []byte("周年"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("阅兵"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("你好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("人们"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("审美"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("观点"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("不同"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后来世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("来"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后去世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("去"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("但是后来我才知道你是对的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("但是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("后来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("才"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("知道"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("你"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("对"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("存在即合理"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("存在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("即"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("合理"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("的的的的的在的的的的就以和和和"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("的"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("就"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("以"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("I love你,不以为耻,反以为rong"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte("I"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 2, + Term: []byte(" "), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 6, + Term: []byte("love"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 6, + End: 9, + Term: []byte("你"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte(","), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 18, + Term: []byte("不以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("以为"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 24, + Term: []byte("不以为耻"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte(","), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 30, + Term: []byte("反"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("以为"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 40, + Term: []byte("rong"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("因"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("因"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("很好但主要是基于网页形式"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("很"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("但"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("主要"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("基于"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("网页"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("形式"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("为什么我不能拥有想要的生活"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("什么"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("为什么"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("不能"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("拥有"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("想要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("生活"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("后来我才"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("后来"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("才"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("此次来中国是为了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("此次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("中国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("为了"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("使用"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("它"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("就"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("可以"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("解决"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一些"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("问题"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(",使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte(","), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 7, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 10, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 22, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 40, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("其实使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("其实"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("好人使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("好人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("是因为和国家"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("因为"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("是因为"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("国家"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("老年搜索还支持"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("老年"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("搜索"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("还"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("支持"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("干脆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("就"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("把"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("那部"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("蒙人"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("闲法"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("废"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("拉倒"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("!"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 53, + Term: []byte("RT"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 54, + Term: []byte(" "), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 55, + Term: []byte("@"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 55, + End: 67, + Term: []byte("laoshipukong"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 67, + End: 68, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 69, + Term: []byte(":"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 69, + End: 70, + Term: []byte(" "), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 72, + Term: []byte("27"), + Position: 20, + Type: analysis.Numeric, + }, + { + Start: 72, + End: 75, + Term: []byte("日"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 75, + End: 78, + Term: []byte(","), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 84, + Term: []byte("全国"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("国人"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 84, + End: 90, + Term: []byte("人大"), + Position: 25, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 96, + Term: []byte("常委"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 99, + Term: []byte("委会"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 99, + Term: []byte("常委会"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 99, + Term: []byte("全国人大常委会"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 105, + Term: []byte("第三"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("三次"), + Position: 31, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 108, + Term: []byte("第三次"), + Position: 32, + Type: analysis.Ideographic, + }, + { + Start: 108, + End: 114, + Term: []byte("审议"), + Position: 33, + Type: analysis.Ideographic, + }, + { + Start: 114, + End: 120, + Term: []byte("侵权"), + Position: 34, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 126, + Term: []byte("责任"), + Position: 35, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 129, + Term: []byte("责任法"), + Position: 36, + Type: analysis.Ideographic, + }, + { + Start: 129, + End: 135, + Term: []byte("草案"), + Position: 37, + Type: analysis.Ideographic, + }, + { + Start: 135, + End: 138, + Term: []byte(","), + Position: 38, + Type: analysis.AlphaNumeric, + }, + { + Start: 138, + End: 144, + Term: []byte("删除"), + Position: 39, + Type: analysis.Ideographic, + }, + { + Start: 144, + End: 147, + Term: []byte("了"), + Position: 40, + Type: analysis.Ideographic, + }, + { + Start: 147, + End: 153, + Term: []byte("有关"), + Position: 41, + Type: analysis.Ideographic, + }, + { + Start: 153, + End: 159, + Term: []byte("医疗"), + Position: 42, + Type: analysis.Ideographic, + }, + { + Start: 159, + End: 165, + Term: []byte("损害"), + Position: 43, + Type: analysis.Ideographic, + }, + { + Start: 165, + End: 171, + Term: []byte("责任"), + Position: 44, + Type: analysis.Ideographic, + }, + { + Start: 171, + End: 174, + Term: []byte("“"), + Position: 45, + Type: analysis.AlphaNumeric, + }, + { + Start: 174, + End: 180, + Term: []byte("举证"), + Position: 46, + Type: analysis.Ideographic, + }, + { + Start: 180, + End: 186, + Term: []byte("倒置"), + Position: 47, + Type: analysis.Ideographic, + }, + { + Start: 186, + End: 189, + Term: []byte("”"), + Position: 48, + Type: analysis.AlphaNumeric, + }, + { + Start: 189, + End: 192, + Term: []byte("的"), + Position: 49, + Type: analysis.Ideographic, + }, + { + Start: 192, + End: 198, + Term: []byte("规定"), + Position: 50, + Type: analysis.Ideographic, + }, + { + Start: 198, + End: 201, + Term: []byte("。"), + Position: 51, + Type: analysis.AlphaNumeric, + }, + { + Start: 201, + End: 204, + Term: []byte("在"), + Position: 52, + Type: analysis.Ideographic, + }, + { + Start: 204, + End: 210, + Term: []byte("医患"), + Position: 53, + Type: analysis.Ideographic, + }, + { + Start: 210, + End: 216, + Term: []byte("纠纷"), + Position: 54, + Type: analysis.Ideographic, + }, + { + Start: 216, + End: 222, + Term: []byte("中本"), + Position: 55, + Type: analysis.Ideographic, + }, + { + Start: 222, + End: 225, + Term: []byte("已"), + Position: 56, + Type: analysis.Ideographic, + }, + { + Start: 225, + End: 231, + Term: []byte("处于"), + Position: 57, + Type: analysis.Ideographic, + }, + { + Start: 231, + End: 237, + Term: []byte("弱势"), + Position: 58, + Type: analysis.Ideographic, + }, + { + Start: 237, + End: 243, + Term: []byte("地位"), + Position: 59, + Type: analysis.Ideographic, + }, + { + Start: 243, + End: 246, + Term: []byte("的"), + Position: 60, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 252, + Term: []byte("消费"), + Position: 61, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 255, + Term: []byte("消费者"), + Position: 62, + Type: analysis.Ideographic, + }, + { + Start: 255, + End: 261, + Term: []byte("由此"), + Position: 63, + Type: analysis.Ideographic, + }, + { + Start: 261, + End: 264, + Term: []byte("将"), + Position: 64, + Type: analysis.Ideographic, + }, + { + Start: 264, + End: 270, + Term: []byte("陷入"), + Position: 65, + Type: analysis.Ideographic, + }, + { + Start: 276, + End: 282, + Term: []byte("不复"), + Position: 66, + Type: analysis.Ideographic, + }, + { + Start: 270, + End: 282, + Term: []byte("万劫不复"), + Position: 67, + Type: analysis.Ideographic, + }, + { + Start: 282, + End: 285, + Term: []byte("的"), + Position: 68, + Type: analysis.Ideographic, + }, + { + Start: 285, + End: 291, + Term: []byte("境地"), + Position: 69, + Type: analysis.Ideographic, + }, + { + Start: 291, + End: 294, + Term: []byte("。"), + Position: 70, + Type: analysis.AlphaNumeric, + }, + { + Start: 294, + End: 295, + Term: []byte(" "), + Position: 71, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("大"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("大"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("他说的确实在理"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("说"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("确实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("在理"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春节讲话"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("市长"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("春节"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("讲话"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结婚的和尚未结婚的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结婚"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("尚未"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("结婚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结合成分子时"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结合"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("分子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("旅游和服务是最好的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("旅游"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("服务"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("最好"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("这件事情的确是我的错"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这件"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("事情"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("的确"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("我"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("错"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("供大家参考指正"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("供"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("大家"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("参考"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("指正"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("哈尔滨政府公布塌桥原因"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("哈尔"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("哈尔滨"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("政府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("公布"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("塌桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("原因"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我在机场入口处"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("在"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("机场"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("入口"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("入口处"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邢永臣摄影报道"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邢永臣"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("摄影"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("报道"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("BP神经网络如何训练才能在分类时增加区分度?"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("BP"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 8, + Term: []byte("神经"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("网络"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 11, + Term: []byte("神经网"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 14, + Term: []byte("神经网络"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("如何"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 20, + End: 26, + Term: []byte("训练"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("才能"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("在"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("分类"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("时"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 44, + End: 50, + Term: []byte("增加"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 56, + Term: []byte("区分"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 53, + End: 59, + Term: []byte("分度"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 59, + Term: []byte("区分度"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 59, + End: 62, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("南京市长江大桥"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("南京"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("京市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("南京市"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长江"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("大桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 21, + Term: []byte("长江大桥"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("应"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("一些"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("使用"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("用者"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("使用者"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("建议"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 30, + End: 33, + Term: []byte("也"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("为了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("便于"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("利用"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 59, + Term: []byte("NiuTrans"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 59, + End: 65, + Term: []byte("用于"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("SMT"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 74, + Term: []byte("研究"), + Position: 16, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春药店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("长春市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长春"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("药店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邓颖超生前最喜欢的衣服"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邓颖超"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("生前"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("最"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("喜欢"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("衣服"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("胡锦涛是热爱世界和平的政治局常委"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("锦涛"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("胡锦涛"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("热爱"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世界"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("和平"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("政治"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("政治局"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("常委"), + Position: 10, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("程序"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("程序员"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("祝"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("海林"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 30, + Term: []byte("朱会震"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("在"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("孙健"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("左面"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("和"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("右面"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 61, + Term: []byte(","), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 62, + Term: []byte(" "), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 68, + Term: []byte("范凯"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 71, + Term: []byte("在"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 71, + End: 74, + Term: []byte("最"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 80, + Term: []byte("右面"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 80, + End: 81, + Term: []byte("."), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("再往"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte("左"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("是"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 102, + Term: []byte("李松洪"), + Position: 24, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一次性交多少钱"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("一次性"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("交"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("多少"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("钱"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("两块五一套,三块八一斤,四块七一本,五块六一条"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("两块"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("五"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("一套"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte(","), + Position: 4, + Type: analysis.AlphaNumeric, + }, + { + Start: 18, + End: 24, + Term: []byte("三块"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("八"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一斤"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 36, + End: 42, + Term: []byte("四块"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("七"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("一本"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte(","), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 60, + Term: []byte("五块"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 63, + Term: []byte("六"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("一条"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("小和尚留了一个像大和尚一样的和尚头"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("小"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("和尚"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("留"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("一个"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("像"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("大"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("和尚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一样"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("和尚"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 51, + Term: []byte("和尚头"), + Position: 12, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("中华"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("华人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("共和"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("共和国"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 27, + Term: []byte("中华人民共和国"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("公民"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 34, + Term: []byte(";"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 34, + End: 37, + Term: []byte("我"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("爸爸"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("是"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 52, + Term: []byte("共和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 55, + Term: []byte("共和党"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("党员"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 62, + Term: []byte(";"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 63, + Term: []byte(" "), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 63, + End: 69, + Term: []byte("地铁"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 75, + Term: []byte("和平"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 78, + Term: []byte("和平门"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("站"), + Position: 22, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张晓梅去人民医院做了个B超然后去买了件T恤"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张晓梅"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("医院"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("做"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("个"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 37, + Term: []byte("B超"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("然后"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("去"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 49, + Term: []byte("买"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 52, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 55, + Term: []byte("件"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 59, + Term: []byte("T恤"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("AT&T是一件不错的公司,给你发offer了吗?"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("AT&T"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("一件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("不错"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("公司"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 31, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 31, + End: 34, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte("你"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 40, + Term: []byte("发"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 40, + End: 45, + Term: []byte("offer"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("吗"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("?"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("C++"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 6, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 8, + Term: []byte("c#"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 8, + End: 11, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("什么"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("关系"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("?"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 26, + End: 28, + Term: []byte("11"), + Position: 8, + Type: analysis.Numeric, + }, + { + Start: 28, + End: 29, + Term: []byte("+"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 29, + End: 32, + Term: []byte("122"), + Position: 10, + Type: analysis.Numeric, + }, + { + Start: 32, + End: 33, + Term: []byte("="), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 33, + End: 36, + Term: []byte("133"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 36, + End: 39, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 39, + End: 42, + Term: []byte("是"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("吗"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 48, + End: 50, + Term: []byte("PI"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 50, + End: 51, + Term: []byte("="), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 58, + Term: []byte("3.14159"), + Position: 19, + Type: analysis.Numeric, + }, + }, + }, + { + []byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("你"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("认识"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("那个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("主席"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("握手"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("的哥"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("吗"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("?"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 51, + Term: []byte("他开"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("一辆"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 63, + Term: []byte("黑色"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("的士"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 72, + Term: []byte("。"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("枪杆子中出政权"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("枪杆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("杆子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("枪杆子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("中"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("出"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("政权"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张三风同学走上了不归路"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张三风"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("同学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("走上"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("归路"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 33, + Term: []byte("不归路"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("阿Q"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 4, + End: 10, + Term: []byte("腰间"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("挂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("着"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 21, + Term: []byte("BB机"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("手里"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("拿"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("着"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("大哥"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("大哥大"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte(","), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("说"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte(":"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 54, + Term: []byte("我"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("一般"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 66, + Term: []byte("吃饭"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 69, + Term: []byte("不"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 74, + Term: []byte("AA制"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("的"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 80, + Term: []byte("。"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("在1号店能买到小S和大S八卦的书。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 10, + Term: []byte("1号店"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("买"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 19, + Term: []byte("到"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 23, + Term: []byte("小S"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("和"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 30, + Term: []byte("大S"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("八卦"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("书"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("。"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + } + + tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, true) + for _, test := range tests { + actual := tokenizer.Tokenize(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } + +}