From 1aabc4a2f39dabefd3675454837aa0557c017295 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 16 Mar 2015 15:55:41 +0800 Subject: [PATCH 1/5] removed unnecessary MarshalBinary/UnmarshalBinary method --- trie.go | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/trie.go b/trie.go index b7d7ed2..679f1f4 100644 --- a/trie.go +++ b/trie.go @@ -1,7 +1,6 @@ package jiebago import ( - "bytes" "crypto/md5" "encoding/gob" "fmt" @@ -19,34 +18,6 @@ type trie struct { Freq map[string]float64 } -func (t trie) MarshalBinary() ([]byte, error) { - var b bytes.Buffer - enc := gob.NewEncoder(&b) - err := enc.Encode(t.Total) - if err != nil { - return nil, err - } - err = enc.Encode(t.Freq) - if err != nil { - return nil, err - } - return b.Bytes(), nil -} - -func (t *trie) UnmarshalBinary(data []byte) error { - b := bytes.NewBuffer(data) - dec := gob.NewDecoder(b) - err := dec.Decode(&t.Total) - if err != nil { - return err - } - err = dec.Decode(&t.Freq) - if err != nil { - return err - } - return nil -} - func (t *trie) load(dictFileName string) error { dictFilePath, err := DictPath(dictFileName) if err != nil { From 2c95c61d337341700f1b58c7594824256653b22c Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Tue, 17 Mar 2015 15:30:13 +0800 Subject: [PATCH 2/5] added jieba tokenizer for bleve --- analyse/tokenizers/jieba.go | 108 + analyse/tokenizers/jieba_test.go | 5229 ++++++++++++++++++++++++++++++ 2 files changed, 5337 insertions(+) create mode 100644 analyse/tokenizers/jieba.go create mode 100644 analyse/tokenizers/jieba_test.go diff --git a/analyse/tokenizers/jieba.go b/analyse/tokenizers/jieba.go new file mode 100644 index 0000000..0bedd55 --- /dev/null +++ b/analyse/tokenizers/jieba.go @@ -0,0 +1,108 @@ +package tokenizers + +import ( + "fmt" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + "github.com/wangbin/jiebago" + "regexp" + "strconv" +) + +const Name = "jieba" + +var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) + +type JiebaTokenizer struct { + dictFileName string + hmm, searchMode bool +} + +func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { + err := jiebago.SetDictionary(dictFileName) + return &JiebaTokenizer{ + dictFileName: dictFileName, + hmm: hmm, + searchMode: searchMode, + }, err +} + +func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + runeStart := 0 + start := 0 + end := 0 + pos := 1 + var width int + var gram string + for word := range jiebago.Cut(string(input), false, jt.hmm) { + if jt.searchMode { + runes := []rune(word) + width = len(runes) + for _, step := range [2]int{2, 3} { + if width > step { + for i := 0; i < width-step+1; i++ { + gram = string(runes[i : i+step]) + if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 { + token := analysis.Token{ + Term: []byte(gram), + Start: start, + End: start + len(gram), + Position: pos, + Type: detectTokenType(gram), + } + rv = append(rv, &token) + pos++ + } + } + } + } + } + end = start + len(word) + token := analysis.Token{ + Term: []byte(word), + Start: start, + End: end, + Position: pos, + Type: detectTokenType(word), + } + rv = append(rv, &token) + pos++ + runeStart += width + start = end + } + return rv +} + +func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( + analysis.Tokenizer, error) { + dictFileName, ok := config["file"].(string) + if !ok { + return nil, fmt.Errorf("must specify dictionary file path") + } + hmm, ok := config["hmm"].(bool) + if !ok { + hmm = true + } + searchMode, ok := config["search"].(bool) + if !ok { + searchMode = true + } + + return NewJiebaTokenizer(dictFileName, hmm, searchMode) +} + +func detectTokenType(term string) analysis.TokenType { + if IdeographRegexp.MatchString(term) { + return analysis.Ideographic + } + _, err := strconv.ParseFloat(term, 64) + if err == nil { + return analysis.Numeric + } + return analysis.AlphaNumeric +} + +func init() { + registry.RegisterTokenizer(Name, JiebaTokenizerConstructor) +} diff --git a/analyse/tokenizers/jieba_test.go b/analyse/tokenizers/jieba_test.go new file mode 100644 index 0000000..ee4878b --- /dev/null +++ b/analyse/tokenizers/jieba_test.go @@ -0,0 +1,5229 @@ +package tokenizers + +import ( + "github.com/blevesearch/bleve/analysis" + "reflect" + "testing" +) + +func TestJiebaTokenizer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("一个"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 30, + Term: []byte("伸手不见五指"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("黑夜"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 42, + End: 45, + Term: []byte("我"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("叫"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 57, + Term: []byte("孙悟空"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 60, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 60, + End: 63, + Term: []byte("我"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 66, + Term: []byte("爱"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 72, + Term: []byte("北京"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 72, + End: 75, + Term: []byte(","), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 75, + End: 78, + Term: []byte("我"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("爱"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("Python"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 87, + End: 90, + Term: []byte("和"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("C++"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("。"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("我不喜欢日本和服。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("不"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("喜欢"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("日本"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("和服"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("雷猴回归人间。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("回归"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人间"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("。"), + Position: 4, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("工信处"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("女干事"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("每月"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("经过"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("下属"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("科室"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("都"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("要"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 54, + Term: []byte("亲口"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("交代"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 62, + Term: []byte("24"), + Position: 11, + Type: analysis.Numeric, + }, + { + Start: 62, + End: 65, + Term: []byte("口"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 74, + Term: []byte("交换机"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("等"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 86, + Term: []byte("技术性"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 86, + End: 92, + Term: []byte("器件"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 92, + End: 95, + Term: []byte("的"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 95, + End: 101, + Term: []byte("安装"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 101, + End: 107, + Term: []byte("工作"), + Position: 19, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我需要廉租房"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("廉租房"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("永和服装饰品有限公司"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("永和"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("服装"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("饰品"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 30, + Term: []byte("有限公司"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我爱北京天安门"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("爱"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("北京"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("天安门"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("abc"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("abc"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("隐马尔可夫"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("隐"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 15, + Term: []byte("马尔可夫"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("雷猴是个好网站"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("好"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("网站"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("“"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 12, + Term: []byte("Microsoft"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 15, + Term: []byte("”"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 15, + End: 21, + Term: []byte("一词"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("由"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("“"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 40, + Term: []byte("MICROcomputer"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 40, + End: 43, + Term: []byte("("), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 43, + End: 49, + Term: []byte("微型"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 58, + Term: []byte("计算机"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 58, + End: 61, + Term: []byte(")"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 64, + Term: []byte("”"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 64, + End: 67, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 70, + Term: []byte("“"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 78, + Term: []byte("SOFTware"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 81, + Term: []byte("("), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("软件"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte(")"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 90, + End: 93, + Term: []byte("”"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("两"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 102, + Term: []byte("部分"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("组成"), + Position: 22, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("草泥马和欺实马是今年的流行词汇"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("草泥马"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("欺实"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("马"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("今年"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("流行"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("词汇"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("伊藤洋华堂总府店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("伊藤"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("洋华堂"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("总府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国科学院计算技术研究所"), + analysis.TokenStream{ + { + Start: 0, + End: 36, + Term: []byte("中国科学院计算技术研究所"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("罗密欧与朱丽叶"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("罗密欧"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("与"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("朱丽叶"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我购买了道具和服装"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("购买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("道具"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("服装"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("PS"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 3, + Term: []byte(":"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 4, + Term: []byte(" "), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("我"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("觉得"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("开源"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("有"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("一个"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("好处"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 37, + End: 43, + Term: []byte("就是"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 49, + Term: []byte("能够"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("敦促"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("自己"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 73, + Term: []byte("不断改进"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 73, + End: 76, + Term: []byte(","), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 76, + End: 82, + Term: []byte("避免"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 82, + End: 88, + Term: []byte("敞帚"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 88, + End: 94, + Term: []byte("自珍"), + Position: 19, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省石首市"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("石首市"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省十堰市"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("十堰市"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("总经理完成了这件事情"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("总经理"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("完成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("这件"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("事情"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("电脑修好了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("电脑"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("修好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("做好了这件事情就一了百了了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("做好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("这件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("事情"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 36, + Term: []byte("一了百了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("了"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("人们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("审美"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("观点"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("不同"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我们买了一个美的空调"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("我们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("一个"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("美的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("空调"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("线程初始化时我们要注意"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("线程"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("初始化"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("我们"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("要"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("注意"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一个分子是由好多原子组织成的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一个"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("分子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("由"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("好多"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("原子"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("组织"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("成"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("祝你马到功成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("祝"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("你"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 18, + Term: []byte("马到功成"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("他掉进了无底洞里"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("掉"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("进"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("无底洞"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("里"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国的首都是北京"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("北京"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("孙君意"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("孙君意"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("外交部发言人马朝旭"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("外交部"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("发言人"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("马朝旭"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("领导人会议和第四届东亚峰会"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("领导人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("会议"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("第四届"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("东亚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("峰会"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("在过去的这五年"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("过去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("这"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("五年"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("还需要很长的路要走"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("还"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("很长"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("路"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("走"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("60周年首都阅兵"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("60"), + Position: 1, + Type: analysis.Numeric, + }, + { + Start: 2, + End: 8, + Term: []byte("周年"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("阅兵"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("你好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("人们"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("审美"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("观点"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("不同"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后来世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("来"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后去世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("去"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("但是后来我才知道你是对的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("但是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("后来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("才"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("知道"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("你"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("对"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("存在即合理"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("存在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("即"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("合理"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("的的的的的在的的的的就以和和和"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("的"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("就"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("以"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("I love你,不以为耻,反以为rong"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte("I"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 2, + Term: []byte(" "), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 6, + Term: []byte("love"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 6, + End: 9, + Term: []byte("你"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte(","), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 24, + Term: []byte("不以为耻"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 30, + Term: []byte("反"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("以为"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 40, + Term: []byte("rong"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("因"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("因"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("很好但主要是基于网页形式"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("很"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("但"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("主要"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("基于"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("网页"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("形式"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("为什么我不能拥有想要的生活"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("为什么"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("不能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("拥有"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("想要"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("生活"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("后来我才"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("后来"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("才"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("此次来中国是为了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("此次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("中国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("为了"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("使用"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("它"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("就"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("可以"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("解决"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一些"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("问题"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(",使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte(","), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 7, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 10, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 22, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 40, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("其实使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("其实"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("好人使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("好人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("是因为和国家"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("是因为"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("国家"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("老年搜索还支持"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("老年"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("搜索"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("还"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("支持"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("干脆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("就"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("把"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("那部"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("蒙人"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("闲法"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("废"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("拉倒"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("!"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 53, + Term: []byte("RT"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 54, + Term: []byte(" "), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 55, + Term: []byte("@"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 55, + End: 67, + Term: []byte("laoshipukong"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 67, + End: 68, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 69, + Term: []byte(":"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 69, + End: 70, + Term: []byte(" "), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 72, + Term: []byte("27"), + Position: 20, + Type: analysis.Numeric, + }, + { + Start: 72, + End: 75, + Term: []byte("日"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 75, + End: 78, + Term: []byte(","), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 99, + Term: []byte("全国人大常委会"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 108, + Term: []byte("第三次"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 108, + End: 114, + Term: []byte("审议"), + Position: 25, + Type: analysis.Ideographic, + }, + { + Start: 114, + End: 120, + Term: []byte("侵权"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 129, + Term: []byte("责任法"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 129, + End: 135, + Term: []byte("草案"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 135, + End: 138, + Term: []byte(","), + Position: 29, + Type: analysis.AlphaNumeric, + }, + { + Start: 138, + End: 144, + Term: []byte("删除"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 144, + End: 147, + Term: []byte("了"), + Position: 31, + Type: analysis.Ideographic, + }, + { + Start: 147, + End: 153, + Term: []byte("有关"), + Position: 32, + Type: analysis.Ideographic, + }, + { + Start: 153, + End: 159, + Term: []byte("医疗"), + Position: 33, + Type: analysis.Ideographic, + }, + { + Start: 159, + End: 165, + Term: []byte("损害"), + Position: 34, + Type: analysis.Ideographic, + }, + { + Start: 165, + End: 171, + Term: []byte("责任"), + Position: 35, + Type: analysis.Ideographic, + }, + { + Start: 171, + End: 174, + Term: []byte("“"), + Position: 36, + Type: analysis.AlphaNumeric, + }, + { + Start: 174, + End: 180, + Term: []byte("举证"), + Position: 37, + Type: analysis.Ideographic, + }, + { + Start: 180, + End: 186, + Term: []byte("倒置"), + Position: 38, + Type: analysis.Ideographic, + }, + { + Start: 186, + End: 189, + Term: []byte("”"), + Position: 39, + Type: analysis.AlphaNumeric, + }, + { + Start: 189, + End: 192, + Term: []byte("的"), + Position: 40, + Type: analysis.Ideographic, + }, + { + Start: 192, + End: 198, + Term: []byte("规定"), + Position: 41, + Type: analysis.Ideographic, + }, + { + Start: 198, + End: 201, + Term: []byte("。"), + Position: 42, + Type: analysis.AlphaNumeric, + }, + { + Start: 201, + End: 204, + Term: []byte("在"), + Position: 43, + Type: analysis.Ideographic, + }, + { + Start: 204, + End: 210, + Term: []byte("医患"), + Position: 44, + Type: analysis.Ideographic, + }, + { + Start: 210, + End: 216, + Term: []byte("纠纷"), + Position: 45, + Type: analysis.Ideographic, + }, + { + Start: 216, + End: 222, + Term: []byte("中本"), + Position: 46, + Type: analysis.Ideographic, + }, + { + Start: 222, + End: 225, + Term: []byte("已"), + Position: 47, + Type: analysis.Ideographic, + }, + { + Start: 225, + End: 231, + Term: []byte("处于"), + Position: 48, + Type: analysis.Ideographic, + }, + { + Start: 231, + End: 237, + Term: []byte("弱势"), + Position: 49, + Type: analysis.Ideographic, + }, + { + Start: 237, + End: 243, + Term: []byte("地位"), + Position: 50, + Type: analysis.Ideographic, + }, + { + Start: 243, + End: 246, + Term: []byte("的"), + Position: 51, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 255, + Term: []byte("消费者"), + Position: 52, + Type: analysis.Ideographic, + }, + { + Start: 255, + End: 261, + Term: []byte("由此"), + Position: 53, + Type: analysis.Ideographic, + }, + { + Start: 261, + End: 264, + Term: []byte("将"), + Position: 54, + Type: analysis.Ideographic, + }, + { + Start: 264, + End: 270, + Term: []byte("陷入"), + Position: 55, + Type: analysis.Ideographic, + }, + { + Start: 270, + End: 282, + Term: []byte("万劫不复"), + Position: 56, + Type: analysis.Ideographic, + }, + { + Start: 282, + End: 285, + Term: []byte("的"), + Position: 57, + Type: analysis.Ideographic, + }, + { + Start: 285, + End: 291, + Term: []byte("境地"), + Position: 58, + Type: analysis.Ideographic, + }, + { + Start: 291, + End: 294, + Term: []byte("。"), + Position: 59, + Type: analysis.AlphaNumeric, + }, + { + Start: 294, + End: 295, + Term: []byte(" "), + Position: 60, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("大"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("大"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("他说的确实在理"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("说"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("确实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("在理"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春节讲话"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("市长"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("春节"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("讲话"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结婚的和尚未结婚的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结婚"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("尚未"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("结婚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结合成分子时"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结合"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("分子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("旅游和服务是最好的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("旅游"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("服务"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("最好"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("这件事情的确是我的错"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这件"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("事情"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("的确"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("我"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("错"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("供大家参考指正"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("供"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("大家"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("参考"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("指正"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("哈尔滨政府公布塌桥原因"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("哈尔滨"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("政府"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("公布"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("塌桥"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("原因"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我在机场入口处"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("在"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("机场"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("入口处"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邢永臣摄影报道"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邢永臣"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("摄影"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("报道"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("BP神经网络如何训练才能在分类时增加区分度?"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("BP"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 14, + Term: []byte("神经网络"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("如何"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 20, + End: 26, + Term: []byte("训练"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("才能"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("分类"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("时"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 44, + End: 50, + Term: []byte("增加"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 59, + Term: []byte("区分度"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 59, + End: 62, + Term: []byte("?"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("南京市长江大桥"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("南京市"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 21, + Term: []byte("长江大桥"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("应"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("一些"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("使用者"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("建议"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte(","), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 30, + End: 33, + Term: []byte("也"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("为了"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("便于"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("利用"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 59, + Term: []byte("NiuTrans"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 59, + End: 65, + Term: []byte("用于"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("SMT"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 74, + Term: []byte("研究"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春药店"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("长春市"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长春"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("药店"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邓颖超生前最喜欢的衣服"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邓颖超"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("生前"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("最"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("喜欢"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("衣服"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("胡锦涛是热爱世界和平的政治局常委"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("胡锦涛"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("热爱"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世界"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("和平"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("政治局"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("常委"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("程序员"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("祝"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("海林"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 30, + Term: []byte("朱会震"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("在"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("孙健"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("左面"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("和"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("右面"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 61, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 62, + Term: []byte(" "), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 68, + Term: []byte("范凯"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 71, + Term: []byte("在"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 71, + End: 74, + Term: []byte("最"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 80, + Term: []byte("右面"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 80, + End: 81, + Term: []byte("."), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("再往"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte("左"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("是"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 102, + Term: []byte("李松洪"), + Position: 23, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一次性交多少钱"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("一次性"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("交"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("多少"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("钱"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("两块五一套,三块八一斤,四块七一本,五块六一条"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("两块"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("五"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("一套"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte(","), + Position: 4, + Type: analysis.AlphaNumeric, + }, + { + Start: 18, + End: 24, + Term: []byte("三块"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("八"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一斤"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 36, + End: 42, + Term: []byte("四块"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("七"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("一本"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte(","), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 60, + Term: []byte("五块"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 63, + Term: []byte("六"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("一条"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("小和尚留了一个像大和尚一样的和尚头"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("小"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("和尚"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("留"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("一个"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("像"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("大"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("和尚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一样"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 51, + Term: []byte("和尚头"), + Position: 11, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 27, + Term: []byte("中华人民共和国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("公民"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 34, + Term: []byte(";"), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 34, + End: 37, + Term: []byte("我"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("爸爸"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("是"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 55, + Term: []byte("共和党"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("党员"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 62, + Term: []byte(";"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 63, + Term: []byte(" "), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 63, + End: 69, + Term: []byte("地铁"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 78, + Term: []byte("和平门"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("站"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张晓梅去人民医院做了个B超然后去买了件T恤"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张晓梅"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("医院"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("做"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("个"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 37, + Term: []byte("B超"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("然后"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("去"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 49, + Term: []byte("买"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 52, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 55, + Term: []byte("件"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 59, + Term: []byte("T恤"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("AT&T是一件不错的公司,给你发offer了吗?"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("AT&T"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("一件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("不错"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("公司"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 31, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 31, + End: 34, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte("你"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 40, + Term: []byte("发"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 40, + End: 45, + Term: []byte("offer"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("吗"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("?"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("C++"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 6, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 8, + Term: []byte("c#"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 8, + End: 11, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("什么"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("关系"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("?"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 26, + End: 28, + Term: []byte("11"), + Position: 8, + Type: analysis.Numeric, + }, + { + Start: 28, + End: 29, + Term: []byte("+"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 29, + End: 32, + Term: []byte("122"), + Position: 10, + Type: analysis.Numeric, + }, + { + Start: 32, + End: 33, + Term: []byte("="), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 33, + End: 36, + Term: []byte("133"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 36, + End: 39, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 39, + End: 42, + Term: []byte("是"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("吗"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 48, + End: 50, + Term: []byte("PI"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 50, + End: 51, + Term: []byte("="), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 58, + Term: []byte("3.14159"), + Position: 19, + Type: analysis.Numeric, + }, + }, + }, + { + []byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("你"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("认识"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("那个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("主席"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("握手"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("的哥"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("吗"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("?"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 51, + Term: []byte("他开"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("一辆"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 63, + Term: []byte("黑色"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("的士"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 72, + Term: []byte("。"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("枪杆子中出政权"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("枪杆子"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("中"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("出"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("政权"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张三风同学走上了不归路"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张三风"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("同学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("走上"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 33, + Term: []byte("不归路"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("阿Q"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 4, + End: 10, + Term: []byte("腰间"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("挂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("着"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 21, + Term: []byte("BB机"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("手里"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("拿"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("着"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("大哥大"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("说"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte(":"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 54, + Term: []byte("我"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("一般"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 66, + Term: []byte("吃饭"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 69, + Term: []byte("不"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 74, + Term: []byte("AA制"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("的"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 80, + Term: []byte("。"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("在1号店能买到小S和大S八卦的书。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 10, + Term: []byte("1号店"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("买"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 19, + Term: []byte("到"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 23, + Term: []byte("小S"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("和"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 30, + Term: []byte("大S"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("八卦"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("书"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("。"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + } + + tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, false) + for _, test := range tests { + actual := tokenizer.Tokenize(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } + +} From a14788addbb7e0d8a591c8d8794bb32c7cc89593 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Tue, 17 Mar 2015 16:29:09 +0800 Subject: [PATCH 3/5] fixed a but in tokenizer under search mode, added more tests --- analyse/tokenizers/jieba.go | 6 +- analyse/tokenizers/jieba_test.go | 5840 +++++++++++++++++++++++++++++- 2 files changed, 5843 insertions(+), 3 deletions(-) diff --git a/analyse/tokenizers/jieba.go b/analyse/tokenizers/jieba.go index 0bedd55..1e1547d 100644 --- a/analyse/tokenizers/jieba.go +++ b/analyse/tokenizers/jieba.go @@ -43,11 +43,13 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { if width > step { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) + gramLen := len(gram) if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 { + gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram), - Start: start, - End: start + len(gram), + Start: gramStart, + End: gramStart + gramLen, Position: pos, Type: detectTokenType(gram), } diff --git a/analyse/tokenizers/jieba_test.go b/analyse/tokenizers/jieba_test.go index ee4878b..81ca918 100644 --- a/analyse/tokenizers/jieba_test.go +++ b/analyse/tokenizers/jieba_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -func TestJiebaTokenizer(t *testing.T) { +func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { tests := []struct { input []byte output analysis.TokenStream @@ -5227,3 +5227,5841 @@ func TestJiebaTokenizer(t *testing.T) { } } + +func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("一个"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("伸手"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("不见"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("五指"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 30, + Term: []byte("伸手不见五指"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("黑夜"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("。"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 42, + End: 45, + Term: []byte("我"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("叫"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("悟空"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 57, + Term: []byte("孙悟空"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 60, + Term: []byte(","), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 60, + End: 63, + Term: []byte("我"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 66, + Term: []byte("爱"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 72, + Term: []byte("北京"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 72, + End: 75, + Term: []byte(","), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 75, + End: 78, + Term: []byte("我"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("爱"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("Python"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 87, + End: 90, + Term: []byte("和"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("C++"), + Position: 23, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("。"), + Position: 24, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("我不喜欢日本和服。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("不"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("喜欢"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("日本"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("和服"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("雷猴回归人间。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("回归"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人间"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("。"), + Position: 4, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("工信处"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("干事"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("女干事"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("每月"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("经过"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("下属"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("科室"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("都"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("要"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 54, + Term: []byte("亲口"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("交代"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 62, + Term: []byte("24"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 62, + End: 65, + Term: []byte("口"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 71, + Term: []byte("交换"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 74, + Term: []byte("换机"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 74, + Term: []byte("交换机"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("等"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 83, + Term: []byte("技术"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 86, + Term: []byte("技术性"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 86, + End: 92, + Term: []byte("器件"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 92, + End: 95, + Term: []byte("的"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 95, + End: 101, + Term: []byte("安装"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 101, + End: 107, + Term: []byte("工作"), + Position: 23, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我需要廉租房"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("廉租"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("租房"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("廉租房"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("永和服装饰品有限公司"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("永和"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("服装"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("饰品"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("有限"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("公司"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 30, + Term: []byte("有限公司"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我爱北京天安门"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("爱"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("北京"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("天安"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("天安门"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("abc"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("abc"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("隐马尔可夫"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("隐"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("可夫"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 12, + Term: []byte("马尔可"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 15, + Term: []byte("马尔可夫"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("雷猴是个好网站"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("好"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("网站"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("“"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 12, + Term: []byte("Microsoft"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 15, + Term: []byte("”"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 15, + End: 21, + Term: []byte("一词"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("由"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("“"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 40, + Term: []byte("MICROcomputer"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 40, + End: 43, + Term: []byte("("), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 43, + End: 49, + Term: []byte("微型"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("计算"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 58, + Term: []byte("算机"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 58, + Term: []byte("计算机"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 58, + End: 61, + Term: []byte(")"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 64, + Term: []byte("”"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 64, + End: 67, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 70, + Term: []byte("“"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 78, + Term: []byte("SOFTware"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 81, + Term: []byte("("), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("软件"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte(")"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 90, + End: 93, + Term: []byte("”"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("两"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 102, + Term: []byte("部分"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("组成"), + Position: 24, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("草泥马和欺实马是今年的流行词汇"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("草泥马"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("欺实"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("马"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("今年"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("流行"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("词汇"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("伊藤洋华堂总府店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("伊藤"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("洋华堂"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("总府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国科学院计算技术研究所"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("科学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("学院"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("计算"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("技术"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("研究"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("科学院"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 36, + Term: []byte("研究所"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 36, + Term: []byte("中国科学院计算技术研究所"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("罗密欧与朱丽叶"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("罗密欧"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("与"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("朱丽叶"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我购买了道具和服装"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("购买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("道具"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("服装"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("PS"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 3, + Term: []byte(":"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 4, + Term: []byte(" "), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("我"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("觉得"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("开源"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("有"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("一个"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("好处"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 37, + End: 43, + Term: []byte("就是"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 49, + Term: []byte("能够"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("敦促"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("自己"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 67, + Term: []byte("不断"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 73, + Term: []byte("改进"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 73, + Term: []byte("不断改进"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 73, + End: 76, + Term: []byte(","), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 76, + End: 82, + Term: []byte("避免"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 82, + End: 88, + Term: []byte("敞帚"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 88, + End: 94, + Term: []byte("自珍"), + Position: 21, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省石首市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("石首"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("石首市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省十堰市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("十堰"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("十堰市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("总经理完成了这件事情"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("经理"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("总经理"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("完成"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("这件"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("事情"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("电脑修好了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("电脑"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("修好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("做好了这件事情就一了百了了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("做好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("这件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("事情"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 36, + Term: []byte("一了百了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("了"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("人们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("审美"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("观点"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("不同"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我们买了一个美的空调"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("我们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("一个"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("美的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("空调"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("线程初始化时我们要注意"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("线程"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("初始"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("初始化"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("我们"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("注意"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一个分子是由好多原子组织成的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一个"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("分子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("由"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("好多"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("原子"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("组织"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("成"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("祝你马到功成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("祝"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("你"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 18, + Term: []byte("马到功成"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("他掉进了无底洞里"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("掉"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("进"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("无底"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("无底洞"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("里"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国的首都是北京"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("北京"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("孙君意"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("孙君意"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("外交部发言人马朝旭"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("外交"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("外交部"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("发言"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("发言人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("马朝旭"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("领导人会议和第四届东亚峰会"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("领导"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("领导人"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("会议"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("第四"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("四届"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("第四届"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("东亚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("峰会"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("在过去的这五年"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("过去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("这"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("五年"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("还需要很长的路要走"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("还"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("很长"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("路"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("走"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("60周年首都阅兵"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("60"), + Position: 1, + Type: analysis.Numeric, + }, + { + Start: 2, + End: 8, + Term: []byte("周年"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("阅兵"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("你好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("人们"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("审美"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("观点"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("不同"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后来世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("来"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后去世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("去"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("但是后来我才知道你是对的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("但是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("后来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("才"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("知道"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("你"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("对"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("存在即合理"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("存在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("即"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("合理"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("的的的的的在的的的的就以和和和"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("的"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("就"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("以"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("I love你,不以为耻,反以为rong"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte("I"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 2, + Term: []byte(" "), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 6, + Term: []byte("love"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 6, + End: 9, + Term: []byte("你"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte(","), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 18, + Term: []byte("不以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("以为"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 24, + Term: []byte("不以为耻"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte(","), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 30, + Term: []byte("反"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("以为"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 40, + Term: []byte("rong"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("因"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("因"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("很好但主要是基于网页形式"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("很"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("但"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("主要"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("基于"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("网页"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("形式"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("为什么我不能拥有想要的生活"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("什么"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("为什么"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("不能"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("拥有"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("想要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("生活"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("后来我才"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("后来"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("才"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("此次来中国是为了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("此次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("中国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("为了"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("使用"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("它"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("就"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("可以"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("解决"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一些"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("问题"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(",使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte(","), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 7, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 10, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 22, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 40, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("其实使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("其实"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("好人使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("好人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("是因为和国家"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("因为"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("是因为"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("国家"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("老年搜索还支持"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("老年"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("搜索"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("还"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("支持"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("干脆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("就"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("把"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("那部"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("蒙人"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("闲法"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("废"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("拉倒"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("!"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 53, + Term: []byte("RT"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 54, + Term: []byte(" "), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 55, + Term: []byte("@"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 55, + End: 67, + Term: []byte("laoshipukong"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 67, + End: 68, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 69, + Term: []byte(":"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 69, + End: 70, + Term: []byte(" "), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 72, + Term: []byte("27"), + Position: 20, + Type: analysis.Numeric, + }, + { + Start: 72, + End: 75, + Term: []byte("日"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 75, + End: 78, + Term: []byte(","), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 84, + Term: []byte("全国"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("国人"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 84, + End: 90, + Term: []byte("人大"), + Position: 25, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 96, + Term: []byte("常委"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 99, + Term: []byte("委会"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 99, + Term: []byte("常委会"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 99, + Term: []byte("全国人大常委会"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 105, + Term: []byte("第三"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("三次"), + Position: 31, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 108, + Term: []byte("第三次"), + Position: 32, + Type: analysis.Ideographic, + }, + { + Start: 108, + End: 114, + Term: []byte("审议"), + Position: 33, + Type: analysis.Ideographic, + }, + { + Start: 114, + End: 120, + Term: []byte("侵权"), + Position: 34, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 126, + Term: []byte("责任"), + Position: 35, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 129, + Term: []byte("责任法"), + Position: 36, + Type: analysis.Ideographic, + }, + { + Start: 129, + End: 135, + Term: []byte("草案"), + Position: 37, + Type: analysis.Ideographic, + }, + { + Start: 135, + End: 138, + Term: []byte(","), + Position: 38, + Type: analysis.AlphaNumeric, + }, + { + Start: 138, + End: 144, + Term: []byte("删除"), + Position: 39, + Type: analysis.Ideographic, + }, + { + Start: 144, + End: 147, + Term: []byte("了"), + Position: 40, + Type: analysis.Ideographic, + }, + { + Start: 147, + End: 153, + Term: []byte("有关"), + Position: 41, + Type: analysis.Ideographic, + }, + { + Start: 153, + End: 159, + Term: []byte("医疗"), + Position: 42, + Type: analysis.Ideographic, + }, + { + Start: 159, + End: 165, + Term: []byte("损害"), + Position: 43, + Type: analysis.Ideographic, + }, + { + Start: 165, + End: 171, + Term: []byte("责任"), + Position: 44, + Type: analysis.Ideographic, + }, + { + Start: 171, + End: 174, + Term: []byte("“"), + Position: 45, + Type: analysis.AlphaNumeric, + }, + { + Start: 174, + End: 180, + Term: []byte("举证"), + Position: 46, + Type: analysis.Ideographic, + }, + { + Start: 180, + End: 186, + Term: []byte("倒置"), + Position: 47, + Type: analysis.Ideographic, + }, + { + Start: 186, + End: 189, + Term: []byte("”"), + Position: 48, + Type: analysis.AlphaNumeric, + }, + { + Start: 189, + End: 192, + Term: []byte("的"), + Position: 49, + Type: analysis.Ideographic, + }, + { + Start: 192, + End: 198, + Term: []byte("规定"), + Position: 50, + Type: analysis.Ideographic, + }, + { + Start: 198, + End: 201, + Term: []byte("。"), + Position: 51, + Type: analysis.AlphaNumeric, + }, + { + Start: 201, + End: 204, + Term: []byte("在"), + Position: 52, + Type: analysis.Ideographic, + }, + { + Start: 204, + End: 210, + Term: []byte("医患"), + Position: 53, + Type: analysis.Ideographic, + }, + { + Start: 210, + End: 216, + Term: []byte("纠纷"), + Position: 54, + Type: analysis.Ideographic, + }, + { + Start: 216, + End: 222, + Term: []byte("中本"), + Position: 55, + Type: analysis.Ideographic, + }, + { + Start: 222, + End: 225, + Term: []byte("已"), + Position: 56, + Type: analysis.Ideographic, + }, + { + Start: 225, + End: 231, + Term: []byte("处于"), + Position: 57, + Type: analysis.Ideographic, + }, + { + Start: 231, + End: 237, + Term: []byte("弱势"), + Position: 58, + Type: analysis.Ideographic, + }, + { + Start: 237, + End: 243, + Term: []byte("地位"), + Position: 59, + Type: analysis.Ideographic, + }, + { + Start: 243, + End: 246, + Term: []byte("的"), + Position: 60, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 252, + Term: []byte("消费"), + Position: 61, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 255, + Term: []byte("消费者"), + Position: 62, + Type: analysis.Ideographic, + }, + { + Start: 255, + End: 261, + Term: []byte("由此"), + Position: 63, + Type: analysis.Ideographic, + }, + { + Start: 261, + End: 264, + Term: []byte("将"), + Position: 64, + Type: analysis.Ideographic, + }, + { + Start: 264, + End: 270, + Term: []byte("陷入"), + Position: 65, + Type: analysis.Ideographic, + }, + { + Start: 276, + End: 282, + Term: []byte("不复"), + Position: 66, + Type: analysis.Ideographic, + }, + { + Start: 270, + End: 282, + Term: []byte("万劫不复"), + Position: 67, + Type: analysis.Ideographic, + }, + { + Start: 282, + End: 285, + Term: []byte("的"), + Position: 68, + Type: analysis.Ideographic, + }, + { + Start: 285, + End: 291, + Term: []byte("境地"), + Position: 69, + Type: analysis.Ideographic, + }, + { + Start: 291, + End: 294, + Term: []byte("。"), + Position: 70, + Type: analysis.AlphaNumeric, + }, + { + Start: 294, + End: 295, + Term: []byte(" "), + Position: 71, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("大"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("大"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("他说的确实在理"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("说"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("确实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("在理"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春节讲话"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("市长"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("春节"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("讲话"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结婚的和尚未结婚的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结婚"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("尚未"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("结婚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结合成分子时"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结合"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("分子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("旅游和服务是最好的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("旅游"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("服务"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("最好"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("这件事情的确是我的错"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这件"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("事情"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("的确"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("我"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("错"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("供大家参考指正"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("供"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("大家"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("参考"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("指正"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("哈尔滨政府公布塌桥原因"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("哈尔"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("哈尔滨"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("政府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("公布"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("塌桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("原因"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我在机场入口处"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("在"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("机场"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("入口"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("入口处"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邢永臣摄影报道"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邢永臣"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("摄影"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("报道"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("BP神经网络如何训练才能在分类时增加区分度?"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("BP"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 8, + Term: []byte("神经"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("网络"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 11, + Term: []byte("神经网"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 14, + Term: []byte("神经网络"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("如何"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 20, + End: 26, + Term: []byte("训练"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("才能"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("在"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("分类"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("时"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 44, + End: 50, + Term: []byte("增加"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 56, + Term: []byte("区分"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 53, + End: 59, + Term: []byte("分度"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 59, + Term: []byte("区分度"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 59, + End: 62, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("南京市长江大桥"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("南京"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("京市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("南京市"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长江"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("大桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 21, + Term: []byte("长江大桥"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("应"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("一些"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("使用"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("用者"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("使用者"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("建议"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 30, + End: 33, + Term: []byte("也"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("为了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("便于"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("利用"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 59, + Term: []byte("NiuTrans"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 59, + End: 65, + Term: []byte("用于"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("SMT"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 74, + Term: []byte("研究"), + Position: 16, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春药店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("长春市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长春"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("药店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邓颖超生前最喜欢的衣服"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邓颖超"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("生前"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("最"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("喜欢"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("衣服"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("胡锦涛是热爱世界和平的政治局常委"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("锦涛"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("胡锦涛"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("热爱"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世界"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("和平"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("政治"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("政治局"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("常委"), + Position: 10, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("程序"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("程序员"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("祝"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("海林"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 30, + Term: []byte("朱会震"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("在"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("孙健"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("左面"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("和"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("右面"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 61, + Term: []byte(","), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 62, + Term: []byte(" "), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 68, + Term: []byte("范凯"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 71, + Term: []byte("在"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 71, + End: 74, + Term: []byte("最"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 80, + Term: []byte("右面"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 80, + End: 81, + Term: []byte("."), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("再往"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte("左"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("是"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 102, + Term: []byte("李松洪"), + Position: 24, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一次性交多少钱"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("一次性"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("交"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("多少"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("钱"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("两块五一套,三块八一斤,四块七一本,五块六一条"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("两块"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("五"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("一套"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte(","), + Position: 4, + Type: analysis.AlphaNumeric, + }, + { + Start: 18, + End: 24, + Term: []byte("三块"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("八"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一斤"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 36, + End: 42, + Term: []byte("四块"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("七"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("一本"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte(","), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 60, + Term: []byte("五块"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 63, + Term: []byte("六"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("一条"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("小和尚留了一个像大和尚一样的和尚头"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("小"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("和尚"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("留"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("一个"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("像"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("大"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("和尚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一样"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("和尚"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 51, + Term: []byte("和尚头"), + Position: 12, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("中华"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("华人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("共和"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("共和国"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 27, + Term: []byte("中华人民共和国"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("公民"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 34, + Term: []byte(";"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 34, + End: 37, + Term: []byte("我"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("爸爸"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("是"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 52, + Term: []byte("共和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 55, + Term: []byte("共和党"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("党员"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 62, + Term: []byte(";"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 63, + Term: []byte(" "), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 63, + End: 69, + Term: []byte("地铁"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 75, + Term: []byte("和平"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 78, + Term: []byte("和平门"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("站"), + Position: 22, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张晓梅去人民医院做了个B超然后去买了件T恤"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张晓梅"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("医院"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("做"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("个"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 37, + Term: []byte("B超"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("然后"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("去"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 49, + Term: []byte("买"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 52, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 55, + Term: []byte("件"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 59, + Term: []byte("T恤"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("AT&T是一件不错的公司,给你发offer了吗?"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("AT&T"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("一件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("不错"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("公司"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 31, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 31, + End: 34, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte("你"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 40, + Term: []byte("发"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 40, + End: 45, + Term: []byte("offer"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("吗"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("?"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("C++"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 6, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 8, + Term: []byte("c#"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 8, + End: 11, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("什么"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("关系"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("?"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 26, + End: 28, + Term: []byte("11"), + Position: 8, + Type: analysis.Numeric, + }, + { + Start: 28, + End: 29, + Term: []byte("+"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 29, + End: 32, + Term: []byte("122"), + Position: 10, + Type: analysis.Numeric, + }, + { + Start: 32, + End: 33, + Term: []byte("="), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 33, + End: 36, + Term: []byte("133"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 36, + End: 39, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 39, + End: 42, + Term: []byte("是"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("吗"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 48, + End: 50, + Term: []byte("PI"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 50, + End: 51, + Term: []byte("="), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 58, + Term: []byte("3.14159"), + Position: 19, + Type: analysis.Numeric, + }, + }, + }, + { + []byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("你"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("认识"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("那个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("主席"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("握手"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("的哥"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("吗"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("?"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 51, + Term: []byte("他开"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("一辆"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 63, + Term: []byte("黑色"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("的士"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 72, + Term: []byte("。"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("枪杆子中出政权"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("枪杆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("杆子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("枪杆子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("中"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("出"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("政权"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张三风同学走上了不归路"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张三风"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("同学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("走上"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("归路"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 33, + Term: []byte("不归路"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("阿Q"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 4, + End: 10, + Term: []byte("腰间"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("挂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("着"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 21, + Term: []byte("BB机"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("手里"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("拿"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("着"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("大哥"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("大哥大"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte(","), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("说"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte(":"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 54, + Term: []byte("我"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("一般"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 66, + Term: []byte("吃饭"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 69, + Term: []byte("不"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 74, + Term: []byte("AA制"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("的"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 80, + Term: []byte("。"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("在1号店能买到小S和大S八卦的书。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 10, + Term: []byte("1号店"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("买"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 19, + Term: []byte("到"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 23, + Term: []byte("小S"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("和"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 30, + Term: []byte("大S"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("八卦"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("书"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("。"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + } + + tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, true) + for _, test := range tests { + actual := tokenizer.Tokenize(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } + +} From f596ac063da8cc6df0d8ed9fe1fcab73d6939967 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Tue, 17 Mar 2015 16:34:36 +0800 Subject: [PATCH 4/5] added more tests --- analyse/tokenizers/jieba_test.go | 11450 ++++++++++++++++++++++++++++- 1 file changed, 11449 insertions(+), 1 deletion(-) diff --git a/analyse/tokenizers/jieba_test.go b/analyse/tokenizers/jieba_test.go index 81ca918..ee9949e 100644 --- a/analyse/tokenizers/jieba_test.go +++ b/analyse/tokenizers/jieba_test.go @@ -11063,5 +11063,11453 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) } } - +} + +func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("这"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("一个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 30, + Term: []byte("伸手不见五指"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("黑夜"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("。"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 42, + End: 45, + Term: []byte("我"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("叫"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 57, + Term: []byte("孙悟空"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 60, + Term: []byte(","), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 60, + End: 63, + Term: []byte("我"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 66, + Term: []byte("爱"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 72, + Term: []byte("北京"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 72, + End: 75, + Term: []byte(","), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 75, + End: 78, + Term: []byte("我"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("爱"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("Python"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 87, + End: 90, + Term: []byte("和"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("C++"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("。"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("我不喜欢日本和服。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("不"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("喜欢"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("日本"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("和服"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("雷猴回归人间。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("回归"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人间"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("。"), + Position: 4, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("工信处"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("女干事"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("每月"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("经过"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("下属"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("科室"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("都"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("要"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 54, + Term: []byte("亲口"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("交代"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 62, + Term: []byte("24"), + Position: 11, + Type: analysis.Numeric, + }, + { + Start: 62, + End: 65, + Term: []byte("口"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 74, + Term: []byte("交换机"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("等"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 86, + Term: []byte("技术性"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 86, + End: 92, + Term: []byte("器件"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 92, + End: 95, + Term: []byte("的"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 95, + End: 101, + Term: []byte("安装"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 101, + End: 107, + Term: []byte("工作"), + Position: 19, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我需要廉租房"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("廉租房"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("永和服装饰品有限公司"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("永和"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("服装"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("饰品"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 30, + Term: []byte("有限公司"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我爱北京天安门"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("爱"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("北京"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("天安门"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("abc"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("abc"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("隐马尔可夫"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("隐"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 15, + Term: []byte("马尔可夫"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("雷猴是个好网站"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("好"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("网站"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("“"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 12, + Term: []byte("Microsoft"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 15, + Term: []byte("”"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 15, + End: 18, + Term: []byte("一"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("词"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("由"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("“"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 40, + Term: []byte("MICROcomputer"), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 40, + End: 43, + Term: []byte("("), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 43, + End: 49, + Term: []byte("微型"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 58, + Term: []byte("计算机"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 58, + End: 61, + Term: []byte(")"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 64, + Term: []byte("”"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 64, + End: 67, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 70, + Term: []byte("“"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 78, + Term: []byte("SOFTware"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 81, + Term: []byte("("), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("软件"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte(")"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 90, + End: 93, + Term: []byte("”"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("两"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 102, + Term: []byte("部分"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("组成"), + Position: 23, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("草泥马和欺实马是今年的流行词汇"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("草泥马"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("欺"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("马"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("今年"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("流行"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("词汇"), + Position: 10, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("伊藤洋华堂总府店"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("伊"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("藤"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("洋华堂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("总府"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("店"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国科学院计算技术研究所"), + analysis.TokenStream{ + { + Start: 0, + End: 36, + Term: []byte("中国科学院计算技术研究所"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("罗密欧与朱丽叶"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("罗密欧"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("与"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("朱丽叶"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我购买了道具和服装"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("购买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("道具"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("服装"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("PS"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 3, + Term: []byte(":"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 4, + Term: []byte(" "), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("我"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("觉得"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("开源"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("有"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("一个"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("好处"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 37, + End: 43, + Term: []byte("就是"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 49, + Term: []byte("能够"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("敦促"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("自己"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 73, + Term: []byte("不断改进"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 73, + End: 76, + Term: []byte(","), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 76, + End: 82, + Term: []byte("避免"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 82, + End: 85, + Term: []byte("敞"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 85, + End: 88, + Term: []byte("帚"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 88, + End: 94, + Term: []byte("自珍"), + Position: 20, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省石首市"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("石首市"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省十堰市"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("十堰市"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("总经理完成了这件事情"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("总经理"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("完成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("这件"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("事情"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("电脑修好了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("电脑"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("修好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("做好了这件事情就一了百了了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("做好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("这件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("事情"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 36, + Term: []byte("一了百了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("了"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("人们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("审美"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("观点"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("不同"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我们买了一个美的空调"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("我们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("一个"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("美的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("空调"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("线程初始化时我们要注意"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("线程"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("初始化"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("我们"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("要"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("注意"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一个分子是由好多原子组织成的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一个"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("分子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("由"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("好多"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("原子"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("组织"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("成"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("祝你马到功成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("祝"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("你"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 18, + Term: []byte("马到功成"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("他掉进了无底洞里"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("掉"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("进"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("无底洞"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("里"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国的首都是北京"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("北京"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("孙君意"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("孙"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("君"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("意"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("外交部发言人马朝旭"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("外交部"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("发言人"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("马朝旭"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("领导人会议和第四届东亚峰会"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("领导人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("会议"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("第四届"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("东亚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("峰会"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("在过去的这五年"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("过去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("这"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("五年"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("还需要很长的路要走"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("还"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("很"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("长"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("路"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("要"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("走"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("60周年首都阅兵"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("60"), + Position: 1, + Type: analysis.Numeric, + }, + { + Start: 2, + End: 8, + Term: []byte("周年"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("阅兵"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("你好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("人们"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("审美"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("观点"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("不同"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后来世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("来"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后去世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("去"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("但是后来我才知道你是对的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("但是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("后来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("才"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("知道"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("你"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("对"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("存在即合理"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("存在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("即"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("合理"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("的的的的的在的的的的就以和和和"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("的"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("就"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("以"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("I love你,不以为耻,反以为rong"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte("I"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 2, + Term: []byte(" "), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 6, + Term: []byte("love"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 6, + End: 9, + Term: []byte("你"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte(","), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 24, + Term: []byte("不以为耻"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 30, + Term: []byte("反"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("以为"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 40, + Term: []byte("rong"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("因"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("因"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("很好但主要是基于网页形式"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("很"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("但"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("主要"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("基于"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("网页"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("形式"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("为什么我不能拥有想要的生活"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("为什么"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("不能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("拥有"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("想要"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("生活"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("后来我才"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("后来"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("才"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("此次来中国是为了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("此次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("中国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("为了"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("使用"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("它"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("就"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("可以"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("解决"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一些"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("问题"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(",使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte(","), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 7, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 10, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 22, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 40, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("其实使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("其实"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("好人使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("好人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("是因为和国家"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("是因为"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("国家"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("老年搜索还支持"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("老年"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("搜索"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("还"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("支持"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("干脆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("就"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("把"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("那"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("部"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("蒙"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("人"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("闲"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("法"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("给"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("废"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("了"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("拉倒"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("!"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 53, + Term: []byte("RT"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 54, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 55, + Term: []byte("@"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 55, + End: 67, + Term: []byte("laoshipukong"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 67, + End: 68, + Term: []byte(" "), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 69, + Term: []byte(":"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 69, + End: 70, + Term: []byte(" "), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 72, + Term: []byte("27"), + Position: 23, + Type: analysis.Numeric, + }, + { + Start: 72, + End: 75, + Term: []byte("日"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 75, + End: 78, + Term: []byte(","), + Position: 25, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 99, + Term: []byte("全国人大常委会"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 108, + Term: []byte("第三次"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 108, + End: 114, + Term: []byte("审议"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 114, + End: 120, + Term: []byte("侵权"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 129, + Term: []byte("责任法"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 129, + End: 135, + Term: []byte("草案"), + Position: 31, + Type: analysis.Ideographic, + }, + { + Start: 135, + End: 138, + Term: []byte(","), + Position: 32, + Type: analysis.AlphaNumeric, + }, + { + Start: 138, + End: 144, + Term: []byte("删除"), + Position: 33, + Type: analysis.Ideographic, + }, + { + Start: 144, + End: 147, + Term: []byte("了"), + Position: 34, + Type: analysis.Ideographic, + }, + { + Start: 147, + End: 153, + Term: []byte("有关"), + Position: 35, + Type: analysis.Ideographic, + }, + { + Start: 153, + End: 159, + Term: []byte("医疗"), + Position: 36, + Type: analysis.Ideographic, + }, + { + Start: 159, + End: 165, + Term: []byte("损害"), + Position: 37, + Type: analysis.Ideographic, + }, + { + Start: 165, + End: 171, + Term: []byte("责任"), + Position: 38, + Type: analysis.Ideographic, + }, + { + Start: 171, + End: 174, + Term: []byte("“"), + Position: 39, + Type: analysis.AlphaNumeric, + }, + { + Start: 174, + End: 180, + Term: []byte("举证"), + Position: 40, + Type: analysis.Ideographic, + }, + { + Start: 180, + End: 186, + Term: []byte("倒置"), + Position: 41, + Type: analysis.Ideographic, + }, + { + Start: 186, + End: 189, + Term: []byte("”"), + Position: 42, + Type: analysis.AlphaNumeric, + }, + { + Start: 189, + End: 192, + Term: []byte("的"), + Position: 43, + Type: analysis.Ideographic, + }, + { + Start: 192, + End: 198, + Term: []byte("规定"), + Position: 44, + Type: analysis.Ideographic, + }, + { + Start: 198, + End: 201, + Term: []byte("。"), + Position: 45, + Type: analysis.AlphaNumeric, + }, + { + Start: 201, + End: 204, + Term: []byte("在"), + Position: 46, + Type: analysis.Ideographic, + }, + { + Start: 204, + End: 210, + Term: []byte("医患"), + Position: 47, + Type: analysis.Ideographic, + }, + { + Start: 210, + End: 216, + Term: []byte("纠纷"), + Position: 48, + Type: analysis.Ideographic, + }, + { + Start: 216, + End: 219, + Term: []byte("中"), + Position: 49, + Type: analysis.Ideographic, + }, + { + Start: 219, + End: 222, + Term: []byte("本"), + Position: 50, + Type: analysis.Ideographic, + }, + { + Start: 222, + End: 225, + Term: []byte("已"), + Position: 51, + Type: analysis.Ideographic, + }, + { + Start: 225, + End: 231, + Term: []byte("处于"), + Position: 52, + Type: analysis.Ideographic, + }, + { + Start: 231, + End: 237, + Term: []byte("弱势"), + Position: 53, + Type: analysis.Ideographic, + }, + { + Start: 237, + End: 243, + Term: []byte("地位"), + Position: 54, + Type: analysis.Ideographic, + }, + { + Start: 243, + End: 246, + Term: []byte("的"), + Position: 55, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 255, + Term: []byte("消费者"), + Position: 56, + Type: analysis.Ideographic, + }, + { + Start: 255, + End: 261, + Term: []byte("由此"), + Position: 57, + Type: analysis.Ideographic, + }, + { + Start: 261, + End: 264, + Term: []byte("将"), + Position: 58, + Type: analysis.Ideographic, + }, + { + Start: 264, + End: 270, + Term: []byte("陷入"), + Position: 59, + Type: analysis.Ideographic, + }, + { + Start: 270, + End: 282, + Term: []byte("万劫不复"), + Position: 60, + Type: analysis.Ideographic, + }, + { + Start: 282, + End: 285, + Term: []byte("的"), + Position: 61, + Type: analysis.Ideographic, + }, + { + Start: 285, + End: 291, + Term: []byte("境地"), + Position: 62, + Type: analysis.Ideographic, + }, + { + Start: 291, + End: 294, + Term: []byte("。"), + Position: 63, + Type: analysis.AlphaNumeric, + }, + { + Start: 294, + End: 295, + Term: []byte(" "), + Position: 64, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("大"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("大"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("他说的确实在理"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("说"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("确实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("理"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春节讲话"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("市长"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("春节"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("讲话"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结婚的和尚未结婚的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结婚"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("尚未"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("结婚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结合成分子时"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结合"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("分子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("旅游和服务是最好的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("旅游"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("服务"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("最好"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("这件事情的确是我的错"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这件"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("事情"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("的确"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("我"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("错"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("供大家参考指正"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("供"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("大家"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("参考"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("指正"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("哈尔滨政府公布塌桥原因"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("哈尔滨"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("政府"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("公布"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("塌"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("原因"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我在机场入口处"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("在"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("机场"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("入口处"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邢永臣摄影报道"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("邢"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("永"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("臣"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("摄影"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("报道"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("BP神经网络如何训练才能在分类时增加区分度?"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("BP"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 14, + Term: []byte("神经网络"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("如何"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 20, + End: 26, + Term: []byte("训练"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("才能"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("分类"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("时"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 44, + End: 50, + Term: []byte("增加"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 59, + Term: []byte("区分度"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 59, + End: 62, + Term: []byte("?"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("南京市长江大桥"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("南京市"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 21, + Term: []byte("长江大桥"), + Position: 2, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("应"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("一些"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("使用者"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("建议"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte(","), + Position: 6, + Type: analysis.AlphaNumeric, + }, + { + Start: 30, + End: 33, + Term: []byte("也"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("为了"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("便于"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("利用"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 59, + Term: []byte("NiuTrans"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 59, + End: 65, + Term: []byte("用于"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("SMT"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 74, + Term: []byte("研究"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春药店"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("长春市"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长春"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("药店"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邓颖超生前最喜欢的衣服"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邓颖超"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("生前"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("最"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("喜欢"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("衣服"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("胡锦涛是热爱世界和平的政治局常委"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("胡锦涛"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("热爱"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世界"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("和平"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("政治局"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("常委"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("程序员"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("祝"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("海林"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("朱"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("会"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("震"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("是"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("在"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("孙"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("健"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("的"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("左面"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("右面"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 61, + Term: []byte(","), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 62, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 65, + Term: []byte("范"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("凯"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 71, + Term: []byte("在"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 71, + End: 74, + Term: []byte("最"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 80, + Term: []byte("右面"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 80, + End: 81, + Term: []byte("."), + Position: 23, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 84, + Term: []byte("再"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 84, + End: 87, + Term: []byte("往"), + Position: 25, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte("左"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("是"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 96, + Term: []byte("李"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 99, + Term: []byte("松"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 102, + Term: []byte("洪"), + Position: 30, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一次性交多少钱"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("一次性"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("交"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("多少"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("钱"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("两块五一套,三块八一斤,四块七一本,五块六一条"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("两块"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("五"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("一套"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte(","), + Position: 4, + Type: analysis.AlphaNumeric, + }, + { + Start: 18, + End: 24, + Term: []byte("三块"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("八"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一斤"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 36, + End: 42, + Term: []byte("四块"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("七"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("一本"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte(","), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 60, + Term: []byte("五块"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 63, + Term: []byte("六"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("一条"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("小和尚留了一个像大和尚一样的和尚头"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("小"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("和尚"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("留"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("一个"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("像"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("大"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("和尚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一样"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 51, + Term: []byte("和尚头"), + Position: 11, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 27, + Term: []byte("中华人民共和国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("公民"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 34, + Term: []byte(";"), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 34, + End: 37, + Term: []byte("我"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("爸爸"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("是"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 55, + Term: []byte("共和党"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("党员"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 62, + Term: []byte(";"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 63, + Term: []byte(" "), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 63, + End: 69, + Term: []byte("地铁"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 78, + Term: []byte("和平门"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("站"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张晓梅去人民医院做了个B超然后去买了件T恤"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张晓梅"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("医院"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("做"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("个"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 37, + Term: []byte("B超"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("然后"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("去"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 49, + Term: []byte("买"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 52, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 55, + Term: []byte("件"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 59, + Term: []byte("T恤"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("AT&T是一件不错的公司,给你发offer了吗?"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("AT&T"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("一件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("不错"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("公司"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 31, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 31, + End: 34, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte("你"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 40, + Term: []byte("发"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 40, + End: 45, + Term: []byte("offer"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("吗"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("?"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("C++"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 6, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 8, + Term: []byte("c#"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 8, + End: 11, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("什么"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("关系"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("?"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 26, + End: 28, + Term: []byte("11"), + Position: 8, + Type: analysis.Numeric, + }, + { + Start: 28, + End: 29, + Term: []byte("+"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 29, + End: 32, + Term: []byte("122"), + Position: 10, + Type: analysis.Numeric, + }, + { + Start: 32, + End: 33, + Term: []byte("="), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 33, + End: 36, + Term: []byte("133"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 36, + End: 39, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 39, + End: 42, + Term: []byte("是"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("吗"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 48, + End: 50, + Term: []byte("PI"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 50, + End: 51, + Term: []byte("="), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 52, + Term: []byte("3"), + Position: 19, + Type: analysis.Numeric, + }, + { + Start: 52, + End: 53, + Term: []byte("."), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 58, + Term: []byte("14159"), + Position: 21, + Type: analysis.Numeric, + }, + }, + }, + { + []byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("你"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("认识"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("那个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("主席"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("握手"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("的哥"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("吗"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("?"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("他"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("开"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("一辆"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 63, + Term: []byte("黑色"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("的士"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 72, + Term: []byte("。"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("枪杆子中出政权"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("枪杆子"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("中"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("出"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("政权"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张三风同学走上了不归路"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("张"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("三"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("风"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("同学"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("走上"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 33, + Term: []byte("不归路"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("阿Q"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 4, + End: 10, + Term: []byte("腰间"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("挂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("着"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 21, + Term: []byte("BB机"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("手里"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("拿"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("着"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("大哥大"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("说"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte(":"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 54, + Term: []byte("我"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("一般"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 66, + Term: []byte("吃饭"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 69, + Term: []byte("不"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 74, + Term: []byte("AA制"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("的"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 80, + Term: []byte("。"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("在1号店能买到小S和大S八卦的书。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 10, + Term: []byte("1号店"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("买"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 19, + Term: []byte("到"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 23, + Term: []byte("小S"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("和"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 30, + Term: []byte("大S"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("八卦"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("书"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("。"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + } + + tokenizer, _ := NewJiebaTokenizer("../../dict.txt", false, false) + for _, test := range tests { + actual := tokenizer.Tokenize(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } +} + +func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{{ + []byte("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("这"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("一个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("伸手"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("不见"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("五指"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 30, + Term: []byte("伸手不见五指"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("黑夜"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("。"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 42, + End: 45, + Term: []byte("我"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("叫"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("悟空"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 57, + Term: []byte("孙悟空"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 60, + Term: []byte(","), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 60, + End: 63, + Term: []byte("我"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 66, + Term: []byte("爱"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 72, + Term: []byte("北京"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 72, + End: 75, + Term: []byte(","), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 75, + End: 78, + Term: []byte("我"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("爱"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("Python"), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 87, + End: 90, + Term: []byte("和"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("C++"), + Position: 24, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("。"), + Position: 25, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("我不喜欢日本和服。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("不"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("喜欢"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("日本"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("和服"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("。"), + Position: 6, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("雷猴回归人间。"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("回归"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人间"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("。"), + Position: 4, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("工信处"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("干事"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("女干事"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("每月"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("经过"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("下属"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 42, + Term: []byte("科室"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("都"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("要"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 54, + Term: []byte("亲口"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("交代"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 62, + Term: []byte("24"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 62, + End: 65, + Term: []byte("口"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 71, + Term: []byte("交换"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 74, + Term: []byte("换机"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 74, + Term: []byte("交换机"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("等"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 83, + Term: []byte("技术"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 86, + Term: []byte("技术性"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 86, + End: 92, + Term: []byte("器件"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 92, + End: 95, + Term: []byte("的"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 95, + End: 101, + Term: []byte("安装"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 101, + End: 107, + Term: []byte("工作"), + Position: 23, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我需要廉租房"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("廉租"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("租房"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("廉租房"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("永和服装饰品有限公司"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("永和"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("服装"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("饰品"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("有限"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("公司"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 30, + Term: []byte("有限公司"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我爱北京天安门"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("爱"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("北京"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("天安"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("天安门"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("abc"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("abc"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("隐马尔可夫"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("隐"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("可夫"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 12, + Term: []byte("马尔可"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 15, + Term: []byte("马尔可夫"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("雷猴是个好网站"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("雷猴"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("好"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("网站"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("“"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 12, + Term: []byte("Microsoft"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 15, + Term: []byte("”"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 15, + End: 18, + Term: []byte("一"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("词"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("由"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("“"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 40, + Term: []byte("MICROcomputer"), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 40, + End: 43, + Term: []byte("("), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 43, + End: 49, + Term: []byte("微型"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("计算"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 58, + Term: []byte("算机"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 58, + Term: []byte("计算机"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 58, + End: 61, + Term: []byte(")"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 64, + Term: []byte("”"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 64, + End: 67, + Term: []byte("和"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 70, + Term: []byte("“"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 78, + Term: []byte("SOFTware"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 81, + Term: []byte("("), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 87, + Term: []byte("软件"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte(")"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 90, + End: 93, + Term: []byte("”"), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 93, + End: 96, + Term: []byte("两"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 102, + Term: []byte("部分"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("组成"), + Position: 25, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("草泥马和欺实马是今年的流行词汇"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("草泥马"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("欺"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("马"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("今年"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("流行"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("词汇"), + Position: 10, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("伊藤洋华堂总府店"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("伊"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("藤"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("洋华堂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("总府"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("店"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国科学院计算技术研究所"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("科学"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("学院"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("计算"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("技术"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("研究"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("科学院"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 36, + Term: []byte("研究所"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 36, + Term: []byte("中国科学院计算技术研究所"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("罗密欧与朱丽叶"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("罗密欧"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("与"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("朱丽叶"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我购买了道具和服装"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("购买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("道具"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("服装"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("PS"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 3, + Term: []byte(":"), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 4, + Term: []byte(" "), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("我"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("觉得"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("开源"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("有"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("一个"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("好处"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte(","), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 37, + End: 43, + Term: []byte("就是"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 49, + Term: []byte("能够"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 55, + Term: []byte("敦促"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("自己"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 67, + Term: []byte("不断"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 67, + End: 73, + Term: []byte("改进"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 73, + Term: []byte("不断改进"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 73, + End: 76, + Term: []byte(","), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 76, + End: 82, + Term: []byte("避免"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 82, + End: 85, + Term: []byte("敞"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 85, + End: 88, + Term: []byte("帚"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 88, + End: 94, + Term: []byte("自珍"), + Position: 22, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省石首市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("石首"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("石首市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("湖北省十堰市"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("湖北"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("湖北省"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("十堰"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("十堰市"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("总经理完成了这件事情"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("经理"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("总经理"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("完成"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("这件"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("事情"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("电脑修好了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("电脑"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("修好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("做好了这件事情就一了百了了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("做好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("这件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("事情"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 36, + Term: []byte("一了百了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("了"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("人们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("审美"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("观点"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("不同"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我们买了一个美的空调"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("我们"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("买"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("一个"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("美的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("空调"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("线程初始化时我们要注意"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("线程"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("初始"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 15, + Term: []byte("初始化"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("我们"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("注意"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一个分子是由好多原子组织成的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一个"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("分子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("由"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("好多"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("原子"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("组织"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("成"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("祝你马到功成"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("祝"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("你"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 18, + Term: []byte("马到功成"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("他掉进了无底洞里"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("掉"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("进"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("无底"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("无底洞"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("里"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("中国的首都是北京"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("中国"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("北京"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("孙君意"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("孙"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("君"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("意"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("外交部发言人马朝旭"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("外交"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("外交部"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("发言"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("发言人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("马朝旭"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("领导人会议和第四届东亚峰会"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("领导"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("领导人"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("会议"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("第四"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("四届"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("第四届"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("东亚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("峰会"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("在过去的这五年"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("过去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("这"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("五年"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("还需要很长的路要走"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("还"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("需要"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("很"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("长"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("路"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("要"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("走"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("60周年首都阅兵"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("60"), + Position: 1, + Type: analysis.Numeric, + }, + { + Start: 2, + End: 8, + Term: []byte("周年"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("首都"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("阅兵"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("你好"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("人们"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("审美"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("观点"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("不同"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后来世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("来"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("买水果然后去世博园"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("买"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("水果"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("然后"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("去"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世博"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("博园"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("世博园"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("但是后来我才知道你是对的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("但是"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("后来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("才"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("知道"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("你"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("对"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("存在即合理"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("存在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("即"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("合理"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("的的的的的在的的的的就以和和和"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("的"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("的"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("就"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("以"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("和"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("I love你,不以为耻,反以为rong"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte("I"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 2, + Term: []byte(" "), + Position: 2, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 6, + Term: []byte("love"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 6, + End: 9, + Term: []byte("你"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte(","), + Position: 5, + Type: analysis.AlphaNumeric, + }, + { + Start: 12, + End: 18, + Term: []byte("不以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("以为"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 24, + Term: []byte("不以为耻"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte(","), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 27, + End: 30, + Term: []byte("反"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("以为"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 40, + Term: []byte("rong"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("因"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("因"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("很好但主要是基于网页形式"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("很"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("但"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("主要"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("基于"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("网页"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("形式"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("hello你好人们审美的观点是不同的"), + analysis.TokenStream{ + { + Start: 0, + End: 5, + Term: []byte("hello"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 5, + End: 11, + Term: []byte("你好"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("人们"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("审美"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("观点"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("是"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("不同"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("的"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("为什么我不能拥有想要的生活"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("什么"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("为什么"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("我"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("不能"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("拥有"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("想要"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("生活"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("后来我才"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("后来"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("我"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("才"), + Position: 3, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("此次来中国是为了"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("此次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("来"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("中国"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("为了"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("使用"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("了"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("它"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("就"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("可以"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("解决"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一些"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("问题"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(",使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 1, + Term: []byte(","), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 1, + End: 7, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 10, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 22, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 34, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 40, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("其实使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("其实"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("好人使用了它就可以解决一些问题"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("好人"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("使用"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("它"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("就"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("可以"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("解决"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一些"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("问题"), + Position: 9, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("是因为和国家"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("因为"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("是因为"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("国家"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("老年搜索还支持"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("老年"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("搜索"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("还"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("支持"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("干脆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("就"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("把"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("那"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("部"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("蒙"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("人"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("闲"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("法"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("给"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("废"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("了"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("拉倒"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("!"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 53, + Term: []byte("RT"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 54, + Term: []byte(" "), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 55, + Term: []byte("@"), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 55, + End: 67, + Term: []byte("laoshipukong"), + Position: 19, + Type: analysis.AlphaNumeric, + }, + { + Start: 67, + End: 68, + Term: []byte(" "), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 69, + Term: []byte(":"), + Position: 21, + Type: analysis.AlphaNumeric, + }, + { + Start: 69, + End: 70, + Term: []byte(" "), + Position: 22, + Type: analysis.AlphaNumeric, + }, + { + Start: 70, + End: 72, + Term: []byte("27"), + Position: 23, + Type: analysis.Numeric, + }, + { + Start: 72, + End: 75, + Term: []byte("日"), + Position: 24, + Type: analysis.Ideographic, + }, + { + Start: 75, + End: 78, + Term: []byte(","), + Position: 25, + Type: analysis.AlphaNumeric, + }, + { + Start: 78, + End: 84, + Term: []byte("全国"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 81, + End: 87, + Term: []byte("国人"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 84, + End: 90, + Term: []byte("人大"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 96, + Term: []byte("常委"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 99, + Term: []byte("委会"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 99, + Term: []byte("常委会"), + Position: 31, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 99, + Term: []byte("全国人大常委会"), + Position: 32, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 105, + Term: []byte("第三"), + Position: 33, + Type: analysis.Ideographic, + }, + { + Start: 102, + End: 108, + Term: []byte("三次"), + Position: 34, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 108, + Term: []byte("第三次"), + Position: 35, + Type: analysis.Ideographic, + }, + { + Start: 108, + End: 114, + Term: []byte("审议"), + Position: 36, + Type: analysis.Ideographic, + }, + { + Start: 114, + End: 120, + Term: []byte("侵权"), + Position: 37, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 126, + Term: []byte("责任"), + Position: 38, + Type: analysis.Ideographic, + }, + { + Start: 120, + End: 129, + Term: []byte("责任法"), + Position: 39, + Type: analysis.Ideographic, + }, + { + Start: 129, + End: 135, + Term: []byte("草案"), + Position: 40, + Type: analysis.Ideographic, + }, + { + Start: 135, + End: 138, + Term: []byte(","), + Position: 41, + Type: analysis.AlphaNumeric, + }, + { + Start: 138, + End: 144, + Term: []byte("删除"), + Position: 42, + Type: analysis.Ideographic, + }, + { + Start: 144, + End: 147, + Term: []byte("了"), + Position: 43, + Type: analysis.Ideographic, + }, + { + Start: 147, + End: 153, + Term: []byte("有关"), + Position: 44, + Type: analysis.Ideographic, + }, + { + Start: 153, + End: 159, + Term: []byte("医疗"), + Position: 45, + Type: analysis.Ideographic, + }, + { + Start: 159, + End: 165, + Term: []byte("损害"), + Position: 46, + Type: analysis.Ideographic, + }, + { + Start: 165, + End: 171, + Term: []byte("责任"), + Position: 47, + Type: analysis.Ideographic, + }, + { + Start: 171, + End: 174, + Term: []byte("“"), + Position: 48, + Type: analysis.AlphaNumeric, + }, + { + Start: 174, + End: 180, + Term: []byte("举证"), + Position: 49, + Type: analysis.Ideographic, + }, + { + Start: 180, + End: 186, + Term: []byte("倒置"), + Position: 50, + Type: analysis.Ideographic, + }, + { + Start: 186, + End: 189, + Term: []byte("”"), + Position: 51, + Type: analysis.AlphaNumeric, + }, + { + Start: 189, + End: 192, + Term: []byte("的"), + Position: 52, + Type: analysis.Ideographic, + }, + { + Start: 192, + End: 198, + Term: []byte("规定"), + Position: 53, + Type: analysis.Ideographic, + }, + { + Start: 198, + End: 201, + Term: []byte("。"), + Position: 54, + Type: analysis.AlphaNumeric, + }, + { + Start: 201, + End: 204, + Term: []byte("在"), + Position: 55, + Type: analysis.Ideographic, + }, + { + Start: 204, + End: 210, + Term: []byte("医患"), + Position: 56, + Type: analysis.Ideographic, + }, + { + Start: 210, + End: 216, + Term: []byte("纠纷"), + Position: 57, + Type: analysis.Ideographic, + }, + { + Start: 216, + End: 219, + Term: []byte("中"), + Position: 58, + Type: analysis.Ideographic, + }, + { + Start: 219, + End: 222, + Term: []byte("本"), + Position: 59, + Type: analysis.Ideographic, + }, + { + Start: 222, + End: 225, + Term: []byte("已"), + Position: 60, + Type: analysis.Ideographic, + }, + { + Start: 225, + End: 231, + Term: []byte("处于"), + Position: 61, + Type: analysis.Ideographic, + }, + { + Start: 231, + End: 237, + Term: []byte("弱势"), + Position: 62, + Type: analysis.Ideographic, + }, + { + Start: 237, + End: 243, + Term: []byte("地位"), + Position: 63, + Type: analysis.Ideographic, + }, + { + Start: 243, + End: 246, + Term: []byte("的"), + Position: 64, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 252, + Term: []byte("消费"), + Position: 65, + Type: analysis.Ideographic, + }, + { + Start: 246, + End: 255, + Term: []byte("消费者"), + Position: 66, + Type: analysis.Ideographic, + }, + { + Start: 255, + End: 261, + Term: []byte("由此"), + Position: 67, + Type: analysis.Ideographic, + }, + { + Start: 261, + End: 264, + Term: []byte("将"), + Position: 68, + Type: analysis.Ideographic, + }, + { + Start: 264, + End: 270, + Term: []byte("陷入"), + Position: 69, + Type: analysis.Ideographic, + }, + { + Start: 276, + End: 282, + Term: []byte("不复"), + Position: 70, + Type: analysis.Ideographic, + }, + { + Start: 270, + End: 282, + Term: []byte("万劫不复"), + Position: 71, + Type: analysis.Ideographic, + }, + { + Start: 282, + End: 285, + Term: []byte("的"), + Position: 72, + Type: analysis.Ideographic, + }, + { + Start: 285, + End: 291, + Term: []byte("境地"), + Position: 73, + Type: analysis.Ideographic, + }, + { + Start: 291, + End: 294, + Term: []byte("。"), + Position: 74, + Type: analysis.AlphaNumeric, + }, + { + Start: 294, + End: 295, + Term: []byte(" "), + Position: 75, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("大"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("大"), + Position: 1, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte(""), + analysis.TokenStream{}, + }, + { + []byte("他说的确实在理"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("他"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("说"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("确实"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("在"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("理"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春节讲话"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("市长"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("春节"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("讲话"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结婚的和尚未结婚的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结婚"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("的"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("和"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("尚未"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("结婚"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("结合成分子时"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("结合"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("成"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("分子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("时"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("旅游和服务是最好的"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("旅游"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("服务"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("最好"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("这件事情的确是我的错"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("这件"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("事情"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("的确"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("我"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("错"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("供大家参考指正"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("供"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("大家"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("参考"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("指正"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("哈尔滨政府公布塌桥原因"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("哈尔"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("哈尔滨"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("政府"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("公布"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("塌"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("桥"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("原因"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我在机场入口处"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("在"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("机场"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("入口"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 21, + Term: []byte("入口处"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邢永臣摄影报道"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("邢"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("永"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("臣"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("摄影"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("报道"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("BP神经网络如何训练才能在分类时增加区分度?"), + analysis.TokenStream{ + { + Start: 0, + End: 2, + Term: []byte("BP"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 2, + End: 8, + Term: []byte("神经"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 8, + End: 14, + Term: []byte("网络"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 11, + Term: []byte("神经网"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 2, + End: 14, + Term: []byte("神经网络"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 14, + End: 20, + Term: []byte("如何"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 20, + End: 26, + Term: []byte("训练"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 32, + Term: []byte("才能"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 32, + End: 35, + Term: []byte("在"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 35, + End: 41, + Term: []byte("分类"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 41, + End: 44, + Term: []byte("时"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 44, + End: 50, + Term: []byte("增加"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 56, + Term: []byte("区分"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 53, + End: 59, + Term: []byte("分度"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 50, + End: 59, + Term: []byte("区分度"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 59, + End: 62, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("南京市长江大桥"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("南京"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("京市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("南京市"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长江"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("大桥"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 21, + Term: []byte("长江大桥"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("应"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("一些"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("使用"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("用者"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 18, + Term: []byte("使用者"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("的"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("建议"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 30, + End: 33, + Term: []byte("也"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("为了"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 45, + Term: []byte("便于"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("利用"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 59, + Term: []byte("NiuTrans"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 59, + End: 65, + Term: []byte("用于"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("SMT"), + Position: 15, + Type: analysis.AlphaNumeric, + }, + { + Start: 68, + End: 74, + Term: []byte("研究"), + Position: 16, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("长春市长春药店"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("长春"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("长春市"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("长春"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("药店"), + Position: 4, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("邓颖超生前最喜欢的衣服"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("邓颖超"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("生前"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("最"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("喜欢"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("衣服"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("胡锦涛是热爱世界和平的政治局常委"), + analysis.TokenStream{ + { + Start: 3, + End: 9, + Term: []byte("锦涛"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("胡锦涛"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("是"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("热爱"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("世界"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("和平"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("政治"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("政治局"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("常委"), + Position: 10, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("程序"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("程序员"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("祝"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("海林"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("和"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("朱"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("会"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("震"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("是"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte("在"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("孙"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("健"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("的"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("左面"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("和"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("右面"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 61, + Term: []byte(","), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 61, + End: 62, + Term: []byte(" "), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 65, + Term: []byte("范"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 65, + End: 68, + Term: []byte("凯"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 68, + End: 71, + Term: []byte("在"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 71, + End: 74, + Term: []byte("最"), + Position: 22, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 80, + Term: []byte("右面"), + Position: 23, + Type: analysis.Ideographic, + }, + { + Start: 80, + End: 81, + Term: []byte("."), + Position: 24, + Type: analysis.AlphaNumeric, + }, + { + Start: 81, + End: 84, + Term: []byte("再"), + Position: 25, + Type: analysis.Ideographic, + }, + { + Start: 84, + End: 87, + Term: []byte("往"), + Position: 26, + Type: analysis.Ideographic, + }, + { + Start: 87, + End: 90, + Term: []byte("左"), + Position: 27, + Type: analysis.Ideographic, + }, + { + Start: 90, + End: 93, + Term: []byte("是"), + Position: 28, + Type: analysis.Ideographic, + }, + { + Start: 93, + End: 96, + Term: []byte("李"), + Position: 29, + Type: analysis.Ideographic, + }, + { + Start: 96, + End: 99, + Term: []byte("松"), + Position: 30, + Type: analysis.Ideographic, + }, + { + Start: 99, + End: 102, + Term: []byte("洪"), + Position: 31, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("一次性交多少钱"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("一次"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("一次性"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("交"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("多少"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("钱"), + Position: 5, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("两块五一套,三块八一斤,四块七一本,五块六一条"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("两块"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("五"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("一套"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte(","), + Position: 4, + Type: analysis.AlphaNumeric, + }, + { + Start: 18, + End: 24, + Term: []byte("三块"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("八"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("一斤"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 36, + Term: []byte(","), + Position: 8, + Type: analysis.AlphaNumeric, + }, + { + Start: 36, + End: 42, + Term: []byte("四块"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("七"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 51, + Term: []byte("一本"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte(","), + Position: 12, + Type: analysis.AlphaNumeric, + }, + { + Start: 54, + End: 60, + Term: []byte("五块"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 63, + Term: []byte("六"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("一条"), + Position: 15, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("小和尚留了一个像大和尚一样的和尚头"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("小"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("和尚"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("留"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("了"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("一个"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("像"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("大"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("和尚"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("一样"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 48, + Term: []byte("和尚"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 51, + Term: []byte("和尚头"), + Position: 12, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("我"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 12, + Term: []byte("中华"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("华人"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("共和"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 27, + Term: []byte("共和国"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 27, + Term: []byte("中华人民共和国"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("公民"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 34, + Term: []byte(";"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 34, + End: 37, + Term: []byte("我"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("爸爸"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("是"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 52, + Term: []byte("共和"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 55, + Term: []byte("共和党"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 61, + Term: []byte("党员"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 61, + End: 62, + Term: []byte(";"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 62, + End: 63, + Term: []byte(" "), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 63, + End: 69, + Term: []byte("地铁"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 75, + Term: []byte("和平"), + Position: 20, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 78, + Term: []byte("和平门"), + Position: 21, + Type: analysis.Ideographic, + }, + { + Start: 78, + End: 81, + Term: []byte("站"), + Position: 22, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张晓梅去人民医院做了个B超然后去买了件T恤"), + analysis.TokenStream{ + { + Start: 0, + End: 9, + Term: []byte("张晓梅"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("去"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 18, + Term: []byte("人民"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("医院"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 27, + Term: []byte("做"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("个"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 37, + Term: []byte("B超"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 43, + Term: []byte("然后"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 43, + End: 46, + Term: []byte("去"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 46, + End: 49, + Term: []byte("买"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 49, + End: 52, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 52, + End: 55, + Term: []byte("件"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 55, + End: 59, + Term: []byte("T恤"), + Position: 14, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("AT&T是一件不错的公司,给你发offer了吗?"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("AT&T"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 4, + End: 7, + Term: []byte("是"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 7, + End: 13, + Term: []byte("一件"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 19, + Term: []byte("不错"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 22, + Term: []byte("的"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 22, + End: 28, + Term: []byte("公司"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 28, + End: 31, + Term: []byte(","), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 31, + End: 34, + Term: []byte("给"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 34, + End: 37, + Term: []byte("你"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 37, + End: 40, + Term: []byte("发"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 40, + End: 45, + Term: []byte("offer"), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("了"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("吗"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 54, + Term: []byte("?"), + Position: 14, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("C++"), + Position: 1, + Type: analysis.AlphaNumeric, + }, + { + Start: 3, + End: 6, + Term: []byte("和"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 8, + Term: []byte("c#"), + Position: 3, + Type: analysis.AlphaNumeric, + }, + { + Start: 8, + End: 11, + Term: []byte("是"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 11, + End: 17, + Term: []byte("什么"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 17, + End: 23, + Term: []byte("关系"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("?"), + Position: 7, + Type: analysis.AlphaNumeric, + }, + { + Start: 26, + End: 28, + Term: []byte("11"), + Position: 8, + Type: analysis.Numeric, + }, + { + Start: 28, + End: 29, + Term: []byte("+"), + Position: 9, + Type: analysis.AlphaNumeric, + }, + { + Start: 29, + End: 32, + Term: []byte("122"), + Position: 10, + Type: analysis.Numeric, + }, + { + Start: 32, + End: 33, + Term: []byte("="), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 33, + End: 36, + Term: []byte("133"), + Position: 12, + Type: analysis.Numeric, + }, + { + Start: 36, + End: 39, + Term: []byte(","), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 39, + End: 42, + Term: []byte("是"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("吗"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 45, + End: 48, + Term: []byte("?"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + { + Start: 48, + End: 50, + Term: []byte("PI"), + Position: 17, + Type: analysis.AlphaNumeric, + }, + { + Start: 50, + End: 51, + Term: []byte("="), + Position: 18, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 52, + Term: []byte("3"), + Position: 19, + Type: analysis.Numeric, + }, + { + Start: 52, + End: 53, + Term: []byte("."), + Position: 20, + Type: analysis.AlphaNumeric, + }, + { + Start: 53, + End: 58, + Term: []byte("14159"), + Position: 21, + Type: analysis.Numeric, + }, + }, + }, + { + []byte("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("你"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("认识"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("那个"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("和"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 24, + Term: []byte("主席"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 30, + Term: []byte("握手"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("的"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("的哥"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("吗"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("?"), + Position: 10, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("他"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte("开"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 51, + End: 57, + Term: []byte("一辆"), + Position: 13, + Type: analysis.Ideographic, + }, + { + Start: 57, + End: 63, + Term: []byte("黑色"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 63, + End: 69, + Term: []byte("的士"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 72, + Term: []byte("。"), + Position: 16, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("枪杆子中出政权"), + analysis.TokenStream{ + { + Start: 0, + End: 6, + Term: []byte("枪杆"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 9, + Term: []byte("杆子"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 0, + End: 9, + Term: []byte("枪杆子"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("中"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("出"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("政权"), + Position: 6, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("张三风同学走上了不归路"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("张"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("三"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("风"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 15, + Term: []byte("同学"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 21, + Term: []byte("走上"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 24, + Term: []byte("了"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 33, + Term: []byte("归路"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 24, + End: 33, + Term: []byte("不归路"), + Position: 8, + Type: analysis.Ideographic, + }, + }, + }, + { + []byte("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"), + analysis.TokenStream{ + { + Start: 0, + End: 4, + Term: []byte("阿Q"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 4, + End: 10, + Term: []byte("腰间"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("挂"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("着"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 21, + Term: []byte("BB机"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 21, + End: 27, + Term: []byte("手里"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 27, + End: 30, + Term: []byte("拿"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 33, + Term: []byte("着"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 39, + Term: []byte("大哥"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 33, + End: 42, + Term: []byte("大哥大"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte(","), + Position: 11, + Type: analysis.AlphaNumeric, + }, + { + Start: 45, + End: 48, + Term: []byte("说"), + Position: 12, + Type: analysis.Ideographic, + }, + { + Start: 48, + End: 51, + Term: []byte(":"), + Position: 13, + Type: analysis.AlphaNumeric, + }, + { + Start: 51, + End: 54, + Term: []byte("我"), + Position: 14, + Type: analysis.Ideographic, + }, + { + Start: 54, + End: 60, + Term: []byte("一般"), + Position: 15, + Type: analysis.Ideographic, + }, + { + Start: 60, + End: 66, + Term: []byte("吃饭"), + Position: 16, + Type: analysis.Ideographic, + }, + { + Start: 66, + End: 69, + Term: []byte("不"), + Position: 17, + Type: analysis.Ideographic, + }, + { + Start: 69, + End: 74, + Term: []byte("AA制"), + Position: 18, + Type: analysis.Ideographic, + }, + { + Start: 74, + End: 77, + Term: []byte("的"), + Position: 19, + Type: analysis.Ideographic, + }, + { + Start: 77, + End: 80, + Term: []byte("。"), + Position: 20, + Type: analysis.AlphaNumeric, + }, + }, + }, + { + []byte("在1号店能买到小S和大S八卦的书。"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("在"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 10, + Term: []byte("1号店"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 10, + End: 13, + Term: []byte("能"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 13, + End: 16, + Term: []byte("买"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 16, + End: 19, + Term: []byte("到"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 19, + End: 23, + Term: []byte("小S"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 23, + End: 26, + Term: []byte("和"), + Position: 7, + Type: analysis.Ideographic, + }, + { + Start: 26, + End: 30, + Term: []byte("大S"), + Position: 8, + Type: analysis.Ideographic, + }, + { + Start: 30, + End: 36, + Term: []byte("八卦"), + Position: 9, + Type: analysis.Ideographic, + }, + { + Start: 36, + End: 39, + Term: []byte("的"), + Position: 10, + Type: analysis.Ideographic, + }, + { + Start: 39, + End: 42, + Term: []byte("书"), + Position: 11, + Type: analysis.Ideographic, + }, + { + Start: 42, + End: 45, + Term: []byte("。"), + Position: 12, + Type: analysis.AlphaNumeric, + }, + }, + }, + } + + tokenizer, _ := NewJiebaTokenizer("../../dict.txt", false, true) + for _, test := range tests { + actual := tokenizer.Tokenize(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } } From 16929faf576212a81c34da74b6bea65b1f41cba9 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Wed, 18 Mar 2015 17:31:41 +0800 Subject: [PATCH 5/5] removed old tokenize module, updated README --- README.md | 255 +++++++++--- tokenize.go | 37 -- tokenize_test.go | 390 ------------------ {analyse/tokenizers => tokenizers}/jieba.go | 0 .../tokenizers => tokenizers}/jieba_test.go | 8 +- 5 files changed, 213 insertions(+), 477 deletions(-) delete mode 100644 tokenize.go delete mode 100644 tokenize_test.go rename {analyse/tokenizers => tokenizers}/jieba.go (100%) rename {analyse/tokenizers => tokenizers}/jieba_test.go (99%) diff --git a/README.md b/README.md index 48489d5..ade962f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ -结巴分词Go版 jiebago -=================== +#结巴分词 Go 语言版:jiebago + [![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [结巴分词](https://github.com/fxsjy/jieba)是[@fxsjy](https://github.com/fxsjy)用Python编写的中文分词组件,jiebago是结巴分词的Go语言实现,目前已经实现的功能包括:三种模式分词、自定义词典、关键词提取和词性标注。 -安装 -===== +## 安装 - go get github.com/wangbin/jiebago + + go get github.com/wangbin/jiebago/... -分词 -===== +## 分词 + package main @@ -53,8 +53,8 @@ 【搜索引擎模式】:小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 / -添加自定义词典 -============= +## 添加自定义词典 + var sentence = "李小福是创新办主任也是云计算方面的专家" fmt.Print("Before: ") @@ -69,12 +69,7 @@ After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / -关键词提取 -======== - -需要先安装analyse模块: - - go get github.com/wangbin/jiebago/analyse +## 关键词提取 示例代码: @@ -133,12 +128,7 @@ 全资 0.306324 商业 0.306138 -词性标注 -======= - -需要先安装posseg模块: - - go get github.com/wangbin/jiebago/posseg +## 词性标注 示例代码: @@ -166,8 +156,8 @@ 北京 ns 天安门 ns -并行分词 -======= + +## 并行分词 因为Go有强大的goroutine特性,并行分词实现起来非常简单,所以并没有内置到jiebaogo中,而是由使用者自己实现,下面是一个简单的例子: @@ -207,40 +197,213 @@ writer.Flush() -Tokenize -========= +## Tokenize:返回词语在原文的起始位置 - var sentence = "永和服装饰品有限公司" - // 默认模式 - for _, token := range jiebago.Tokenize(sentence, "default", true) { - fmt.Printf("word %s\t\t start: %d \t\t end:%d\n", token.Word, token.Start, token.End) + +注意新版的 Jiebago Tokenizer 实现了 Bleve 的 Tokenizer 接口,跟之前的实现有很大的变化: + +1. 接受的参数必须是 []byte。 +2. 输出的 Token 的起始和终止位置是 byte 的位置,不是之前的 rune 的位置,所以和 Python 版的 Jieba.tokenize 输出不一致。 + +``` +package main + +import ( + "fmt" + "github.com/wangbin/jiebago/tokenizers" +) + +const DictPath = "/path/to/dict.txt" + +var sentence = []byte("永和服装饰品有限公司") + +func main() { + // default mode + tokenizer, _ := tokenizers.NewJiebaTokenizer(DictPath, true, false) for _, token := range tokenizer.Tokenize(sentence) { + fmt.Printf( + "Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n", + token.Term, token.Start, token.End, token.Position, token.Type) } - // 搜索模式 - for _, token := range jiebago.Tokenize(sentence, "search", true) { - fmt.Printf("word %s\t\t start: %d \t\t end:%d\n", token.Word, token.Start, token.End) + + //search mode + tokenizer, _ = tokenizers.NewJiebaTokenizer(DictPath, true, true) + for _, token := range tokenizer.Tokenize(sentence) { + fmt.Printf( + "Term: %s\t Start: %d \t End: %d\t Position: %d\t Type: %d\n", + token.Term, token.Start, token.End, token.Position, token.Type) } +} + +``` +默认模式输出: + +``` +Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 +Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 +Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 +Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1 +``` +搜索模式输出: + +``` +Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 +Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 +Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 +Term: 有限 Start: 18 End: 24 Position: 4 Type: 1 +Term: 公司 Start: 24 End: 30 Position: 5 Type: 1 +Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1 +``` +### 配合 bleve 进行中文全文检索 + +[bleve](http://www.blevesearch.com/) 是一个 Go 语言实现的全文索引系统,jiebago 可以配合 bleve 使用实现中文的全文检索。一个简单的用法示例: + +``` +package main + +import ( + "fmt" + "github.com/blevesearch/bleve" + _ "github.com/wangbin/jiebago/analyse/tokenizers" + "log" +) + +func main() { + // open a new index + indexMapping := bleve.NewIndexMapping() + + err := indexMapping.AddCustomTokenizer("jieba", + map[string]interface{}{ + "file": "/Users/wangbin/mygo/src/github.com/wangbin/jiebago/dict.txt", + "type": "jieba", + }) + if err != nil { + log.Fatal(err) + } + + err = indexMapping.AddCustomAnalyzer("jieba", + map[string]interface{}{ + "type": "custom", + "tokenizer": "jieba", + "token_filters": []string{ + "possessive_en", + "to_lower", + "stop_en", + }, + }) + + if err != nil { + log.Fatal(err) + } + + indexMapping.DefaultAnalyzer = "jieba" + + index, err := bleve.New("example.bleve", indexMapping) + + if err != nil { + log.Fatal(err) + } + + indexMapping.DefaultAnalyzer = "jieba" + + index, err := bleve.New("example.bleve", indexMapping) + + if err != nil { + log.Fatal(err) + } + + docs := []struct { + Title string + Name string + }{ + { + Title: "Doc 1", + Name: "This is the first document we’ve added", + }, + { + Title: "Doc 2", + Name: "The second one 你 中文测试中文 is even more interesting! 吃水果", + }, + { + Title: "Doc 3", + Name: "买水果然后来世博园。", + }, + { + Title: "Doc 4", + Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", + }, + { + Title: "Doc 5", + Name: "咱俩交换一下吧。", + }, + } + // index docs + for _, doc := range docs { + index.Index(doc.Title, doc) + } + + // search for some text + for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} { + query := bleve.NewMatchQuery(keyword) + search := bleve.NewSearchRequest(query) + search.Highlight = bleve.NewHighlight() + searchResults, err := index.Search(search) + if err != nil { + log.Fatal(err) + } + fmt.Printf("Result of %s: %s\n", keyword, searchResults) + } +} +``` 输出结果: - word 永和 start: 0 end:2 - word 服装 start: 2 end:4 - word 饰品 start: 4 end:6 - word 有限公司 start: 6 end:10 +``` +Result of 水果世博园: 2 matches, showing 1 through 2, took 377.988µs + 1. Doc 3 (1.099550) + Name + 买水果然后来世博园。 + 2. Doc 2 (0.031941) + Name + The second one 你 中文测试中文 is even more interesting! 吃水果 - word 永和 start: 0 end:2 - word 服装 start: 0 end:2 - word 饰品 start: 0 end:2 - word 有限 start: 0 end:2 - word 公司 start: 2 end:4 - word 有限公司 start: 0 end:4 +Result of 你: 1 matches, showing 1 through 1, took 103.367µs + 1. Doc 2 (0.391161) + Name + The second one 中文测试中文 is even more interesting! 吃水果 -分词速度 -======= +Result of first: 1 matches, showing 1 through 1, took 373.317µs + 1. Doc 1 (0.512150) + Name + This is the first document we’ve added + +Result of 中文: 1 matches, showing 1 through 1, took 106.433µs + 1. Doc 2 (0.553186) + Name + The second one 你 中文测试中文 is even more interesting! 吃水果 + +Result of 交换机: 2 matches, showing 1 through 2, took 188.235µs + 1. Doc 4 (0.608495) + Name + 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 + 2. Doc 5 (0.086700) + Name + 咱俩交换一下吧。 + +Result of 交换: 2 matches, showing 1 through 2, took 148.822µs + 1. Doc 5 (0.534158) + Name + 咱俩交换一下吧。 + 2. Doc 4 (0.296297) + Name + 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 +``` + +## 分词速度 - 2MB / Second in Full Mode - 700KB / Second in Default Mode - Test Env: AMD Phenom(tm) II X6 1055T CPU @ 2.8GHz; 《金庸全集》 -许可证 -====== +## 许可证 + MIT: http://wangbin.mit-license.org diff --git a/tokenize.go b/tokenize.go deleted file mode 100644 index 3765207..0000000 --- a/tokenize.go +++ /dev/null @@ -1,37 +0,0 @@ -package jiebago - -type token struct { - Word string - Start int - End int -} - -// Return words with position. -func Tokenize(sentence string, mode string, HMM bool) []token { - tokens := make([]token, 0) - start := 0 - var width int - for word := range Cut(sentence, false, HMM) { - if mode == "default" { - width = len([]rune(word)) - tokens = append(tokens, token{word, start, start + width}) - start += width - - } else { - runes := []rune(word) - width = len(runes) - for _, step := range []int{2, 3} { - if width > step { - for i := 0; i < width-step+1; i++ { - gram := string(runes[i : i+step]) - if _, ok := Trie.Freq[gram]; ok { - tokens = append(tokens, token{gram, start + i, start + i + step}) - } - } - } - } - tokens = append(tokens, token{word, start, start + width}) - } - } - return tokens -} diff --git a/tokenize_test.go b/tokenize_test.go deleted file mode 100644 index a088bbd..0000000 --- a/tokenize_test.go +++ /dev/null @@ -1,390 +0,0 @@ -package jiebago - -import "testing" - -var ( - result = [][]token{ - []token{token{"\u8fd9\u662f", 0, 2}, token{"\u4e00\u4e2a", 2, 4}, token{"\u4f38\u624b\u4e0d\u89c1\u4e94\u6307", 4, 10}, token{"\u7684", 10, 11}, token{"\u9ed1\u591c", 11, 13}, token{"\u3002", 13, 14}, token{"\u6211", 14, 15}, token{"\u53eb", 15, 16}, token{"\u5b59\u609f\u7a7a", 16, 19}, token{"\uff0c", 19, 20}, token{"\u6211", 20, 21}, token{"\u7231", 21, 22}, token{"\u5317\u4eac", 22, 24}, token{"\uff0c", 24, 25}, token{"\u6211", 25, 26}, token{"\u7231", 26, 27}, token{"Python", 27, 33}, token{"\u548c", 33, 34}, token{"C++", 34, 37}, token{"\u3002", 37, 38}}, - []token{token{"\u6211", 0, 1}, token{"\u4e0d", 1, 2}, token{"\u559c\u6b22", 2, 4}, token{"\u65e5\u672c", 4, 6}, token{"\u548c\u670d", 6, 8}, token{"\u3002", 8, 9}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u56de\u5f52", 2, 4}, token{"\u4eba\u95f4", 4, 6}, token{"\u3002", 6, 7}}, - []token{token{"\u5de5\u4fe1\u5904", 0, 3}, token{"\u5973\u5e72\u4e8b", 3, 6}, token{"\u6bcf\u6708", 6, 8}, token{"\u7ecf\u8fc7", 8, 10}, token{"\u4e0b\u5c5e", 10, 12}, token{"\u79d1\u5ba4", 12, 14}, token{"\u90fd", 14, 15}, token{"\u8981", 15, 16}, token{"\u4eb2\u53e3", 16, 18}, token{"\u4ea4\u4ee3", 18, 20}, token{"24", 20, 22}, token{"\u53e3", 22, 23}, token{"\u4ea4\u6362\u673a", 23, 26}, token{"\u7b49", 26, 27}, token{"\u6280\u672f\u6027", 27, 30}, token{"\u5668\u4ef6", 30, 32}, token{"\u7684", 32, 33}, token{"\u5b89\u88c5", 33, 35}, token{"\u5de5\u4f5c", 35, 37}}, - []token{token{"\u6211", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5ec9\u79df\u623f", 3, 6}}, - []token{token{"\u6c38\u548c", 0, 2}, token{"\u670d\u88c5", 2, 4}, token{"\u9970\u54c1", 4, 6}, token{"\u6709\u9650\u516c\u53f8", 6, 10}}, - []token{token{"\u6211", 0, 1}, token{"\u7231", 1, 2}, token{"\u5317\u4eac", 2, 4}, token{"\u5929\u5b89\u95e8", 4, 7}}, - []token{token{"abc", 0, 3}}, - []token{token{"\u9690", 0, 1}, token{"\u9a6c\u5c14\u53ef\u592b", 1, 5}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u662f", 2, 3}, token{"\u4e2a", 3, 4}, token{"\u597d", 4, 5}, token{"\u7f51\u7ad9", 5, 7}}, - []token{token{"\u201c", 0, 1}, token{"Microsoft", 1, 10}, token{"\u201d", 10, 11}, token{"\u4e00\u8bcd", 11, 13}, token{"\u7531", 13, 14}, token{"\u201c", 14, 15}, token{"MICROcomputer", 15, 28}, token{"\uff08", 28, 29}, token{"\u5fae\u578b", 29, 31}, token{"\u8ba1\u7b97\u673a", 31, 34}, token{"\uff09", 34, 35}, token{"\u201d", 35, 36}, token{"\u548c", 36, 37}, token{"\u201c", 37, 38}, token{"SOFTware", 38, 46}, token{"\uff08", 46, 47}, token{"\u8f6f\u4ef6", 47, 49}, token{"\uff09", 49, 50}, token{"\u201d", 50, 51}, token{"\u4e24", 51, 52}, token{"\u90e8\u5206", 52, 54}, token{"\u7ec4\u6210", 54, 56}}, - []token{token{"\u8349\u6ce5\u9a6c", 0, 3}, token{"\u548c", 3, 4}, token{"\u6b3a\u5b9e", 4, 6}, token{"\u9a6c", 6, 7}, token{"\u662f", 7, 8}, token{"\u4eca\u5e74", 8, 10}, token{"\u7684", 10, 11}, token{"\u6d41\u884c", 11, 13}, token{"\u8bcd\u6c47", 13, 15}}, - []token{token{"\u4f0a\u85e4", 0, 2}, token{"\u6d0b\u534e\u5802", 2, 5}, token{"\u603b\u5e9c", 5, 7}, token{"\u5e97", 7, 8}}, - []token{token{"\u4e2d\u56fd\u79d1\u5b66\u9662\u8ba1\u7b97\u6280\u672f\u7814\u7a76\u6240", 0, 12}}, - []token{token{"\u7f57\u5bc6\u6b27", 0, 3}, token{"\u4e0e", 3, 4}, token{"\u6731\u4e3d\u53f6", 4, 7}}, - []token{token{"\u6211", 0, 1}, token{"\u8d2d\u4e70", 1, 3}, token{"\u4e86", 3, 4}, token{"\u9053\u5177", 4, 6}, token{"\u548c", 6, 7}, token{"\u670d\u88c5", 7, 9}}, - []token{token{"PS", 0, 2}, token{":", 2, 3}, token{" ", 3, 4}, token{"\u6211", 4, 5}, token{"\u89c9\u5f97", 5, 7}, token{"\u5f00\u6e90", 7, 9}, token{"\u6709", 9, 10}, token{"\u4e00\u4e2a", 10, 12}, token{"\u597d\u5904", 12, 14}, token{"\uff0c", 14, 15}, token{"\u5c31\u662f", 15, 17}, token{"\u80fd\u591f", 17, 19}, token{"\u6566\u4fc3", 19, 21}, token{"\u81ea\u5df1", 21, 23}, token{"\u4e0d\u65ad\u6539\u8fdb", 23, 27}, token{"\uff0c", 27, 28}, token{"\u907f\u514d", 28, 30}, token{"\u655e\u5e1a", 30, 32}, token{"\u81ea\u73cd", 32, 34}}, - []token{token{"\u6e56\u5317\u7701", 0, 3}, token{"\u77f3\u9996\u5e02", 3, 6}}, - []token{token{"\u6e56\u5317\u7701", 0, 3}, token{"\u5341\u5830\u5e02", 3, 6}}, - []token{token{"\u603b\u7ecf\u7406", 0, 3}, token{"\u5b8c\u6210", 3, 5}, token{"\u4e86", 5, 6}, token{"\u8fd9\u4ef6", 6, 8}, token{"\u4e8b\u60c5", 8, 10}}, - []token{token{"\u7535\u8111", 0, 2}, token{"\u4fee\u597d", 2, 4}, token{"\u4e86", 4, 5}}, - []token{token{"\u505a\u597d", 0, 2}, token{"\u4e86", 2, 3}, token{"\u8fd9\u4ef6", 3, 5}, token{"\u4e8b\u60c5", 5, 7}, token{"\u5c31", 7, 8}, token{"\u4e00\u4e86\u767e\u4e86", 8, 12}, token{"\u4e86", 12, 13}}, - []token{token{"\u4eba\u4eec", 0, 2}, token{"\u5ba1\u7f8e", 2, 4}, token{"\u7684", 4, 5}, token{"\u89c2\u70b9", 5, 7}, token{"\u662f", 7, 8}, token{"\u4e0d\u540c", 8, 10}, token{"\u7684", 10, 11}}, - []token{token{"\u6211\u4eec", 0, 2}, token{"\u4e70", 2, 3}, token{"\u4e86", 3, 4}, token{"\u4e00\u4e2a", 4, 6}, token{"\u7f8e\u7684", 6, 8}, token{"\u7a7a\u8c03", 8, 10}}, - []token{token{"\u7ebf\u7a0b", 0, 2}, token{"\u521d\u59cb\u5316", 2, 5}, token{"\u65f6", 5, 6}, token{"\u6211\u4eec", 6, 8}, token{"\u8981", 8, 9}, token{"\u6ce8\u610f", 9, 11}}, - []token{token{"\u4e00\u4e2a", 0, 2}, token{"\u5206\u5b50", 2, 4}, token{"\u662f", 4, 5}, token{"\u7531", 5, 6}, token{"\u597d\u591a", 6, 8}, token{"\u539f\u5b50", 8, 10}, token{"\u7ec4\u7ec7", 10, 12}, token{"\u6210", 12, 13}, token{"\u7684", 13, 14}}, - []token{token{"\u795d", 0, 1}, token{"\u4f60", 1, 2}, token{"\u9a6c\u5230\u529f\u6210", 2, 6}}, - []token{token{"\u4ed6", 0, 1}, token{"\u6389", 1, 2}, token{"\u8fdb", 2, 3}, token{"\u4e86", 3, 4}, token{"\u65e0\u5e95\u6d1e", 4, 7}, token{"\u91cc", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u7684", 2, 3}, token{"\u9996\u90fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u5317\u4eac", 6, 8}}, - []token{token{"\u5b59\u541b\u610f", 0, 3}}, - []token{token{"\u5916\u4ea4\u90e8", 0, 3}, token{"\u53d1\u8a00\u4eba", 3, 6}, token{"\u9a6c\u671d\u65ed", 6, 9}}, - []token{token{"\u9886\u5bfc\u4eba", 0, 3}, token{"\u4f1a\u8bae", 3, 5}, token{"\u548c", 5, 6}, token{"\u7b2c\u56db\u5c4a", 6, 9}, token{"\u4e1c\u4e9a", 9, 11}, token{"\u5cf0\u4f1a", 11, 13}}, - []token{token{"\u5728", 0, 1}, token{"\u8fc7\u53bb", 1, 3}, token{"\u7684", 3, 4}, token{"\u8fd9", 4, 5}, token{"\u4e94\u5e74", 5, 7}}, - []token{token{"\u8fd8", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5f88\u957f", 3, 5}, token{"\u7684", 5, 6}, token{"\u8def", 6, 7}, token{"\u8981", 7, 8}, token{"\u8d70", 8, 9}}, - []token{token{"60", 0, 2}, token{"\u5468\u5e74", 2, 4}, token{"\u9996\u90fd", 4, 6}, token{"\u9605\u5175", 6, 8}}, - []token{token{"\u4f60\u597d", 0, 2}, token{"\u4eba\u4eec", 2, 4}, token{"\u5ba1\u7f8e", 4, 6}, token{"\u7684", 6, 7}, token{"\u89c2\u70b9", 7, 9}, token{"\u662f", 9, 10}, token{"\u4e0d\u540c", 10, 12}, token{"\u7684", 12, 13}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u6765", 5, 6}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u53bb", 5, 6}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4f46\u662f", 0, 2}, token{"\u540e\u6765", 2, 4}, token{"\u6211", 4, 5}, token{"\u624d", 5, 6}, token{"\u77e5\u9053", 6, 8}, token{"\u4f60", 8, 9}, token{"\u662f", 9, 10}, token{"\u5bf9", 10, 11}, token{"\u7684", 11, 12}}, - []token{token{"\u5b58\u5728", 0, 2}, token{"\u5373", 2, 3}, token{"\u5408\u7406", 3, 5}}, - []token{token{"\u7684", 0, 1}, token{"\u7684", 1, 2}, token{"\u7684", 2, 3}, token{"\u7684", 3, 4}, token{"\u7684", 4, 5}, token{"\u5728", 5, 6}, token{"\u7684", 6, 7}, token{"\u7684", 7, 8}, token{"\u7684", 8, 9}, token{"\u7684", 9, 10}, token{"\u5c31", 10, 11}, token{"\u4ee5", 11, 12}, token{"\u548c", 12, 13}, token{"\u548c", 13, 14}, token{"\u548c", 14, 15}}, - []token{token{"I", 0, 1}, token{" ", 1, 2}, token{"love", 2, 6}, token{"\u4f60", 6, 7}, token{"\uff0c", 7, 8}, token{"\u4e0d\u4ee5\u4e3a\u803b", 8, 12}, token{"\uff0c", 12, 13}, token{"\u53cd", 13, 14}, token{"\u4ee5\u4e3a", 14, 16}, token{"rong", 16, 20}}, - []token{token{"\u56e0", 0, 1}}, - []token{}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u5f88", 0, 1}, token{"\u597d", 1, 2}, token{"\u4f46", 2, 3}, token{"\u4e3b\u8981", 3, 5}, token{"\u662f", 5, 6}, token{"\u57fa\u4e8e", 6, 8}, token{"\u7f51\u9875", 8, 10}, token{"\u5f62\u5f0f", 10, 12}}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u4e3a\u4ec0\u4e48", 0, 3}, token{"\u6211", 3, 4}, token{"\u4e0d\u80fd", 4, 6}, token{"\u62e5\u6709", 6, 8}, token{"\u60f3\u8981", 8, 10}, token{"\u7684", 10, 11}, token{"\u751f\u6d3b", 11, 13}}, - []token{token{"\u540e\u6765", 0, 2}, token{"\u6211", 2, 3}, token{"\u624d", 3, 4}}, - []token{token{"\u6b64\u6b21", 0, 2}, token{"\u6765", 2, 3}, token{"\u4e2d\u56fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u4e3a\u4e86", 6, 8}}, - []token{token{"\u4f7f\u7528", 0, 2}, token{"\u4e86", 2, 3}, token{"\u5b83", 3, 4}, token{"\u5c31", 4, 5}, token{"\u53ef\u4ee5", 5, 7}, token{"\u89e3\u51b3", 7, 9}, token{"\u4e00\u4e9b", 9, 11}, token{"\u95ee\u9898", 11, 13}}, - []token{token{",", 0, 1}, token{"\u4f7f\u7528", 1, 3}, token{"\u4e86", 3, 4}, token{"\u5b83", 4, 5}, token{"\u5c31", 5, 6}, token{"\u53ef\u4ee5", 6, 8}, token{"\u89e3\u51b3", 8, 10}, token{"\u4e00\u4e9b", 10, 12}, token{"\u95ee\u9898", 12, 14}}, - []token{token{"\u5176\u5b9e", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u597d\u4eba", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u662f\u56e0\u4e3a", 0, 3}, token{"\u548c", 3, 4}, token{"\u56fd\u5bb6", 4, 6}}, - []token{token{"\u8001\u5e74", 0, 2}, token{"\u641c\u7d22", 2, 4}, token{"\u8fd8", 4, 5}, token{"\u652f\u6301", 5, 7}}, - []token{token{"\u5e72\u8106", 0, 2}, token{"\u5c31", 2, 3}, token{"\u628a", 3, 4}, token{"\u90a3\u90e8", 4, 6}, token{"\u8499\u4eba", 6, 8}, token{"\u7684", 8, 9}, token{"\u95f2\u6cd5", 9, 11}, token{"\u7ed9", 11, 12}, token{"\u5e9f", 12, 13}, token{"\u4e86", 13, 14}, token{"\u62c9\u5012", 14, 16}, token{"\uff01", 16, 17}, token{"RT", 17, 19}, token{" ", 19, 20}, token{"@", 20, 21}, token{"laoshipukong", 21, 33}, token{" ", 33, 34}, token{":", 34, 35}, token{" ", 35, 36}, token{"27", 36, 38}, token{"\u65e5", 38, 39}, token{"\uff0c", 39, 40}, token{"\u5168\u56fd\u4eba\u5927\u5e38\u59d4\u4f1a", 40, 47}, token{"\u7b2c\u4e09\u6b21", 47, 50}, token{"\u5ba1\u8bae", 50, 52}, token{"\u4fb5\u6743", 52, 54}, token{"\u8d23\u4efb\u6cd5", 54, 57}, token{"\u8349\u6848", 57, 59}, token{"\uff0c", 59, 60}, token{"\u5220\u9664", 60, 62}, token{"\u4e86", 62, 63}, token{"\u6709\u5173", 63, 65}, token{"\u533b\u7597", 65, 67}, token{"\u635f\u5bb3", 67, 69}, token{"\u8d23\u4efb", 69, 71}, token{"\u201c", 71, 72}, token{"\u4e3e\u8bc1", 72, 74}, token{"\u5012\u7f6e", 74, 76}, token{"\u201d", 76, 77}, token{"\u7684", 77, 78}, token{"\u89c4\u5b9a", 78, 80}, token{"\u3002", 80, 81}, token{"\u5728", 81, 82}, token{"\u533b\u60a3", 82, 84}, token{"\u7ea0\u7eb7", 84, 86}, token{"\u4e2d\u672c", 86, 88}, token{"\u5df2", 88, 89}, token{"\u5904\u4e8e", 89, 91}, token{"\u5f31\u52bf", 91, 93}, token{"\u5730\u4f4d", 93, 95}, token{"\u7684", 95, 96}, token{"\u6d88\u8d39\u8005", 96, 99}, token{"\u7531\u6b64", 99, 101}, token{"\u5c06", 101, 102}, token{"\u9677\u5165", 102, 104}, token{"\u4e07\u52ab\u4e0d\u590d", 104, 108}, token{"\u7684", 108, 109}, token{"\u5883\u5730", 109, 111}, token{"\u3002", 111, 112}, token{" ", 112, 113}}, - []token{token{"\u5927", 0, 1}}, - []token{}, - []token{token{"\u4ed6", 0, 1}, token{"\u8bf4", 1, 2}, token{"\u7684", 2, 3}, token{"\u786e\u5b9e", 3, 5}, token{"\u5728\u7406", 5, 7}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u5e02\u957f", 2, 4}, token{"\u6625\u8282", 4, 6}, token{"\u8bb2\u8bdd", 6, 8}}, - []token{token{"\u7ed3\u5a5a", 0, 2}, token{"\u7684", 2, 3}, token{"\u548c", 3, 4}, token{"\u5c1a\u672a", 4, 6}, token{"\u7ed3\u5a5a", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u7ed3\u5408", 0, 2}, token{"\u6210", 2, 3}, token{"\u5206\u5b50", 3, 5}, token{"\u65f6", 5, 6}}, - []token{token{"\u65c5\u6e38", 0, 2}, token{"\u548c", 2, 3}, token{"\u670d\u52a1", 3, 5}, token{"\u662f", 5, 6}, token{"\u6700\u597d", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u8fd9\u4ef6", 0, 2}, token{"\u4e8b\u60c5", 2, 4}, token{"\u7684\u786e", 4, 6}, token{"\u662f", 6, 7}, token{"\u6211", 7, 8}, token{"\u7684", 8, 9}, token{"\u9519", 9, 10}}, - []token{token{"\u4f9b", 0, 1}, token{"\u5927\u5bb6", 1, 3}, token{"\u53c2\u8003", 3, 5}, token{"\u6307\u6b63", 5, 7}}, - []token{token{"\u54c8\u5c14\u6ee8", 0, 3}, token{"\u653f\u5e9c", 3, 5}, token{"\u516c\u5e03", 5, 7}, token{"\u584c\u6865", 7, 9}, token{"\u539f\u56e0", 9, 11}}, - []token{token{"\u6211", 0, 1}, token{"\u5728", 1, 2}, token{"\u673a\u573a", 2, 4}, token{"\u5165\u53e3\u5904", 4, 7}}, - []token{token{"\u90a2\u6c38\u81e3", 0, 3}, token{"\u6444\u5f71", 3, 5}, token{"\u62a5\u9053", 5, 7}}, - []token{token{"BP", 0, 2}, token{"\u795e\u7ecf\u7f51\u7edc", 2, 6}, token{"\u5982\u4f55", 6, 8}, token{"\u8bad\u7ec3", 8, 10}, token{"\u624d\u80fd", 10, 12}, token{"\u5728", 12, 13}, token{"\u5206\u7c7b", 13, 15}, token{"\u65f6", 15, 16}, token{"\u589e\u52a0", 16, 18}, token{"\u533a\u5206\u5ea6", 18, 21}, token{"\uff1f", 21, 22}}, - []token{token{"\u5357\u4eac\u5e02", 0, 3}, token{"\u957f\u6c5f\u5927\u6865", 3, 7}}, - []token{token{"\u5e94", 0, 1}, token{"\u4e00\u4e9b", 1, 3}, token{"\u4f7f\u7528\u8005", 3, 6}, token{"\u7684", 6, 7}, token{"\u5efa\u8bae", 7, 9}, token{"\uff0c", 9, 10}, token{"\u4e5f", 10, 11}, token{"\u4e3a\u4e86", 11, 13}, token{"\u4fbf\u4e8e", 13, 15}, token{"\u5229\u7528", 15, 17}, token{"NiuTrans", 17, 25}, token{"\u7528\u4e8e", 25, 27}, token{"SMT", 27, 30}, token{"\u7814\u7a76", 30, 32}}, - []token{token{"\u957f\u6625\u5e02", 0, 3}, token{"\u957f\u6625", 3, 5}, token{"\u836f\u5e97", 5, 7}}, - []token{token{"\u9093\u9896\u8d85", 0, 3}, token{"\u751f\u524d", 3, 5}, token{"\u6700", 5, 6}, token{"\u559c\u6b22", 6, 8}, token{"\u7684", 8, 9}, token{"\u8863\u670d", 9, 11}}, - []token{token{"\u80e1\u9526\u6d9b", 0, 3}, token{"\u662f", 3, 4}, token{"\u70ed\u7231", 4, 6}, token{"\u4e16\u754c", 6, 8}, token{"\u548c\u5e73", 8, 10}, token{"\u7684", 10, 11}, token{"\u653f\u6cbb\u5c40", 11, 14}, token{"\u5e38\u59d4", 14, 16}}, - []token{token{"\u7a0b\u5e8f\u5458", 0, 3}, token{"\u795d", 3, 4}, token{"\u6d77\u6797", 4, 6}, token{"\u548c", 6, 7}, token{"\u6731\u4f1a\u9707", 7, 10}, token{"\u662f", 10, 11}, token{"\u5728", 11, 12}, token{"\u5b59\u5065", 12, 14}, token{"\u7684", 14, 15}, token{"\u5de6\u9762", 15, 17}, token{"\u548c", 17, 18}, token{"\u53f3\u9762", 18, 20}, token{",", 20, 21}, token{" ", 21, 22}, token{"\u8303\u51ef", 22, 24}, token{"\u5728", 24, 25}, token{"\u6700", 25, 26}, token{"\u53f3\u9762", 26, 28}, token{".", 28, 29}, token{"\u518d\u5f80", 29, 31}, token{"\u5de6", 31, 32}, token{"\u662f", 32, 33}, token{"\u674e\u677e\u6d2a", 33, 36}}, - []token{token{"\u4e00\u6b21\u6027", 0, 3}, token{"\u4ea4", 3, 4}, token{"\u591a\u5c11", 4, 6}, token{"\u94b1", 6, 7}}, - []token{token{"\u4e24\u5757", 0, 2}, token{"\u4e94", 2, 3}, token{"\u4e00\u5957", 3, 5}, token{"\uff0c", 5, 6}, token{"\u4e09\u5757", 6, 8}, token{"\u516b", 8, 9}, token{"\u4e00\u65a4", 9, 11}, token{"\uff0c", 11, 12}, token{"\u56db\u5757", 12, 14}, token{"\u4e03", 14, 15}, token{"\u4e00\u672c", 15, 17}, token{"\uff0c", 17, 18}, token{"\u4e94\u5757", 18, 20}, token{"\u516d", 20, 21}, token{"\u4e00\u6761", 21, 23}}, - []token{token{"\u5c0f", 0, 1}, token{"\u548c\u5c1a", 1, 3}, token{"\u7559", 3, 4}, token{"\u4e86", 4, 5}, token{"\u4e00\u4e2a", 5, 7}, token{"\u50cf", 7, 8}, token{"\u5927", 8, 9}, token{"\u548c\u5c1a", 9, 11}, token{"\u4e00\u6837", 11, 13}, token{"\u7684", 13, 14}, token{"\u548c\u5c1a\u5934", 14, 17}}, - []token{token{"\u6211", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd", 2, 9}, token{"\u516c\u6c11", 9, 11}, token{";", 11, 12}, token{"\u6211", 12, 13}, token{"\u7238\u7238", 13, 15}, token{"\u662f", 15, 16}, token{"\u5171\u548c\u515a", 16, 19}, token{"\u515a\u5458", 19, 21}, token{";", 21, 22}, token{" ", 22, 23}, token{"\u5730\u94c1", 23, 25}, token{"\u548c\u5e73\u95e8", 25, 28}, token{"\u7ad9", 28, 29}}, - []token{token{"\u5f20\u6653\u6885", 0, 3}, token{"\u53bb", 3, 4}, token{"\u4eba\u6c11", 4, 6}, token{"\u533b\u9662", 6, 8}, token{"\u505a", 8, 9}, token{"\u4e86", 9, 10}, token{"\u4e2a", 10, 11}, token{"B\u8d85", 11, 13}, token{"\u7136\u540e", 13, 15}, token{"\u53bb", 15, 16}, token{"\u4e70", 16, 17}, token{"\u4e86", 17, 18}, token{"\u4ef6", 18, 19}, token{"T\u6064", 19, 21}}, - []token{token{"AT&T", 0, 4}, token{"\u662f", 4, 5}, token{"\u4e00\u4ef6", 5, 7}, token{"\u4e0d\u9519", 7, 9}, token{"\u7684", 9, 10}, token{"\u516c\u53f8", 10, 12}, token{"\uff0c", 12, 13}, token{"\u7ed9", 13, 14}, token{"\u4f60", 14, 15}, token{"\u53d1", 15, 16}, token{"offer", 16, 21}, token{"\u4e86", 21, 22}, token{"\u5417", 22, 23}, token{"\uff1f", 23, 24}}, - []token{token{"C++", 0, 3}, token{"\u548c", 3, 4}, token{"c#", 4, 6}, token{"\u662f", 6, 7}, token{"\u4ec0\u4e48", 7, 9}, token{"\u5173\u7cfb", 9, 11}, token{"\uff1f", 11, 12}, token{"11", 12, 14}, token{"+", 14, 15}, token{"122", 15, 18}, token{"=", 18, 19}, token{"133", 19, 22}, token{"\uff0c", 22, 23}, token{"\u662f", 23, 24}, token{"\u5417", 24, 25}, token{"\uff1f", 25, 26}, token{"PI", 26, 28}, token{"=", 28, 29}, token{"3.14159", 29, 36}}, - []token{token{"\u4f60", 0, 1}, token{"\u8ba4\u8bc6", 1, 3}, token{"\u90a3\u4e2a", 3, 5}, token{"\u548c", 5, 6}, token{"\u4e3b\u5e2d", 6, 8}, token{"\u63e1\u624b", 8, 10}, token{"\u7684", 10, 11}, token{"\u7684\u54e5", 11, 13}, token{"\u5417", 13, 14}, token{"\uff1f", 14, 15}, token{"\u4ed6\u5f00", 15, 17}, token{"\u4e00\u8f86", 17, 19}, token{"\u9ed1\u8272", 19, 21}, token{"\u7684\u58eb", 21, 23}, token{"\u3002", 23, 24}}, - []token{token{"\u67aa\u6746\u5b50", 0, 3}, token{"\u4e2d", 3, 4}, token{"\u51fa", 4, 5}, token{"\u653f\u6743", 5, 7}}, - []token{token{"\u5f20\u4e09\u98ce", 0, 3}, token{"\u540c\u5b66", 3, 5}, token{"\u8d70\u4e0a", 5, 7}, token{"\u4e86", 7, 8}, token{"\u4e0d\u5f52\u8def", 8, 11}}, - []token{token{"\u963fQ", 0, 2}, token{"\u8170\u95f4", 2, 4}, token{"\u6302", 4, 5}, token{"\u7740", 5, 6}, token{"BB\u673a", 6, 9}, token{"\u624b\u91cc", 9, 11}, token{"\u62ff", 11, 12}, token{"\u7740", 12, 13}, token{"\u5927\u54e5\u5927", 13, 16}, token{"\uff0c", 16, 17}, token{"\u8bf4", 17, 18}, token{"\uff1a", 18, 19}, token{"\u6211", 19, 20}, token{"\u4e00\u822c", 20, 22}, token{"\u5403\u996d", 22, 24}, token{"\u4e0d", 24, 25}, token{"AA\u5236", 25, 28}, token{"\u7684", 28, 29}, token{"\u3002", 29, 30}}, - []token{token{"\u5728", 0, 1}, token{"1\u53f7\u5e97", 1, 4}, token{"\u80fd", 4, 5}, token{"\u4e70", 5, 6}, token{"\u5230", 6, 7}, token{"\u5c0fS", 7, 9}, token{"\u548c", 9, 10}, token{"\u5927S", 10, 12}, token{"\u516b\u5366", 12, 14}, token{"\u7684", 14, 15}, token{"\u4e66", 15, 16}, token{"\u3002", 16, 17}}, - []token{token{"\u8fd9\u662f", 0, 2}, token{"\u4e00\u4e2a", 2, 4}, token{"\u4f38\u624b", 4, 6}, token{"\u4e0d\u89c1", 6, 8}, token{"\u4e94\u6307", 8, 10}, token{"\u4f38\u624b\u4e0d\u89c1\u4e94\u6307", 4, 10}, token{"\u7684", 10, 11}, token{"\u9ed1\u591c", 11, 13}, token{"\u3002", 13, 14}, token{"\u6211", 14, 15}, token{"\u53eb", 15, 16}, token{"\u609f\u7a7a", 17, 19}, token{"\u5b59\u609f\u7a7a", 16, 19}, token{"\uff0c", 19, 20}, token{"\u6211", 20, 21}, token{"\u7231", 21, 22}, token{"\u5317\u4eac", 22, 24}, token{"\uff0c", 24, 25}, token{"\u6211", 25, 26}, token{"\u7231", 26, 27}, token{"Python", 27, 33}, token{"\u548c", 33, 34}, token{"C++", 34, 37}, token{"\u3002", 37, 38}}, - []token{token{"\u6211", 0, 1}, token{"\u4e0d", 1, 2}, token{"\u559c\u6b22", 2, 4}, token{"\u65e5\u672c", 4, 6}, token{"\u548c\u670d", 6, 8}, token{"\u3002", 8, 9}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u56de\u5f52", 2, 4}, token{"\u4eba\u95f4", 4, 6}, token{"\u3002", 6, 7}}, - []token{token{"\u5de5\u4fe1\u5904", 0, 3}, token{"\u5e72\u4e8b", 4, 6}, token{"\u5973\u5e72\u4e8b", 3, 6}, token{"\u6bcf\u6708", 6, 8}, token{"\u7ecf\u8fc7", 8, 10}, token{"\u4e0b\u5c5e", 10, 12}, token{"\u79d1\u5ba4", 12, 14}, token{"\u90fd", 14, 15}, token{"\u8981", 15, 16}, token{"\u4eb2\u53e3", 16, 18}, token{"\u4ea4\u4ee3", 18, 20}, token{"24", 20, 22}, token{"\u53e3", 22, 23}, token{"\u4ea4\u6362", 23, 25}, token{"\u6362\u673a", 24, 26}, token{"\u4ea4\u6362\u673a", 23, 26}, token{"\u7b49", 26, 27}, token{"\u6280\u672f", 27, 29}, token{"\u6280\u672f\u6027", 27, 30}, token{"\u5668\u4ef6", 30, 32}, token{"\u7684", 32, 33}, token{"\u5b89\u88c5", 33, 35}, token{"\u5de5\u4f5c", 35, 37}}, - []token{token{"\u6211", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5ec9\u79df", 3, 5}, token{"\u79df\u623f", 4, 6}, token{"\u5ec9\u79df\u623f", 3, 6}}, - []token{token{"\u6c38\u548c", 0, 2}, token{"\u670d\u88c5", 2, 4}, token{"\u9970\u54c1", 4, 6}, token{"\u6709\u9650", 6, 8}, token{"\u516c\u53f8", 8, 10}, token{"\u6709\u9650\u516c\u53f8", 6, 10}}, - []token{token{"\u6211", 0, 1}, token{"\u7231", 1, 2}, token{"\u5317\u4eac", 2, 4}, token{"\u5929\u5b89", 4, 6}, token{"\u5929\u5b89\u95e8", 4, 7}}, - []token{token{"abc", 0, 3}}, - []token{token{"\u9690", 0, 1}, token{"\u53ef\u592b", 3, 5}, token{"\u9a6c\u5c14\u53ef", 1, 4}, token{"\u9a6c\u5c14\u53ef\u592b", 1, 5}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u662f", 2, 3}, token{"\u4e2a", 3, 4}, token{"\u597d", 4, 5}, token{"\u7f51\u7ad9", 5, 7}}, - []token{token{"\u201c", 0, 1}, token{"Microsoft", 1, 10}, token{"\u201d", 10, 11}, token{"\u4e00\u8bcd", 11, 13}, token{"\u7531", 13, 14}, token{"\u201c", 14, 15}, token{"MICROcomputer", 15, 28}, token{"\uff08", 28, 29}, token{"\u5fae\u578b", 29, 31}, token{"\u8ba1\u7b97", 31, 33}, token{"\u7b97\u673a", 32, 34}, token{"\u8ba1\u7b97\u673a", 31, 34}, token{"\uff09", 34, 35}, token{"\u201d", 35, 36}, token{"\u548c", 36, 37}, token{"\u201c", 37, 38}, token{"SOFTware", 38, 46}, token{"\uff08", 46, 47}, token{"\u8f6f\u4ef6", 47, 49}, token{"\uff09", 49, 50}, token{"\u201d", 50, 51}, token{"\u4e24", 51, 52}, token{"\u90e8\u5206", 52, 54}, token{"\u7ec4\u6210", 54, 56}}, - []token{token{"\u8349\u6ce5\u9a6c", 0, 3}, token{"\u548c", 3, 4}, token{"\u6b3a\u5b9e", 4, 6}, token{"\u9a6c", 6, 7}, token{"\u662f", 7, 8}, token{"\u4eca\u5e74", 8, 10}, token{"\u7684", 10, 11}, token{"\u6d41\u884c", 11, 13}, token{"\u8bcd\u6c47", 13, 15}}, - []token{token{"\u4f0a\u85e4", 0, 2}, token{"\u6d0b\u534e\u5802", 2, 5}, token{"\u603b\u5e9c", 5, 7}, token{"\u5e97", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u79d1\u5b66", 2, 4}, token{"\u5b66\u9662", 3, 5}, token{"\u8ba1\u7b97", 5, 7}, token{"\u6280\u672f", 7, 9}, token{"\u7814\u7a76", 9, 11}, token{"\u79d1\u5b66\u9662", 2, 5}, token{"\u7814\u7a76\u6240", 9, 12}, token{"\u4e2d\u56fd\u79d1\u5b66\u9662\u8ba1\u7b97\u6280\u672f\u7814\u7a76\u6240", 0, 12}}, - []token{token{"\u7f57\u5bc6\u6b27", 0, 3}, token{"\u4e0e", 3, 4}, token{"\u6731\u4e3d\u53f6", 4, 7}}, - []token{token{"\u6211", 0, 1}, token{"\u8d2d\u4e70", 1, 3}, token{"\u4e86", 3, 4}, token{"\u9053\u5177", 4, 6}, token{"\u548c", 6, 7}, token{"\u670d\u88c5", 7, 9}}, - []token{token{"PS", 0, 2}, token{":", 2, 3}, token{" ", 3, 4}, token{"\u6211", 4, 5}, token{"\u89c9\u5f97", 5, 7}, token{"\u5f00\u6e90", 7, 9}, token{"\u6709", 9, 10}, token{"\u4e00\u4e2a", 10, 12}, token{"\u597d\u5904", 12, 14}, token{"\uff0c", 14, 15}, token{"\u5c31\u662f", 15, 17}, token{"\u80fd\u591f", 17, 19}, token{"\u6566\u4fc3", 19, 21}, token{"\u81ea\u5df1", 21, 23}, token{"\u4e0d\u65ad", 23, 25}, token{"\u6539\u8fdb", 25, 27}, token{"\u4e0d\u65ad\u6539\u8fdb", 23, 27}, token{"\uff0c", 27, 28}, token{"\u907f\u514d", 28, 30}, token{"\u655e\u5e1a", 30, 32}, token{"\u81ea\u73cd", 32, 34}}, - []token{token{"\u6e56\u5317", 0, 2}, token{"\u6e56\u5317\u7701", 0, 3}, token{"\u77f3\u9996", 3, 5}, token{"\u77f3\u9996\u5e02", 3, 6}}, - []token{token{"\u6e56\u5317", 0, 2}, token{"\u6e56\u5317\u7701", 0, 3}, token{"\u5341\u5830", 3, 5}, token{"\u5341\u5830\u5e02", 3, 6}}, - []token{token{"\u7ecf\u7406", 1, 3}, token{"\u603b\u7ecf\u7406", 0, 3}, token{"\u5b8c\u6210", 3, 5}, token{"\u4e86", 5, 6}, token{"\u8fd9\u4ef6", 6, 8}, token{"\u4e8b\u60c5", 8, 10}}, - []token{token{"\u7535\u8111", 0, 2}, token{"\u4fee\u597d", 2, 4}, token{"\u4e86", 4, 5}}, - []token{token{"\u505a\u597d", 0, 2}, token{"\u4e86", 2, 3}, token{"\u8fd9\u4ef6", 3, 5}, token{"\u4e8b\u60c5", 5, 7}, token{"\u5c31", 7, 8}, token{"\u4e00\u4e86\u767e\u4e86", 8, 12}, token{"\u4e86", 12, 13}}, - []token{token{"\u4eba\u4eec", 0, 2}, token{"\u5ba1\u7f8e", 2, 4}, token{"\u7684", 4, 5}, token{"\u89c2\u70b9", 5, 7}, token{"\u662f", 7, 8}, token{"\u4e0d\u540c", 8, 10}, token{"\u7684", 10, 11}}, - []token{token{"\u6211\u4eec", 0, 2}, token{"\u4e70", 2, 3}, token{"\u4e86", 3, 4}, token{"\u4e00\u4e2a", 4, 6}, token{"\u7f8e\u7684", 6, 8}, token{"\u7a7a\u8c03", 8, 10}}, - []token{token{"\u7ebf\u7a0b", 0, 2}, token{"\u521d\u59cb", 2, 4}, token{"\u521d\u59cb\u5316", 2, 5}, token{"\u65f6", 5, 6}, token{"\u6211\u4eec", 6, 8}, token{"\u8981", 8, 9}, token{"\u6ce8\u610f", 9, 11}}, - []token{token{"\u4e00\u4e2a", 0, 2}, token{"\u5206\u5b50", 2, 4}, token{"\u662f", 4, 5}, token{"\u7531", 5, 6}, token{"\u597d\u591a", 6, 8}, token{"\u539f\u5b50", 8, 10}, token{"\u7ec4\u7ec7", 10, 12}, token{"\u6210", 12, 13}, token{"\u7684", 13, 14}}, - []token{token{"\u795d", 0, 1}, token{"\u4f60", 1, 2}, token{"\u9a6c\u5230\u529f\u6210", 2, 6}}, - []token{token{"\u4ed6", 0, 1}, token{"\u6389", 1, 2}, token{"\u8fdb", 2, 3}, token{"\u4e86", 3, 4}, token{"\u65e0\u5e95", 4, 6}, token{"\u65e0\u5e95\u6d1e", 4, 7}, token{"\u91cc", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u7684", 2, 3}, token{"\u9996\u90fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u5317\u4eac", 6, 8}}, - []token{token{"\u5b59\u541b\u610f", 0, 3}}, - []token{token{"\u5916\u4ea4", 0, 2}, token{"\u5916\u4ea4\u90e8", 0, 3}, token{"\u53d1\u8a00", 3, 5}, token{"\u53d1\u8a00\u4eba", 3, 6}, token{"\u9a6c\u671d\u65ed", 6, 9}}, - []token{token{"\u9886\u5bfc", 0, 2}, token{"\u9886\u5bfc\u4eba", 0, 3}, token{"\u4f1a\u8bae", 3, 5}, token{"\u548c", 5, 6}, token{"\u7b2c\u56db", 6, 8}, token{"\u56db\u5c4a", 7, 9}, token{"\u7b2c\u56db\u5c4a", 6, 9}, token{"\u4e1c\u4e9a", 9, 11}, token{"\u5cf0\u4f1a", 11, 13}}, - []token{token{"\u5728", 0, 1}, token{"\u8fc7\u53bb", 1, 3}, token{"\u7684", 3, 4}, token{"\u8fd9", 4, 5}, token{"\u4e94\u5e74", 5, 7}}, - []token{token{"\u8fd8", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5f88\u957f", 3, 5}, token{"\u7684", 5, 6}, token{"\u8def", 6, 7}, token{"\u8981", 7, 8}, token{"\u8d70", 8, 9}}, - []token{token{"60", 0, 2}, token{"\u5468\u5e74", 2, 4}, token{"\u9996\u90fd", 4, 6}, token{"\u9605\u5175", 6, 8}}, - []token{token{"\u4f60\u597d", 0, 2}, token{"\u4eba\u4eec", 2, 4}, token{"\u5ba1\u7f8e", 4, 6}, token{"\u7684", 6, 7}, token{"\u89c2\u70b9", 7, 9}, token{"\u662f", 9, 10}, token{"\u4e0d\u540c", 10, 12}, token{"\u7684", 12, 13}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u6765", 5, 6}, token{"\u4e16\u535a", 6, 8}, token{"\u535a\u56ed", 7, 9}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u53bb", 5, 6}, token{"\u4e16\u535a", 6, 8}, token{"\u535a\u56ed", 7, 9}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4f46\u662f", 0, 2}, token{"\u540e\u6765", 2, 4}, token{"\u6211", 4, 5}, token{"\u624d", 5, 6}, token{"\u77e5\u9053", 6, 8}, token{"\u4f60", 8, 9}, token{"\u662f", 9, 10}, token{"\u5bf9", 10, 11}, token{"\u7684", 11, 12}}, - []token{token{"\u5b58\u5728", 0, 2}, token{"\u5373", 2, 3}, token{"\u5408\u7406", 3, 5}}, - []token{token{"\u7684", 0, 1}, token{"\u7684", 1, 2}, token{"\u7684", 2, 3}, token{"\u7684", 3, 4}, token{"\u7684", 4, 5}, token{"\u5728", 5, 6}, token{"\u7684", 6, 7}, token{"\u7684", 7, 8}, token{"\u7684", 8, 9}, token{"\u7684", 9, 10}, token{"\u5c31", 10, 11}, token{"\u4ee5", 11, 12}, token{"\u548c", 12, 13}, token{"\u548c", 13, 14}, token{"\u548c", 14, 15}}, - []token{token{"I", 0, 1}, token{" ", 1, 2}, token{"love", 2, 6}, token{"\u4f60", 6, 7}, token{"\uff0c", 7, 8}, token{"\u4e0d\u4ee5", 8, 10}, token{"\u4ee5\u4e3a", 9, 11}, token{"\u4e0d\u4ee5\u4e3a\u803b", 8, 12}, token{"\uff0c", 12, 13}, token{"\u53cd", 13, 14}, token{"\u4ee5\u4e3a", 14, 16}, token{"rong", 16, 20}}, - []token{token{"\u56e0", 0, 1}}, - []token{}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u5f88", 0, 1}, token{"\u597d", 1, 2}, token{"\u4f46", 2, 3}, token{"\u4e3b\u8981", 3, 5}, token{"\u662f", 5, 6}, token{"\u57fa\u4e8e", 6, 8}, token{"\u7f51\u9875", 8, 10}, token{"\u5f62\u5f0f", 10, 12}}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u4ec0\u4e48", 1, 3}, token{"\u4e3a\u4ec0\u4e48", 0, 3}, token{"\u6211", 3, 4}, token{"\u4e0d\u80fd", 4, 6}, token{"\u62e5\u6709", 6, 8}, token{"\u60f3\u8981", 8, 10}, token{"\u7684", 10, 11}, token{"\u751f\u6d3b", 11, 13}}, - []token{token{"\u540e\u6765", 0, 2}, token{"\u6211", 2, 3}, token{"\u624d", 3, 4}}, - []token{token{"\u6b64\u6b21", 0, 2}, token{"\u6765", 2, 3}, token{"\u4e2d\u56fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u4e3a\u4e86", 6, 8}}, - []token{token{"\u4f7f\u7528", 0, 2}, token{"\u4e86", 2, 3}, token{"\u5b83", 3, 4}, token{"\u5c31", 4, 5}, token{"\u53ef\u4ee5", 5, 7}, token{"\u89e3\u51b3", 7, 9}, token{"\u4e00\u4e9b", 9, 11}, token{"\u95ee\u9898", 11, 13}}, - []token{token{",", 0, 1}, token{"\u4f7f\u7528", 1, 3}, token{"\u4e86", 3, 4}, token{"\u5b83", 4, 5}, token{"\u5c31", 5, 6}, token{"\u53ef\u4ee5", 6, 8}, token{"\u89e3\u51b3", 8, 10}, token{"\u4e00\u4e9b", 10, 12}, token{"\u95ee\u9898", 12, 14}}, - []token{token{"\u5176\u5b9e", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u597d\u4eba", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u56e0\u4e3a", 1, 3}, token{"\u662f\u56e0\u4e3a", 0, 3}, token{"\u548c", 3, 4}, token{"\u56fd\u5bb6", 4, 6}}, - []token{token{"\u8001\u5e74", 0, 2}, token{"\u641c\u7d22", 2, 4}, token{"\u8fd8", 4, 5}, token{"\u652f\u6301", 5, 7}}, - []token{token{"\u5e72\u8106", 0, 2}, token{"\u5c31", 2, 3}, token{"\u628a", 3, 4}, token{"\u90a3\u90e8", 4, 6}, token{"\u8499\u4eba", 6, 8}, token{"\u7684", 8, 9}, token{"\u95f2\u6cd5", 9, 11}, token{"\u7ed9", 11, 12}, token{"\u5e9f", 12, 13}, token{"\u4e86", 13, 14}, token{"\u62c9\u5012", 14, 16}, token{"\uff01", 16, 17}, token{"RT", 17, 19}, token{" ", 19, 20}, token{"@", 20, 21}, token{"laoshipukong", 21, 33}, token{" ", 33, 34}, token{":", 34, 35}, token{" ", 35, 36}, token{"27", 36, 38}, token{"\u65e5", 38, 39}, token{"\uff0c", 39, 40}, token{"\u5168\u56fd", 40, 42}, token{"\u56fd\u4eba", 41, 43}, token{"\u4eba\u5927", 42, 44}, token{"\u5e38\u59d4", 44, 46}, token{"\u59d4\u4f1a", 45, 47}, token{"\u5e38\u59d4\u4f1a", 44, 47}, token{"\u5168\u56fd\u4eba\u5927\u5e38\u59d4\u4f1a", 40, 47}, token{"\u7b2c\u4e09", 47, 49}, token{"\u4e09\u6b21", 48, 50}, token{"\u7b2c\u4e09\u6b21", 47, 50}, token{"\u5ba1\u8bae", 50, 52}, token{"\u4fb5\u6743", 52, 54}, token{"\u8d23\u4efb", 54, 56}, token{"\u8d23\u4efb\u6cd5", 54, 57}, token{"\u8349\u6848", 57, 59}, token{"\uff0c", 59, 60}, token{"\u5220\u9664", 60, 62}, token{"\u4e86", 62, 63}, token{"\u6709\u5173", 63, 65}, token{"\u533b\u7597", 65, 67}, token{"\u635f\u5bb3", 67, 69}, token{"\u8d23\u4efb", 69, 71}, token{"\u201c", 71, 72}, token{"\u4e3e\u8bc1", 72, 74}, token{"\u5012\u7f6e", 74, 76}, token{"\u201d", 76, 77}, token{"\u7684", 77, 78}, token{"\u89c4\u5b9a", 78, 80}, token{"\u3002", 80, 81}, token{"\u5728", 81, 82}, token{"\u533b\u60a3", 82, 84}, token{"\u7ea0\u7eb7", 84, 86}, token{"\u4e2d\u672c", 86, 88}, token{"\u5df2", 88, 89}, token{"\u5904\u4e8e", 89, 91}, token{"\u5f31\u52bf", 91, 93}, token{"\u5730\u4f4d", 93, 95}, token{"\u7684", 95, 96}, token{"\u6d88\u8d39", 96, 98}, token{"\u6d88\u8d39\u8005", 96, 99}, token{"\u7531\u6b64", 99, 101}, token{"\u5c06", 101, 102}, token{"\u9677\u5165", 102, 104}, token{"\u4e0d\u590d", 106, 108}, token{"\u4e07\u52ab\u4e0d\u590d", 104, 108}, token{"\u7684", 108, 109}, token{"\u5883\u5730", 109, 111}, token{"\u3002", 111, 112}, token{" ", 112, 113}}, - []token{token{"\u5927", 0, 1}}, - []token{}, - []token{token{"\u4ed6", 0, 1}, token{"\u8bf4", 1, 2}, token{"\u7684", 2, 3}, token{"\u786e\u5b9e", 3, 5}, token{"\u5728\u7406", 5, 7}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u5e02\u957f", 2, 4}, token{"\u6625\u8282", 4, 6}, token{"\u8bb2\u8bdd", 6, 8}}, - []token{token{"\u7ed3\u5a5a", 0, 2}, token{"\u7684", 2, 3}, token{"\u548c", 3, 4}, token{"\u5c1a\u672a", 4, 6}, token{"\u7ed3\u5a5a", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u7ed3\u5408", 0, 2}, token{"\u6210", 2, 3}, token{"\u5206\u5b50", 3, 5}, token{"\u65f6", 5, 6}}, - []token{token{"\u65c5\u6e38", 0, 2}, token{"\u548c", 2, 3}, token{"\u670d\u52a1", 3, 5}, token{"\u662f", 5, 6}, token{"\u6700\u597d", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u8fd9\u4ef6", 0, 2}, token{"\u4e8b\u60c5", 2, 4}, token{"\u7684\u786e", 4, 6}, token{"\u662f", 6, 7}, token{"\u6211", 7, 8}, token{"\u7684", 8, 9}, token{"\u9519", 9, 10}}, - []token{token{"\u4f9b", 0, 1}, token{"\u5927\u5bb6", 1, 3}, token{"\u53c2\u8003", 3, 5}, token{"\u6307\u6b63", 5, 7}}, - []token{token{"\u54c8\u5c14", 0, 2}, token{"\u54c8\u5c14\u6ee8", 0, 3}, token{"\u653f\u5e9c", 3, 5}, token{"\u516c\u5e03", 5, 7}, token{"\u584c\u6865", 7, 9}, token{"\u539f\u56e0", 9, 11}}, - []token{token{"\u6211", 0, 1}, token{"\u5728", 1, 2}, token{"\u673a\u573a", 2, 4}, token{"\u5165\u53e3", 4, 6}, token{"\u5165\u53e3\u5904", 4, 7}}, - []token{token{"\u90a2\u6c38\u81e3", 0, 3}, token{"\u6444\u5f71", 3, 5}, token{"\u62a5\u9053", 5, 7}}, - []token{token{"BP", 0, 2}, token{"\u795e\u7ecf", 2, 4}, token{"\u7f51\u7edc", 4, 6}, token{"\u795e\u7ecf\u7f51", 2, 5}, token{"\u795e\u7ecf\u7f51\u7edc", 2, 6}, token{"\u5982\u4f55", 6, 8}, token{"\u8bad\u7ec3", 8, 10}, token{"\u624d\u80fd", 10, 12}, token{"\u5728", 12, 13}, token{"\u5206\u7c7b", 13, 15}, token{"\u65f6", 15, 16}, token{"\u589e\u52a0", 16, 18}, token{"\u533a\u5206", 18, 20}, token{"\u5206\u5ea6", 19, 21}, token{"\u533a\u5206\u5ea6", 18, 21}, token{"\uff1f", 21, 22}}, - []token{token{"\u5357\u4eac", 0, 2}, token{"\u4eac\u5e02", 1, 3}, token{"\u5357\u4eac\u5e02", 0, 3}, token{"\u957f\u6c5f", 3, 5}, token{"\u5927\u6865", 5, 7}, token{"\u957f\u6c5f\u5927\u6865", 3, 7}}, - []token{token{"\u5e94", 0, 1}, token{"\u4e00\u4e9b", 1, 3}, token{"\u4f7f\u7528", 3, 5}, token{"\u7528\u8005", 4, 6}, token{"\u4f7f\u7528\u8005", 3, 6}, token{"\u7684", 6, 7}, token{"\u5efa\u8bae", 7, 9}, token{"\uff0c", 9, 10}, token{"\u4e5f", 10, 11}, token{"\u4e3a\u4e86", 11, 13}, token{"\u4fbf\u4e8e", 13, 15}, token{"\u5229\u7528", 15, 17}, token{"NiuTrans", 17, 25}, token{"\u7528\u4e8e", 25, 27}, token{"SMT", 27, 30}, token{"\u7814\u7a76", 30, 32}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u957f\u6625\u5e02", 0, 3}, token{"\u957f\u6625", 3, 5}, token{"\u836f\u5e97", 5, 7}}, - []token{token{"\u9093\u9896\u8d85", 0, 3}, token{"\u751f\u524d", 3, 5}, token{"\u6700", 5, 6}, token{"\u559c\u6b22", 6, 8}, token{"\u7684", 8, 9}, token{"\u8863\u670d", 9, 11}}, - []token{token{"\u9526\u6d9b", 1, 3}, token{"\u80e1\u9526\u6d9b", 0, 3}, token{"\u662f", 3, 4}, token{"\u70ed\u7231", 4, 6}, token{"\u4e16\u754c", 6, 8}, token{"\u548c\u5e73", 8, 10}, token{"\u7684", 10, 11}, token{"\u653f\u6cbb", 11, 13}, token{"\u653f\u6cbb\u5c40", 11, 14}, token{"\u5e38\u59d4", 14, 16}}, - []token{token{"\u7a0b\u5e8f", 0, 2}, token{"\u7a0b\u5e8f\u5458", 0, 3}, token{"\u795d", 3, 4}, token{"\u6d77\u6797", 4, 6}, token{"\u548c", 6, 7}, token{"\u6731\u4f1a\u9707", 7, 10}, token{"\u662f", 10, 11}, token{"\u5728", 11, 12}, token{"\u5b59\u5065", 12, 14}, token{"\u7684", 14, 15}, token{"\u5de6\u9762", 15, 17}, token{"\u548c", 17, 18}, token{"\u53f3\u9762", 18, 20}, token{",", 20, 21}, token{" ", 21, 22}, token{"\u8303\u51ef", 22, 24}, token{"\u5728", 24, 25}, token{"\u6700", 25, 26}, token{"\u53f3\u9762", 26, 28}, token{".", 28, 29}, token{"\u518d\u5f80", 29, 31}, token{"\u5de6", 31, 32}, token{"\u662f", 32, 33}, token{"\u674e\u677e\u6d2a", 33, 36}}, - []token{token{"\u4e00\u6b21", 0, 2}, token{"\u4e00\u6b21\u6027", 0, 3}, token{"\u4ea4", 3, 4}, token{"\u591a\u5c11", 4, 6}, token{"\u94b1", 6, 7}}, - []token{token{"\u4e24\u5757", 0, 2}, token{"\u4e94", 2, 3}, token{"\u4e00\u5957", 3, 5}, token{"\uff0c", 5, 6}, token{"\u4e09\u5757", 6, 8}, token{"\u516b", 8, 9}, token{"\u4e00\u65a4", 9, 11}, token{"\uff0c", 11, 12}, token{"\u56db\u5757", 12, 14}, token{"\u4e03", 14, 15}, token{"\u4e00\u672c", 15, 17}, token{"\uff0c", 17, 18}, token{"\u4e94\u5757", 18, 20}, token{"\u516d", 20, 21}, token{"\u4e00\u6761", 21, 23}}, - []token{token{"\u5c0f", 0, 1}, token{"\u548c\u5c1a", 1, 3}, token{"\u7559", 3, 4}, token{"\u4e86", 4, 5}, token{"\u4e00\u4e2a", 5, 7}, token{"\u50cf", 7, 8}, token{"\u5927", 8, 9}, token{"\u548c\u5c1a", 9, 11}, token{"\u4e00\u6837", 11, 13}, token{"\u7684", 13, 14}, token{"\u548c\u5c1a", 14, 16}, token{"\u548c\u5c1a\u5934", 14, 17}}, - []token{token{"\u6211", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e2d\u534e", 2, 4}, token{"\u534e\u4eba", 3, 5}, token{"\u4eba\u6c11", 4, 6}, token{"\u5171\u548c", 6, 8}, token{"\u5171\u548c\u56fd", 6, 9}, token{"\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd", 2, 9}, token{"\u516c\u6c11", 9, 11}, token{";", 11, 12}, token{"\u6211", 12, 13}, token{"\u7238\u7238", 13, 15}, token{"\u662f", 15, 16}, token{"\u5171\u548c", 16, 18}, token{"\u5171\u548c\u515a", 16, 19}, token{"\u515a\u5458", 19, 21}, token{";", 21, 22}, token{" ", 22, 23}, token{"\u5730\u94c1", 23, 25}, token{"\u548c\u5e73", 25, 27}, token{"\u548c\u5e73\u95e8", 25, 28}, token{"\u7ad9", 28, 29}}, - []token{token{"\u5f20\u6653\u6885", 0, 3}, token{"\u53bb", 3, 4}, token{"\u4eba\u6c11", 4, 6}, token{"\u533b\u9662", 6, 8}, token{"\u505a", 8, 9}, token{"\u4e86", 9, 10}, token{"\u4e2a", 10, 11}, token{"B\u8d85", 11, 13}, token{"\u7136\u540e", 13, 15}, token{"\u53bb", 15, 16}, token{"\u4e70", 16, 17}, token{"\u4e86", 17, 18}, token{"\u4ef6", 18, 19}, token{"T\u6064", 19, 21}}, - []token{token{"AT&T", 0, 4}, token{"\u662f", 4, 5}, token{"\u4e00\u4ef6", 5, 7}, token{"\u4e0d\u9519", 7, 9}, token{"\u7684", 9, 10}, token{"\u516c\u53f8", 10, 12}, token{"\uff0c", 12, 13}, token{"\u7ed9", 13, 14}, token{"\u4f60", 14, 15}, token{"\u53d1", 15, 16}, token{"offer", 16, 21}, token{"\u4e86", 21, 22}, token{"\u5417", 22, 23}, token{"\uff1f", 23, 24}}, - []token{token{"C++", 0, 3}, token{"\u548c", 3, 4}, token{"c#", 4, 6}, token{"\u662f", 6, 7}, token{"\u4ec0\u4e48", 7, 9}, token{"\u5173\u7cfb", 9, 11}, token{"\uff1f", 11, 12}, token{"11", 12, 14}, token{"+", 14, 15}, token{"122", 15, 18}, token{"=", 18, 19}, token{"133", 19, 22}, token{"\uff0c", 22, 23}, token{"\u662f", 23, 24}, token{"\u5417", 24, 25}, token{"\uff1f", 25, 26}, token{"PI", 26, 28}, token{"=", 28, 29}, token{"3.14159", 29, 36}}, - []token{token{"\u4f60", 0, 1}, token{"\u8ba4\u8bc6", 1, 3}, token{"\u90a3\u4e2a", 3, 5}, token{"\u548c", 5, 6}, token{"\u4e3b\u5e2d", 6, 8}, token{"\u63e1\u624b", 8, 10}, token{"\u7684", 10, 11}, token{"\u7684\u54e5", 11, 13}, token{"\u5417", 13, 14}, token{"\uff1f", 14, 15}, token{"\u4ed6\u5f00", 15, 17}, token{"\u4e00\u8f86", 17, 19}, token{"\u9ed1\u8272", 19, 21}, token{"\u7684\u58eb", 21, 23}, token{"\u3002", 23, 24}}, - []token{token{"\u67aa\u6746", 0, 2}, token{"\u6746\u5b50", 1, 3}, token{"\u67aa\u6746\u5b50", 0, 3}, token{"\u4e2d", 3, 4}, token{"\u51fa", 4, 5}, token{"\u653f\u6743", 5, 7}}, - []token{token{"\u5f20\u4e09\u98ce", 0, 3}, token{"\u540c\u5b66", 3, 5}, token{"\u8d70\u4e0a", 5, 7}, token{"\u4e86", 7, 8}, token{"\u5f52\u8def", 9, 11}, token{"\u4e0d\u5f52\u8def", 8, 11}}, - []token{token{"\u963fQ", 0, 2}, token{"\u8170\u95f4", 2, 4}, token{"\u6302", 4, 5}, token{"\u7740", 5, 6}, token{"BB\u673a", 6, 9}, token{"\u624b\u91cc", 9, 11}, token{"\u62ff", 11, 12}, token{"\u7740", 12, 13}, token{"\u5927\u54e5", 13, 15}, token{"\u5927\u54e5\u5927", 13, 16}, token{"\uff0c", 16, 17}, token{"\u8bf4", 17, 18}, token{"\uff1a", 18, 19}, token{"\u6211", 19, 20}, token{"\u4e00\u822c", 20, 22}, token{"\u5403\u996d", 22, 24}, token{"\u4e0d", 24, 25}, token{"AA\u5236", 25, 28}, token{"\u7684", 28, 29}, token{"\u3002", 29, 30}}, - []token{token{"\u5728", 0, 1}, token{"1\u53f7\u5e97", 1, 4}, token{"\u80fd", 4, 5}, token{"\u4e70", 5, 6}, token{"\u5230", 6, 7}, token{"\u5c0fS", 7, 9}, token{"\u548c", 9, 10}, token{"\u5927S", 10, 12}, token{"\u516b\u5366", 12, 14}, token{"\u7684", 14, 15}, token{"\u4e66", 15, 16}, token{"\u3002", 16, 17}}, - } - noHmmResult = [][]token{ - []token{token{"\u8fd9", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e00\u4e2a", 2, 4}, token{"\u4f38\u624b\u4e0d\u89c1\u4e94\u6307", 4, 10}, token{"\u7684", 10, 11}, token{"\u9ed1\u591c", 11, 13}, token{"\u3002", 13, 14}, token{"\u6211", 14, 15}, token{"\u53eb", 15, 16}, token{"\u5b59\u609f\u7a7a", 16, 19}, token{"\uff0c", 19, 20}, token{"\u6211", 20, 21}, token{"\u7231", 21, 22}, token{"\u5317\u4eac", 22, 24}, token{"\uff0c", 24, 25}, token{"\u6211", 25, 26}, token{"\u7231", 26, 27}, token{"Python", 27, 33}, token{"\u548c", 33, 34}, token{"C++", 34, 37}, token{"\u3002", 37, 38}}, - []token{token{"\u6211", 0, 1}, token{"\u4e0d", 1, 2}, token{"\u559c\u6b22", 2, 4}, token{"\u65e5\u672c", 4, 6}, token{"\u548c\u670d", 6, 8}, token{"\u3002", 8, 9}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u56de\u5f52", 2, 4}, token{"\u4eba\u95f4", 4, 6}, token{"\u3002", 6, 7}}, - []token{token{"\u5de5\u4fe1\u5904", 0, 3}, token{"\u5973\u5e72\u4e8b", 3, 6}, token{"\u6bcf\u6708", 6, 8}, token{"\u7ecf\u8fc7", 8, 10}, token{"\u4e0b\u5c5e", 10, 12}, token{"\u79d1\u5ba4", 12, 14}, token{"\u90fd", 14, 15}, token{"\u8981", 15, 16}, token{"\u4eb2\u53e3", 16, 18}, token{"\u4ea4\u4ee3", 18, 20}, token{"24", 20, 22}, token{"\u53e3", 22, 23}, token{"\u4ea4\u6362\u673a", 23, 26}, token{"\u7b49", 26, 27}, token{"\u6280\u672f\u6027", 27, 30}, token{"\u5668\u4ef6", 30, 32}, token{"\u7684", 32, 33}, token{"\u5b89\u88c5", 33, 35}, token{"\u5de5\u4f5c", 35, 37}}, - []token{token{"\u6211", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5ec9\u79df\u623f", 3, 6}}, - []token{token{"\u6c38\u548c", 0, 2}, token{"\u670d\u88c5", 2, 4}, token{"\u9970\u54c1", 4, 6}, token{"\u6709\u9650\u516c\u53f8", 6, 10}}, - []token{token{"\u6211", 0, 1}, token{"\u7231", 1, 2}, token{"\u5317\u4eac", 2, 4}, token{"\u5929\u5b89\u95e8", 4, 7}}, - []token{token{"abc", 0, 3}}, - []token{token{"\u9690", 0, 1}, token{"\u9a6c\u5c14\u53ef\u592b", 1, 5}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u662f", 2, 3}, token{"\u4e2a", 3, 4}, token{"\u597d", 4, 5}, token{"\u7f51\u7ad9", 5, 7}}, - []token{token{"\u201c", 0, 1}, token{"Microsoft", 1, 10}, token{"\u201d", 10, 11}, token{"\u4e00", 11, 12}, token{"\u8bcd", 12, 13}, token{"\u7531", 13, 14}, token{"\u201c", 14, 15}, token{"MICROcomputer", 15, 28}, token{"\uff08", 28, 29}, token{"\u5fae\u578b", 29, 31}, token{"\u8ba1\u7b97\u673a", 31, 34}, token{"\uff09", 34, 35}, token{"\u201d", 35, 36}, token{"\u548c", 36, 37}, token{"\u201c", 37, 38}, token{"SOFTware", 38, 46}, token{"\uff08", 46, 47}, token{"\u8f6f\u4ef6", 47, 49}, token{"\uff09", 49, 50}, token{"\u201d", 50, 51}, token{"\u4e24", 51, 52}, token{"\u90e8\u5206", 52, 54}, token{"\u7ec4\u6210", 54, 56}}, - []token{token{"\u8349\u6ce5\u9a6c", 0, 3}, token{"\u548c", 3, 4}, token{"\u6b3a", 4, 5}, token{"\u5b9e", 5, 6}, token{"\u9a6c", 6, 7}, token{"\u662f", 7, 8}, token{"\u4eca\u5e74", 8, 10}, token{"\u7684", 10, 11}, token{"\u6d41\u884c", 11, 13}, token{"\u8bcd\u6c47", 13, 15}}, - []token{token{"\u4f0a", 0, 1}, token{"\u85e4", 1, 2}, token{"\u6d0b\u534e\u5802", 2, 5}, token{"\u603b\u5e9c", 5, 7}, token{"\u5e97", 7, 8}}, - []token{token{"\u4e2d\u56fd\u79d1\u5b66\u9662\u8ba1\u7b97\u6280\u672f\u7814\u7a76\u6240", 0, 12}}, - []token{token{"\u7f57\u5bc6\u6b27", 0, 3}, token{"\u4e0e", 3, 4}, token{"\u6731\u4e3d\u53f6", 4, 7}}, - []token{token{"\u6211", 0, 1}, token{"\u8d2d\u4e70", 1, 3}, token{"\u4e86", 3, 4}, token{"\u9053\u5177", 4, 6}, token{"\u548c", 6, 7}, token{"\u670d\u88c5", 7, 9}}, - []token{token{"PS", 0, 2}, token{":", 2, 3}, token{" ", 3, 4}, token{"\u6211", 4, 5}, token{"\u89c9\u5f97", 5, 7}, token{"\u5f00\u6e90", 7, 9}, token{"\u6709", 9, 10}, token{"\u4e00\u4e2a", 10, 12}, token{"\u597d\u5904", 12, 14}, token{"\uff0c", 14, 15}, token{"\u5c31\u662f", 15, 17}, token{"\u80fd\u591f", 17, 19}, token{"\u6566\u4fc3", 19, 21}, token{"\u81ea\u5df1", 21, 23}, token{"\u4e0d\u65ad\u6539\u8fdb", 23, 27}, token{"\uff0c", 27, 28}, token{"\u907f\u514d", 28, 30}, token{"\u655e", 30, 31}, token{"\u5e1a", 31, 32}, token{"\u81ea\u73cd", 32, 34}}, - []token{token{"\u6e56\u5317\u7701", 0, 3}, token{"\u77f3\u9996\u5e02", 3, 6}}, - []token{token{"\u6e56\u5317\u7701", 0, 3}, token{"\u5341\u5830\u5e02", 3, 6}}, - []token{token{"\u603b\u7ecf\u7406", 0, 3}, token{"\u5b8c\u6210", 3, 5}, token{"\u4e86", 5, 6}, token{"\u8fd9\u4ef6", 6, 8}, token{"\u4e8b\u60c5", 8, 10}}, - []token{token{"\u7535\u8111", 0, 2}, token{"\u4fee\u597d", 2, 4}, token{"\u4e86", 4, 5}}, - []token{token{"\u505a\u597d", 0, 2}, token{"\u4e86", 2, 3}, token{"\u8fd9\u4ef6", 3, 5}, token{"\u4e8b\u60c5", 5, 7}, token{"\u5c31", 7, 8}, token{"\u4e00\u4e86\u767e\u4e86", 8, 12}, token{"\u4e86", 12, 13}}, - []token{token{"\u4eba\u4eec", 0, 2}, token{"\u5ba1\u7f8e", 2, 4}, token{"\u7684", 4, 5}, token{"\u89c2\u70b9", 5, 7}, token{"\u662f", 7, 8}, token{"\u4e0d\u540c", 8, 10}, token{"\u7684", 10, 11}}, - []token{token{"\u6211\u4eec", 0, 2}, token{"\u4e70", 2, 3}, token{"\u4e86", 3, 4}, token{"\u4e00\u4e2a", 4, 6}, token{"\u7f8e\u7684", 6, 8}, token{"\u7a7a\u8c03", 8, 10}}, - []token{token{"\u7ebf\u7a0b", 0, 2}, token{"\u521d\u59cb\u5316", 2, 5}, token{"\u65f6", 5, 6}, token{"\u6211\u4eec", 6, 8}, token{"\u8981", 8, 9}, token{"\u6ce8\u610f", 9, 11}}, - []token{token{"\u4e00\u4e2a", 0, 2}, token{"\u5206\u5b50", 2, 4}, token{"\u662f", 4, 5}, token{"\u7531", 5, 6}, token{"\u597d\u591a", 6, 8}, token{"\u539f\u5b50", 8, 10}, token{"\u7ec4\u7ec7", 10, 12}, token{"\u6210", 12, 13}, token{"\u7684", 13, 14}}, - []token{token{"\u795d", 0, 1}, token{"\u4f60", 1, 2}, token{"\u9a6c\u5230\u529f\u6210", 2, 6}}, - []token{token{"\u4ed6", 0, 1}, token{"\u6389", 1, 2}, token{"\u8fdb", 2, 3}, token{"\u4e86", 3, 4}, token{"\u65e0\u5e95\u6d1e", 4, 7}, token{"\u91cc", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u7684", 2, 3}, token{"\u9996\u90fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u5317\u4eac", 6, 8}}, - []token{token{"\u5b59", 0, 1}, token{"\u541b", 1, 2}, token{"\u610f", 2, 3}}, - []token{token{"\u5916\u4ea4\u90e8", 0, 3}, token{"\u53d1\u8a00\u4eba", 3, 6}, token{"\u9a6c\u671d\u65ed", 6, 9}}, - []token{token{"\u9886\u5bfc\u4eba", 0, 3}, token{"\u4f1a\u8bae", 3, 5}, token{"\u548c", 5, 6}, token{"\u7b2c\u56db\u5c4a", 6, 9}, token{"\u4e1c\u4e9a", 9, 11}, token{"\u5cf0\u4f1a", 11, 13}}, - []token{token{"\u5728", 0, 1}, token{"\u8fc7\u53bb", 1, 3}, token{"\u7684", 3, 4}, token{"\u8fd9", 4, 5}, token{"\u4e94\u5e74", 5, 7}}, - []token{token{"\u8fd8", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5f88", 3, 4}, token{"\u957f", 4, 5}, token{"\u7684", 5, 6}, token{"\u8def", 6, 7}, token{"\u8981", 7, 8}, token{"\u8d70", 8, 9}}, - []token{token{"60", 0, 2}, token{"\u5468\u5e74", 2, 4}, token{"\u9996\u90fd", 4, 6}, token{"\u9605\u5175", 6, 8}}, - []token{token{"\u4f60\u597d", 0, 2}, token{"\u4eba\u4eec", 2, 4}, token{"\u5ba1\u7f8e", 4, 6}, token{"\u7684", 6, 7}, token{"\u89c2\u70b9", 7, 9}, token{"\u662f", 9, 10}, token{"\u4e0d\u540c", 10, 12}, token{"\u7684", 12, 13}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u6765", 5, 6}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u53bb", 5, 6}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4f46\u662f", 0, 2}, token{"\u540e\u6765", 2, 4}, token{"\u6211", 4, 5}, token{"\u624d", 5, 6}, token{"\u77e5\u9053", 6, 8}, token{"\u4f60", 8, 9}, token{"\u662f", 9, 10}, token{"\u5bf9", 10, 11}, token{"\u7684", 11, 12}}, - []token{token{"\u5b58\u5728", 0, 2}, token{"\u5373", 2, 3}, token{"\u5408\u7406", 3, 5}}, - []token{token{"\u7684", 0, 1}, token{"\u7684", 1, 2}, token{"\u7684", 2, 3}, token{"\u7684", 3, 4}, token{"\u7684", 4, 5}, token{"\u5728", 5, 6}, token{"\u7684", 6, 7}, token{"\u7684", 7, 8}, token{"\u7684", 8, 9}, token{"\u7684", 9, 10}, token{"\u5c31", 10, 11}, token{"\u4ee5", 11, 12}, token{"\u548c", 12, 13}, token{"\u548c", 13, 14}, token{"\u548c", 14, 15}}, - []token{token{"I", 0, 1}, token{" ", 1, 2}, token{"love", 2, 6}, token{"\u4f60", 6, 7}, token{"\uff0c", 7, 8}, token{"\u4e0d\u4ee5\u4e3a\u803b", 8, 12}, token{"\uff0c", 12, 13}, token{"\u53cd", 13, 14}, token{"\u4ee5\u4e3a", 14, 16}, token{"rong", 16, 20}}, - []token{token{"\u56e0", 0, 1}}, - []token{}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u5f88", 0, 1}, token{"\u597d", 1, 2}, token{"\u4f46", 2, 3}, token{"\u4e3b\u8981", 3, 5}, token{"\u662f", 5, 6}, token{"\u57fa\u4e8e", 6, 8}, token{"\u7f51\u9875", 8, 10}, token{"\u5f62\u5f0f", 10, 12}}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u4e3a\u4ec0\u4e48", 0, 3}, token{"\u6211", 3, 4}, token{"\u4e0d\u80fd", 4, 6}, token{"\u62e5\u6709", 6, 8}, token{"\u60f3\u8981", 8, 10}, token{"\u7684", 10, 11}, token{"\u751f\u6d3b", 11, 13}}, - []token{token{"\u540e\u6765", 0, 2}, token{"\u6211", 2, 3}, token{"\u624d", 3, 4}}, - []token{token{"\u6b64\u6b21", 0, 2}, token{"\u6765", 2, 3}, token{"\u4e2d\u56fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u4e3a\u4e86", 6, 8}}, - []token{token{"\u4f7f\u7528", 0, 2}, token{"\u4e86", 2, 3}, token{"\u5b83", 3, 4}, token{"\u5c31", 4, 5}, token{"\u53ef\u4ee5", 5, 7}, token{"\u89e3\u51b3", 7, 9}, token{"\u4e00\u4e9b", 9, 11}, token{"\u95ee\u9898", 11, 13}}, - []token{token{",", 0, 1}, token{"\u4f7f\u7528", 1, 3}, token{"\u4e86", 3, 4}, token{"\u5b83", 4, 5}, token{"\u5c31", 5, 6}, token{"\u53ef\u4ee5", 6, 8}, token{"\u89e3\u51b3", 8, 10}, token{"\u4e00\u4e9b", 10, 12}, token{"\u95ee\u9898", 12, 14}}, - []token{token{"\u5176\u5b9e", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u597d\u4eba", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u662f\u56e0\u4e3a", 0, 3}, token{"\u548c", 3, 4}, token{"\u56fd\u5bb6", 4, 6}}, - []token{token{"\u8001\u5e74", 0, 2}, token{"\u641c\u7d22", 2, 4}, token{"\u8fd8", 4, 5}, token{"\u652f\u6301", 5, 7}}, - []token{token{"\u5e72\u8106", 0, 2}, token{"\u5c31", 2, 3}, token{"\u628a", 3, 4}, token{"\u90a3", 4, 5}, token{"\u90e8", 5, 6}, token{"\u8499", 6, 7}, token{"\u4eba", 7, 8}, token{"\u7684", 8, 9}, token{"\u95f2", 9, 10}, token{"\u6cd5", 10, 11}, token{"\u7ed9", 11, 12}, token{"\u5e9f", 12, 13}, token{"\u4e86", 13, 14}, token{"\u62c9\u5012", 14, 16}, token{"\uff01", 16, 17}, token{"RT", 17, 19}, token{" ", 19, 20}, token{"@", 20, 21}, token{"laoshipukong", 21, 33}, token{" ", 33, 34}, token{":", 34, 35}, token{" ", 35, 36}, token{"27", 36, 38}, token{"\u65e5", 38, 39}, token{"\uff0c", 39, 40}, token{"\u5168\u56fd\u4eba\u5927\u5e38\u59d4\u4f1a", 40, 47}, token{"\u7b2c\u4e09\u6b21", 47, 50}, token{"\u5ba1\u8bae", 50, 52}, token{"\u4fb5\u6743", 52, 54}, token{"\u8d23\u4efb\u6cd5", 54, 57}, token{"\u8349\u6848", 57, 59}, token{"\uff0c", 59, 60}, token{"\u5220\u9664", 60, 62}, token{"\u4e86", 62, 63}, token{"\u6709\u5173", 63, 65}, token{"\u533b\u7597", 65, 67}, token{"\u635f\u5bb3", 67, 69}, token{"\u8d23\u4efb", 69, 71}, token{"\u201c", 71, 72}, token{"\u4e3e\u8bc1", 72, 74}, token{"\u5012\u7f6e", 74, 76}, token{"\u201d", 76, 77}, token{"\u7684", 77, 78}, token{"\u89c4\u5b9a", 78, 80}, token{"\u3002", 80, 81}, token{"\u5728", 81, 82}, token{"\u533b\u60a3", 82, 84}, token{"\u7ea0\u7eb7", 84, 86}, token{"\u4e2d", 86, 87}, token{"\u672c", 87, 88}, token{"\u5df2", 88, 89}, token{"\u5904\u4e8e", 89, 91}, token{"\u5f31\u52bf", 91, 93}, token{"\u5730\u4f4d", 93, 95}, token{"\u7684", 95, 96}, token{"\u6d88\u8d39\u8005", 96, 99}, token{"\u7531\u6b64", 99, 101}, token{"\u5c06", 101, 102}, token{"\u9677\u5165", 102, 104}, token{"\u4e07\u52ab\u4e0d\u590d", 104, 108}, token{"\u7684", 108, 109}, token{"\u5883\u5730", 109, 111}, token{"\u3002", 111, 112}, token{" ", 112, 113}}, - []token{token{"\u5927", 0, 1}}, - []token{}, - []token{token{"\u4ed6", 0, 1}, token{"\u8bf4", 1, 2}, token{"\u7684", 2, 3}, token{"\u786e\u5b9e", 3, 5}, token{"\u5728", 5, 6}, token{"\u7406", 6, 7}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u5e02\u957f", 2, 4}, token{"\u6625\u8282", 4, 6}, token{"\u8bb2\u8bdd", 6, 8}}, - []token{token{"\u7ed3\u5a5a", 0, 2}, token{"\u7684", 2, 3}, token{"\u548c", 3, 4}, token{"\u5c1a\u672a", 4, 6}, token{"\u7ed3\u5a5a", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u7ed3\u5408", 0, 2}, token{"\u6210", 2, 3}, token{"\u5206\u5b50", 3, 5}, token{"\u65f6", 5, 6}}, - []token{token{"\u65c5\u6e38", 0, 2}, token{"\u548c", 2, 3}, token{"\u670d\u52a1", 3, 5}, token{"\u662f", 5, 6}, token{"\u6700\u597d", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u8fd9\u4ef6", 0, 2}, token{"\u4e8b\u60c5", 2, 4}, token{"\u7684\u786e", 4, 6}, token{"\u662f", 6, 7}, token{"\u6211", 7, 8}, token{"\u7684", 8, 9}, token{"\u9519", 9, 10}}, - []token{token{"\u4f9b", 0, 1}, token{"\u5927\u5bb6", 1, 3}, token{"\u53c2\u8003", 3, 5}, token{"\u6307\u6b63", 5, 7}}, - []token{token{"\u54c8\u5c14\u6ee8", 0, 3}, token{"\u653f\u5e9c", 3, 5}, token{"\u516c\u5e03", 5, 7}, token{"\u584c", 7, 8}, token{"\u6865", 8, 9}, token{"\u539f\u56e0", 9, 11}}, - []token{token{"\u6211", 0, 1}, token{"\u5728", 1, 2}, token{"\u673a\u573a", 2, 4}, token{"\u5165\u53e3\u5904", 4, 7}}, - []token{token{"\u90a2", 0, 1}, token{"\u6c38", 1, 2}, token{"\u81e3", 2, 3}, token{"\u6444\u5f71", 3, 5}, token{"\u62a5\u9053", 5, 7}}, - []token{token{"BP", 0, 2}, token{"\u795e\u7ecf\u7f51\u7edc", 2, 6}, token{"\u5982\u4f55", 6, 8}, token{"\u8bad\u7ec3", 8, 10}, token{"\u624d\u80fd", 10, 12}, token{"\u5728", 12, 13}, token{"\u5206\u7c7b", 13, 15}, token{"\u65f6", 15, 16}, token{"\u589e\u52a0", 16, 18}, token{"\u533a\u5206\u5ea6", 18, 21}, token{"\uff1f", 21, 22}}, - []token{token{"\u5357\u4eac\u5e02", 0, 3}, token{"\u957f\u6c5f\u5927\u6865", 3, 7}}, - []token{token{"\u5e94", 0, 1}, token{"\u4e00\u4e9b", 1, 3}, token{"\u4f7f\u7528\u8005", 3, 6}, token{"\u7684", 6, 7}, token{"\u5efa\u8bae", 7, 9}, token{"\uff0c", 9, 10}, token{"\u4e5f", 10, 11}, token{"\u4e3a\u4e86", 11, 13}, token{"\u4fbf\u4e8e", 13, 15}, token{"\u5229\u7528", 15, 17}, token{"NiuTrans", 17, 25}, token{"\u7528\u4e8e", 25, 27}, token{"SMT", 27, 30}, token{"\u7814\u7a76", 30, 32}}, - []token{token{"\u957f\u6625\u5e02", 0, 3}, token{"\u957f\u6625", 3, 5}, token{"\u836f\u5e97", 5, 7}}, - []token{token{"\u9093\u9896\u8d85", 0, 3}, token{"\u751f\u524d", 3, 5}, token{"\u6700", 5, 6}, token{"\u559c\u6b22", 6, 8}, token{"\u7684", 8, 9}, token{"\u8863\u670d", 9, 11}}, - []token{token{"\u80e1\u9526\u6d9b", 0, 3}, token{"\u662f", 3, 4}, token{"\u70ed\u7231", 4, 6}, token{"\u4e16\u754c", 6, 8}, token{"\u548c\u5e73", 8, 10}, token{"\u7684", 10, 11}, token{"\u653f\u6cbb\u5c40", 11, 14}, token{"\u5e38\u59d4", 14, 16}}, - []token{token{"\u7a0b\u5e8f\u5458", 0, 3}, token{"\u795d", 3, 4}, token{"\u6d77\u6797", 4, 6}, token{"\u548c", 6, 7}, token{"\u6731", 7, 8}, token{"\u4f1a", 8, 9}, token{"\u9707", 9, 10}, token{"\u662f", 10, 11}, token{"\u5728", 11, 12}, token{"\u5b59", 12, 13}, token{"\u5065", 13, 14}, token{"\u7684", 14, 15}, token{"\u5de6\u9762", 15, 17}, token{"\u548c", 17, 18}, token{"\u53f3\u9762", 18, 20}, token{",", 20, 21}, token{" ", 21, 22}, token{"\u8303", 22, 23}, token{"\u51ef", 23, 24}, token{"\u5728", 24, 25}, token{"\u6700", 25, 26}, token{"\u53f3\u9762", 26, 28}, token{".", 28, 29}, token{"\u518d", 29, 30}, token{"\u5f80", 30, 31}, token{"\u5de6", 31, 32}, token{"\u662f", 32, 33}, token{"\u674e", 33, 34}, token{"\u677e", 34, 35}, token{"\u6d2a", 35, 36}}, - []token{token{"\u4e00\u6b21\u6027", 0, 3}, token{"\u4ea4", 3, 4}, token{"\u591a\u5c11", 4, 6}, token{"\u94b1", 6, 7}}, - []token{token{"\u4e24\u5757", 0, 2}, token{"\u4e94", 2, 3}, token{"\u4e00\u5957", 3, 5}, token{"\uff0c", 5, 6}, token{"\u4e09\u5757", 6, 8}, token{"\u516b", 8, 9}, token{"\u4e00\u65a4", 9, 11}, token{"\uff0c", 11, 12}, token{"\u56db\u5757", 12, 14}, token{"\u4e03", 14, 15}, token{"\u4e00\u672c", 15, 17}, token{"\uff0c", 17, 18}, token{"\u4e94\u5757", 18, 20}, token{"\u516d", 20, 21}, token{"\u4e00\u6761", 21, 23}}, - []token{token{"\u5c0f", 0, 1}, token{"\u548c\u5c1a", 1, 3}, token{"\u7559", 3, 4}, token{"\u4e86", 4, 5}, token{"\u4e00\u4e2a", 5, 7}, token{"\u50cf", 7, 8}, token{"\u5927", 8, 9}, token{"\u548c\u5c1a", 9, 11}, token{"\u4e00\u6837", 11, 13}, token{"\u7684", 13, 14}, token{"\u548c\u5c1a\u5934", 14, 17}}, - []token{token{"\u6211", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd", 2, 9}, token{"\u516c\u6c11", 9, 11}, token{";", 11, 12}, token{"\u6211", 12, 13}, token{"\u7238\u7238", 13, 15}, token{"\u662f", 15, 16}, token{"\u5171\u548c\u515a", 16, 19}, token{"\u515a\u5458", 19, 21}, token{";", 21, 22}, token{" ", 22, 23}, token{"\u5730\u94c1", 23, 25}, token{"\u548c\u5e73\u95e8", 25, 28}, token{"\u7ad9", 28, 29}}, - []token{token{"\u5f20\u6653\u6885", 0, 3}, token{"\u53bb", 3, 4}, token{"\u4eba\u6c11", 4, 6}, token{"\u533b\u9662", 6, 8}, token{"\u505a", 8, 9}, token{"\u4e86", 9, 10}, token{"\u4e2a", 10, 11}, token{"B\u8d85", 11, 13}, token{"\u7136\u540e", 13, 15}, token{"\u53bb", 15, 16}, token{"\u4e70", 16, 17}, token{"\u4e86", 17, 18}, token{"\u4ef6", 18, 19}, token{"T\u6064", 19, 21}}, - []token{token{"AT&T", 0, 4}, token{"\u662f", 4, 5}, token{"\u4e00\u4ef6", 5, 7}, token{"\u4e0d\u9519", 7, 9}, token{"\u7684", 9, 10}, token{"\u516c\u53f8", 10, 12}, token{"\uff0c", 12, 13}, token{"\u7ed9", 13, 14}, token{"\u4f60", 14, 15}, token{"\u53d1", 15, 16}, token{"offer", 16, 21}, token{"\u4e86", 21, 22}, token{"\u5417", 22, 23}, token{"\uff1f", 23, 24}}, - []token{token{"C++", 0, 3}, token{"\u548c", 3, 4}, token{"c#", 4, 6}, token{"\u662f", 6, 7}, token{"\u4ec0\u4e48", 7, 9}, token{"\u5173\u7cfb", 9, 11}, token{"\uff1f", 11, 12}, token{"11", 12, 14}, token{"+", 14, 15}, token{"122", 15, 18}, token{"=", 18, 19}, token{"133", 19, 22}, token{"\uff0c", 22, 23}, token{"\u662f", 23, 24}, token{"\u5417", 24, 25}, token{"\uff1f", 25, 26}, token{"PI", 26, 28}, token{"=", 28, 29}, token{"3", 29, 30}, token{".", 30, 31}, token{"14159", 31, 36}}, - []token{token{"\u4f60", 0, 1}, token{"\u8ba4\u8bc6", 1, 3}, token{"\u90a3\u4e2a", 3, 5}, token{"\u548c", 5, 6}, token{"\u4e3b\u5e2d", 6, 8}, token{"\u63e1\u624b", 8, 10}, token{"\u7684", 10, 11}, token{"\u7684\u54e5", 11, 13}, token{"\u5417", 13, 14}, token{"\uff1f", 14, 15}, token{"\u4ed6", 15, 16}, token{"\u5f00", 16, 17}, token{"\u4e00\u8f86", 17, 19}, token{"\u9ed1\u8272", 19, 21}, token{"\u7684\u58eb", 21, 23}, token{"\u3002", 23, 24}}, - []token{token{"\u67aa\u6746\u5b50", 0, 3}, token{"\u4e2d", 3, 4}, token{"\u51fa", 4, 5}, token{"\u653f\u6743", 5, 7}}, - []token{token{"\u5f20", 0, 1}, token{"\u4e09", 1, 2}, token{"\u98ce", 2, 3}, token{"\u540c\u5b66", 3, 5}, token{"\u8d70\u4e0a", 5, 7}, token{"\u4e86", 7, 8}, token{"\u4e0d\u5f52\u8def", 8, 11}}, - []token{token{"\u963fQ", 0, 2}, token{"\u8170\u95f4", 2, 4}, token{"\u6302", 4, 5}, token{"\u7740", 5, 6}, token{"BB\u673a", 6, 9}, token{"\u624b\u91cc", 9, 11}, token{"\u62ff", 11, 12}, token{"\u7740", 12, 13}, token{"\u5927\u54e5\u5927", 13, 16}, token{"\uff0c", 16, 17}, token{"\u8bf4", 17, 18}, token{"\uff1a", 18, 19}, token{"\u6211", 19, 20}, token{"\u4e00\u822c", 20, 22}, token{"\u5403\u996d", 22, 24}, token{"\u4e0d", 24, 25}, token{"AA\u5236", 25, 28}, token{"\u7684", 28, 29}, token{"\u3002", 29, 30}}, - []token{token{"\u5728", 0, 1}, token{"1\u53f7\u5e97", 1, 4}, token{"\u80fd", 4, 5}, token{"\u4e70", 5, 6}, token{"\u5230", 6, 7}, token{"\u5c0fS", 7, 9}, token{"\u548c", 9, 10}, token{"\u5927S", 10, 12}, token{"\u516b\u5366", 12, 14}, token{"\u7684", 14, 15}, token{"\u4e66", 15, 16}, token{"\u3002", 16, 17}}, - []token{token{"\u8fd9", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e00\u4e2a", 2, 4}, token{"\u4f38\u624b", 4, 6}, token{"\u4e0d\u89c1", 6, 8}, token{"\u4e94\u6307", 8, 10}, token{"\u4f38\u624b\u4e0d\u89c1\u4e94\u6307", 4, 10}, token{"\u7684", 10, 11}, token{"\u9ed1\u591c", 11, 13}, token{"\u3002", 13, 14}, token{"\u6211", 14, 15}, token{"\u53eb", 15, 16}, token{"\u609f\u7a7a", 17, 19}, token{"\u5b59\u609f\u7a7a", 16, 19}, token{"\uff0c", 19, 20}, token{"\u6211", 20, 21}, token{"\u7231", 21, 22}, token{"\u5317\u4eac", 22, 24}, token{"\uff0c", 24, 25}, token{"\u6211", 25, 26}, token{"\u7231", 26, 27}, token{"Python", 27, 33}, token{"\u548c", 33, 34}, token{"C++", 34, 37}, token{"\u3002", 37, 38}}, - []token{token{"\u6211", 0, 1}, token{"\u4e0d", 1, 2}, token{"\u559c\u6b22", 2, 4}, token{"\u65e5\u672c", 4, 6}, token{"\u548c\u670d", 6, 8}, token{"\u3002", 8, 9}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u56de\u5f52", 2, 4}, token{"\u4eba\u95f4", 4, 6}, token{"\u3002", 6, 7}}, - []token{token{"\u5de5\u4fe1\u5904", 0, 3}, token{"\u5e72\u4e8b", 4, 6}, token{"\u5973\u5e72\u4e8b", 3, 6}, token{"\u6bcf\u6708", 6, 8}, token{"\u7ecf\u8fc7", 8, 10}, token{"\u4e0b\u5c5e", 10, 12}, token{"\u79d1\u5ba4", 12, 14}, token{"\u90fd", 14, 15}, token{"\u8981", 15, 16}, token{"\u4eb2\u53e3", 16, 18}, token{"\u4ea4\u4ee3", 18, 20}, token{"24", 20, 22}, token{"\u53e3", 22, 23}, token{"\u4ea4\u6362", 23, 25}, token{"\u6362\u673a", 24, 26}, token{"\u4ea4\u6362\u673a", 23, 26}, token{"\u7b49", 26, 27}, token{"\u6280\u672f", 27, 29}, token{"\u6280\u672f\u6027", 27, 30}, token{"\u5668\u4ef6", 30, 32}, token{"\u7684", 32, 33}, token{"\u5b89\u88c5", 33, 35}, token{"\u5de5\u4f5c", 35, 37}}, - []token{token{"\u6211", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5ec9\u79df", 3, 5}, token{"\u79df\u623f", 4, 6}, token{"\u5ec9\u79df\u623f", 3, 6}}, - []token{token{"\u6c38\u548c", 0, 2}, token{"\u670d\u88c5", 2, 4}, token{"\u9970\u54c1", 4, 6}, token{"\u6709\u9650", 6, 8}, token{"\u516c\u53f8", 8, 10}, token{"\u6709\u9650\u516c\u53f8", 6, 10}}, - []token{token{"\u6211", 0, 1}, token{"\u7231", 1, 2}, token{"\u5317\u4eac", 2, 4}, token{"\u5929\u5b89", 4, 6}, token{"\u5929\u5b89\u95e8", 4, 7}}, - []token{token{"abc", 0, 3}}, - []token{token{"\u9690", 0, 1}, token{"\u53ef\u592b", 3, 5}, token{"\u9a6c\u5c14\u53ef", 1, 4}, token{"\u9a6c\u5c14\u53ef\u592b", 1, 5}}, - []token{token{"\u96f7\u7334", 0, 2}, token{"\u662f", 2, 3}, token{"\u4e2a", 3, 4}, token{"\u597d", 4, 5}, token{"\u7f51\u7ad9", 5, 7}}, - []token{token{"\u201c", 0, 1}, token{"Microsoft", 1, 10}, token{"\u201d", 10, 11}, token{"\u4e00", 11, 12}, token{"\u8bcd", 12, 13}, token{"\u7531", 13, 14}, token{"\u201c", 14, 15}, token{"MICROcomputer", 15, 28}, token{"\uff08", 28, 29}, token{"\u5fae\u578b", 29, 31}, token{"\u8ba1\u7b97", 31, 33}, token{"\u7b97\u673a", 32, 34}, token{"\u8ba1\u7b97\u673a", 31, 34}, token{"\uff09", 34, 35}, token{"\u201d", 35, 36}, token{"\u548c", 36, 37}, token{"\u201c", 37, 38}, token{"SOFTware", 38, 46}, token{"\uff08", 46, 47}, token{"\u8f6f\u4ef6", 47, 49}, token{"\uff09", 49, 50}, token{"\u201d", 50, 51}, token{"\u4e24", 51, 52}, token{"\u90e8\u5206", 52, 54}, token{"\u7ec4\u6210", 54, 56}}, - []token{token{"\u8349\u6ce5\u9a6c", 0, 3}, token{"\u548c", 3, 4}, token{"\u6b3a", 4, 5}, token{"\u5b9e", 5, 6}, token{"\u9a6c", 6, 7}, token{"\u662f", 7, 8}, token{"\u4eca\u5e74", 8, 10}, token{"\u7684", 10, 11}, token{"\u6d41\u884c", 11, 13}, token{"\u8bcd\u6c47", 13, 15}}, - []token{token{"\u4f0a", 0, 1}, token{"\u85e4", 1, 2}, token{"\u6d0b\u534e\u5802", 2, 5}, token{"\u603b\u5e9c", 5, 7}, token{"\u5e97", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u79d1\u5b66", 2, 4}, token{"\u5b66\u9662", 3, 5}, token{"\u8ba1\u7b97", 5, 7}, token{"\u6280\u672f", 7, 9}, token{"\u7814\u7a76", 9, 11}, token{"\u79d1\u5b66\u9662", 2, 5}, token{"\u7814\u7a76\u6240", 9, 12}, token{"\u4e2d\u56fd\u79d1\u5b66\u9662\u8ba1\u7b97\u6280\u672f\u7814\u7a76\u6240", 0, 12}}, - []token{token{"\u7f57\u5bc6\u6b27", 0, 3}, token{"\u4e0e", 3, 4}, token{"\u6731\u4e3d\u53f6", 4, 7}}, - []token{token{"\u6211", 0, 1}, token{"\u8d2d\u4e70", 1, 3}, token{"\u4e86", 3, 4}, token{"\u9053\u5177", 4, 6}, token{"\u548c", 6, 7}, token{"\u670d\u88c5", 7, 9}}, - []token{token{"PS", 0, 2}, token{":", 2, 3}, token{" ", 3, 4}, token{"\u6211", 4, 5}, token{"\u89c9\u5f97", 5, 7}, token{"\u5f00\u6e90", 7, 9}, token{"\u6709", 9, 10}, token{"\u4e00\u4e2a", 10, 12}, token{"\u597d\u5904", 12, 14}, token{"\uff0c", 14, 15}, token{"\u5c31\u662f", 15, 17}, token{"\u80fd\u591f", 17, 19}, token{"\u6566\u4fc3", 19, 21}, token{"\u81ea\u5df1", 21, 23}, token{"\u4e0d\u65ad", 23, 25}, token{"\u6539\u8fdb", 25, 27}, token{"\u4e0d\u65ad\u6539\u8fdb", 23, 27}, token{"\uff0c", 27, 28}, token{"\u907f\u514d", 28, 30}, token{"\u655e", 30, 31}, token{"\u5e1a", 31, 32}, token{"\u81ea\u73cd", 32, 34}}, - []token{token{"\u6e56\u5317", 0, 2}, token{"\u6e56\u5317\u7701", 0, 3}, token{"\u77f3\u9996", 3, 5}, token{"\u77f3\u9996\u5e02", 3, 6}}, - []token{token{"\u6e56\u5317", 0, 2}, token{"\u6e56\u5317\u7701", 0, 3}, token{"\u5341\u5830", 3, 5}, token{"\u5341\u5830\u5e02", 3, 6}}, - []token{token{"\u7ecf\u7406", 1, 3}, token{"\u603b\u7ecf\u7406", 0, 3}, token{"\u5b8c\u6210", 3, 5}, token{"\u4e86", 5, 6}, token{"\u8fd9\u4ef6", 6, 8}, token{"\u4e8b\u60c5", 8, 10}}, - []token{token{"\u7535\u8111", 0, 2}, token{"\u4fee\u597d", 2, 4}, token{"\u4e86", 4, 5}}, - []token{token{"\u505a\u597d", 0, 2}, token{"\u4e86", 2, 3}, token{"\u8fd9\u4ef6", 3, 5}, token{"\u4e8b\u60c5", 5, 7}, token{"\u5c31", 7, 8}, token{"\u4e00\u4e86\u767e\u4e86", 8, 12}, token{"\u4e86", 12, 13}}, - []token{token{"\u4eba\u4eec", 0, 2}, token{"\u5ba1\u7f8e", 2, 4}, token{"\u7684", 4, 5}, token{"\u89c2\u70b9", 5, 7}, token{"\u662f", 7, 8}, token{"\u4e0d\u540c", 8, 10}, token{"\u7684", 10, 11}}, - []token{token{"\u6211\u4eec", 0, 2}, token{"\u4e70", 2, 3}, token{"\u4e86", 3, 4}, token{"\u4e00\u4e2a", 4, 6}, token{"\u7f8e\u7684", 6, 8}, token{"\u7a7a\u8c03", 8, 10}}, - []token{token{"\u7ebf\u7a0b", 0, 2}, token{"\u521d\u59cb", 2, 4}, token{"\u521d\u59cb\u5316", 2, 5}, token{"\u65f6", 5, 6}, token{"\u6211\u4eec", 6, 8}, token{"\u8981", 8, 9}, token{"\u6ce8\u610f", 9, 11}}, - []token{token{"\u4e00\u4e2a", 0, 2}, token{"\u5206\u5b50", 2, 4}, token{"\u662f", 4, 5}, token{"\u7531", 5, 6}, token{"\u597d\u591a", 6, 8}, token{"\u539f\u5b50", 8, 10}, token{"\u7ec4\u7ec7", 10, 12}, token{"\u6210", 12, 13}, token{"\u7684", 13, 14}}, - []token{token{"\u795d", 0, 1}, token{"\u4f60", 1, 2}, token{"\u9a6c\u5230\u529f\u6210", 2, 6}}, - []token{token{"\u4ed6", 0, 1}, token{"\u6389", 1, 2}, token{"\u8fdb", 2, 3}, token{"\u4e86", 3, 4}, token{"\u65e0\u5e95", 4, 6}, token{"\u65e0\u5e95\u6d1e", 4, 7}, token{"\u91cc", 7, 8}}, - []token{token{"\u4e2d\u56fd", 0, 2}, token{"\u7684", 2, 3}, token{"\u9996\u90fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u5317\u4eac", 6, 8}}, - []token{token{"\u5b59", 0, 1}, token{"\u541b", 1, 2}, token{"\u610f", 2, 3}}, - []token{token{"\u5916\u4ea4", 0, 2}, token{"\u5916\u4ea4\u90e8", 0, 3}, token{"\u53d1\u8a00", 3, 5}, token{"\u53d1\u8a00\u4eba", 3, 6}, token{"\u9a6c\u671d\u65ed", 6, 9}}, - []token{token{"\u9886\u5bfc", 0, 2}, token{"\u9886\u5bfc\u4eba", 0, 3}, token{"\u4f1a\u8bae", 3, 5}, token{"\u548c", 5, 6}, token{"\u7b2c\u56db", 6, 8}, token{"\u56db\u5c4a", 7, 9}, token{"\u7b2c\u56db\u5c4a", 6, 9}, token{"\u4e1c\u4e9a", 9, 11}, token{"\u5cf0\u4f1a", 11, 13}}, - []token{token{"\u5728", 0, 1}, token{"\u8fc7\u53bb", 1, 3}, token{"\u7684", 3, 4}, token{"\u8fd9", 4, 5}, token{"\u4e94\u5e74", 5, 7}}, - []token{token{"\u8fd8", 0, 1}, token{"\u9700\u8981", 1, 3}, token{"\u5f88", 3, 4}, token{"\u957f", 4, 5}, token{"\u7684", 5, 6}, token{"\u8def", 6, 7}, token{"\u8981", 7, 8}, token{"\u8d70", 8, 9}}, - []token{token{"60", 0, 2}, token{"\u5468\u5e74", 2, 4}, token{"\u9996\u90fd", 4, 6}, token{"\u9605\u5175", 6, 8}}, - []token{token{"\u4f60\u597d", 0, 2}, token{"\u4eba\u4eec", 2, 4}, token{"\u5ba1\u7f8e", 4, 6}, token{"\u7684", 6, 7}, token{"\u89c2\u70b9", 7, 9}, token{"\u662f", 9, 10}, token{"\u4e0d\u540c", 10, 12}, token{"\u7684", 12, 13}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u6765", 5, 6}, token{"\u4e16\u535a", 6, 8}, token{"\u535a\u56ed", 7, 9}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4e70", 0, 1}, token{"\u6c34\u679c", 1, 3}, token{"\u7136\u540e", 3, 5}, token{"\u53bb", 5, 6}, token{"\u4e16\u535a", 6, 8}, token{"\u535a\u56ed", 7, 9}, token{"\u4e16\u535a\u56ed", 6, 9}}, - []token{token{"\u4f46\u662f", 0, 2}, token{"\u540e\u6765", 2, 4}, token{"\u6211", 4, 5}, token{"\u624d", 5, 6}, token{"\u77e5\u9053", 6, 8}, token{"\u4f60", 8, 9}, token{"\u662f", 9, 10}, token{"\u5bf9", 10, 11}, token{"\u7684", 11, 12}}, - []token{token{"\u5b58\u5728", 0, 2}, token{"\u5373", 2, 3}, token{"\u5408\u7406", 3, 5}}, - []token{token{"\u7684", 0, 1}, token{"\u7684", 1, 2}, token{"\u7684", 2, 3}, token{"\u7684", 3, 4}, token{"\u7684", 4, 5}, token{"\u5728", 5, 6}, token{"\u7684", 6, 7}, token{"\u7684", 7, 8}, token{"\u7684", 8, 9}, token{"\u7684", 9, 10}, token{"\u5c31", 10, 11}, token{"\u4ee5", 11, 12}, token{"\u548c", 12, 13}, token{"\u548c", 13, 14}, token{"\u548c", 14, 15}}, - []token{token{"I", 0, 1}, token{" ", 1, 2}, token{"love", 2, 6}, token{"\u4f60", 6, 7}, token{"\uff0c", 7, 8}, token{"\u4e0d\u4ee5", 8, 10}, token{"\u4ee5\u4e3a", 9, 11}, token{"\u4e0d\u4ee5\u4e3a\u803b", 8, 12}, token{"\uff0c", 12, 13}, token{"\u53cd", 13, 14}, token{"\u4ee5\u4e3a", 14, 16}, token{"rong", 16, 20}}, - []token{token{"\u56e0", 0, 1}}, - []token{}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u5f88", 0, 1}, token{"\u597d", 1, 2}, token{"\u4f46", 2, 3}, token{"\u4e3b\u8981", 3, 5}, token{"\u662f", 5, 6}, token{"\u57fa\u4e8e", 6, 8}, token{"\u7f51\u9875", 8, 10}, token{"\u5f62\u5f0f", 10, 12}}, - []token{token{"hello", 0, 5}, token{"\u4f60\u597d", 5, 7}, token{"\u4eba\u4eec", 7, 9}, token{"\u5ba1\u7f8e", 9, 11}, token{"\u7684", 11, 12}, token{"\u89c2\u70b9", 12, 14}, token{"\u662f", 14, 15}, token{"\u4e0d\u540c", 15, 17}, token{"\u7684", 17, 18}}, - []token{token{"\u4ec0\u4e48", 1, 3}, token{"\u4e3a\u4ec0\u4e48", 0, 3}, token{"\u6211", 3, 4}, token{"\u4e0d\u80fd", 4, 6}, token{"\u62e5\u6709", 6, 8}, token{"\u60f3\u8981", 8, 10}, token{"\u7684", 10, 11}, token{"\u751f\u6d3b", 11, 13}}, - []token{token{"\u540e\u6765", 0, 2}, token{"\u6211", 2, 3}, token{"\u624d", 3, 4}}, - []token{token{"\u6b64\u6b21", 0, 2}, token{"\u6765", 2, 3}, token{"\u4e2d\u56fd", 3, 5}, token{"\u662f", 5, 6}, token{"\u4e3a\u4e86", 6, 8}}, - []token{token{"\u4f7f\u7528", 0, 2}, token{"\u4e86", 2, 3}, token{"\u5b83", 3, 4}, token{"\u5c31", 4, 5}, token{"\u53ef\u4ee5", 5, 7}, token{"\u89e3\u51b3", 7, 9}, token{"\u4e00\u4e9b", 9, 11}, token{"\u95ee\u9898", 11, 13}}, - []token{token{",", 0, 1}, token{"\u4f7f\u7528", 1, 3}, token{"\u4e86", 3, 4}, token{"\u5b83", 4, 5}, token{"\u5c31", 5, 6}, token{"\u53ef\u4ee5", 6, 8}, token{"\u89e3\u51b3", 8, 10}, token{"\u4e00\u4e9b", 10, 12}, token{"\u95ee\u9898", 12, 14}}, - []token{token{"\u5176\u5b9e", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u597d\u4eba", 0, 2}, token{"\u4f7f\u7528", 2, 4}, token{"\u4e86", 4, 5}, token{"\u5b83", 5, 6}, token{"\u5c31", 6, 7}, token{"\u53ef\u4ee5", 7, 9}, token{"\u89e3\u51b3", 9, 11}, token{"\u4e00\u4e9b", 11, 13}, token{"\u95ee\u9898", 13, 15}}, - []token{token{"\u56e0\u4e3a", 1, 3}, token{"\u662f\u56e0\u4e3a", 0, 3}, token{"\u548c", 3, 4}, token{"\u56fd\u5bb6", 4, 6}}, - []token{token{"\u8001\u5e74", 0, 2}, token{"\u641c\u7d22", 2, 4}, token{"\u8fd8", 4, 5}, token{"\u652f\u6301", 5, 7}}, - []token{token{"\u5e72\u8106", 0, 2}, token{"\u5c31", 2, 3}, token{"\u628a", 3, 4}, token{"\u90a3", 4, 5}, token{"\u90e8", 5, 6}, token{"\u8499", 6, 7}, token{"\u4eba", 7, 8}, token{"\u7684", 8, 9}, token{"\u95f2", 9, 10}, token{"\u6cd5", 10, 11}, token{"\u7ed9", 11, 12}, token{"\u5e9f", 12, 13}, token{"\u4e86", 13, 14}, token{"\u62c9\u5012", 14, 16}, token{"\uff01", 16, 17}, token{"RT", 17, 19}, token{" ", 19, 20}, token{"@", 20, 21}, token{"laoshipukong", 21, 33}, token{" ", 33, 34}, token{":", 34, 35}, token{" ", 35, 36}, token{"27", 36, 38}, token{"\u65e5", 38, 39}, token{"\uff0c", 39, 40}, token{"\u5168\u56fd", 40, 42}, token{"\u56fd\u4eba", 41, 43}, token{"\u4eba\u5927", 42, 44}, token{"\u5e38\u59d4", 44, 46}, token{"\u59d4\u4f1a", 45, 47}, token{"\u5e38\u59d4\u4f1a", 44, 47}, token{"\u5168\u56fd\u4eba\u5927\u5e38\u59d4\u4f1a", 40, 47}, token{"\u7b2c\u4e09", 47, 49}, token{"\u4e09\u6b21", 48, 50}, token{"\u7b2c\u4e09\u6b21", 47, 50}, token{"\u5ba1\u8bae", 50, 52}, token{"\u4fb5\u6743", 52, 54}, token{"\u8d23\u4efb", 54, 56}, token{"\u8d23\u4efb\u6cd5", 54, 57}, token{"\u8349\u6848", 57, 59}, token{"\uff0c", 59, 60}, token{"\u5220\u9664", 60, 62}, token{"\u4e86", 62, 63}, token{"\u6709\u5173", 63, 65}, token{"\u533b\u7597", 65, 67}, token{"\u635f\u5bb3", 67, 69}, token{"\u8d23\u4efb", 69, 71}, token{"\u201c", 71, 72}, token{"\u4e3e\u8bc1", 72, 74}, token{"\u5012\u7f6e", 74, 76}, token{"\u201d", 76, 77}, token{"\u7684", 77, 78}, token{"\u89c4\u5b9a", 78, 80}, token{"\u3002", 80, 81}, token{"\u5728", 81, 82}, token{"\u533b\u60a3", 82, 84}, token{"\u7ea0\u7eb7", 84, 86}, token{"\u4e2d", 86, 87}, token{"\u672c", 87, 88}, token{"\u5df2", 88, 89}, token{"\u5904\u4e8e", 89, 91}, token{"\u5f31\u52bf", 91, 93}, token{"\u5730\u4f4d", 93, 95}, token{"\u7684", 95, 96}, token{"\u6d88\u8d39", 96, 98}, token{"\u6d88\u8d39\u8005", 96, 99}, token{"\u7531\u6b64", 99, 101}, token{"\u5c06", 101, 102}, token{"\u9677\u5165", 102, 104}, token{"\u4e0d\u590d", 106, 108}, token{"\u4e07\u52ab\u4e0d\u590d", 104, 108}, token{"\u7684", 108, 109}, token{"\u5883\u5730", 109, 111}, token{"\u3002", 111, 112}, token{" ", 112, 113}}, - []token{token{"\u5927", 0, 1}}, - []token{}, - []token{token{"\u4ed6", 0, 1}, token{"\u8bf4", 1, 2}, token{"\u7684", 2, 3}, token{"\u786e\u5b9e", 3, 5}, token{"\u5728", 5, 6}, token{"\u7406", 6, 7}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u5e02\u957f", 2, 4}, token{"\u6625\u8282", 4, 6}, token{"\u8bb2\u8bdd", 6, 8}}, - []token{token{"\u7ed3\u5a5a", 0, 2}, token{"\u7684", 2, 3}, token{"\u548c", 3, 4}, token{"\u5c1a\u672a", 4, 6}, token{"\u7ed3\u5a5a", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u7ed3\u5408", 0, 2}, token{"\u6210", 2, 3}, token{"\u5206\u5b50", 3, 5}, token{"\u65f6", 5, 6}}, - []token{token{"\u65c5\u6e38", 0, 2}, token{"\u548c", 2, 3}, token{"\u670d\u52a1", 3, 5}, token{"\u662f", 5, 6}, token{"\u6700\u597d", 6, 8}, token{"\u7684", 8, 9}}, - []token{token{"\u8fd9\u4ef6", 0, 2}, token{"\u4e8b\u60c5", 2, 4}, token{"\u7684\u786e", 4, 6}, token{"\u662f", 6, 7}, token{"\u6211", 7, 8}, token{"\u7684", 8, 9}, token{"\u9519", 9, 10}}, - []token{token{"\u4f9b", 0, 1}, token{"\u5927\u5bb6", 1, 3}, token{"\u53c2\u8003", 3, 5}, token{"\u6307\u6b63", 5, 7}}, - []token{token{"\u54c8\u5c14", 0, 2}, token{"\u54c8\u5c14\u6ee8", 0, 3}, token{"\u653f\u5e9c", 3, 5}, token{"\u516c\u5e03", 5, 7}, token{"\u584c", 7, 8}, token{"\u6865", 8, 9}, token{"\u539f\u56e0", 9, 11}}, - []token{token{"\u6211", 0, 1}, token{"\u5728", 1, 2}, token{"\u673a\u573a", 2, 4}, token{"\u5165\u53e3", 4, 6}, token{"\u5165\u53e3\u5904", 4, 7}}, - []token{token{"\u90a2", 0, 1}, token{"\u6c38", 1, 2}, token{"\u81e3", 2, 3}, token{"\u6444\u5f71", 3, 5}, token{"\u62a5\u9053", 5, 7}}, - []token{token{"BP", 0, 2}, token{"\u795e\u7ecf", 2, 4}, token{"\u7f51\u7edc", 4, 6}, token{"\u795e\u7ecf\u7f51", 2, 5}, token{"\u795e\u7ecf\u7f51\u7edc", 2, 6}, token{"\u5982\u4f55", 6, 8}, token{"\u8bad\u7ec3", 8, 10}, token{"\u624d\u80fd", 10, 12}, token{"\u5728", 12, 13}, token{"\u5206\u7c7b", 13, 15}, token{"\u65f6", 15, 16}, token{"\u589e\u52a0", 16, 18}, token{"\u533a\u5206", 18, 20}, token{"\u5206\u5ea6", 19, 21}, token{"\u533a\u5206\u5ea6", 18, 21}, token{"\uff1f", 21, 22}}, - []token{token{"\u5357\u4eac", 0, 2}, token{"\u4eac\u5e02", 1, 3}, token{"\u5357\u4eac\u5e02", 0, 3}, token{"\u957f\u6c5f", 3, 5}, token{"\u5927\u6865", 5, 7}, token{"\u957f\u6c5f\u5927\u6865", 3, 7}}, - []token{token{"\u5e94", 0, 1}, token{"\u4e00\u4e9b", 1, 3}, token{"\u4f7f\u7528", 3, 5}, token{"\u7528\u8005", 4, 6}, token{"\u4f7f\u7528\u8005", 3, 6}, token{"\u7684", 6, 7}, token{"\u5efa\u8bae", 7, 9}, token{"\uff0c", 9, 10}, token{"\u4e5f", 10, 11}, token{"\u4e3a\u4e86", 11, 13}, token{"\u4fbf\u4e8e", 13, 15}, token{"\u5229\u7528", 15, 17}, token{"NiuTrans", 17, 25}, token{"\u7528\u4e8e", 25, 27}, token{"SMT", 27, 30}, token{"\u7814\u7a76", 30, 32}}, - []token{token{"\u957f\u6625", 0, 2}, token{"\u957f\u6625\u5e02", 0, 3}, token{"\u957f\u6625", 3, 5}, token{"\u836f\u5e97", 5, 7}}, - []token{token{"\u9093\u9896\u8d85", 0, 3}, token{"\u751f\u524d", 3, 5}, token{"\u6700", 5, 6}, token{"\u559c\u6b22", 6, 8}, token{"\u7684", 8, 9}, token{"\u8863\u670d", 9, 11}}, - []token{token{"\u9526\u6d9b", 1, 3}, token{"\u80e1\u9526\u6d9b", 0, 3}, token{"\u662f", 3, 4}, token{"\u70ed\u7231", 4, 6}, token{"\u4e16\u754c", 6, 8}, token{"\u548c\u5e73", 8, 10}, token{"\u7684", 10, 11}, token{"\u653f\u6cbb", 11, 13}, token{"\u653f\u6cbb\u5c40", 11, 14}, token{"\u5e38\u59d4", 14, 16}}, - []token{token{"\u7a0b\u5e8f", 0, 2}, token{"\u7a0b\u5e8f\u5458", 0, 3}, token{"\u795d", 3, 4}, token{"\u6d77\u6797", 4, 6}, token{"\u548c", 6, 7}, token{"\u6731", 7, 8}, token{"\u4f1a", 8, 9}, token{"\u9707", 9, 10}, token{"\u662f", 10, 11}, token{"\u5728", 11, 12}, token{"\u5b59", 12, 13}, token{"\u5065", 13, 14}, token{"\u7684", 14, 15}, token{"\u5de6\u9762", 15, 17}, token{"\u548c", 17, 18}, token{"\u53f3\u9762", 18, 20}, token{",", 20, 21}, token{" ", 21, 22}, token{"\u8303", 22, 23}, token{"\u51ef", 23, 24}, token{"\u5728", 24, 25}, token{"\u6700", 25, 26}, token{"\u53f3\u9762", 26, 28}, token{".", 28, 29}, token{"\u518d", 29, 30}, token{"\u5f80", 30, 31}, token{"\u5de6", 31, 32}, token{"\u662f", 32, 33}, token{"\u674e", 33, 34}, token{"\u677e", 34, 35}, token{"\u6d2a", 35, 36}}, - []token{token{"\u4e00\u6b21", 0, 2}, token{"\u4e00\u6b21\u6027", 0, 3}, token{"\u4ea4", 3, 4}, token{"\u591a\u5c11", 4, 6}, token{"\u94b1", 6, 7}}, - []token{token{"\u4e24\u5757", 0, 2}, token{"\u4e94", 2, 3}, token{"\u4e00\u5957", 3, 5}, token{"\uff0c", 5, 6}, token{"\u4e09\u5757", 6, 8}, token{"\u516b", 8, 9}, token{"\u4e00\u65a4", 9, 11}, token{"\uff0c", 11, 12}, token{"\u56db\u5757", 12, 14}, token{"\u4e03", 14, 15}, token{"\u4e00\u672c", 15, 17}, token{"\uff0c", 17, 18}, token{"\u4e94\u5757", 18, 20}, token{"\u516d", 20, 21}, token{"\u4e00\u6761", 21, 23}}, - []token{token{"\u5c0f", 0, 1}, token{"\u548c\u5c1a", 1, 3}, token{"\u7559", 3, 4}, token{"\u4e86", 4, 5}, token{"\u4e00\u4e2a", 5, 7}, token{"\u50cf", 7, 8}, token{"\u5927", 8, 9}, token{"\u548c\u5c1a", 9, 11}, token{"\u4e00\u6837", 11, 13}, token{"\u7684", 13, 14}, token{"\u548c\u5c1a", 14, 16}, token{"\u548c\u5c1a\u5934", 14, 17}}, - []token{token{"\u6211", 0, 1}, token{"\u662f", 1, 2}, token{"\u4e2d\u534e", 2, 4}, token{"\u534e\u4eba", 3, 5}, token{"\u4eba\u6c11", 4, 6}, token{"\u5171\u548c", 6, 8}, token{"\u5171\u548c\u56fd", 6, 9}, token{"\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd", 2, 9}, token{"\u516c\u6c11", 9, 11}, token{";", 11, 12}, token{"\u6211", 12, 13}, token{"\u7238\u7238", 13, 15}, token{"\u662f", 15, 16}, token{"\u5171\u548c", 16, 18}, token{"\u5171\u548c\u515a", 16, 19}, token{"\u515a\u5458", 19, 21}, token{";", 21, 22}, token{" ", 22, 23}, token{"\u5730\u94c1", 23, 25}, token{"\u548c\u5e73", 25, 27}, token{"\u548c\u5e73\u95e8", 25, 28}, token{"\u7ad9", 28, 29}}, - []token{token{"\u5f20\u6653\u6885", 0, 3}, token{"\u53bb", 3, 4}, token{"\u4eba\u6c11", 4, 6}, token{"\u533b\u9662", 6, 8}, token{"\u505a", 8, 9}, token{"\u4e86", 9, 10}, token{"\u4e2a", 10, 11}, token{"B\u8d85", 11, 13}, token{"\u7136\u540e", 13, 15}, token{"\u53bb", 15, 16}, token{"\u4e70", 16, 17}, token{"\u4e86", 17, 18}, token{"\u4ef6", 18, 19}, token{"T\u6064", 19, 21}}, - []token{token{"AT&T", 0, 4}, token{"\u662f", 4, 5}, token{"\u4e00\u4ef6", 5, 7}, token{"\u4e0d\u9519", 7, 9}, token{"\u7684", 9, 10}, token{"\u516c\u53f8", 10, 12}, token{"\uff0c", 12, 13}, token{"\u7ed9", 13, 14}, token{"\u4f60", 14, 15}, token{"\u53d1", 15, 16}, token{"offer", 16, 21}, token{"\u4e86", 21, 22}, token{"\u5417", 22, 23}, token{"\uff1f", 23, 24}}, - []token{token{"C++", 0, 3}, token{"\u548c", 3, 4}, token{"c#", 4, 6}, token{"\u662f", 6, 7}, token{"\u4ec0\u4e48", 7, 9}, token{"\u5173\u7cfb", 9, 11}, token{"\uff1f", 11, 12}, token{"11", 12, 14}, token{"+", 14, 15}, token{"122", 15, 18}, token{"=", 18, 19}, token{"133", 19, 22}, token{"\uff0c", 22, 23}, token{"\u662f", 23, 24}, token{"\u5417", 24, 25}, token{"\uff1f", 25, 26}, token{"PI", 26, 28}, token{"=", 28, 29}, token{"3", 29, 30}, token{".", 30, 31}, token{"14159", 31, 36}}, - []token{token{"\u4f60", 0, 1}, token{"\u8ba4\u8bc6", 1, 3}, token{"\u90a3\u4e2a", 3, 5}, token{"\u548c", 5, 6}, token{"\u4e3b\u5e2d", 6, 8}, token{"\u63e1\u624b", 8, 10}, token{"\u7684", 10, 11}, token{"\u7684\u54e5", 11, 13}, token{"\u5417", 13, 14}, token{"\uff1f", 14, 15}, token{"\u4ed6", 15, 16}, token{"\u5f00", 16, 17}, token{"\u4e00\u8f86", 17, 19}, token{"\u9ed1\u8272", 19, 21}, token{"\u7684\u58eb", 21, 23}, token{"\u3002", 23, 24}}, - []token{token{"\u67aa\u6746", 0, 2}, token{"\u6746\u5b50", 1, 3}, token{"\u67aa\u6746\u5b50", 0, 3}, token{"\u4e2d", 3, 4}, token{"\u51fa", 4, 5}, token{"\u653f\u6743", 5, 7}}, - []token{token{"\u5f20", 0, 1}, token{"\u4e09", 1, 2}, token{"\u98ce", 2, 3}, token{"\u540c\u5b66", 3, 5}, token{"\u8d70\u4e0a", 5, 7}, token{"\u4e86", 7, 8}, token{"\u5f52\u8def", 9, 11}, token{"\u4e0d\u5f52\u8def", 8, 11}}, - []token{token{"\u963fQ", 0, 2}, token{"\u8170\u95f4", 2, 4}, token{"\u6302", 4, 5}, token{"\u7740", 5, 6}, token{"BB\u673a", 6, 9}, token{"\u624b\u91cc", 9, 11}, token{"\u62ff", 11, 12}, token{"\u7740", 12, 13}, token{"\u5927\u54e5", 13, 15}, token{"\u5927\u54e5\u5927", 13, 16}, token{"\uff0c", 16, 17}, token{"\u8bf4", 17, 18}, token{"\uff1a", 18, 19}, token{"\u6211", 19, 20}, token{"\u4e00\u822c", 20, 22}, token{"\u5403\u996d", 22, 24}, token{"\u4e0d", 24, 25}, token{"AA\u5236", 25, 28}, token{"\u7684", 28, 29}, token{"\u3002", 29, 30}}, - []token{token{"\u5728", 0, 1}, token{"1\u53f7\u5e97", 1, 4}, token{"\u80fd", 4, 5}, token{"\u4e70", 5, 6}, token{"\u5230", 6, 7}, token{"\u5c0fS", 7, 9}, token{"\u548c", 9, 10}, token{"\u5927S", 10, 12}, token{"\u516b\u5366", 12, 14}, token{"\u7684", 14, 15}, token{"\u4e66", 15, 16}, token{"\u3002", 16, 17}}, - } -) - -func TesttokenizeDefaultMode(t *testing.T) { - for index, sentence := range test_contents { - tokens := Tokenize(sentence, "default", true) - if len(tokens) != len(result[index]) { - t.Error(len(tokens)) - } - for i, token := range tokens { - if token != result[index][i] { - t.Error(token) - } - } - } -} - -func TesttokenizeNoHMM(t *testing.T) { - for index, sentence := range test_contents { - tokens := Tokenize(sentence, "default", false) - if len(tokens) != len(noHmmResult[index]) { - t.Error(len(tokens)) - } - for i, token := range tokens { - if token != noHmmResult[index][i] { - t.Error(token) - } - } - } -} diff --git a/analyse/tokenizers/jieba.go b/tokenizers/jieba.go similarity index 100% rename from analyse/tokenizers/jieba.go rename to tokenizers/jieba.go diff --git a/analyse/tokenizers/jieba_test.go b/tokenizers/jieba_test.go similarity index 99% rename from analyse/tokenizers/jieba_test.go rename to tokenizers/jieba_test.go index ee9949e..2f55775 100644 --- a/analyse/tokenizers/jieba_test.go +++ b/tokenizers/jieba_test.go @@ -5218,7 +5218,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, false) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -11056,7 +11056,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../../dict.txt", true, true) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -16473,7 +16473,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../../dict.txt", false, false) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -22505,7 +22505,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../../dict.txt", false, true) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) {