From edef39719dfae0dd9ced9a627b08d2ed7d12f03c Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Thu, 30 Apr 2015 17:01:02 +0800 Subject: [PATCH] move jieba to a seperate module, tweak posseg module --- dictionary/dictionary.go | 20 +- jieba/jieba.go | 290 ++++++++++++++ jieba/jieba_test.go | 824 +++++++++++++++++++++++++++++++++++++++ posseg/posseg.go | 282 ++++++++------ posseg/posseg_test.go | 500 ++++++++++++------------ 5 files changed, 1530 insertions(+), 386 deletions(-) create mode 100644 jieba/jieba.go create mode 100644 jieba/jieba_test.go diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go index 083d072..f1f14ec 100644 --- a/dictionary/dictionary.go +++ b/dictionary/dictionary.go @@ -13,17 +13,23 @@ import ( type Dictionary struct { total, logTotal float64 freqMap map[string]float64 + posMap map[string]string sync.RWMutex } func New() *Dictionary { - return &Dictionary{freqMap: make(map[string]float64)} + return &Dictionary{ + freqMap: make(map[string]float64), + posMap: make(map[string]string)} } func (d *Dictionary) addToken(token Token) { d.freqMap[token.text] = token.frequency d.total += token.frequency runes := []rune(token.text) + if len(token.pos) > 0 { + d.posMap[token.text] = token.pos + } n := len(runes) for i := 0; i < n; i++ { frag := string(runes[:i+1]) @@ -59,6 +65,13 @@ func (d Dictionary) Frequency(key string) (float64, bool) { return freq, ok } +func (d Dictionary) Pos(key string) (string, bool) { + d.RLock() + pos, ok := d.posMap[key] + d.RUnlock() + return pos, ok +} + func (d *Dictionary) LoadDictionary(fileName string) error { return d.loadDictionary(fileName, false) } @@ -88,20 +101,21 @@ func (d *Dictionary) loadDictionary(fileName string, isUserDictionary bool) erro if !isUserDictionary && len(d.freqMap) > 0 { d.freqMap = make(map[string]float64) + d.posMap = make(map[string]string) d.total = 0.0 d.logTotal = 0.0 } for scanner.Scan() { line = scanner.Text() fields = strings.Split(line, " ") - token.text = strings.Replace(fields[0], "\ufeff", "", 1) + token.text = strings.TrimSpace(strings.Replace(fields[0], "\ufeff", "", 1)) if length := len(fields); length > 1 { token.frequency, err = strconv.ParseFloat(fields[1], 64) if err != nil { return err } if length > 2 { - token.pos = fields[2] + token.pos = strings.TrimSpace(fields[2]) } } d.addToken(token) diff --git a/jieba/jieba.go b/jieba/jieba.go new file mode 100644 index 0000000..9375489 --- /dev/null +++ b/jieba/jieba.go @@ -0,0 +1,290 @@ +package jieba + +import ( + "math" + "regexp" + + "github.com/wangbin/jiebago/dictionary" + "github.com/wangbin/jiebago/finalseg" + "github.com/wangbin/jiebago/util" +) + +var ( + reEng = regexp.MustCompile(`[[:alnum:]]`) + reHanCutAll = regexp.MustCompile(`(\p{Han}+)`) + reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) + reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) + reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) +) + +type Segmenter struct { + *dictionary.Dictionary +} + +func (seg *Segmenter) dag(runes []rune) map[int][]int { + dag := make(map[int][]int) + n := len(runes) + var frag []rune + var i int + for k := 0; k < n; k++ { + dag[k] = make([]int, 0) + i = k + frag = runes[k : k+1] + for { + freq, ok := seg.Frequency(string(frag)) + if !ok { + break + } + if freq > 0.0 { + dag[k] = append(dag[k], i) + } + i += 1 + if i >= n { + break + } + frag = runes[k : i+1] + } + if len(dag[k]) == 0 { + dag[k] = append(dag[k], k) + } + } + return dag +} + +type route struct { + frequency float64 + index int +} + +func (seg *Segmenter) calc(runes []rune) map[int]route { + dag := seg.dag(runes) + n := len(runes) + rs := make(map[int]route) + rs[n] = route{frequency: 0.0, index: 0} + logTotal := seg.LogTotal() + var r route + for idx := n - 1; idx >= 0; idx-- { + for _, i := range dag[idx] { + if freq, ok := seg.Frequency(string(runes[idx : i+1])); ok { + r = route{frequency: math.Log(freq) - logTotal + rs[i+1].frequency, index: i} + } else { + r = route{frequency: math.Log(1.0) - logTotal + rs[i+1].frequency, index: i} + } + if v, ok := rs[idx]; !ok { + rs[idx] = r + } else { + if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { + rs[idx] = r + } + } + } + } + return rs +} + +type cutFunc func(sentence string) <-chan string + +func (seg *Segmenter) cutDAG(sentence string) <-chan string { + result := make(chan string) + go func() { + runes := []rune(sentence) + routes := seg.calc(runes) + var y int + length := len(runes) + buf := make([]rune, 0) + for x := 0; x < length; { + y = routes[x].index + 1 + frag := runes[x:y] + if y-x == 1 { + buf = append(buf, frag...) + } else { + if len(buf) > 0 { + bufString := string(buf) + if len(buf) == 1 { + result <- bufString + } else { + if v, ok := seg.Frequency(bufString); !ok || v == 0.0 { + for x := range finalseg.Cut(bufString) { + result <- x + } + } else { + for _, elem := range buf { + result <- string(elem) + } + } + } + buf = make([]rune, 0) + } + result <- string(frag) + } + x = y + } + + if len(buf) > 0 { + bufString := string(buf) + if len(buf) == 1 { + result <- bufString + } else { + if v, ok := seg.Frequency(bufString); !ok || v == 0.0 { + for t := range finalseg.Cut(bufString) { + result <- t + } + } else { + for _, elem := range buf { + result <- string(elem) + } + } + } + } + close(result) + }() + return result +} + +func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { + result := make(chan string) + + go func() { + runes := []rune(sentence) + routes := seg.calc(runes) + var y int + length := len(runes) + buf := make([]rune, 0) + for x := 0; x < length; { + y = routes[x].index + 1 + frag := runes[x:y] + if reEng.MatchString(string(frag)) && len(frag) == 1 { + buf = append(buf, frag...) + x = y + } else { + if len(buf) > 0 { + result <- string(buf) + buf = make([]rune, 0) + } + result <- string(frag) + x = y + } + } + if len(buf) > 0 { + result <- string(buf) + buf = make([]rune, 0) + } + close(result) + }() + return result +} + +func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { + result := make(chan string) + var cut cutFunc + if hmm { + cut = seg.cutDAG + } else { + cut = seg.cutDAGNoHMM + } + + go func() { + for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { + if len(block) == 0 { + continue + } + if reHanDefault.MatchString(block) { + for x := range cut(block) { + result <- x + } + continue + } + for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) { + if reSkipDefault.MatchString(subBlock) { + result <- subBlock + continue + } + for _, r := range subBlock { + result <- string(r) + } + } + } + close(result) + }() + return result +} + +func (seg *Segmenter) cutAll(sentence string) <-chan string { + result := make(chan string) + go func() { + runes := []rune(sentence) + dag := seg.dag(runes) + start := -1 + ks := make([]int, len(dag)) + for k := range dag { + ks[k] = k + } + var l []int + for k := range ks { + l = dag[k] + if len(l) == 1 && k > start { + result <- string(runes[k : l[0]+1]) + start = l[0] + continue + } + for _, j := range l { + if j > k { + result <- string(runes[k : j+1]) + start = j + } + } + } + close(result) + }() + return result +} + +func (seg *Segmenter) CutAll(sentence string) <-chan string { + result := make(chan string) + go func() { + for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { + if len(block) == 0 { + continue + } + if reHanCutAll.MatchString(block) { + for x := range seg.cutAll(block) { + result <- x + } + continue + } + for _, subBlock := range reSkipCutAll.Split(block, -1) { + result <- subBlock + } + } + close(result) + }() + return result +} + +func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { + result := make(chan string) + go func() { + for word := range seg.Cut(sentence, hmm) { + runes := []rune(word) + for _, increment := range []int{2, 3} { + if len(runes) <= increment { + continue + } + var gram string + for i := 0; i < len(runes)-increment+1; i++ { + gram = string(runes[i : i+increment]) + if v, ok := seg.Frequency(gram); ok && v > 0.0 { + result <- gram + } + } + } + result <- word + } + close(result) + }() + return result +} + +func New() *Segmenter { + return &Segmenter{dictionary.New()} +} diff --git a/jieba/jieba_test.go b/jieba/jieba_test.go new file mode 100644 index 0000000..21e26a8 --- /dev/null +++ b/jieba/jieba_test.go @@ -0,0 +1,824 @@ +package jieba + +import "testing" + +var ( + seg *Segmenter + test_contents = []string{ + "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", + "我不喜欢日本和服。", + "雷猴回归人间。", + "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", + "我需要廉租房", + "永和服装饰品有限公司", + "我爱北京天安门", + "abc", + "隐马尔可夫", + "雷猴是个好网站", + "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", + "草泥马和欺实马是今年的流行词汇", + "伊藤洋华堂总府店", + "中国科学院计算技术研究所", + "罗密欧与朱丽叶", + "我购买了道具和服装", + "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", + "湖北省石首市", + "湖北省十堰市", + "总经理完成了这件事情", + "电脑修好了", + "做好了这件事情就一了百了了", + "人们审美的观点是不同的", + "我们买了一个美的空调", + "线程初始化时我们要注意", + "一个分子是由好多原子组织成的", + "祝你马到功成", + "他掉进了无底洞里", + "中国的首都是北京", + "孙君意", + "外交部发言人马朝旭", + "领导人会议和第四届东亚峰会", + "在过去的这五年", + "还需要很长的路要走", + "60周年首都阅兵", + "你好人们审美的观点是不同的", + "买水果然后来世博园", + "买水果然后去世博园", + "但是后来我才知道你是对的", + "存在即合理", + "的的的的的在的的的的就以和和和", + "I love你,不以为耻,反以为rong", + "因", + "", + "hello你好人们审美的观点是不同的", + "很好但主要是基于网页形式", + "hello你好人们审美的观点是不同的", + "为什么我不能拥有想要的生活", + "后来我才", + "此次来中国是为了", + "使用了它就可以解决一些问题", + ",使用了它就可以解决一些问题", + "其实使用了它就可以解决一些问题", + "好人使用了它就可以解决一些问题", + "是因为和国家", + "老年搜索还支持", + "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ", + "大", + "", + "他说的确实在理", + "长春市长春节讲话", + "结婚的和尚未结婚的", + "结合成分子时", + "旅游和服务是最好的", + "这件事情的确是我的错", + "供大家参考指正", + "哈尔滨政府公布塌桥原因", + "我在机场入口处", + "邢永臣摄影报道", + "BP神经网络如何训练才能在分类时增加区分度?", + "南京市长江大桥", + "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", + "长春市长春药店", + "邓颖超生前最喜欢的衣服", + "胡锦涛是热爱世界和平的政治局常委", + "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", + "一次性交多少钱", + "两块五一套,三块八一斤,四块七一本,五块六一条", + "小和尚留了一个像大和尚一样的和尚头", + "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", + "张晓梅去人民医院做了个B超然后去买了件T恤", + "AT&T是一件不错的公司,给你发offer了吗?", + "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", + "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", + "枪杆子中出政权"} + + defaultCutResult = [][]string{[]string{"这是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, + []string{"我", "不", "喜欢", "日本", "和服", "。"}, + []string{"雷猴", "回归", "人间", "。"}, + []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"}, + []string{"我", "需要", "廉租房"}, + []string{"永和", "服装", "饰品", "有限公司"}, + []string{"我", "爱", "北京", "天安门"}, + []string{"abc"}, + []string{"隐", "马尔可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, + []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"}, + []string{"伊藤", "洋华堂", "总府", "店"}, + []string{"中国科学院计算技术研究所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我", "购买", "了", "道具", "和", "服装"}, + []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞帚", "自珍"}, + []string{"湖北省", "石首市"}, + []string{"湖北省", "十堰市"}, + []string{"总经理", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, + []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美的", "空调"}, + []string{"线程", "初始化", "时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, + []string{"祝", "你", "马到功成"}, + []string{"他", "掉", "进", "了", "无底洞", "里"}, + []string{"中国", "的", "首都", "是", "北京"}, + []string{"孙君意"}, + []string{"外交部", "发言人", "马朝旭"}, + []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"}, + []string{"在", "过去", "的", "这", "五年"}, + []string{"还", "需要", "很长", "的", "路", "要", "走"}, + []string{"60", "周年", "首都", "阅兵"}, + []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"买", "水果", "然后", "来", "世博园"}, + []string{"买", "水果", "然后", "去", "世博园"}, + []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "是", "为了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"是因为", "和", "国家"}, + []string{"老年", "搜索", "还", "支持"}, + []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "}, + []string{"大"}, + []string{}, + []string{"他", "说", "的", "确实", "在理"}, + []string{"长春", "市长", "春节", "讲话"}, + []string{"结婚", "的", "和", "尚未", "结婚", "的"}, + []string{"结合", "成", "分子", "时"}, + []string{"旅游", "和", "服务", "是", "最好", "的"}, + []string{"这件", "事情", "的确", "是", "我", "的", "错"}, + []string{"供", "大家", "参考", "指正"}, + []string{"哈尔滨", "政府", "公布", "塌桥", "原因"}, + []string{"我", "在", "机场", "入口处"}, + []string{"邢永臣", "摄影", "报道"}, + []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"}, + []string{"南京市", "长江大桥"}, + []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春市", "长春", "药店"}, + []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, + []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"}, + []string{"程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"}, + []string{"一次性", "交", "多少", "钱"}, + []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, + []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"}, + []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"}, + []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, + []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, + []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, + []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"}, + []string{"枪杆子", "中", "出", "政权"}, + } + + cutAllResult = [][]string{[]string{"这", "是", "一个", "伸手", "伸手不见", "伸手不见五指", "不见", "五指", "的", "黑夜", "", "", "我", "叫", "孙悟空", "悟空", "", "", "我", "爱", "北京", "", "", "我", "爱", "Python", "和", "C++", ""}, + []string{"我", "不", "喜欢", "日本", "和服", "", ""}, + []string{"雷猴", "回归", "人间", "", ""}, + []string{"工信处", "处女", "女干事", "干事", "每月", "月经", "经过", "下属", "科室", "都", "要", "亲口", "口交", "交代", "24", "口交", "交换", "交换机", "换机", "等", "技术", "技术性", "性器", "器件", "的", "安装", "安装工", "装工", "工作"}, + []string{"我", "需要", "廉租", "廉租房", "租房"}, + []string{"永和", "和服", "服装", "装饰", "装饰品", "饰品", "有限", "有限公司", "公司"}, + []string{"我", "爱", "北京", "天安", "天安门"}, + []string{"abc"}, + []string{"隐", "马尔可", "马尔可夫", "可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"", "Microsoft", "", "一", "词", "由", "", "MICROcomputer", "", "微型", "计算", "计算机", "算机", "", "", "", "和", "", "SOFTware", "", "软件", "", "", "", "两部", "部分", "分组", "组成"}, + []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, + []string{"伊", "藤", "洋华堂", "总府", "店"}, + []string{"中国", "中国科学院", "中国科学院计算技术研究所", "科学", "科学院", "学院", "计算", "计算技术", "技术", "研究", "研究所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我", "购买", "了", "道具", "和服", "服装"}, + []string{"PS", "", "", "我", "觉得", "开源", "有", "一个", "好处", "", "", "就是", "能够", "敦促", "自己", "不断", "不断改进", "改进", "", "", "避免", "敞", "帚", "自珍"}, + []string{"湖北", "湖北省", "石首", "石首市"}, + []string{"湖北", "湖北省", "十堰", "十堰市"}, + []string{"总经理", "经理", "理完", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做好", "了", "这件", "事情", "就", "一了百了", "了了"}, + []string{"人们", "审美", "美的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美的", "空调"}, + []string{"线程", "初始", "初始化", "化时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "织成", "的"}, + []string{"祝", "你", "马到功成"}, + []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, + []string{"中国", "的", "首都", "是", "北京"}, + []string{"孙", "君", "意"}, + []string{"外交", "外交部", "部发", "发言", "发言人", "人马", "马朝旭"}, + []string{"领导", "领导人", "会议", "议和", "第四", "第四届", "四届", "东亚", "峰会"}, + []string{"在", "过去", "的", "这", "五年"}, + []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, + []string{"60", "周年", "首都", "阅兵"}, + []string{"你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, + []string{"买", "水果", "果然", "然后", "后来", "来世", "世博", "世博园", "博园"}, + []string{"买", "水果", "果然", "然后", "后去", "去世", "世博", "世博园", "博园"}, + []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", "love", "你", "", "", "不以", "不以为耻", "以为", "耻", "", "", "反", "以为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "要是", "基于", "网页", "形式"}, + []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, + []string{"为什么", "什么", "我", "不能", "拥有", "想要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "国是", "为了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"", "", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"是因为", "因为", "和", "国家"}, + []string{"老年", "搜索", "索还", "支持"}, + []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲", "法", "给", "废", "了", "拉倒", "", "RT", "", "laoshipukong", "", "", "27", "日", "", "", "全国", "全国人大", "全国人大常委会", "国人", "人大", "人大常委会", "常委", "常委会", "委会", "第三", "第三次", "三次", "审议", "侵权", "权责", "责任", "责任法", "草案", "", "", "删除", "除了", "有关", "医疗", "损害", "责任", "", "", "举证", "倒置", "", "", "的", "规定", "", "", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "万劫不复", "不复", "的", "境地", "", "", ""}, + []string{"大"}, + []string{}, + []string{"他", "说", "的确", "确实", "实在", "理"}, + []string{"长春", "长春市", "市长", "长春", "春节", "讲话"}, + []string{"结婚", "的", "和尚", "尚未", "未结", "结婚", "的"}, + []string{"结合", "合成", "成分", "分子", "时"}, + []string{"旅游", "和服", "服务", "是", "最好", "的"}, + []string{"这件", "事情", "的确", "是", "我", "的", "错"}, + []string{"供", "大家", "参考", "指正"}, + []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"}, + []string{"我", "在", "机场", "入口", "入口处"}, + []string{"邢", "永", "臣", "摄影", "报道"}, + []string{"BP", "神经", "神经网", "神经网络", "网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "加区", "区分", "区分度", "分度", "", ""}, + []string{"南京", "南京市", "京市", "市长", "长江", "长江大桥", "大桥"}, + []string{"应", "一些", "使用", "使用者", "用者", "的", "建议", "", "", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春", "长春市", "市长", "长春", "春药", "药店"}, + []string{"邓颖超", "超生", "生前", "最", "喜欢", "的", "衣服"}, + []string{"胡锦涛", "锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, + []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", "", "", "", "范", "凯", "在", "最", "右面", "", "", "再往", "左", "是", "李", "松", "洪"}, + []string{"一次", "一次性", "性交", "多少", "多少钱"}, + []string{"两块", "五一", "一套", "", "", "三块", "八一", "一斤", "", "", "四块", "七一", "一本", "", "", "五块", "六一", "一条"}, + []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, + []string{"我", "是", "中华", "中华人民", "中华人民共和国", "华人", "人民", "人民共和国", "共和", "共和国", "国公", "公民", "", "", "我", "爸爸", "是", "共和", "共和党", "党员", "", "", "", "地铁", "和平", "和平门", "站"}, + []string{"张晓梅", "去", "人民", "民医院", "医院", "做", "了", "个", "B", "超然", "然后", "后去", "买", "了", "件", "T", "恤"}, + []string{"AT", "T", "是", "一件", "不错", "的", "公司", "", "", "给", "你", "发", "offer", "了", "吗", "", ""}, + []string{"C++", "和", "c#", "是", "什么", "关系", "", "11+122", "133", "", "是", "吗", "", "PI", "3", "14159"}, + []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "", "", "他", "开", "一辆", "黑色", "的士", "", ""}, + []string{"枪杆", "枪杆子", "杆子", "中出", "政权"}, + } + + defaultCutNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, + []string{"我", "不", "喜欢", "日本", "和服", "。"}, + []string{"雷猴", "回归", "人间", "。"}, + []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"}, + []string{"我", "需要", "廉租房"}, + []string{"永和", "服装", "饰品", "有限公司"}, + []string{"我", "爱", "北京", "天安门"}, + []string{"abc"}, + []string{"隐", "马尔可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, + []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, + []string{"伊", "藤", "洋华堂", "总府", "店"}, + []string{"中国科学院计算技术研究所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我", "购买", "了", "道具", "和", "服装"}, + []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞", "帚", "自珍"}, + []string{"湖北省", "石首市"}, + []string{"湖北省", "十堰市"}, + []string{"总经理", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, + []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美的", "空调"}, + []string{"线程", "初始化", "时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, + []string{"祝", "你", "马到功成"}, + []string{"他", "掉", "进", "了", "无底洞", "里"}, + []string{"中国", "的", "首都", "是", "北京"}, + []string{"孙", "君", "意"}, + []string{"外交部", "发言人", "马朝旭"}, + []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"}, + []string{"在", "过去", "的", "这", "五年"}, + []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, + []string{"60", "周年", "首都", "阅兵"}, + []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"买", "水果", "然后", "来", "世博园"}, + []string{"买", "水果", "然后", "去", "世博园"}, + []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "是", "为了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"是因为", "和", "国家"}, + []string{"老年", "搜索", "还", "支持"}, + []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "}, + []string{"大"}, + []string{}, + []string{"他", "说", "的", "确实", "在", "理"}, + []string{"长春", "市长", "春节", "讲话"}, + []string{"结婚", "的", "和", "尚未", "结婚", "的"}, + []string{"结合", "成", "分子", "时"}, + []string{"旅游", "和", "服务", "是", "最好", "的"}, + []string{"这件", "事情", "的确", "是", "我", "的", "错"}, + []string{"供", "大家", "参考", "指正"}, + []string{"哈尔滨", "政府", "公布", "塌", "桥", "原因"}, + []string{"我", "在", "机场", "入口处"}, + []string{"邢", "永", "臣", "摄影", "报道"}, + []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"}, + []string{"南京市", "长江大桥"}, + []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春市", "长春", "药店"}, + []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, + []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"}, + []string{"程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"}, + []string{"一次性", "交", "多少", "钱"}, + []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, + []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"}, + []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"}, + []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, + []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, + []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"}, + []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"}, + []string{"枪杆子", "中", "出", "政权"}, + } + + cutForSearchResult = [][]string{[]string{"这是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, + []string{"我", "不", "喜欢", "日本", "和服", "。"}, + []string{"雷猴", "回归", "人间", "。"}, + []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"}, + []string{"我", "需要", "廉租", "租房", "廉租房"}, + []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"}, + []string{"我", "爱", "北京", "天安", "天安门"}, + []string{"abc"}, + []string{"隐", "可夫", "马尔可", "马尔可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, + []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"}, + []string{"伊藤", "洋华堂", "总府", "店"}, + []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我", "购买", "了", "道具", "和", "服装"}, + []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞帚", "自珍"}, + []string{"湖北", "湖北省", "石首", "石首市"}, + []string{"湖北", "湖北省", "十堰", "十堰市"}, + []string{"经理", "总经理", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, + []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美的", "空调"}, + []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, + []string{"祝", "你", "马到功成"}, + []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, + []string{"中国", "的", "首都", "是", "北京"}, + []string{"孙君意"}, + []string{"外交", "外交部", "发言", "发言人", "马朝旭"}, + []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"}, + []string{"在", "过去", "的", "这", "五年"}, + []string{"还", "需要", "很长", "的", "路", "要", "走"}, + []string{"60", "周年", "首都", "阅兵"}, + []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"}, + []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"}, + []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "是", "为了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"因为", "是因为", "和", "国家"}, + []string{"老年", "搜索", "还", "支持"}, + []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "}, + []string{"大"}, + []string{}, + []string{"他", "说", "的", "确实", "在理"}, + []string{"长春", "市长", "春节", "讲话"}, + []string{"结婚", "的", "和", "尚未", "结婚", "的"}, + []string{"结合", "成", "分子", "时"}, + []string{"旅游", "和", "服务", "是", "最好", "的"}, + []string{"这件", "事情", "的确", "是", "我", "的", "错"}, + []string{"供", "大家", "参考", "指正"}, + []string{"哈尔", "哈尔滨", "政府", "公布", "塌桥", "原因"}, + []string{"我", "在", "机场", "入口", "入口处"}, + []string{"邢永臣", "摄影", "报道"}, + []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"}, + []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"}, + []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春", "长春市", "长春", "药店"}, + []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, + []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, + []string{"程序", "程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"}, + []string{"一次", "一次性", "交", "多少", "钱"}, + []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, + []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, + []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"}, + []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, + []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, + []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, + []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"}, + []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"}, + } + + cutForSearchNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, + []string{"我", "不", "喜欢", "日本", "和服", "。"}, + []string{"雷猴", "回归", "人间", "。"}, + []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"}, + []string{"我", "需要", "廉租", "租房", "廉租房"}, + []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"}, + []string{"我", "爱", "北京", "天安", "天安门"}, + []string{"abc"}, + []string{"隐", "可夫", "马尔可", "马尔可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, + []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, + []string{"伊", "藤", "洋华堂", "总府", "店"}, + []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我", "购买", "了", "道具", "和", "服装"}, + []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞", "帚", "自珍"}, + []string{"湖北", "湖北省", "石首", "石首市"}, + []string{"湖北", "湖北省", "十堰", "十堰市"}, + []string{"经理", "总经理", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, + []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美的", "空调"}, + []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, + []string{"祝", "你", "马到功成"}, + []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, + []string{"中国", "的", "首都", "是", "北京"}, + []string{"孙", "君", "意"}, + []string{"外交", "外交部", "发言", "发言人", "马朝旭"}, + []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"}, + []string{"在", "过去", "的", "这", "五年"}, + []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, + []string{"60", "周年", "首都", "阅兵"}, + []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"}, + []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"}, + []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, + []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "是", "为了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"因为", "是因为", "和", "国家"}, + []string{"老年", "搜索", "还", "支持"}, + []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "}, + []string{"大"}, + []string{}, + []string{"他", "说", "的", "确实", "在", "理"}, + []string{"长春", "市长", "春节", "讲话"}, + []string{"结婚", "的", "和", "尚未", "结婚", "的"}, + []string{"结合", "成", "分子", "时"}, + []string{"旅游", "和", "服务", "是", "最好", "的"}, + []string{"这件", "事情", "的确", "是", "我", "的", "错"}, + []string{"供", "大家", "参考", "指正"}, + []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"}, + []string{"我", "在", "机场", "入口", "入口处"}, + []string{"邢", "永", "臣", "摄影", "报道"}, + []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"}, + []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"}, + []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春", "长春市", "长春", "药店"}, + []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, + []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, + []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"}, + []string{"一次", "一次性", "交", "多少", "钱"}, + []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, + []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, + []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"}, + []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, + []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, + []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"}, + []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"}, + []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"}, + } + + userDictCutResult = [][]string{ + []string{"这是", "一个", "伸手", "不见", "五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱北京", ",", "我", "爱", "Python", "和", "C", "++", "。"}, + []string{"我", "不", "喜欢", "日本", "和", "服", "。"}, + []string{"雷猴", "回归人间", "。"}, + []string{"工信", "处女", "干事", "每", "月", "经过", "下", "属", "科室", "都", "要", "亲口", "交代", "24", "口交换机", "等", "技术性", "器件", "的", "安装", "工作"}, + []string{"我", "需要", "廉租房"}, + []string{"永和服", "装饰品", "有", "限公司"}, + []string{"我", "爱北京", "天安门"}, + []string{"abc"}, + []string{"隐马尔", "可夫"}, + []string{"雷猴", "是", "个", "好", "网站"}, + []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两部分", "组成"}, + []string{"草泥", "马", "和", "欺实", "马", "是", "今", "年", "的", "流行", "词汇"}, + []string{"伊藤洋华堂", "总府", "店"}, + []string{"中国", "科学院", "计算", "技术", "研究", "所"}, + []string{"罗密欧", "与", "朱丽叶"}, + []string{"我购", "买", "了", "道", "具", "和", "服装"}, + []string{"PS", ":", " ", "我觉", "得", "开源", "有", "一个", "好", "处", ",", "就", "是", "能够", "敦促", "自己", "不断", "改进", ",", "避免", "敞帚", "自珍"}, + []string{"湖北省", "石首市"}, + []string{"湖北省", "十堰市"}, + []string{"总经理", "完成", "了", "这件", "事情"}, + []string{"电脑", "修好", "了"}, + []string{"做", "好", "了", "这件", "事情", "就", "一", "了", "百", "了", "了"}, + []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"我们", "买", "了", "一个", "美", "的", "空调"}, + []string{"线程", "初始", "化时", "我们", "要", "注意"}, + []string{"一个", "分子", "是", "由", "好", "多", "原子", "组织成", "的"}, + []string{"祝", "你", "马到", "功成"}, + []string{"他", "掉", "进", "了", "无底", "洞里"}, + []string{"中国", "的", "首", "都", "是", "北京"}, + []string{"孙君意"}, + []string{"外交部", "发言人", "马朝旭"}, + []string{"领导", "人会议", "和", "第四届", "东亚峰", "会"}, + []string{"在", "过", "去", "的", "这五年"}, + []string{"还", "需要", "很长", "的", "路", "要", "走"}, + []string{"60", "周年首", "都", "阅兵"}, + []string{"你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"买水果", "然后", "来", "世博园"}, + []string{"买水果", "然后", "去", "世博园"}, + []string{"但", "是", "后", "来", "我", "才", "知道", "你", "是", "对", "的"}, + []string{"存在", "即", "合理"}, + []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, + []string{"I", " ", "love", "你", ",", "不以", "为耻", ",", "反以", "为", "rong"}, + []string{"因"}, + []string{}, + []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, + []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, + []string{"为", "什么", "我", "不能", "拥有", "想", "要", "的", "生活"}, + []string{"后来", "我", "才"}, + []string{"此次", "来", "中国", "是", "为", "了"}, + []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"其实", "使", "用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, + []string{"是", "因为", "和", "国家"}, + []string{"老年", "搜索", "还", "支持"}, + []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉", "倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人", "大常委会", "第三次", "审议", "侵权责", "任法", "草案", ",", "删除", "了", "有", "关医疗", "损害", "责任", "“", "举证", "倒", "置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由", "此", "将", "陷入", "万劫", "不复", "的", "境地", "。", " "}, + []string{"大"}, + []string{}, + []string{"他", "说", "的", "确实", "在", "理"}, + []string{"长春市", "长春节", "讲话"}, + []string{"结婚", "的", "和", "尚未", "结婚", "的"}, + []string{"结合成", "分子", "时"}, + []string{"旅游", "和", "服务", "是", "最", "好", "的"}, + []string{"这件", "事情", "的", "确是", "我", "的", "错"}, + []string{"供大家", "参考", "指正"}, + []string{"哈尔滨", "政府", "公布塌桥", "原因"}, + []string{"我", "在", "机场", "入口", "处"}, + []string{"邢永臣", "摄影", "报道"}, + []string{"BP", "神经", "网络", "如何", "训练", "才", "能", "在", "分类", "时", "增加区", "分度", "?"}, + []string{"南京市", "长江大桥"}, + []string{"应一些", "使", "用者", "的", "建议", ",", "也", "为", "了", "便", "于", "利用", "NiuTrans", "用于", "SMT", "研究"}, + []string{"长春市", "长春药店"}, + []string{"邓颖", "超生", "前", "最", "喜欢", "的", "衣服"}, + []string{"胡锦涛", "是", "热爱世界", "和", "平", "的", "政治局", "常委"}, + []string{"程序员", "祝海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最右面", ".", "再往", "左", "是", "李松洪"}, + []string{"一次性", "交多少", "钱"}, + []string{"两块", "五一套", ",", "三块", "八一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, + []string{"小", "和", "尚留", "了", "一个", "像", "大", "和", "尚", "一样", "的", "和", "尚头"}, + []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共", "和", "党", "党员", ";", " ", "地铁", "和", "平门", "站"}, + []string{"张晓梅", "去", "人民医院", "做", "了", "个", "B", "超然", "后", "去", "买", "了", "件", "T", "恤"}, + []string{"AT", "&", "T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, + []string{"C", "++", "和", "c", "#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, + []string{"你", "认识", "那个", "和", "主席握", "手", "的", "的", "哥", "吗", "?", "他开", "一辆", "黑色", "的", "士", "。"}, + []string{"枪杆子", "中", "出政权"}, + } +) + +func init() { + seg = New() + seg.LoadDictionary("../dict.txt") +} + +func chanToArray(ch <-chan string) []string { + result := make([]string, 0) + for word := range ch { + result = append(result, word) + } + return result +} + +func TestCutDAG(t *testing.T) { + result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) + if len(result) != 11 { + t.Fatal(result) + } +} + +func TestCutDAGNoHmm(t *testing.T) { + result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) + if len(result) != 11 { + t.Fatal(result) + } +} + +func TestDefaultCut(t *testing.T) { + var result []string + for index, content := range test_contents { + result = chanToArray(seg.Cut(content, true)) + if len(result) != len(defaultCutResult[index]) { + t.Errorf("default cut for %s length should be %d not %d\n", + content, len(defaultCutResult[index]), len(result)) + t.Errorf("expect: %v\n", defaultCutResult[index]) + t.Fatalf("got: %v\n", result) + } + for i, r := range result { + if r != defaultCutResult[index][i] { + t.Fatal(r) + } + } + } +} + +func TestCutAll(t *testing.T) { + var result []string + for index, content := range test_contents { + result = chanToArray(seg.CutAll(content)) + if len(result) != len(cutAllResult[index]) { + t.Errorf("cut all for %s length should be %d not %d\n", + content, len(cutAllResult[index]), len(result)) + t.Errorf("expect: %v\n", defaultCutResult[index]) + t.Fatalf("got: %v\n", result) + } + for i, c := range result { + if c != cutAllResult[index][i] { + t.Fatal(c) + } + } + } +} + +func TestDefaultCutNoHMM(t *testing.T) { + var result []string + for index, content := range test_contents { + result = chanToArray(seg.Cut(content, false)) + if len(result) != len(defaultCutNoHMMResult[index]) { + t.Fatalf("default cut no hmm for %s length should be %d not %d\n", + content, len(defaultCutNoHMMResult[index]), len(result)) + } + for i, c := range result { + if c != defaultCutNoHMMResult[index][i] { + t.Fatal(c) + } + } + } +} + +func TestCutForSearch(t *testing.T) { + var result []string + for index, content := range test_contents { + result = chanToArray(seg.CutForSearch(content, true)) + if len(result) != len(cutForSearchResult[index]) { + t.Fatalf("cut for search for %s length should be %d not %d\n", + content, len(cutForSearchResult[index]), len(result)) + } + for i, c := range result { + if c != cutForSearchResult[index][i] { + t.Fatal(c) + } + } + } + for index, content := range test_contents { + result = chanToArray(seg.CutForSearch(content, false)) + if len(result) != len(cutForSearchNoHMMResult[index]) { + t.Fatalf("cut for search no hmm for %s length should be %d not %d\n", + content, len(cutForSearchNoHMMResult[index]), len(result)) + } + for i, c := range result { + if c != cutForSearchNoHMMResult[index][i] { + t.Fatal(c) + } + } + } +} + +func TestLoadDictionary(t *testing.T) { + var result []string + seg.LoadDictionary("../foobar.txt") + for index, content := range test_contents { + result = chanToArray(seg.Cut(content, true)) + if len(result) != len(userDictCutResult[index]) { + t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n", + content, len(userDictCutResult[index]), len(result)) + } + for i, c := range result { + if c != userDictCutResult[index][i] { + t.Fatal(c) + } + } + } + seg.LoadDictionary("../dict.txt") +} + +func TestLoadUserDictionary(t *testing.T) { + seg.LoadUserDictionary("../userdict.txt") + + sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" + result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} + + words := chanToArray(seg.Cut(sentence, true)) + if len(words) != len(result) { + t.Fatal(len(words)) + } + for index, word := range words { + if word != result[index] { + t.Fatal(word) + } + } + + sentence = "easy_install is great" + result = []string{"easy_install", " ", "is", " ", "great"} + words = chanToArray(seg.Cut(sentence, true)) + if len(words) != len(result) { + t.Fatal(len(words)) + } + for index, word := range words { + if word != result[index] { + t.Fatal(word) + } + } + + sentence = "python 的正则表达式是好用的" + result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} + words = chanToArray(seg.Cut(sentence, true)) + if len(words) != len(result) { + t.Fatal(words) + t.Fatal(result) + } + for index, word := range words { + if word != result[index] { + t.Fatal(word) + } + } + seg.LoadDictionary("../dict.txt") +} + +func BenchmarkCutNoHMM(b *testing.B) { + sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + b.ResetTimer() + for i := 0; i < b.N; i++ { + chanToArray(seg.Cut(sentence, false)) + } +} + +func BenchmarkCut(b *testing.B) { + sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + b.ResetTimer() + for i := 0; i < b.N; i++ { + chanToArray(seg.Cut(sentence, true)) + } +} + +func BenchmarkCutAll(b *testing.B) { + sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + b.ResetTimer() + for i := 0; i < b.N; i++ { + chanToArray(seg.CutAll(sentence)) + } +} + +func BenchmarkCutForSearchNoHMM(b *testing.B) { + sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + b.ResetTimer() + for i := 0; i < b.N; i++ { + chanToArray(seg.CutForSearch(sentence, false)) + } +} + +func BenchmarkCutForSearch(b *testing.B) { + sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + b.ResetTimer() + for i := 0; i < b.N; i++ { + chanToArray(seg.CutForSearch(sentence, true)) + } +} diff --git a/posseg/posseg.go b/posseg/posseg.go index 2e505af..ef8a159 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -1,10 +1,11 @@ package posseg import ( - "fmt" - "github.com/wangbin/jiebago" + "math" "regexp" - "strings" + + "github.com/wangbin/jiebago/dictionary" + "github.com/wangbin/jiebago/util" ) var ( @@ -17,57 +18,28 @@ var ( reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) ) -type Pair struct { - Word, Flag string +type Segment struct { + text, pos string } -func (p Pair) String() string { - return fmt.Sprintf("%s / %s", p.Word, p.Flag) +func (s Segment) Text() string { + return s.text } -type Posseg struct { - *jiebago.Jieba - flagMap map[string]string +func (s Segment) Pos() string { + return s.pos } -func (p *Posseg) AddEntry(entry jiebago.Entry) { - if len(entry.Flag) > 0 { - p.flagMap[entry.Word] = strings.TrimSpace(entry.Flag) - } - p.Add(entry.Word, entry.Freq) +type Segmenter struct { + *dictionary.Dictionary } -func (p Posseg) Flag(word string) (string, bool) { - flag, ok := p.flagMap[word] - return flag, ok +func New() *Segmenter { + return &Segmenter{dictionary.New()} } -// Set dictionary, it could be absolute path of dictionary file, or dictionary -// name in current diectory. -func Open(dictFileName string) (*Posseg, error) { - p := New() - err := jiebago.LoadDict(p, dictFileName, true) - return p, err -} - -// Load user specified dictionary file. -func (p *Posseg) LoadUserDict(dictFileName string) error { - return jiebago.LoadDict(p, dictFileName, true) -} - -func (p *Posseg) SetDict(dictFileName string) error { - if len(p.flagMap) > 0 || p.Total() > 0.0 { - return jiebago.ErrInitialized - } - return jiebago.LoadDict(p, dictFileName, false) -} - -func New() *Posseg { - return &Posseg{jiebago.New(), make(map[string]string)} -} - -func (p *Posseg) cutDetailInternal(sentence string) chan Pair { - result := make(chan Pair) +func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { + result := make(chan Segment) go func() { runes := []rune(sentence) @@ -80,42 +52,42 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair { case "B": begin = i case "E": - result <- Pair{string(runes[begin : i+1]), pos.POS()} + result <- Segment{string(runes[begin : i+1]), pos.POS()} next = i + 1 case "S": - result <- Pair{string(char), pos.POS()} + result <- Segment{string(char), pos.POS()} next = i + 1 } } if next < len(runes) { - result <- Pair{string(runes[next:]), posList[next].POS()} + result <- Segment{string(runes[next:]), posList[next].POS()} } close(result) }() return result } -func (p *Posseg) cutDetail(sentence string) chan Pair { - result := make(chan Pair) +func (seg *Segmenter) cutDetail(sentence string) <-chan Segment { + result := make(chan Segment) go func() { - for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) { + for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) { if reHanDetail.MatchString(blk) { - for wordTag := range p.cutDetailInternal(blk) { - result <- wordTag + for segment := range seg.cutDetailInternal(blk) { + result <- segment } - } else { - for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) { - if len(x) == 0 { - continue - } - switch { - case reNum.MatchString(x): - result <- Pair{x, "m"} - case reEng.MatchString(x): - result <- Pair{x, "eng"} - default: - result <- Pair{x, "x"} - } + continue + } + for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) { + if len(x) == 0 { + continue + } + switch { + case reNum.MatchString(x): + result <- Segment{x, "m"} + case reEng.MatchString(x): + result <- Segment{x, "eng"} + default: + result <- Segment{x, "x"} } } } @@ -124,46 +96,105 @@ func (p *Posseg) cutDetail(sentence string) chan Pair { return result } -type cutFunc func(sentence string) chan Pair +func (seg *Segmenter) dag(runes []rune) map[int][]int { + dag := make(map[int][]int) + n := len(runes) + var frag []rune + var i int + for k := 0; k < n; k++ { + dag[k] = make([]int, 0) + i = k + frag = runes[k : k+1] + for { + freq, ok := seg.Frequency(string(frag)) + if !ok { + break + } + if freq > 0.0 { + dag[k] = append(dag[k], i) + } + i += 1 + if i >= n { + break + } + frag = runes[k : i+1] + } + if len(dag[k]) == 0 { + dag[k] = append(dag[k], k) + } + } + return dag +} -func (p *Posseg) cutDAG(sentence string) chan Pair { - result := make(chan Pair) +type route struct { + frequency float64 + index int +} + +func (seg *Segmenter) calc(runes []rune) map[int]route { + dag := seg.dag(runes) + n := len(runes) + rs := make(map[int]route) + rs[n] = route{frequency: 0.0, index: 0} + logTotal := seg.LogTotal() + var r route + for idx := n - 1; idx >= 0; idx-- { + for _, i := range dag[idx] { + if freq, ok := seg.Frequency(string(runes[idx : i+1])); ok { + r = route{frequency: math.Log(freq) - logTotal + rs[i+1].frequency, index: i} + } else { + r = route{frequency: math.Log(1.0) - logTotal + rs[i+1].frequency, index: i} + } + if v, ok := rs[idx]; !ok { + rs[idx] = r + } else { + if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { + rs[idx] = r + } + } + } + } + return rs +} + +type cutFunc func(sentence string) <-chan Segment + +func (seg *Segmenter) cutDAG(sentence string) <-chan Segment { + result := make(chan Segment) go func() { runes := []rune(sentence) - dag := jiebago.DAG(p, runes) - routes := jiebago.Routes(p, runes, dag) + routes := seg.calc(runes) var y int length := len(runes) buf := make([]rune, 0) for x := 0; x < length; { - y = routes[x].Index + 1 - l_word := runes[x:y] + y = routes[x].index + 1 + frag := runes[x:y] if y-x == 1 { - buf = append(buf, l_word...) + buf = append(buf, frag...) } else { if len(buf) > 0 { + bufString := string(buf) if len(buf) == 1 { - sbuf := string(buf) - if tag, ok := p.Flag(sbuf); ok { - result <- Pair{sbuf, tag} + if tag, ok := seg.Pos(bufString); ok { + result <- Segment{bufString, tag} } else { - result <- Pair{sbuf, "x"} + result <- Segment{bufString, "x"} } buf = make([]rune, 0) } else { - bufString := string(buf) - if v, ok := p.Freq(bufString); !ok || v == 0.0 { - for t := range p.cutDetail(bufString) { + if v, ok := seg.Frequency(bufString); !ok || v == 0.0 { + for t := range seg.cutDetail(bufString) { result <- t } } else { for _, elem := range buf { selem := string(elem) - if tag, ok := p.Flag(selem); ok { - result <- Pair{string(elem), tag} + if tag, ok := seg.Pos(selem); ok { + result <- Segment{selem, tag} } else { - result <- Pair{string(elem), "x"} + result <- Segment{selem, "x"} } } @@ -171,37 +202,36 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { buf = make([]rune, 0) } } - sl_word := string(l_word) - if tag, ok := p.Flag(sl_word); ok { - result <- Pair{sl_word, tag} + word := string(frag) + if tag, ok := seg.Pos(word); ok { + result <- Segment{word, tag} } else { - result <- Pair{sl_word, "x"} + result <- Segment{word, "x"} } } x = y } if len(buf) > 0 { + bufString := string(buf) if len(buf) == 1 { - sbuf := string(buf) - if tag, ok := p.Flag(sbuf); ok { - result <- Pair{sbuf, tag} + if tag, ok := seg.Pos(bufString); ok { + result <- Segment{bufString, tag} } else { - result <- Pair{sbuf, "x"} + result <- Segment{bufString, "x"} } } else { - bufString := string(buf) - if v, ok := p.Freq(bufString); !ok || v == 0.0 { - for t := range p.cutDetail(bufString) { + if v, ok := seg.Frequency(bufString); !ok || v == 0.0 { + for t := range seg.cutDetail(bufString) { result <- t } } else { for _, elem := range buf { selem := string(elem) - if tag, ok := p.Flag(selem); ok { - result <- Pair{selem, tag} + if tag, ok := seg.Pos(selem); ok { + result <- Segment{selem, tag} } else { - result <- Pair{selem, "x"} + result <- Segment{selem, "x"} } } } @@ -212,42 +242,37 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { return result } -func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair { - result := make(chan Pair) +func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment { + result := make(chan Segment) go func() { runes := []rune(sentence) - dag := jiebago.DAG(p, runes) - routes := jiebago.Routes(p, runes, dag) - x := 0 + routes := seg.calc(runes) var y int length := len(runes) buf := make([]rune, 0) - for { - if x >= length { - break - } - y = routes[x].Index + 1 - l_word := runes[x:y] - if reEng1.MatchString(string(l_word)) && len(l_word) == 1 { - buf = append(buf, l_word...) + for x := 0; x < length; { + y = routes[x].index + 1 + frag := runes[x:y] + if reEng1.MatchString(string(frag)) && len(frag) == 1 { + buf = append(buf, frag...) x = y } else { if len(buf) > 0 { - result <- Pair{string(buf), "eng"} + result <- Segment{string(buf), "eng"} buf = make([]rune, 0) } - sl_word := string(l_word) - if tag, ok := p.Flag(sl_word); ok { - result <- Pair{sl_word, tag} + word := string(frag) + if tag, ok := seg.Pos(word); ok { + result <- Segment{word, tag} } else { - result <- Pair{sl_word, "x"} + result <- Segment{word, "x"} } x = y } } if len(buf) > 0 { - result <- Pair{string(buf), "eng"} + result <- Segment{string(buf), "eng"} buf = make([]rune, 0) } close(result) @@ -255,37 +280,34 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair { return result } -// Tags the POS of each word after segmentation, using labels compatible with -// ictclas. -func (p *Posseg) Cut(sentence string, HMM bool) chan Pair { - result := make(chan Pair) +func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment { + result := make(chan Segment) var cut cutFunc - if HMM { - cut = p.cutDAG + if hmm { + cut = seg.cutDAG } else { - cut = p.cutDAGNoHMM + cut = seg.cutDAGNoHMM } go func() { - for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) { + for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) { if reHanInternal.MatchString(blk) { for wordTag := range cut(blk) { result <- wordTag } } else { - for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) { + for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) { if reSkipInternal.MatchString(x) { - result <- Pair{x, "x"} + result <- Segment{x, "x"} } else { for _, xx := range x { s := string(xx) switch { case reNum.MatchString(s): - result <- Pair{s, "m"} + result <- Segment{s, "m"} case reEng.MatchString(x): - result <- Pair{x, "eng"} - break + result <- Segment{x, "eng"} default: - result <- Pair{s, "x"} + result <- Segment{s, "x"} } } } diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 2d0206d..dee3d11 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -5,6 +5,7 @@ import ( ) var ( + seg *Segmenter test_contents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", @@ -92,183 +93,188 @@ var ( "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", "枪杆子中出政权"} - defaultCutResult = [][]Pair{[]Pair{Pair{"这", "r"}, Pair{"是", "v"}, Pair{"一个", "m"}, Pair{"伸手不见五指", "i"}, Pair{"的", "uj"}, Pair{"黑夜", "n"}, Pair{"。", "x"}, Pair{"我", "r"}, Pair{"叫", "v"}, Pair{"孙悟空", "nr"}, Pair{",", "x"}, Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"北京", "ns"}, Pair{",", "x"}, Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"Python", "eng"}, Pair{"和", "c"}, Pair{"C++", "nz"}, Pair{"。", "x"}}, - []Pair{Pair{"我", "r"}, Pair{"不", "d"}, Pair{"喜欢", "v"}, Pair{"日本", "ns"}, Pair{"和服", "nz"}, Pair{"。", "x"}}, - []Pair{Pair{"雷猴", "n"}, Pair{"回归", "v"}, Pair{"人间", "n"}, Pair{"。", "x"}}, - []Pair{Pair{"工信处", "n"}, Pair{"女干事", "n"}, Pair{"每月", "r"}, Pair{"经过", "p"}, Pair{"下属", "v"}, Pair{"科室", "n"}, Pair{"都", "d"}, Pair{"要", "v"}, Pair{"亲口", "n"}, Pair{"交代", "n"}, Pair{"24", "m"}, Pair{"口", "n"}, Pair{"交换机", "n"}, Pair{"等", "u"}, Pair{"技术性", "n"}, Pair{"器件", "n"}, Pair{"的", "uj"}, Pair{"安装", "v"}, Pair{"工作", "vn"}}, - []Pair{Pair{"我", "r"}, Pair{"需要", "v"}, Pair{"廉租房", "n"}}, - []Pair{Pair{"永和", "nz"}, Pair{"服装", "vn"}, Pair{"饰品", "n"}, Pair{"有限公司", "n"}}, - []Pair{Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"北京", "ns"}, Pair{"天安门", "ns"}}, - []Pair{Pair{"abc", "eng"}}, - []Pair{Pair{"隐", "n"}, Pair{"马尔可夫", "nr"}}, - []Pair{Pair{"雷猴", "n"}, Pair{"是", "v"}, Pair{"个", "q"}, Pair{"好", "a"}, Pair{"网站", "n"}}, - []Pair{Pair{"“", "x"}, Pair{"Microsoft", "eng"}, Pair{"”", "x"}, Pair{"一", "m"}, Pair{"词", "n"}, Pair{"由", "p"}, Pair{"“", "x"}, Pair{"MICROcomputer", "eng"}, Pair{"(", "x"}, Pair{"微型", "b"}, Pair{"计算机", "n"}, Pair{")", "x"}, Pair{"”", "x"}, Pair{"和", "c"}, Pair{"“", "x"}, Pair{"SOFTware", "eng"}, Pair{"(", "x"}, Pair{"软件", "n"}, Pair{")", "x"}, Pair{"”", "x"}, Pair{"两", "m"}, Pair{"部分", "n"}, Pair{"组成", "v"}}, - []Pair{Pair{"草泥马", "n"}, Pair{"和", "c"}, Pair{"欺实", "v"}, Pair{"马", "n"}, Pair{"是", "v"}, Pair{"今年", "t"}, Pair{"的", "uj"}, Pair{"流行", "v"}, Pair{"词汇", "n"}}, - []Pair{Pair{"伊藤", "nr"}, Pair{"洋华堂", "n"}, Pair{"总府", "n"}, Pair{"店", "n"}}, - []Pair{Pair{"中国科学院计算技术研究所", "nt"}}, - []Pair{Pair{"罗密欧", "nr"}, Pair{"与", "p"}, Pair{"朱丽叶", "nr"}}, - []Pair{Pair{"我", "r"}, Pair{"购买", "v"}, Pair{"了", "ul"}, Pair{"道具", "n"}, Pair{"和", "c"}, Pair{"服装", "vn"}}, - []Pair{Pair{"PS", "eng"}, Pair{":", "x"}, Pair{" ", "x"}, Pair{"我", "r"}, Pair{"觉得", "v"}, Pair{"开源", "n"}, Pair{"有", "v"}, Pair{"一个", "m"}, Pair{"好处", "d"}, Pair{",", "x"}, Pair{"就是", "d"}, Pair{"能够", "v"}, Pair{"敦促", "v"}, Pair{"自己", "r"}, Pair{"不断改进", "l"}, Pair{",", "x"}, Pair{"避免", "v"}, Pair{"敞", "v"}, Pair{"帚", "ng"}, Pair{"自珍", "b"}}, - []Pair{Pair{"湖北省", "ns"}, Pair{"石首市", "ns"}}, - []Pair{Pair{"湖北省", "ns"}, Pair{"十堰市", "ns"}}, - []Pair{Pair{"总经理", "n"}, Pair{"完成", "v"}, Pair{"了", "ul"}, Pair{"这件", "mq"}, Pair{"事情", "n"}}, - []Pair{Pair{"电脑", "n"}, Pair{"修好", "v"}, Pair{"了", "ul"}}, - []Pair{Pair{"做好", "v"}, Pair{"了", "ul"}, Pair{"这件", "mq"}, Pair{"事情", "n"}, Pair{"就", "d"}, Pair{"一了百了", "l"}, Pair{"了", "ul"}}, - []Pair{Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"我们", "r"}, Pair{"买", "v"}, Pair{"了", "ul"}, Pair{"一个", "m"}, Pair{"美的", "nr"}, Pair{"空调", "n"}}, - []Pair{Pair{"线程", "n"}, Pair{"初始化", "l"}, Pair{"时", "n"}, Pair{"我们", "r"}, Pair{"要", "v"}, Pair{"注意", "v"}}, - []Pair{Pair{"一个", "m"}, Pair{"分子", "n"}, Pair{"是", "v"}, Pair{"由", "p"}, Pair{"好多", "m"}, Pair{"原子", "n"}, Pair{"组织", "v"}, Pair{"成", "v"}, Pair{"的", "uj"}}, - []Pair{Pair{"祝", "v"}, Pair{"你", "r"}, Pair{"马到功成", "i"}}, - []Pair{Pair{"他", "r"}, Pair{"掉", "v"}, Pair{"进", "v"}, Pair{"了", "ul"}, Pair{"无底洞", "ns"}, Pair{"里", "f"}}, - []Pair{Pair{"中国", "ns"}, Pair{"的", "uj"}, Pair{"首都", "d"}, Pair{"是", "v"}, Pair{"北京", "ns"}}, - []Pair{Pair{"孙君意", "nr"}}, - []Pair{Pair{"外交部", "nt"}, Pair{"发言人", "l"}, Pair{"马朝旭", "nr"}}, - []Pair{Pair{"领导人", "n"}, Pair{"会议", "n"}, Pair{"和", "c"}, Pair{"第四届", "m"}, Pair{"东亚", "ns"}, Pair{"峰会", "n"}}, - []Pair{Pair{"在", "p"}, Pair{"过去", "t"}, Pair{"的", "uj"}, Pair{"这", "r"}, Pair{"五年", "t"}}, - []Pair{Pair{"还", "d"}, Pair{"需要", "v"}, Pair{"很", "d"}, Pair{"长", "a"}, Pair{"的", "uj"}, Pair{"路", "n"}, Pair{"要", "v"}, Pair{"走", "v"}}, - []Pair{Pair{"60", "m"}, Pair{"周年", "t"}, Pair{"首都", "d"}, Pair{"阅兵", "v"}}, - []Pair{Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"买", "v"}, Pair{"水果", "n"}, Pair{"然后", "c"}, Pair{"来", "v"}, Pair{"世博园", "nr"}}, - []Pair{Pair{"买", "v"}, Pair{"水果", "n"}, Pair{"然后", "c"}, Pair{"去", "v"}, Pair{"世博园", "nr"}}, - []Pair{Pair{"但是", "c"}, Pair{"后来", "t"}, Pair{"我", "r"}, Pair{"才", "d"}, Pair{"知道", "v"}, Pair{"你", "r"}, Pair{"是", "v"}, Pair{"对", "p"}, Pair{"的", "uj"}}, - []Pair{Pair{"存在", "v"}, Pair{"即", "v"}, Pair{"合理", "vn"}}, - []Pair{Pair{"的的", "u"}, Pair{"的的", "u"}, Pair{"的", "uj"}, Pair{"在的", "u"}, Pair{"的的", "u"}, Pair{"的", "uj"}, Pair{"就", "d"}, Pair{"以", "p"}, Pair{"和和", "nz"}, Pair{"和", "c"}}, - []Pair{Pair{"I", "x"}, Pair{" ", "x"}, Pair{"love", "eng"}, Pair{"你", "r"}, Pair{",", "x"}, Pair{"不以为耻", "i"}, Pair{",", "x"}, Pair{"反", "zg"}, Pair{"以为", "c"}, Pair{"rong", "eng"}}, - []Pair{Pair{"因", "p"}}, - []Pair{}, - []Pair{Pair{"hello", "eng"}, Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"很好", "a"}, Pair{"但", "c"}, Pair{"主要", "b"}, Pair{"是", "v"}, Pair{"基于", "p"}, Pair{"网页", "n"}, Pair{"形式", "n"}}, - []Pair{Pair{"hello", "eng"}, Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"为什么", "r"}, Pair{"我", "r"}, Pair{"不能", "v"}, Pair{"拥有", "v"}, Pair{"想要", "v"}, Pair{"的", "uj"}, Pair{"生活", "vn"}}, - []Pair{Pair{"后来", "t"}, Pair{"我", "r"}, Pair{"才", "d"}}, - []Pair{Pair{"此次", "r"}, Pair{"来", "v"}, Pair{"中国", "ns"}, Pair{"是", "v"}, Pair{"为了", "p"}}, - []Pair{Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{",", "x"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"其实", "d"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"好人", "n"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"是因为", "c"}, Pair{"和", "c"}, Pair{"国家", "n"}}, - []Pair{Pair{"老年", "t"}, Pair{"搜索", "v"}, Pair{"还", "d"}, Pair{"支持", "v"}}, - []Pair{Pair{"干脆", "d"}, Pair{"就", "d"}, Pair{"把", "p"}, Pair{"那部", "r"}, Pair{"蒙人", "n"}, Pair{"的", "uj"}, Pair{"闲法", "n"}, Pair{"给", "p"}, Pair{"废", "v"}, Pair{"了", "ul"}, Pair{"拉倒", "v"}, Pair{"!", "x"}, Pair{"RT", "eng"}, Pair{" ", "x"}, Pair{"@", "x"}, Pair{"laoshipukong", "eng"}, Pair{" ", "x"}, Pair{":", "x"}, Pair{" ", "x"}, Pair{"27", "m"}, Pair{"日", "m"}, Pair{",", "x"}, Pair{"全国人大常委会", "nt"}, Pair{"第三次", "m"}, Pair{"审议", "v"}, Pair{"侵权", "v"}, Pair{"责任法", "n"}, Pair{"草案", "n"}, Pair{",", "x"}, Pair{"删除", "v"}, Pair{"了", "ul"}, Pair{"有关", "vn"}, Pair{"医疗", "n"}, Pair{"损害", "v"}, Pair{"责任", "n"}, Pair{"“", "x"}, Pair{"举证", "v"}, Pair{"倒置", "v"}, Pair{"”", "x"}, Pair{"的", "uj"}, Pair{"规定", "n"}, Pair{"。", "x"}, Pair{"在", "p"}, Pair{"医患", "n"}, Pair{"纠纷", "n"}, Pair{"中本", "ns"}, Pair{"已", "d"}, Pair{"处于", "v"}, Pair{"弱势", "n"}, Pair{"地位", "n"}, Pair{"的", "uj"}, Pair{"消费者", "n"}, Pair{"由此", "c"}, Pair{"将", "d"}, Pair{"陷入", "v"}, Pair{"万劫不复", "i"}, Pair{"的", "uj"}, Pair{"境地", "s"}, Pair{"。", "x"}, Pair{" ", "x"}}, - []Pair{Pair{"大", "a"}}, - []Pair{}, - []Pair{Pair{"他", "r"}, Pair{"说", "v"}, Pair{"的", "uj"}, Pair{"确实", "ad"}, Pair{"在", "p"}, Pair{"理", "n"}}, - []Pair{Pair{"长春", "ns"}, Pair{"市长", "n"}, Pair{"春节", "t"}, Pair{"讲话", "n"}}, - []Pair{Pair{"结婚", "v"}, Pair{"的", "uj"}, Pair{"和", "c"}, Pair{"尚未", "d"}, Pair{"结婚", "v"}, Pair{"的", "uj"}}, - []Pair{Pair{"结合", "v"}, Pair{"成", "n"}, Pair{"分子", "n"}, Pair{"时", "n"}}, - []Pair{Pair{"旅游", "vn"}, Pair{"和", "c"}, Pair{"服务", "vn"}, Pair{"是", "v"}, Pair{"最好", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"这件", "mq"}, Pair{"事情", "n"}, Pair{"的确", "d"}, Pair{"是", "v"}, Pair{"我", "r"}, Pair{"的", "uj"}, Pair{"错", "n"}}, - []Pair{Pair{"供", "v"}, Pair{"大家", "n"}, Pair{"参考", "v"}, Pair{"指正", "v"}}, - []Pair{Pair{"哈尔滨", "ns"}, Pair{"政府", "n"}, Pair{"公布", "v"}, Pair{"塌", "v"}, Pair{"桥", "n"}, Pair{"原因", "n"}}, - []Pair{Pair{"我", "r"}, Pair{"在", "p"}, Pair{"机场", "n"}, Pair{"入口处", "i"}}, - []Pair{Pair{"邢永臣", "nr"}, Pair{"摄影", "n"}, Pair{"报道", "v"}}, - []Pair{Pair{"BP", "eng"}, Pair{"神经网络", "n"}, Pair{"如何", "r"}, Pair{"训练", "vn"}, Pair{"才能", "v"}, Pair{"在", "p"}, Pair{"分类", "n"}, Pair{"时", "n"}, Pair{"增加", "v"}, Pair{"区分度", "n"}, Pair{"?", "x"}}, - []Pair{Pair{"南京市", "ns"}, Pair{"长江大桥", "ns"}}, - []Pair{Pair{"应", "v"}, Pair{"一些", "m"}, Pair{"使用者", "n"}, Pair{"的", "uj"}, Pair{"建议", "n"}, Pair{",", "x"}, Pair{"也", "d"}, Pair{"为了", "p"}, Pair{"便于", "v"}, Pair{"利用", "n"}, Pair{"NiuTrans", "eng"}, Pair{"用于", "v"}, Pair{"SMT", "eng"}, Pair{"研究", "vn"}}, - []Pair{Pair{"长春市", "ns"}, Pair{"长春", "ns"}, Pair{"药店", "n"}}, - []Pair{Pair{"邓颖超", "nr"}, Pair{"生前", "t"}, Pair{"最", "d"}, Pair{"喜欢", "v"}, Pair{"的", "uj"}, Pair{"衣服", "n"}}, - []Pair{Pair{"胡锦涛", "nr"}, Pair{"是", "v"}, Pair{"热爱", "a"}, Pair{"世界", "n"}, Pair{"和平", "nz"}, Pair{"的", "uj"}, Pair{"政治局", "n"}, Pair{"常委", "j"}}, - []Pair{Pair{"程序员", "n"}, Pair{"祝", "v"}, Pair{"海林", "nz"}, Pair{"和", "c"}, Pair{"朱会震", "nr"}, Pair{"是", "v"}, Pair{"在", "p"}, Pair{"孙健", "nr"}, Pair{"的", "uj"}, Pair{"左面", "f"}, Pair{"和", "c"}, Pair{"右面", "f"}, Pair{",", "x"}, Pair{" ", "x"}, Pair{"范凯", "nr"}, Pair{"在", "p"}, Pair{"最", "a"}, Pair{"右面", "f"}, Pair{".", "m"}, Pair{"再往", "d"}, Pair{"左", "f"}, Pair{"是", "v"}, Pair{"李松洪", "nr"}}, - []Pair{Pair{"一次性", "d"}, Pair{"交", "v"}, Pair{"多少", "m"}, Pair{"钱", "n"}}, - []Pair{Pair{"两块", "m"}, Pair{"五", "m"}, Pair{"一套", "m"}, Pair{",", "x"}, Pair{"三块", "m"}, Pair{"八", "m"}, Pair{"一斤", "m"}, Pair{",", "x"}, Pair{"四块", "m"}, Pair{"七", "m"}, Pair{"一本", "m"}, Pair{",", "x"}, Pair{"五块", "m"}, Pair{"六", "m"}, Pair{"一条", "m"}}, - []Pair{Pair{"小", "a"}, Pair{"和尚", "nr"}, Pair{"留", "v"}, Pair{"了", "ul"}, Pair{"一个", "m"}, Pair{"像", "v"}, Pair{"大", "a"}, Pair{"和尚", "nr"}, Pair{"一样", "r"}, Pair{"的", "uj"}, Pair{"和尚头", "nr"}}, - []Pair{Pair{"我", "r"}, Pair{"是", "v"}, Pair{"中华人民共和国", "ns"}, Pair{"公民", "n"}, Pair{";", "x"}, Pair{"我", "r"}, Pair{"爸爸", "n"}, Pair{"是", "v"}, Pair{"共和党", "nt"}, Pair{"党员", "n"}, Pair{";", "x"}, Pair{" ", "x"}, Pair{"地铁", "n"}, Pair{"和平门", "ns"}, Pair{"站", "v"}}, - []Pair{Pair{"张晓梅", "nr"}, Pair{"去", "v"}, Pair{"人民", "n"}, Pair{"医院", "n"}, Pair{"做", "v"}, Pair{"了", "ul"}, Pair{"个", "q"}, Pair{"B超", "n"}, Pair{"然后", "c"}, Pair{"去", "v"}, Pair{"买", "v"}, Pair{"了", "ul"}, Pair{"件", "q"}, Pair{"T恤", "n"}}, - []Pair{Pair{"AT&T", "nz"}, Pair{"是", "v"}, Pair{"一件", "m"}, Pair{"不错", "a"}, Pair{"的", "uj"}, Pair{"公司", "n"}, Pair{",", "x"}, Pair{"给", "p"}, Pair{"你", "r"}, Pair{"发", "v"}, Pair{"offer", "eng"}, Pair{"了", "ul"}, Pair{"吗", "y"}, Pair{"?", "x"}}, - []Pair{Pair{"C++", "nz"}, Pair{"和", "c"}, Pair{"c#", "nz"}, Pair{"是", "v"}, Pair{"什么", "r"}, Pair{"关系", "n"}, Pair{"?", "x"}, Pair{"11", "m"}, Pair{"+", "x"}, Pair{"122", "m"}, Pair{"=", "x"}, Pair{"133", "m"}, Pair{",", "x"}, Pair{"是", "v"}, Pair{"吗", "y"}, Pair{"?", "x"}, Pair{"PI", "eng"}, Pair{"=", "x"}, Pair{"3.14159", "m"}}, - []Pair{Pair{"你", "r"}, Pair{"认识", "v"}, Pair{"那个", "r"}, Pair{"和", "c"}, Pair{"主席", "n"}, Pair{"握手", "v"}, Pair{"的", "uj"}, Pair{"的哥", "n"}, Pair{"吗", "y"}, Pair{"?", "x"}, Pair{"他", "r"}, Pair{"开", "v"}, Pair{"一辆", "m"}, Pair{"黑色", "n"}, Pair{"的士", "n"}, Pair{"。", "x"}}, - []Pair{Pair{"枪杆子", "n"}, Pair{"中", "f"}, Pair{"出", "v"}, Pair{"政权", "n"}}, + defaultCutResult = [][]Segment{[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}}, + []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}}, + []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}}, + []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "m"}, Segment{"口", "n"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}}, + []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}}, + []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}}, + []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}}, + []Segment{Segment{"abc", "eng"}}, + []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}}, + []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}}, + []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}}, + []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺实", "v"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}}, + []Segment{Segment{"伊藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}}, + []Segment{Segment{"中国科学院计算技术研究所", "nt"}}, + []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}}, + []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}}, + []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}}, + []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}}, + []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}}, + []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}}, + []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}}, + []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}}, + []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}}, + []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}}, + []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "v"}, Segment{"的", "uj"}}, + []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}}, + []Segment{Segment{"他", "r"}, Segment{"掉", "v"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}}, + []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}}, + []Segment{Segment{"孙君意", "nr"}}, + []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}}, + []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}}, + []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}}, + []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "d"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}}, + []Segment{Segment{"60", "m"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}}, + []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}}, + []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}}, + []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}}, + []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}}, + []Segment{Segment{"的的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"在的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和和", "nz"}, Segment{"和", "c"}}, + []Segment{Segment{"I", "x"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}}, + []Segment{Segment{"因", "p"}}, + []Segment{}, + []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"很好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}}, + []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}}, + []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}}, + []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}}, + []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}}, + []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}}, + []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那部", "r"}, Segment{"蒙人", "n"}, Segment{"的", "uj"}, Segment{"闲法", "n"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "m"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中本", "ns"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}}, + []Segment{Segment{"大", "a"}}, + []Segment{}, + []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}}, + []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}}, + []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}}, + []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}}, + []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "n"}}, + []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}}, + []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}}, + []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}}, + []Segment{Segment{"邢永臣", "nr"}, Segment{"摄影", "n"}, Segment{"报道", "v"}}, + []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}}, + []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}}, + []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}}, + []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}}, + []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}}, + []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}}, + []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱会震", "nr"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙健", "nr"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范凯", "nr"}, Segment{"在", "p"}, Segment{"最", "a"}, Segment{"右面", "f"}, Segment{".", "m"}, Segment{"再往", "d"}, Segment{"左", "f"}, Segment{"是", "v"}, Segment{"李松洪", "nr"}}, + []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}}, + []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}}, + []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}}, + []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}}, + []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "q"}, Segment{"T恤", "n"}}, + []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}}, + []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "m"}, Segment{"+", "x"}, Segment{"122", "m"}, Segment{"=", "x"}, Segment{"133", "m"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3.14159", "m"}}, + []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}}, + []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}}, } - noHMMCutResult = [][]Pair{ - []Pair{Pair{"这", "r"}, Pair{"是", "v"}, Pair{"一个", "m"}, Pair{"伸手不见五指", "i"}, Pair{"的", "uj"}, Pair{"黑夜", "n"}, Pair{"。", "x"}, Pair{"我", "r"}, Pair{"叫", "v"}, Pair{"孙悟空", "nr"}, Pair{",", "x"}, Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"北京", "ns"}, Pair{",", "x"}, Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"Python", "eng"}, Pair{"和", "c"}, Pair{"C++", "nz"}, Pair{"。", "x"}}, - []Pair{Pair{"我", "r"}, Pair{"不", "d"}, Pair{"喜欢", "v"}, Pair{"日本", "ns"}, Pair{"和服", "nz"}, Pair{"。", "x"}}, - []Pair{Pair{"雷猴", "n"}, Pair{"回归", "v"}, Pair{"人间", "n"}, Pair{"。", "x"}}, - []Pair{Pair{"工信处", "n"}, Pair{"女干事", "n"}, Pair{"每月", "r"}, Pair{"经过", "p"}, Pair{"下属", "v"}, Pair{"科室", "n"}, Pair{"都", "d"}, Pair{"要", "v"}, Pair{"亲口", "n"}, Pair{"交代", "n"}, Pair{"24", "eng"}, Pair{"口", "q"}, Pair{"交换机", "n"}, Pair{"等", "u"}, Pair{"技术性", "n"}, Pair{"器件", "n"}, Pair{"的", "uj"}, Pair{"安装", "v"}, Pair{"工作", "vn"}}, - []Pair{Pair{"我", "r"}, Pair{"需要", "v"}, Pair{"廉租房", "n"}}, - []Pair{Pair{"永和", "nz"}, Pair{"服装", "vn"}, Pair{"饰品", "n"}, Pair{"有限公司", "n"}}, - []Pair{Pair{"我", "r"}, Pair{"爱", "v"}, Pair{"北京", "ns"}, Pair{"天安门", "ns"}}, - []Pair{Pair{"abc", "eng"}}, - []Pair{Pair{"隐", "n"}, Pair{"马尔可夫", "nr"}}, - []Pair{Pair{"雷猴", "n"}, Pair{"是", "v"}, Pair{"个", "q"}, Pair{"好", "a"}, Pair{"网站", "n"}}, - []Pair{Pair{"“", "x"}, Pair{"Microsoft", "eng"}, Pair{"”", "x"}, Pair{"一", "m"}, Pair{"词", "n"}, Pair{"由", "p"}, Pair{"“", "x"}, Pair{"MICROcomputer", "eng"}, Pair{"(", "x"}, Pair{"微型", "b"}, Pair{"计算机", "n"}, Pair{")", "x"}, Pair{"”", "x"}, Pair{"和", "c"}, Pair{"“", "x"}, Pair{"SOFTware", "eng"}, Pair{"(", "x"}, Pair{"软件", "n"}, Pair{")", "x"}, Pair{"”", "x"}, Pair{"两", "m"}, Pair{"部分", "n"}, Pair{"组成", "v"}}, - []Pair{Pair{"草泥马", "n"}, Pair{"和", "c"}, Pair{"欺", "vn"}, Pair{"实", "n"}, Pair{"马", "n"}, Pair{"是", "v"}, Pair{"今年", "t"}, Pair{"的", "uj"}, Pair{"流行", "v"}, Pair{"词汇", "n"}}, - []Pair{Pair{"伊", "ns"}, Pair{"藤", "nr"}, Pair{"洋华堂", "n"}, Pair{"总府", "n"}, Pair{"店", "n"}}, - []Pair{Pair{"中国科学院计算技术研究所", "nt"}}, - []Pair{Pair{"罗密欧", "nr"}, Pair{"与", "p"}, Pair{"朱丽叶", "nr"}}, - []Pair{Pair{"我", "r"}, Pair{"购买", "v"}, Pair{"了", "ul"}, Pair{"道具", "n"}, Pair{"和", "c"}, Pair{"服装", "vn"}}, - []Pair{Pair{"PS", "eng"}, Pair{":", "x"}, Pair{" ", "x"}, Pair{"我", "r"}, Pair{"觉得", "v"}, Pair{"开源", "n"}, Pair{"有", "v"}, Pair{"一个", "m"}, Pair{"好处", "d"}, Pair{",", "x"}, Pair{"就是", "d"}, Pair{"能够", "v"}, Pair{"敦促", "v"}, Pair{"自己", "r"}, Pair{"不断改进", "l"}, Pair{",", "x"}, Pair{"避免", "v"}, Pair{"敞", "v"}, Pair{"帚", "ng"}, Pair{"自珍", "b"}}, - []Pair{Pair{"湖北省", "ns"}, Pair{"石首市", "ns"}}, - []Pair{Pair{"湖北省", "ns"}, Pair{"十堰市", "ns"}}, - []Pair{Pair{"总经理", "n"}, Pair{"完成", "v"}, Pair{"了", "ul"}, Pair{"这件", "mq"}, Pair{"事情", "n"}}, - []Pair{Pair{"电脑", "n"}, Pair{"修好", "v"}, Pair{"了", "ul"}}, - []Pair{Pair{"做好", "v"}, Pair{"了", "ul"}, Pair{"这件", "mq"}, Pair{"事情", "n"}, Pair{"就", "d"}, Pair{"一了百了", "l"}, Pair{"了", "ul"}}, - []Pair{Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"我们", "r"}, Pair{"买", "v"}, Pair{"了", "ul"}, Pair{"一个", "m"}, Pair{"美的", "nr"}, Pair{"空调", "n"}}, - []Pair{Pair{"线程", "n"}, Pair{"初始化", "l"}, Pair{"时", "n"}, Pair{"我们", "r"}, Pair{"要", "v"}, Pair{"注意", "v"}}, - []Pair{Pair{"一个", "m"}, Pair{"分子", "n"}, Pair{"是", "v"}, Pair{"由", "p"}, Pair{"好多", "m"}, Pair{"原子", "n"}, Pair{"组织", "v"}, Pair{"成", "n"}, Pair{"的", "uj"}}, - []Pair{Pair{"祝", "v"}, Pair{"你", "r"}, Pair{"马到功成", "i"}}, - []Pair{Pair{"他", "r"}, Pair{"掉", "zg"}, Pair{"进", "v"}, Pair{"了", "ul"}, Pair{"无底洞", "ns"}, Pair{"里", "f"}}, - []Pair{Pair{"中国", "ns"}, Pair{"的", "uj"}, Pair{"首都", "d"}, Pair{"是", "v"}, Pair{"北京", "ns"}}, - []Pair{Pair{"孙", "zg"}, Pair{"君", "nz"}, Pair{"意", "n"}}, - []Pair{Pair{"外交部", "nt"}, Pair{"发言人", "l"}, Pair{"马朝旭", "nr"}}, - []Pair{Pair{"领导人", "n"}, Pair{"会议", "n"}, Pair{"和", "c"}, Pair{"第四届", "m"}, Pair{"东亚", "ns"}, Pair{"峰会", "n"}}, - []Pair{Pair{"在", "p"}, Pair{"过去", "t"}, Pair{"的", "uj"}, Pair{"这", "r"}, Pair{"五年", "t"}}, - []Pair{Pair{"还", "d"}, Pair{"需要", "v"}, Pair{"很", "zg"}, Pair{"长", "a"}, Pair{"的", "uj"}, Pair{"路", "n"}, Pair{"要", "v"}, Pair{"走", "v"}}, - []Pair{Pair{"60", "eng"}, Pair{"周年", "t"}, Pair{"首都", "d"}, Pair{"阅兵", "v"}}, - []Pair{Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"买", "v"}, Pair{"水果", "n"}, Pair{"然后", "c"}, Pair{"来", "v"}, Pair{"世博园", "nr"}}, - []Pair{Pair{"买", "v"}, Pair{"水果", "n"}, Pair{"然后", "c"}, Pair{"去", "v"}, Pair{"世博园", "nr"}}, - []Pair{Pair{"但是", "c"}, Pair{"后来", "t"}, Pair{"我", "r"}, Pair{"才", "d"}, Pair{"知道", "v"}, Pair{"你", "r"}, Pair{"是", "v"}, Pair{"对", "p"}, Pair{"的", "uj"}}, - []Pair{Pair{"存在", "v"}, Pair{"即", "v"}, Pair{"合理", "vn"}}, - []Pair{Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"在", "p"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"的", "uj"}, Pair{"就", "d"}, Pair{"以", "p"}, Pair{"和", "c"}, Pair{"和", "c"}, Pair{"和", "c"}}, - []Pair{Pair{"I", "eng"}, Pair{" ", "x"}, Pair{"love", "eng"}, Pair{"你", "r"}, Pair{",", "x"}, Pair{"不以为耻", "i"}, Pair{",", "x"}, Pair{"反", "zg"}, Pair{"以为", "c"}, Pair{"rong", "eng"}}, - []Pair{Pair{"因", "p"}}, - []Pair{}, - []Pair{Pair{"hello", "eng"}, Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"很", "zg"}, Pair{"好", "a"}, Pair{"但", "c"}, Pair{"主要", "b"}, Pair{"是", "v"}, Pair{"基于", "p"}, Pair{"网页", "n"}, Pair{"形式", "n"}}, - []Pair{Pair{"hello", "eng"}, Pair{"你好", "l"}, Pair{"人们", "n"}, Pair{"审美", "vn"}, Pair{"的", "uj"}, Pair{"观点", "n"}, Pair{"是", "v"}, Pair{"不同", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"为什么", "r"}, Pair{"我", "r"}, Pair{"不能", "v"}, Pair{"拥有", "v"}, Pair{"想要", "v"}, Pair{"的", "uj"}, Pair{"生活", "vn"}}, - []Pair{Pair{"后来", "t"}, Pair{"我", "r"}, Pair{"才", "d"}}, - []Pair{Pair{"此次", "r"}, Pair{"来", "v"}, Pair{"中国", "ns"}, Pair{"是", "v"}, Pair{"为了", "p"}}, - []Pair{Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{",", "x"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"其实", "d"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"好人", "n"}, Pair{"使用", "v"}, Pair{"了", "ul"}, Pair{"它", "r"}, Pair{"就", "d"}, Pair{"可以", "c"}, Pair{"解决", "v"}, Pair{"一些", "m"}, Pair{"问题", "n"}}, - []Pair{Pair{"是因为", "c"}, Pair{"和", "c"}, Pair{"国家", "n"}}, - []Pair{Pair{"老年", "t"}, Pair{"搜索", "v"}, Pair{"还", "d"}, Pair{"支持", "v"}}, - []Pair{Pair{"干脆", "d"}, Pair{"就", "d"}, Pair{"把", "p"}, Pair{"那", "r"}, Pair{"部", "n"}, Pair{"蒙", "v"}, Pair{"人", "n"}, Pair{"的", "uj"}, Pair{"闲", "n"}, Pair{"法", "j"}, Pair{"给", "p"}, Pair{"废", "v"}, Pair{"了", "ul"}, Pair{"拉倒", "v"}, Pair{"!", "x"}, Pair{"RT", "eng"}, Pair{" ", "x"}, Pair{"@", "x"}, Pair{"laoshipukong", "eng"}, Pair{" ", "x"}, Pair{":", "x"}, Pair{" ", "x"}, Pair{"27", "eng"}, Pair{"日", "m"}, Pair{",", "x"}, Pair{"全国人大常委会", "nt"}, Pair{"第三次", "m"}, Pair{"审议", "v"}, Pair{"侵权", "v"}, Pair{"责任法", "n"}, Pair{"草案", "n"}, Pair{",", "x"}, Pair{"删除", "v"}, Pair{"了", "ul"}, Pair{"有关", "vn"}, Pair{"医疗", "n"}, Pair{"损害", "v"}, Pair{"责任", "n"}, Pair{"“", "x"}, Pair{"举证", "v"}, Pair{"倒置", "v"}, Pair{"”", "x"}, Pair{"的", "uj"}, Pair{"规定", "n"}, Pair{"。", "x"}, Pair{"在", "p"}, Pair{"医患", "n"}, Pair{"纠纷", "n"}, Pair{"中", "f"}, Pair{"本", "r"}, Pair{"已", "d"}, Pair{"处于", "v"}, Pair{"弱势", "n"}, Pair{"地位", "n"}, Pair{"的", "uj"}, Pair{"消费者", "n"}, Pair{"由此", "c"}, Pair{"将", "d"}, Pair{"陷入", "v"}, Pair{"万劫不复", "i"}, Pair{"的", "uj"}, Pair{"境地", "s"}, Pair{"。", "x"}, Pair{" ", "x"}}, - []Pair{Pair{"大", "a"}}, - []Pair{}, - []Pair{Pair{"他", "r"}, Pair{"说", "v"}, Pair{"的", "uj"}, Pair{"确实", "ad"}, Pair{"在", "p"}, Pair{"理", "n"}}, - []Pair{Pair{"长春", "ns"}, Pair{"市长", "n"}, Pair{"春节", "t"}, Pair{"讲话", "n"}}, - []Pair{Pair{"结婚", "v"}, Pair{"的", "uj"}, Pair{"和", "c"}, Pair{"尚未", "d"}, Pair{"结婚", "v"}, Pair{"的", "uj"}}, - []Pair{Pair{"结合", "v"}, Pair{"成", "n"}, Pair{"分子", "n"}, Pair{"时", "n"}}, - []Pair{Pair{"旅游", "vn"}, Pair{"和", "c"}, Pair{"服务", "vn"}, Pair{"是", "v"}, Pair{"最好", "a"}, Pair{"的", "uj"}}, - []Pair{Pair{"这件", "mq"}, Pair{"事情", "n"}, Pair{"的确", "d"}, Pair{"是", "v"}, Pair{"我", "r"}, Pair{"的", "uj"}, Pair{"错", "v"}}, - []Pair{Pair{"供", "v"}, Pair{"大家", "n"}, Pair{"参考", "v"}, Pair{"指正", "v"}}, - []Pair{Pair{"哈尔滨", "ns"}, Pair{"政府", "n"}, Pair{"公布", "v"}, Pair{"塌", "v"}, Pair{"桥", "n"}, Pair{"原因", "n"}}, - []Pair{Pair{"我", "r"}, Pair{"在", "p"}, Pair{"机场", "n"}, Pair{"入口处", "i"}}, - []Pair{Pair{"邢", "nr"}, Pair{"永", "ns"}, Pair{"臣", "n"}, Pair{"摄影", "n"}, Pair{"报道", "v"}}, - []Pair{Pair{"BP", "eng"}, Pair{"神经网络", "n"}, Pair{"如何", "r"}, Pair{"训练", "vn"}, Pair{"才能", "v"}, Pair{"在", "p"}, Pair{"分类", "n"}, Pair{"时", "n"}, Pair{"增加", "v"}, Pair{"区分度", "n"}, Pair{"?", "x"}}, - []Pair{Pair{"南京市", "ns"}, Pair{"长江大桥", "ns"}}, - []Pair{Pair{"应", "v"}, Pair{"一些", "m"}, Pair{"使用者", "n"}, Pair{"的", "uj"}, Pair{"建议", "n"}, Pair{",", "x"}, Pair{"也", "d"}, Pair{"为了", "p"}, Pair{"便于", "v"}, Pair{"利用", "n"}, Pair{"NiuTrans", "eng"}, Pair{"用于", "v"}, Pair{"SMT", "eng"}, Pair{"研究", "vn"}}, - []Pair{Pair{"长春市", "ns"}, Pair{"长春", "ns"}, Pair{"药店", "n"}}, - []Pair{Pair{"邓颖超", "nr"}, Pair{"生前", "t"}, Pair{"最", "d"}, Pair{"喜欢", "v"}, Pair{"的", "uj"}, Pair{"衣服", "n"}}, - []Pair{Pair{"胡锦涛", "nr"}, Pair{"是", "v"}, Pair{"热爱", "a"}, Pair{"世界", "n"}, Pair{"和平", "nz"}, Pair{"的", "uj"}, Pair{"政治局", "n"}, Pair{"常委", "j"}}, - []Pair{Pair{"程序员", "n"}, Pair{"祝", "v"}, Pair{"海林", "nz"}, Pair{"和", "c"}, Pair{"朱", "nr"}, Pair{"会", "v"}, Pair{"震", "v"}, Pair{"是", "v"}, Pair{"在", "p"}, Pair{"孙", "zg"}, Pair{"健", "a"}, Pair{"的", "uj"}, Pair{"左面", "f"}, Pair{"和", "c"}, Pair{"右面", "f"}, Pair{",", "x"}, Pair{" ", "x"}, Pair{"范", "nr"}, Pair{"凯", "nr"}, Pair{"在", "p"}, Pair{"最", "d"}, Pair{"右面", "f"}, Pair{".", "x"}, Pair{"再", "d"}, Pair{"往", "zg"}, Pair{"左", "m"}, Pair{"是", "v"}, Pair{"李", "nr"}, Pair{"松", "v"}, Pair{"洪", "nr"}}, - []Pair{Pair{"一次性", "d"}, Pair{"交", "v"}, Pair{"多少", "m"}, Pair{"钱", "n"}}, - []Pair{Pair{"两块", "m"}, Pair{"五", "m"}, Pair{"一套", "m"}, Pair{",", "x"}, Pair{"三块", "m"}, Pair{"八", "m"}, Pair{"一斤", "m"}, Pair{",", "x"}, Pair{"四块", "m"}, Pair{"七", "m"}, Pair{"一本", "m"}, Pair{",", "x"}, Pair{"五块", "m"}, Pair{"六", "m"}, Pair{"一条", "m"}}, - []Pair{Pair{"小", "a"}, Pair{"和尚", "nr"}, Pair{"留", "v"}, Pair{"了", "ul"}, Pair{"一个", "m"}, Pair{"像", "v"}, Pair{"大", "a"}, Pair{"和尚", "nr"}, Pair{"一样", "r"}, Pair{"的", "uj"}, Pair{"和尚头", "nr"}}, - []Pair{Pair{"我", "r"}, Pair{"是", "v"}, Pair{"中华人民共和国", "ns"}, Pair{"公民", "n"}, Pair{";", "x"}, Pair{"我", "r"}, Pair{"爸爸", "n"}, Pair{"是", "v"}, Pair{"共和党", "nt"}, Pair{"党员", "n"}, Pair{";", "x"}, Pair{" ", "x"}, Pair{"地铁", "n"}, Pair{"和平门", "ns"}, Pair{"站", "v"}}, - []Pair{Pair{"张晓梅", "nr"}, Pair{"去", "v"}, Pair{"人民", "n"}, Pair{"医院", "n"}, Pair{"做", "v"}, Pair{"了", "ul"}, Pair{"个", "q"}, Pair{"B超", "n"}, Pair{"然后", "c"}, Pair{"去", "v"}, Pair{"买", "v"}, Pair{"了", "ul"}, Pair{"件", "zg"}, Pair{"T恤", "n"}}, - []Pair{Pair{"AT&T", "nz"}, Pair{"是", "v"}, Pair{"一件", "m"}, Pair{"不错", "a"}, Pair{"的", "uj"}, Pair{"公司", "n"}, Pair{",", "x"}, Pair{"给", "p"}, Pair{"你", "r"}, Pair{"发", "v"}, Pair{"offer", "eng"}, Pair{"了", "ul"}, Pair{"吗", "y"}, Pair{"?", "x"}}, - []Pair{Pair{"C++", "nz"}, Pair{"和", "c"}, Pair{"c#", "nz"}, Pair{"是", "v"}, Pair{"什么", "r"}, Pair{"关系", "n"}, Pair{"?", "x"}, Pair{"11", "eng"}, Pair{"+", "x"}, Pair{"122", "eng"}, Pair{"=", "x"}, Pair{"133", "eng"}, Pair{",", "x"}, Pair{"是", "v"}, Pair{"吗", "y"}, Pair{"?", "x"}, Pair{"PI", "eng"}, Pair{"=", "x"}, Pair{"3", "eng"}, Pair{".", "x"}, Pair{"14159", "eng"}}, - []Pair{Pair{"你", "r"}, Pair{"认识", "v"}, Pair{"那个", "r"}, Pair{"和", "c"}, Pair{"主席", "n"}, Pair{"握手", "v"}, Pair{"的", "uj"}, Pair{"的哥", "n"}, Pair{"吗", "y"}, Pair{"?", "x"}, Pair{"他", "r"}, Pair{"开", "v"}, Pair{"一辆", "m"}, Pair{"黑色", "n"}, Pair{"的士", "n"}, Pair{"。", "x"}}, - []Pair{Pair{"枪杆子", "n"}, Pair{"中", "f"}, Pair{"出", "v"}, Pair{"政权", "n"}}, + noHMMCutResult = [][]Segment{ + []Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}}, + []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}}, + []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}}, + []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "eng"}, Segment{"口", "q"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}}, + []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}}, + []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}}, + []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}}, + []Segment{Segment{"abc", "eng"}}, + []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}}, + []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}}, + []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}}, + []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺", "vn"}, Segment{"实", "n"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}}, + []Segment{Segment{"伊", "ns"}, Segment{"藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}}, + []Segment{Segment{"中国科学院计算技术研究所", "nt"}}, + []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}}, + []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}}, + []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}}, + []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}}, + []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}}, + []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}}, + []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}}, + []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}}, + []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}}, + []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}}, + []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "n"}, Segment{"的", "uj"}}, + []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}}, + []Segment{Segment{"他", "r"}, Segment{"掉", "zg"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}}, + []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}}, + []Segment{Segment{"孙", "zg"}, Segment{"君", "nz"}, Segment{"意", "n"}}, + []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}}, + []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}}, + []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}}, + []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "zg"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}}, + []Segment{Segment{"60", "eng"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}}, + []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}}, + []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}}, + []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}}, + []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}}, + []Segment{Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"在", "p"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和", "c"}, Segment{"和", "c"}, Segment{"和", "c"}}, + []Segment{Segment{"I", "eng"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}}, + []Segment{Segment{"因", "p"}}, + []Segment{}, + []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"很", "zg"}, Segment{"好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}}, + []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}}, + []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}}, + []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}}, + []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, + []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}}, + []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}}, + []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那", "r"}, Segment{"部", "n"}, Segment{"蒙", "v"}, Segment{"人", "n"}, Segment{"的", "uj"}, Segment{"闲", "n"}, Segment{"法", "j"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "eng"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中", "f"}, Segment{"本", "r"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}}, + []Segment{Segment{"大", "a"}}, + []Segment{}, + []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}}, + []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}}, + []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}}, + []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}}, + []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}}, + []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "v"}}, + []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}}, + []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}}, + []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}}, + []Segment{Segment{"邢", "nr"}, Segment{"永", "ns"}, Segment{"臣", "n"}, Segment{"摄影", "n"}, Segment{"报道", "v"}}, + []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}}, + []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}}, + []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}}, + []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}}, + []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}}, + []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}}, + []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱", "nr"}, Segment{"会", "v"}, Segment{"震", "v"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙", "zg"}, Segment{"健", "a"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范", "nr"}, Segment{"凯", "nr"}, Segment{"在", "p"}, Segment{"最", "d"}, Segment{"右面", "f"}, Segment{".", "x"}, Segment{"再", "d"}, Segment{"往", "zg"}, Segment{"左", "m"}, Segment{"是", "v"}, Segment{"李", "nr"}, Segment{"松", "v"}, Segment{"洪", "nr"}}, + []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}}, + []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}}, + []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}}, + []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}}, + []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "zg"}, Segment{"T恤", "n"}}, + []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}}, + []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "eng"}, Segment{"+", "x"}, Segment{"122", "eng"}, Segment{"=", "x"}, Segment{"133", "eng"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3", "eng"}, Segment{".", "x"}, Segment{"14159", "eng"}}, + []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}}, + []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}}, } ) -func chanToArray(ch chan Pair) []Pair { - result := make([]Pair, 0) +func init() { + seg = New() + seg.LoadDictionary("../dict.txt") +} + +func chanToArray(ch <-chan Segment) []Segment { + result := make([]Segment, 0) for word := range ch { result = append(result, word) } @@ -276,12 +282,8 @@ func chanToArray(ch chan Pair) []Pair { } func TestCut(t *testing.T) { - p, err := Open("../dict.txt") - if err != nil { - t.Fatal(err) - } for index, content := range test_contents { - result := chanToArray(p.Cut(content, true)) + result := chanToArray(seg.Cut(content, true)) if len(defaultCutResult[index]) != len(result) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -293,7 +295,7 @@ func TestCut(t *testing.T) { t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i]) } } - result = chanToArray(p.Cut(content, false)) + result = chanToArray(seg.Cut(content, false)) if len(noHMMCutResult[index]) != len(result) { t.Fatal(content) } @@ -306,19 +308,16 @@ func TestCut(t *testing.T) { } } +// https://github.com/fxsjy/jieba/issues/132 func TestBug132(t *testing.T) { - /* - https://github.com/fxsjy/jieba/issues/132 - */ - p, _ := Open("../dict.txt") sentence := "又跛又啞" - cutResult := []Pair{ - Pair{"又", "d"}, - Pair{"跛", "a"}, - Pair{"又", "d"}, - Pair{"啞", "v"}, + cutResult := []Segment{ + Segment{"又", "d"}, + Segment{"跛", "a"}, + Segment{"又", "d"}, + Segment{"啞", "v"}, } - result := chanToArray(p.Cut(sentence, true)) + result := chanToArray(seg.Cut(sentence, true)) if len(cutResult) != len(result) { t.Fatal(result) } @@ -329,28 +328,25 @@ func TestBug132(t *testing.T) { } } +// https://github.com/fxsjy/jieba/issues/137 func TestBug137(t *testing.T) { - /* - https://github.com/fxsjy/jieba/issues/137 - */ - p, _ := Open("../dict.txt") sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" - cutResult := []Pair{ - Pair{"前", "f"}, - Pair{"港督", "n"}, - Pair{"衛奕", "z"}, - Pair{"信", "n"}, - Pair{"在", "p"}, - Pair{"八八年", "m"}, - Pair{"十月", "t"}, - Pair{"宣布", "v"}, - Pair{"成立", "v"}, - Pair{"中央", "n"}, - Pair{"政策", "n"}, - Pair{"研究", "vn"}, - Pair{"組", "x"}, + cutResult := []Segment{ + Segment{"前", "f"}, + Segment{"港督", "n"}, + Segment{"衛奕", "z"}, + Segment{"信", "n"}, + Segment{"在", "p"}, + Segment{"八八年", "m"}, + Segment{"十月", "t"}, + Segment{"宣布", "v"}, + Segment{"成立", "v"}, + Segment{"中央", "n"}, + Segment{"政策", "n"}, + Segment{"研究", "vn"}, + Segment{"組", "x"}, } - result := chanToArray(p.Cut(sentence, true)) + result := chanToArray(seg.Cut(sentence, true)) if len(cutResult) != len(result) { t.Fatal(result) } @@ -362,50 +358,50 @@ func TestBug137(t *testing.T) { } func TestUserDict(t *testing.T) { - p, _ := Open("../dict.txt") - p.LoadUserDict("../userdict.txt") + seg.LoadUserDictionary("../userdict.txt") + defer seg.LoadDictionary("../dict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" - cutResult := []Pair{ - Pair{"李小福", "nr"}, - Pair{"是", "v"}, - Pair{"创新办", "i"}, - Pair{"主任", "b"}, - Pair{"也", "d"}, - Pair{"是", "v"}, - Pair{"云计算", "x"}, - Pair{"方面", "n"}, - Pair{"的", "uj"}, - Pair{"专家", "n"}, - Pair{";", "x"}, - Pair{" ", "x"}, - Pair{"什么", "r"}, - Pair{"是", "v"}, - Pair{"八一双鹿", "nz"}, - Pair{"例如", "v"}, - Pair{"我", "r"}, - Pair{"输入", "v"}, - Pair{"一个", "m"}, - Pair{"带", "v"}, - Pair{"“", "x"}, - Pair{"韩玉赏鉴", "nz"}, - Pair{"”", "x"}, - Pair{"的", "uj"}, - Pair{"标题", "n"}, - Pair{",", "x"}, - Pair{"在", "p"}, - Pair{"自定义词", "n"}, - Pair{"库中", "nrt"}, - Pair{"也", "d"}, - Pair{"增加", "v"}, - Pair{"了", "ul"}, - Pair{"此", "r"}, - Pair{"词", "n"}, - Pair{"为", "p"}, - Pair{"N", "eng"}, - Pair{"类型", "n"}} + cutResult := []Segment{ + Segment{"李小福", "nr"}, + Segment{"是", "v"}, + Segment{"创新办", "i"}, + Segment{"主任", "b"}, + Segment{"也", "d"}, + Segment{"是", "v"}, + Segment{"云计算", "x"}, + Segment{"方面", "n"}, + Segment{"的", "uj"}, + Segment{"专家", "n"}, + Segment{";", "x"}, + Segment{" ", "x"}, + Segment{"什么", "r"}, + Segment{"是", "v"}, + Segment{"八一双鹿", "nz"}, + Segment{"例如", "v"}, + Segment{"我", "r"}, + Segment{"输入", "v"}, + Segment{"一个", "m"}, + Segment{"带", "v"}, + Segment{"“", "x"}, + Segment{"韩玉赏鉴", "nz"}, + Segment{"”", "x"}, + Segment{"的", "uj"}, + Segment{"标题", "n"}, + Segment{",", "x"}, + Segment{"在", "p"}, + Segment{"自定义词", "n"}, + Segment{"库中", "nrt"}, + Segment{"也", "d"}, + Segment{"增加", "v"}, + Segment{"了", "ul"}, + Segment{"此", "r"}, + Segment{"词", "n"}, + Segment{"为", "p"}, + Segment{"N", "eng"}, + Segment{"类型", "n"}} - result := chanToArray(p.Cut(sentence, true)) + result := chanToArray(seg.Cut(sentence, true)) if len(cutResult) != len(result) { t.Fatal(result) } @@ -417,19 +413,17 @@ func TestUserDict(t *testing.T) { } func BenchmarkCutNoHMM(b *testing.B) { - p, _ := Open("dict.txt") sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(p.Cut(sentence, false)) + chanToArray(seg.Cut(sentence, false)) } } func BenchmarkCut(b *testing.B) { - p, _ := Open("dict.txt") sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(p.Cut(sentence, true)) + chanToArray(seg.Cut(sentence, true)) } }