removed old tokenize module, updated README

2026-06-28 08:02:45 +08:00 · 2015-03-18 17:31:41 +08:00
parent f596ac063d
commit 16929faf57
5 changed files with 213 additions and 477 deletions
--- a/README.md
+++ b/README.md
@@ -1,18 +1,18 @@
-结巴分词Go版 jiebago
-===================
+#结巴分词 Go 语言版：jiebago
+

 [![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago)

 [结巴分词](https://github.com/fxsjy/jieba)是[@fxsjy](https://github.com/fxsjy)用Python编写的中文分词组件，jiebago是结巴分词的Go语言实现，目前已经实现的功能包括：三种模式分词、自定义词典、关键词提取和词性标注。


-安装
-=====
+## 安装

-	go get github.com/wangbin/jiebago
+
+	go get github.com/wangbin/jiebago/...
 	
-分词
-=====
+## 分词
+

    package main

@@ -53,8 +53,8 @@

    【搜索引擎模式】：小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / ， / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /

-添加自定义词典
-=============
+## 添加自定义词典
+

    var sentence = "李小福是创新办主任也是云计算方面的专家"
 	fmt.Print("Before: ")
@@ -69,12 +69,7 @@

    After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /

-关键词提取
-========
-
-需要先安装analyse模块：
-
-    go get github.com/wangbin/jiebago/analyse
+## 关键词提取
    
 示例代码：

@@ -133,12 +128,7 @@
    全资 0.306324
    商业 0.306138    

-词性标注
-=======
-
-需要先安装posseg模块：
-
-    go get github.com/wangbin/jiebago/posseg
+## 词性标注
    
 示例代码：

@@ -166,8 +156,8 @@
    北京 ns
    天安门 ns
    
-并行分词
-=======
+
+## 并行分词

 因为Go有强大的goroutine特性，并行分词实现起来非常简单，所以并没有内置到jiebaogo中，而是由使用者自己实现，下面是一个简单的例子：

@@ -207,40 +197,213 @@
    writer.Flush()


-Tokenize
-=========
+## Tokenize：返回词语在原文的起始位置

-    var sentence = "永和服装饰品有限公司"
-    // 默认模式
-    for _, token := range jiebago.Tokenize(sentence, "default", true) {
-        fmt.Printf("word %s\t\t start: %d \t\t end:%d\n", token.Word, token.Start, token.End)
+
+注意新版的 Jiebago Tokenizer 实现了 Bleve 的 Tokenizer 接口，跟之前的实现有很大的变化：
+
+1. 接受的参数必须是 []byte。
+2. 输出的 Token 的起始和终止位置是 byte 的位置，不是之前的 rune 的位置，所以和 Python 版的 Jieba.tokenize 输出不一致。
+
+```
+package main
+
+import (
+    "fmt"
+    "github.com/wangbin/jiebago/tokenizers"
+)
+
+const DictPath = "/path/to/dict.txt"
+
+var sentence = []byte("永和服装饰品有限公司")
+
+func main() {
+    // default mode
+    tokenizer, _ := tokenizers.NewJiebaTokenizer(DictPath, true, false)     for _, token := range tokenizer.Tokenize(sentence) {
+        fmt.Printf(
+            "Term: %s\t  Start: %d \t  End: %d\t Position: %d\t Type: %d\n",
+            token.Term, token.Start, token.End, token.Position, token.Type)
    }
-    // 搜索模式
-    for _, token := range jiebago.Tokenize(sentence, "search", true) {
-        fmt.Printf("word %s\t\t start: %d \t\t end:%d\n", token.Word, token.Start, token.End)
+    
+    //search mode
+    tokenizer, _ = tokenizers.NewJiebaTokenizer(DictPath, true, true) 
+    for _, token := range tokenizer.Tokenize(sentence) {
+        fmt.Printf(
+            "Term: %s\t  Start: %d \t  End: %d\t Position: %d\t Type: %d\n",
+            token.Term, token.Start, token.End, token.Position, token.Type)
    }

+}
+
+```
+默认模式输出：
+
+```
+Term: 永和        Start: 0        End: 6         Position: 1     Type: 1
+Term: 服装        Start: 6        End: 12        Position: 2     Type: 1
+Term: 饰品        Start: 12       End: 18        Position: 3     Type: 1
+Term: 有限公司    Start: 18       End: 30        Position: 4     Type: 1
+```
+搜索模式输出：
+
+```
+Term: 永和        Start: 0        End: 6         Position: 1     Type: 1
+Term: 服装        Start: 6        End: 12        Position: 2     Type: 1
+Term: 饰品        Start: 12       End: 18        Position: 3     Type: 1
+Term: 有限        Start: 18       End: 24        Position: 4     Type: 1
+Term: 公司        Start: 24       End: 30        Position: 5     Type: 1
+Term: 有限公司    Start: 18       End: 30        Position: 6     Type: 1
+```    
+### 配合 bleve 进行中文全文检索
+
+[bleve](http://www.blevesearch.com/) 是一个 Go 语言实现的全文索引系统，jiebago 可以配合 bleve 使用实现中文的全文检索。一个简单的用法示例：
+
+```
+package main
+
+import (
+    "fmt"
+    "github.com/blevesearch/bleve"
+    _ "github.com/wangbin/jiebago/analyse/tokenizers"
+    "log"
+)
+
+func main() {
+    // open a new index
+    indexMapping := bleve.NewIndexMapping()
+
+    err := indexMapping.AddCustomTokenizer("jieba",
+        map[string]interface{}{
+            "file": "/Users/wangbin/mygo/src/github.com/wangbin/jiebago/dict.txt",
+            "type": "jieba",
+        })
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    err = indexMapping.AddCustomAnalyzer("jieba",
+        map[string]interface{}{
+            "type":      "custom",
+            "tokenizer": "jieba",
+            "token_filters": []string{
+                "possessive_en",
+                "to_lower",
+                "stop_en",
+            },
+        })
+
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    indexMapping.DefaultAnalyzer = "jieba"
+
+    index, err := bleve.New("example.bleve", indexMapping)
+
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    indexMapping.DefaultAnalyzer = "jieba"
+
+    index, err := bleve.New("example.bleve", indexMapping)
+
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    docs := []struct {
+        Title string
+        Name  string
+    }{
+        {
+            Title: "Doc 1",
+            Name:  "This is the first document we’ve added",
+        },
+        {
+            Title: "Doc 2",
+            Name:  "The second one 你 中文测试中文 is even more interesting! 吃水果",
+        },
+        {
+            Title: "Doc 3",
+            Name:  "买水果然后来世博园。",
+        },
+        {
+            Title: "Doc 4",
+            Name:  "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
+        },
+        {
+            Title: "Doc 5",
+            Name:  "咱俩交换一下吧。",
+        },
+    }
+    // index docs
+    for _, doc := range docs {
+        index.Index(doc.Title, doc)
+    }
+
+    // search for some text
+    for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
+        query := bleve.NewMatchQuery(keyword)
+        search := bleve.NewSearchRequest(query)
+        search.Highlight = bleve.NewHighlight()
+        searchResults, err := index.Search(search)
+        if err != nil {
+            log.Fatal(err)
+        }
+        fmt.Printf("Result of %s: %s\n", keyword, searchResults)
+    }
+}
+```
 输出结果：

-    word 永和                start: 0                end:2
-    word 服装                start: 2                end:4
-    word 饰品                start: 4                end:6
-    word 有限公司            start: 6                end:10
+```
+Result of 水果世博园: 2 matches, showing 1 through 2, took 377.988µs
+    1. Doc 3 (1.099550)
+        Name
+                买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
+    2. Doc 2 (0.031941)
+        Name
+                The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>

-    word 永和                start: 0                end:2
-    word 服装                start: 0                end:2
-    word 饰品                start: 0                end:2
-    word 有限                start: 0                end:2
-    word 公司                start: 2                end:4
-    word 有限公司            start: 0                end:4
+Result of 你: 1 matches, showing 1 through 1, took 103.367µs
+    1. Doc 2 (0.391161)
+        Name
+                The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果

-分词速度
-=======
+Result of first: 1 matches, showing 1 through 1, took 373.317µs
+    1. Doc 1 (0.512150)
+        Name
+                This is the <span class="highlight">first</span> document we’ve added
+
+Result of 中文: 1 matches, showing 1 through 1, took 106.433µs
+    1. Doc 2 (0.553186)
+        Name
+                The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
+
+Result of 交换机: 2 matches, showing 1 through 2, took 188.235µs
+    1. Doc 4 (0.608495)
+        Name
+                工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
+    2. Doc 5 (0.086700)
+        Name
+                咱俩<span class="highlight">交换</span>一下吧。
+
+Result of 交换: 2 matches, showing 1 through 2, took 148.822µs
+    1. Doc 5 (0.534158)
+        Name
+                咱俩<span class="highlight">交换</span>一下吧。
+    2. Doc 4 (0.296297)
+        Name
+                工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
+```
+
+## 分词速度

 - 2MB / Second in Full Mode
 - 700KB / Second in Default Mode
 - Test Env: AMD Phenom(tm) II X6 1055T CPU @ 2.8GHz; 《金庸全集》 

-许可证
-======
+## 许可证
+
 MIT: http://wangbin.mit-license.org