moved tokenizers to a seperated module

2026-06-12 21:20:26 +08:00 · 2015-05-07 18:52:29 +08:00
parent 7440fa00df
commit 3d91f615cf
6 changed files with 62 additions and 51 deletions
--- a/tokenizers/example_bleve_test.go
+++ b/tokenizers/example_bleve_test.go
@@ -0,0 +1,126 @@
+package tokenizers_test
+
+import (
+	"fmt"
+	"log"
+	"os"
+
+	"github.com/blevesearch/bleve"
+	_ "github.com/wangbin/jiebago/tokenizers"
+)
+
+func Example_beleveSearch() {
+	// open a new index
+	indexMapping := bleve.NewIndexMapping()
+
+	err := indexMapping.AddCustomTokenizer("jieba",
+		map[string]interface{}{
+			"file": "../dict.txt",
+			"type": "jieba",
+		})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// create a custom analyzer
+	err = indexMapping.AddCustomAnalyzer("jieba",
+		map[string]interface{}{
+			"type":      "custom",
+			"tokenizer": "jieba",
+			"token_filters": []string{
+				"possessive_en",
+				"to_lower",
+				"stop_en",
+			},
+		})
+
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	indexMapping.DefaultAnalyzer = "jieba"
+	cacheDir := "jieba.beleve"
+	os.RemoveAll(cacheDir)
+	index, err := bleve.New(cacheDir, indexMapping)
+
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	docs := []struct {
+		Title string
+		Name  string
+	}{
+		{
+			Title: "Doc 1",
+			Name:  "This is the first document we’ve added",
+		},
+		{
+			Title: "Doc 2",
+			Name:  "The second one 你 中文测试中文 is even more interesting! 吃水果",
+		},
+		{
+			Title: "Doc 3",
+			Name:  "买水果然后来世博园。",
+		},
+		{
+			Title: "Doc 4",
+			Name:  "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
+		},
+		{
+			Title: "Doc 5",
+			Name:  "咱俩交换一下吧。",
+		},
+	}
+	// index docs
+	for _, doc := range docs {
+		index.Index(doc.Title, doc)
+	}
+
+	// search for some text
+	for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
+		query := bleve.NewQueryStringQuery(keyword)
+		search := bleve.NewSearchRequest(query)
+		search.Highlight = bleve.NewHighlight()
+		searchResults, err := index.Search(search)
+		if err != nil {
+			log.Fatal(err)
+		}
+		fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
+		for i, hit := range searchResults.Hits {
+			rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
+			for fragmentField, fragments := range hit.Fragments {
+				rv += fmt.Sprintf("%s: ", fragmentField)
+				for _, fragment := range fragments {
+					rv += fmt.Sprintf("%s", fragment)
+				}
+			}
+			fmt.Printf("%s\n", rv)
+		}
+	}
+	// Output:
+	// Result of "水果世博园": 2 matches:
+	// 1. Doc 3, (1.099550)
+	// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
+	// 2. Doc 2, (0.031941)
+	// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
+	// Result of "你": 1 matches:
+	// 1. Doc 2, (0.391161)
+	// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
+	// Result of "first": 1 matches:
+	// 1. Doc 1, (0.512150)
+	// Name: This is the <span class="highlight">first</span> document we’ve added
+	// Result of "中文": 1 matches:
+	// 1. Doc 2, (0.553186)
+	// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
+	// Result of "交换机": 2 matches:
+	// 1. Doc 4, (0.608495)
+	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
+	// 2. Doc 5, (0.086700)
+	// Name: 咱俩<span class="highlight">交换</span>一下吧。
+	// Result of "交换": 2 matches:
+	// 1. Doc 5, (0.534158)
+	// Name: 咱俩<span class="highlight">交换</span>一下吧。
+	// 2. Doc 4, (0.296297)
+	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
+}
--- a/tokenizers/example_test.go
+++ b/tokenizers/example_test.go
@@ -0,0 +1,42 @@
+package tokenizers_test
+
+import (
+	"fmt"
+
+	"github.com/wangbin/jiebago/tokenizers"
+)
+
+func Example() {
+	sentence := []byte("永和服装饰品有限公司")
+
+	// default mode
+	tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
+	fmt.Println("Default Mode:")
+	for _, token := range tokenizer.Tokenize(sentence) {
+		fmt.Printf(
+			"Term: %s Start: %d End: %d Position: %d Type: %d\n",
+			token.Term, token.Start, token.End, token.Position, token.Type)
+	}
+
+	//search mode
+	tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
+	fmt.Println("Search Mode:")
+	for _, token := range tokenizer.Tokenize(sentence) {
+		fmt.Printf(
+			"Term: %s Start: %d End: %d Position: %d Type: %d\n",
+			token.Term, token.Start, token.End, token.Position, token.Type)
+	}
+	// Output:
+	// Default Mode:
+	// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
+	// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
+	// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
+	// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
+	// Search Mode:
+	// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
+	// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
+	// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
+	// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
+	// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
+	// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
+}
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -0,0 +1,147 @@
+package tokenizers
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+	"github.com/wangbin/jiebago"
+)
+
+// Name is the jieba tokenizer name.
+const Name = "jieba"
+
+var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
+
+// JiebaTokenizer is the beleve tokenizer for jiebago.
+type JiebaTokenizer struct {
+	seg             jiebago.Segmenter
+	hmm, searchMode bool
+}
+
+/*
+NewJiebaTokenizer creates a new JiebaTokenizer.
+
+Parameters:
+
+    dictFilePath: path of the dictioanry file.
+
+    hmm: whether to use Hidden Markov Model to cut unknown words,
+    i.e. not found in dictionary. For example word "安卓" (means "Android" in
+    English) not in the dictionary file. If hmm is set to false, it will be
+    cutted into two single words "安" and "卓", if hmm is set to true, it will
+    be traded as one single word because Jieba using Hidden Markov Model with
+    Viterbi algorithm to guess the best possibility.
+
+    searchMode: whether to further cut long words into serveral short words.
+    In Chinese, some long words may contains other words, for example "交换机"
+    is a Chinese word for "Switcher", if sechMode is false, it will trade
+    "交换机" as a single word. If searchMode is true, it will further split
+    this word into "交换", "换机", which are valid Chinese words.
+*/
+func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
+	var seg jiebago.Segmenter
+	err := seg.LoadDictionary(dictFilePath)
+	return &JiebaTokenizer{
+		seg:        seg,
+		hmm:        hmm,
+		searchMode: searchMode,
+	}, err
+}
+
+// Tokenize cuts input into bleve token stream.
+func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0)
+	runeStart := 0
+	start := 0
+	end := 0
+	pos := 1
+	var width int
+	var gram string
+	dict := jt.seg.Dictionary()
+	for word := range jt.seg.Cut(string(input), jt.hmm) {
+		if jt.searchMode {
+			runes := []rune(word)
+			width = len(runes)
+			for _, step := range [2]int{2, 3} {
+				if width > step {
+					for i := 0; i < width-step+1; i++ {
+						gram = string(runes[i : i+step])
+						gramLen := len(gram)
+						if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
+							gramStart := start + len(string(runes[:i]))
+							token := analysis.Token{
+								Term:     []byte(gram),
+								Start:    gramStart,
+								End:      gramStart + gramLen,
+								Position: pos,
+								Type:     detectTokenType(gram),
+							}
+							rv = append(rv, &token)
+							pos++
+						}
+					}
+				}
+			}
+		}
+		end = start + len(word)
+		token := analysis.Token{
+			Term:     []byte(word),
+			Start:    start,
+			End:      end,
+			Position: pos,
+			Type:     detectTokenType(word),
+		}
+		rv = append(rv, &token)
+		pos++
+		runeStart += width
+		start = end
+	}
+	return rv
+}
+
+/*
+JiebaTokenizerConstructor creates a JiebaTokenizer.
+
+Parameter config should contains at least one parameter:
+
+    file: the path of the dictionary file.
+
+    hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
+
+    search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
+*/
+func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
+	analysis.Tokenizer, error) {
+	dictFilePath, ok := config["file"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify dictionary file path")
+	}
+	hmm, ok := config["hmm"].(bool)
+	if !ok {
+		hmm = true
+	}
+	searchMode, ok := config["search"].(bool)
+	if !ok {
+		searchMode = true
+	}
+
+	return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
+}
+
+func detectTokenType(term string) analysis.TokenType {
+	if ideographRegexp.MatchString(term) {
+		return analysis.Ideographic
+	}
+	_, err := strconv.ParseFloat(term, 64)
+	if err == nil {
+		return analysis.Numeric
+	}
+	return analysis.AlphaNumeric
+}
+
+func init() {
+	registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
+}
--- a/tokenizers/tokenizer_test.go
+++ b/tokenizers/tokenizer_test.go