优化

2026-06-05 00:32:51 +08:00 · 2022-11-30 12:18:15 +08:00
parent ab8b95ef87
commit 8bbc755ed4
48 changed files with 984 additions and 859 deletions
--- a/tokenizers/analyzer.go
+++ b/tokenizers/analyzer.go
@@ -0,0 +1,30 @@
+package tokenizers
+
+import (
+	"errors"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+type JiebaAnalyzer struct {
+}
+
+func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	tokenizerName, ok := config["tokenizer"].(string)
+	if !ok {
+		return nil, errors.New("must specify tokenizer")
+	}
+	tokenizer, err := cache.TokenizerNamed(tokenizerName)
+	if err != nil {
+		return nil, err
+	}
+	alz := &analysis.Analyzer{
+		Tokenizer: tokenizer,
+	}
+	return alz, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer("jieba", analyzerConstructor)
+}
--- a/tokenizers/example_bleve_test.go
+++ b/tokenizers/example_bleve_test.go
@@ -6,7 +6,7 @@ import (
 	"os"

 	"github.com/blevesearch/bleve"
-	_ "github.com/wangbin/jiebago/tokenizers"
+	_ "github.com/fumiama/jieba/tokenizers"
 )

 func Example_beleveSearch() {
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
 	// Output:
 	// Result of "水果世博园": 2 matches:
 	// 1. Doc 3, (1.099550)
-	// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
+	// Name: 买<mark>水果</mark>然后来<mark>世博</mark>园。
 	// 2. Doc 2, (0.031941)
-	// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
+	// Name: The second one 你 中文测试中文 is even more interesting! 吃<mark>水果</mark>
 	// Result of "你": 1 matches:
 	// 1. Doc 2, (0.391161)
-	// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
+	// Name: The second one <mark>你</mark> 中文测试中文 is even more interesting! 吃水果
 	// Result of "first": 1 matches:
 	// 1. Doc 1, (0.512150)
-	// Name: This is the <span class="highlight">first</span> document we’ve added
+	// Name: This is the <mark>first</mark> document we’ve added
 	// Result of "中文": 1 matches:
 	// 1. Doc 2, (0.553186)
-	// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
+	// Name: The second one 你 <mark>中文</mark>测试<mark>中文</mark> is even more interesting! 吃水果
 	// Result of "交换机": 2 matches:
 	// 1. Doc 4, (0.608495)
-	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
+	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换机</mark>等技术性器件的安装工作
 	// 2. Doc 5, (0.086700)
-	// Name: 咱俩<span class="highlight">交换</span>一下吧。
+	// Name: 咱俩<mark>交换</mark>一下吧。
 	// Result of "交换": 2 matches:
 	// 1. Doc 5, (0.534158)
-	// Name: 咱俩<span class="highlight">交换</span>一下吧。
+	// Name: 咱俩<mark>交换</mark>一下吧。
 	// 2. Doc 4, (0.296297)
-	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
+	// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换</mark>机等技术性器件的安装工作
 }
--- a/tokenizers/example_test.go
+++ b/tokenizers/example_test.go
@@ -3,7 +3,7 @@ package tokenizers_test
 import (
 	"fmt"

-	"github.com/wangbin/jiebago/tokenizers"
+	"github.com/fumiama/jieba/tokenizers"
 )

 func Example() {
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -7,7 +7,7 @@ import (

 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
-	"github.com/wangbin/jiebago"
+	jiebago "github.com/fumiama/jieba"
 )

 // Name is the jieba tokenizer name.
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.

 Parameters:

-    dictFilePath: path of the dictioanry file.
+	dictFilePath: path of the dictioanry file.

-    hmm: whether to use Hidden Markov Model to cut unknown words,
-    i.e. not found in dictionary. For example word "安卓" (means "Android" in
-    English) not in the dictionary file. If hmm is set to false, it will be
-    cutted into two single words "安" and "卓", if hmm is set to true, it will
-    be traded as one single word because Jieba using Hidden Markov Model with
-    Viterbi algorithm to guess the best possibility.
+	hmm: whether to use Hidden Markov Model to cut unknown words,
+	i.e. not found in dictionary. For example word "安卓" (means "Android" in
+	English) not in the dictionary file. If hmm is set to false, it will be
+	cutted into two single words "安" and "卓", if hmm is set to true, it will
+	be traded as one single word because Jieba using Hidden Markov Model with
+	Viterbi algorithm to guess the best possibility.

-    searchMode: whether to further cut long words into serveral short words.
-    In Chinese, some long words may contains other words, for example "交换机"
-    is a Chinese word for "Switcher", if sechMode is false, it will trade
-    "交换机" as a single word. If searchMode is true, it will further split
-    this word into "交换", "换机", which are valid Chinese words.
+	searchMode: whether to further cut long words into serveral short words.
+	In Chinese, some long words may contains other words, for example "交换机"
+	is a Chinese word for "Switcher", if sechMode is false, it will trade
+	"交换机" as a single word. If searchMode is true, it will further split
+	this word into "交换", "换机", which are valid Chinese words.
 */
 func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
 	var seg jiebago.Segmenter
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	for word := range jt.seg.Cut(string(input), jt.hmm) {
+	for _, word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.

 Parameter config should contains at least one parameter:

-    file: the path of the dictionary file.
+	file: the path of the dictionary file.

-    hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
+	hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.

-    search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
+	search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
 */
 func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
 	analysis.Tokenizer, error) {
--- a/tokenizers/tokenizer_test.go
+++ b/tokenizers/tokenizer_test.go