优化

2026-06-05 00:32:51 +08:00 · 2022-11-30 12:18:15 +08:00
parent ab8b95ef87
commit 8bbc755ed4
48 changed files with 984 additions and 859 deletions
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -7,7 +7,7 @@ import (

 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
-	"github.com/wangbin/jiebago"
+	jiebago "github.com/fumiama/jieba"
 )

 // Name is the jieba tokenizer name.
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.

 Parameters:

-    dictFilePath: path of the dictioanry file.
+	dictFilePath: path of the dictioanry file.

-    hmm: whether to use Hidden Markov Model to cut unknown words,
-    i.e. not found in dictionary. For example word "安卓" (means "Android" in
-    English) not in the dictionary file. If hmm is set to false, it will be
-    cutted into two single words "安" and "卓", if hmm is set to true, it will
-    be traded as one single word because Jieba using Hidden Markov Model with
-    Viterbi algorithm to guess the best possibility.
+	hmm: whether to use Hidden Markov Model to cut unknown words,
+	i.e. not found in dictionary. For example word "安卓" (means "Android" in
+	English) not in the dictionary file. If hmm is set to false, it will be
+	cutted into two single words "安" and "卓", if hmm is set to true, it will
+	be traded as one single word because Jieba using Hidden Markov Model with
+	Viterbi algorithm to guess the best possibility.

-    searchMode: whether to further cut long words into serveral short words.
-    In Chinese, some long words may contains other words, for example "交换机"
-    is a Chinese word for "Switcher", if sechMode is false, it will trade
-    "交换机" as a single word. If searchMode is true, it will further split
-    this word into "交换", "换机", which are valid Chinese words.
+	searchMode: whether to further cut long words into serveral short words.
+	In Chinese, some long words may contains other words, for example "交换机"
+	is a Chinese word for "Switcher", if sechMode is false, it will trade
+	"交换机" as a single word. If searchMode is true, it will further split
+	this word into "交换", "换机", which are valid Chinese words.
 */
 func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
 	var seg jiebago.Segmenter
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	for word := range jt.seg.Cut(string(input), jt.hmm) {
+	for _, word := range jt.seg.Cut(string(input), jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.

 Parameter config should contains at least one parameter:

-    file: the path of the dictionary file.
+	file: the path of the dictionary file.

-    hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
+	hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.

-    search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
+	search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
 */
 func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
 	analysis.Tokenizer, error) {