From 4d76899e79f8bf6b816d81fa8afe7352d79863b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Wed, 30 Nov 2022 15:39:50 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=20tokenizer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tokenizers/analyzer.go | 3 --- tokenizers/tokenizer.go | 16 ++++++---------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tokenizers/analyzer.go b/tokenizers/analyzer.go index 6af133a..8f8272c 100755 --- a/tokenizers/analyzer.go +++ b/tokenizers/analyzer.go @@ -7,9 +7,6 @@ import ( "github.com/blevesearch/bleve/registry" ) -type JiebaAnalyzer struct { -} - func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { tokenizerName, ok := config["tokenizer"].(string) if !ok { diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index e98dfc6..412fca0 100755 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -8,6 +8,7 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/registry" jieba "github.com/fumiama/jieba" + "github.com/fumiama/jieba/util/helper" ) // Name is the jieba tokenizer name. @@ -83,30 +84,26 @@ func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.To // Tokenize cuts input into bleve token stream. func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - runeStart := 0 + rv := make(analysis.TokenStream, 0, 256) start := 0 end := 0 pos := 1 - var width int - var gram string for _, word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) - width = len(runes) + width := len(runes) for _, step := range [2]int{2, 3} { if width <= step { continue } for i := 0; i < width-step+1; i++ { - gram = string(runes[i : i+step]) - gramLen := len(gram) + gram := string(runes[i : i+step]) if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 { gramStart := start + len(string(runes[:i])) token := analysis.Token{ - Term: []byte(gram), + Term: helper.StringToBytes(gram), Start: gramStart, - End: gramStart + gramLen, + End: gramStart + len(gram), Position: pos, Type: detectTokenType(gram), } @@ -126,7 +123,6 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { } rv = append(rv, &token) pos++ - runeStart += width start = end } return rv