1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-23 12:40:39 +08:00

优化 tokenizer

This commit is contained in:
源文雨
2022-11-30 15:39:50 +08:00
parent a433e052c5
commit 4d76899e79
2 changed files with 6 additions and 13 deletions

View File

@@ -7,9 +7,6 @@ import (
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
) )
type JiebaAnalyzer struct {
}
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizerName, ok := config["tokenizer"].(string) tokenizerName, ok := config["tokenizer"].(string)
if !ok { if !ok {

View File

@@ -8,6 +8,7 @@ import (
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
jieba "github.com/fumiama/jieba" jieba "github.com/fumiama/jieba"
"github.com/fumiama/jieba/util/helper"
) )
// Name is the jieba tokenizer name. // Name is the jieba tokenizer name.
@@ -83,30 +84,26 @@ func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.To
// Tokenize cuts input into bleve token stream. // Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, 256)
runeStart := 0
start := 0 start := 0
end := 0 end := 0
pos := 1 pos := 1
var width int
var gram string
for _, word := range jt.seg.Cut(string(input), jt.hmm) { for _, word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
width = len(runes) width := len(runes)
for _, step := range [2]int{2, 3} { for _, step := range [2]int{2, 3} {
if width <= step { if width <= step {
continue continue
} }
for i := 0; i < width-step+1; i++ { for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step]) gram := string(runes[i : i+step])
gramLen := len(gram)
if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 { if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
gramStart := start + len(string(runes[:i])) gramStart := start + len(string(runes[:i]))
token := analysis.Token{ token := analysis.Token{
Term: []byte(gram), Term: helper.StringToBytes(gram),
Start: gramStart, Start: gramStart,
End: gramStart + gramLen, End: gramStart + len(gram),
Position: pos, Position: pos,
Type: detectTokenType(gram), Type: detectTokenType(gram),
} }
@@ -126,7 +123,6 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
} }
rv = append(rv, &token) rv = append(rv, &token)
pos++ pos++
runeStart += width
start = end start = end
} }
return rv return rv