1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
This commit is contained in:
源文雨
2022-11-30 12:18:15 +08:00
parent ab8b95ef87
commit 8bbc755ed4
48 changed files with 984 additions and 859 deletions

30
tokenizers/analyzer.go Executable file
View File

@@ -0,0 +1,30 @@
package tokenizers
import (
"errors"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
type JiebaAnalyzer struct {
}
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, errors.New("must specify tokenizer")
}
tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}
alz := &analysis.Analyzer{
Tokenizer: tokenizer,
}
return alz, nil
}
func init() {
registry.RegisterAnalyzer("jieba", analyzerConstructor)
}

20
tokenizers/example_bleve_test.go Normal file → Executable file
View File

@@ -6,7 +6,7 @@ import (
"os"
"github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago/tokenizers"
_ "github.com/fumiama/jieba/tokenizers"
)
func Example_beleveSearch() {
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
// Output:
// Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550)
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
// Name: 买<mark>水果</mark>然后来<mark>世博</mark>园。
// 2. Doc 2, (0.031941)
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
// Name: The second one 你 中文测试中文 is even more interesting! 吃<mark>水果</mark>
// Result of "你": 1 matches:
// 1. Doc 2, (0.391161)
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
// Name: The second one <mark>你</mark> 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches:
// 1. Doc 1, (0.512150)
// Name: This is the <span class="highlight">first</span> document weve added
// Name: This is the <mark>first</mark> document weve added
// Result of "中文": 1 matches:
// 1. Doc 2, (0.553186)
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
// Name: The second one 你 <mark>中文</mark>测试<mark>中文</mark> is even more interesting! 吃水果
// Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换机</mark>等技术性器件的安装工作
// 2. Doc 5, (0.086700)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// Name: 咱俩<mark>交换</mark>一下吧。
// Result of "交换": 2 matches:
// 1. Doc 5, (0.534158)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// Name: 咱俩<mark>交换</mark>一下吧。
// 2. Doc 4, (0.296297)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换</mark>机等技术性器件的安装工作
}

2
tokenizers/example_test.go Normal file → Executable file
View File

@@ -3,7 +3,7 @@ package tokenizers_test
import (
"fmt"
"github.com/wangbin/jiebago/tokenizers"
"github.com/fumiama/jieba/tokenizers"
)
func Example() {

34
tokenizers/tokenizer.go Normal file → Executable file
View File

@@ -7,7 +7,7 @@ import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago"
jiebago "github.com/fumiama/jieba"
)
// Name is the jieba tokenizer name.
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jiebago.Segmenter
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1
var width int
var gram string
for word := range jt.seg.Cut(string(input), jt.hmm) {
for _, word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {

0
tokenizers/tokenizer_test.go Normal file → Executable file
View File