mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化
This commit is contained in:
30
tokenizers/analyzer.go
Executable file
30
tokenizers/analyzer.go
Executable file
@@ -0,0 +1,30 @@
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
type JiebaAnalyzer struct {
|
||||
}
|
||||
|
||||
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizerName, ok := config["tokenizer"].(string)
|
||||
if !ok {
|
||||
return nil, errors.New("must specify tokenizer")
|
||||
}
|
||||
tokenizer, err := cache.TokenizerNamed(tokenizerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
alz := &analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
}
|
||||
return alz, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer("jieba", analyzerConstructor)
|
||||
}
|
||||
20
tokenizers/example_bleve_test.go
Normal file → Executable file
20
tokenizers/example_bleve_test.go
Normal file → Executable file
@@ -6,7 +6,7 @@ import (
|
||||
"os"
|
||||
|
||||
"github.com/blevesearch/bleve"
|
||||
_ "github.com/wangbin/jiebago/tokenizers"
|
||||
_ "github.com/fumiama/jieba/tokenizers"
|
||||
)
|
||||
|
||||
func Example_beleveSearch() {
|
||||
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
|
||||
// Output:
|
||||
// Result of "水果世博园": 2 matches:
|
||||
// 1. Doc 3, (1.099550)
|
||||
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
|
||||
// Name: 买<mark>水果</mark>然后来<mark>世博</mark>园。
|
||||
// 2. Doc 2, (0.031941)
|
||||
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
|
||||
// Name: The second one 你 中文测试中文 is even more interesting! 吃<mark>水果</mark>
|
||||
// Result of "你": 1 matches:
|
||||
// 1. Doc 2, (0.391161)
|
||||
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
|
||||
// Name: The second one <mark>你</mark> 中文测试中文 is even more interesting! 吃水果
|
||||
// Result of "first": 1 matches:
|
||||
// 1. Doc 1, (0.512150)
|
||||
// Name: This is the <span class="highlight">first</span> document we’ve added
|
||||
// Name: This is the <mark>first</mark> document we’ve added
|
||||
// Result of "中文": 1 matches:
|
||||
// 1. Doc 2, (0.553186)
|
||||
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
|
||||
// Name: The second one 你 <mark>中文</mark>测试<mark>中文</mark> is even more interesting! 吃水果
|
||||
// Result of "交换机": 2 matches:
|
||||
// 1. Doc 4, (0.608495)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换机</mark>等技术性器件的安装工作
|
||||
// 2. Doc 5, (0.086700)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// Name: 咱俩<mark>交换</mark>一下吧。
|
||||
// Result of "交换": 2 matches:
|
||||
// 1. Doc 5, (0.534158)
|
||||
// Name: 咱俩<span class="highlight">交换</span>一下吧。
|
||||
// Name: 咱俩<mark>交换</mark>一下吧。
|
||||
// 2. Doc 4, (0.296297)
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
|
||||
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<mark>交换</mark>机等技术性器件的安装工作
|
||||
}
|
||||
|
||||
2
tokenizers/example_test.go
Normal file → Executable file
2
tokenizers/example_test.go
Normal file → Executable file
@@ -3,7 +3,7 @@ package tokenizers_test
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/tokenizers"
|
||||
"github.com/fumiama/jieba/tokenizers"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
|
||||
34
tokenizers/tokenizer.go
Normal file → Executable file
34
tokenizers/tokenizer.go
Normal file → Executable file
@@ -7,7 +7,7 @@ import (
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/wangbin/jiebago"
|
||||
jiebago "github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
@@ -26,20 +26,20 @@ NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFilePath: path of the dictioanry file.
|
||||
dictFilePath: path of the dictioanry file.
|
||||
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg jiebago.Segmenter
|
||||
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
for _, word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
@@ -107,11 +107,11 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||
|
||||
Parameter config should contains at least one parameter:
|
||||
|
||||
file: the path of the dictionary file.
|
||||
file: the path of the dictionary file.
|
||||
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
*/
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
|
||||
0
tokenizers/tokenizer_test.go
Normal file → Executable file
0
tokenizers/tokenizer_test.go
Normal file → Executable file
Reference in New Issue
Block a user