1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 21:20:26 +08:00

moved tokenizers to a seperated module

This commit is contained in:
Wang Bin
2015-05-07 18:52:29 +08:00
parent 7440fa00df
commit 3d91f615cf
6 changed files with 62 additions and 51 deletions

View File

@@ -0,0 +1,126 @@
package tokenizers_test
import (
"fmt"
"log"
"os"
"github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago/tokenizers"
)
func Example_beleveSearch() {
// open a new index
indexMapping := bleve.NewIndexMapping()
err := indexMapping.AddCustomTokenizer("jieba",
map[string]interface{}{
"file": "../dict.txt",
"type": "jieba",
})
if err != nil {
log.Fatal(err)
}
// create a custom analyzer
err = indexMapping.AddCustomAnalyzer("jieba",
map[string]interface{}{
"type": "custom",
"tokenizer": "jieba",
"token_filters": []string{
"possessive_en",
"to_lower",
"stop_en",
},
})
if err != nil {
log.Fatal(err)
}
indexMapping.DefaultAnalyzer = "jieba"
cacheDir := "jieba.beleve"
os.RemoveAll(cacheDir)
index, err := bleve.New(cacheDir, indexMapping)
if err != nil {
log.Fatal(err)
}
docs := []struct {
Title string
Name string
}{
{
Title: "Doc 1",
Name: "This is the first document weve added",
},
{
Title: "Doc 2",
Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
},
{
Title: "Doc 3",
Name: "买水果然后来世博园。",
},
{
Title: "Doc 4",
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
},
{
Title: "Doc 5",
Name: "咱俩交换一下吧。",
},
}
// index docs
for _, doc := range docs {
index.Index(doc.Title, doc)
}
// search for some text
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
query := bleve.NewQueryStringQuery(keyword)
search := bleve.NewSearchRequest(query)
search.Highlight = bleve.NewHighlight()
searchResults, err := index.Search(search)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
for i, hit := range searchResults.Hits {
rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
for fragmentField, fragments := range hit.Fragments {
rv += fmt.Sprintf("%s: ", fragmentField)
for _, fragment := range fragments {
rv += fmt.Sprintf("%s", fragment)
}
}
fmt.Printf("%s\n", rv)
}
}
// Output:
// Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550)
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
// 2. Doc 2, (0.031941)
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
// Result of "你": 1 matches:
// 1. Doc 2, (0.391161)
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches:
// 1. Doc 1, (0.512150)
// Name: This is the <span class="highlight">first</span> document weve added
// Result of "中文": 1 matches:
// 1. Doc 2, (0.553186)
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
// Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
// 2. Doc 5, (0.086700)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// Result of "交换": 2 matches:
// 1. Doc 5, (0.534158)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// 2. Doc 4, (0.296297)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
}

View File

@@ -0,0 +1,42 @@
package tokenizers_test
import (
"fmt"
"github.com/wangbin/jiebago/tokenizers"
)
func Example() {
sentence := []byte("永和服装饰品有限公司")
// default mode
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
//search mode
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
// Output:
// Default Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
// Search Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
}

147
tokenizers/tokenizer.go Normal file
View File

@@ -0,0 +1,147 @@
package tokenizers
import (
"fmt"
"regexp"
"strconv"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago"
)
// Name is the jieba tokenizer name.
const Name = "jieba"
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct {
seg jiebago.Segmenter
hmm, searchMode bool
}
/*
NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jiebago.Segmenter
err := seg.LoadDictionary(dictFilePath)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
searchMode: searchMode,
}, err
}
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
runeStart := 0
start := 0
end := 0
pos := 1
var width int
var gram string
dict := jt.seg.Dictionary()
for word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)
for _, step := range [2]int{2, 3} {
if width > step {
for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step])
gramLen := len(gram)
if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
gramStart := start + len(string(runes[:i]))
token := analysis.Token{
Term: []byte(gram),
Start: gramStart,
End: gramStart + gramLen,
Position: pos,
Type: detectTokenType(gram),
}
rv = append(rv, &token)
pos++
}
}
}
}
}
end = start + len(word)
token := analysis.Token{
Term: []byte(word),
Start: start,
End: end,
Position: pos,
Type: detectTokenType(word),
}
rv = append(rv, &token)
pos++
runeStart += width
start = end
}
return rv
}
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFilePath, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
hmm, ok := config["hmm"].(bool)
if !ok {
hmm = true
}
searchMode, ok := config["search"].(bool)
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
}
func detectTokenType(term string) analysis.TokenType {
if ideographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)
if err == nil {
return analysis.Numeric
}
return analysis.AlphaNumeric
}
func init() {
registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
}

22516
tokenizers/tokenizer_test.go Normal file

File diff suppressed because it is too large Load Diff