1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
Files
jieba/tokenizers/example_bleve_test.go
2015-05-07 18:52:29 +08:00

127 lines
3.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package tokenizers_test
import (
"fmt"
"log"
"os"
"github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago/tokenizers"
)
func Example_beleveSearch() {
// open a new index
indexMapping := bleve.NewIndexMapping()
err := indexMapping.AddCustomTokenizer("jieba",
map[string]interface{}{
"file": "../dict.txt",
"type": "jieba",
})
if err != nil {
log.Fatal(err)
}
// create a custom analyzer
err = indexMapping.AddCustomAnalyzer("jieba",
map[string]interface{}{
"type": "custom",
"tokenizer": "jieba",
"token_filters": []string{
"possessive_en",
"to_lower",
"stop_en",
},
})
if err != nil {
log.Fatal(err)
}
indexMapping.DefaultAnalyzer = "jieba"
cacheDir := "jieba.beleve"
os.RemoveAll(cacheDir)
index, err := bleve.New(cacheDir, indexMapping)
if err != nil {
log.Fatal(err)
}
docs := []struct {
Title string
Name string
}{
{
Title: "Doc 1",
Name: "This is the first document weve added",
},
{
Title: "Doc 2",
Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
},
{
Title: "Doc 3",
Name: "买水果然后来世博园。",
},
{
Title: "Doc 4",
Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
},
{
Title: "Doc 5",
Name: "咱俩交换一下吧。",
},
}
// index docs
for _, doc := range docs {
index.Index(doc.Title, doc)
}
// search for some text
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
query := bleve.NewQueryStringQuery(keyword)
search := bleve.NewSearchRequest(query)
search.Highlight = bleve.NewHighlight()
searchResults, err := index.Search(search)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
for i, hit := range searchResults.Hits {
rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
for fragmentField, fragments := range hit.Fragments {
rv += fmt.Sprintf("%s: ", fragmentField)
for _, fragment := range fragments {
rv += fmt.Sprintf("%s", fragment)
}
}
fmt.Printf("%s\n", rv)
}
}
// Output:
// Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550)
// Name: 买<span class="highlight">水果</span>然后来<span class="highlight">世博</span>园。
// 2. Doc 2, (0.031941)
// Name: The second one 你 中文测试中文 is even more interesting! 吃<span class="highlight">水果</span>
// Result of "你": 1 matches:
// 1. Doc 2, (0.391161)
// Name: The second one <span class="highlight">你</span> 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches:
// 1. Doc 1, (0.512150)
// Name: This is the <span class="highlight">first</span> document weve added
// Result of "中文": 1 matches:
// 1. Doc 2, (0.553186)
// Name: The second one 你 <span class="highlight">中文</span>测试<span class="highlight">中文</span> is even more interesting! 吃水果
// Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换机</span>等技术性器件的安装工作
// 2. Doc 5, (0.086700)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// Result of "交换": 2 matches:
// 1. Doc 5, (0.534158)
// Name: 咱俩<span class="highlight">交换</span>一下吧。
// 2. Doc 4, (0.296297)
// Name: 工信处女干事每月经过下属科室都要亲口交代24口<span class="highlight">交换</span>机等技术性器件的安装工作
}