1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

moved tokenizers to a seperated module

This commit is contained in:
Wang Bin
2015-05-07 18:52:29 +08:00
parent 7440fa00df
commit 3d91f615cf
6 changed files with 62 additions and 51 deletions

View File

@@ -57,41 +57,3 @@ func Example_loadUserDictionary() {
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / // Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / // After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
} }
func Example_tokenize() {
var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")
sentence := []byte("永和服装饰品有限公司")
// default mode
tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false)
fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
//search mode
tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true)
fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
// Output:
// Default Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
// Search Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
}

View File

@@ -22,6 +22,11 @@ type Segmenter struct {
dict *Dictionary dict *Dictionary
} }
// Dictionary returns segmenter's dictionary
func (seg *Segmenter) Dictionary() *Dictionary {
return seg.dict
}
// LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard. // LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error { func (seg *Segmenter) LoadDictionary(fileName string) error {

View File

@@ -1,4 +1,4 @@
package jiebago_test package tokenizers_test
import ( import (
"fmt" "fmt"
@@ -6,7 +6,7 @@ import (
"os" "os"
"github.com/blevesearch/bleve" "github.com/blevesearch/bleve"
_ "github.com/wangbin/jiebago" _ "github.com/wangbin/jiebago/tokenizers"
) )
func Example_beleveSearch() { func Example_beleveSearch() {
@@ -15,7 +15,7 @@ func Example_beleveSearch() {
err := indexMapping.AddCustomTokenizer("jieba", err := indexMapping.AddCustomTokenizer("jieba",
map[string]interface{}{ map[string]interface{}{
"file": "dict.txt", "file": "../dict.txt",
"type": "jieba", "type": "jieba",
}) })
if err != nil { if err != nil {
@@ -79,7 +79,7 @@ func Example_beleveSearch() {
// search for some text // search for some text
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} { for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
query := bleve.NewMatchQuery(keyword) query := bleve.NewQueryStringQuery(keyword)
search := bleve.NewSearchRequest(query) search := bleve.NewSearchRequest(query)
search.Highlight = bleve.NewHighlight() search.Highlight = bleve.NewHighlight()
searchResults, err := index.Search(search) searchResults, err := index.Search(search)

View File

@@ -0,0 +1,42 @@
package tokenizers_test
import (
"fmt"
"github.com/wangbin/jiebago/tokenizers"
)
func Example() {
sentence := []byte("永和服装饰品有限公司")
// default mode
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
//search mode
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
token.Term, token.Start, token.End, token.Position, token.Type)
}
// Output:
// Default Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
// Search Mode:
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
}

View File

@@ -1,4 +1,4 @@
package jiebago package tokenizers
import ( import (
"fmt" "fmt"
@@ -7,6 +7,7 @@ import (
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago"
) )
// Name is the jieba tokenizer name. // Name is the jieba tokenizer name.
@@ -16,7 +17,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago. // JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct { type JiebaTokenizer struct {
seg Segmenter seg jiebago.Segmenter
hmm, searchMode bool hmm, searchMode bool
} }
@@ -41,7 +42,7 @@ Parameters:
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg Segmenter var seg jiebago.Segmenter
err := seg.LoadDictionary(dictFilePath) err := seg.LoadDictionary(dictFilePath)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
@@ -59,6 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1 pos := 1
var width int var width int
var gram string var gram string
dict := jt.seg.Dictionary()
for word := range jt.seg.Cut(string(input), jt.hmm) { for word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
@@ -68,7 +70,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
for i := 0; i < width-step+1; i++ { for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step]) gram = string(runes[i : i+step])
gramLen := len(gram) gramLen := len(gram)
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 { if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
gramStart := start + len(string(runes[:i])) gramStart := start + len(string(runes[:i]))
token := analysis.Token{ token := analysis.Token{
Term: []byte(gram), Term: []byte(gram),

View File

@@ -1,4 +1,4 @@
package jiebago package tokenizers
import ( import (
"reflect" "reflect"
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false) tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true) tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false) tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true) tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {