mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
moved tokenizers to a seperated module
This commit is contained in:
@@ -57,41 +57,3 @@ func Example_loadUserDictionary() {
|
||||
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||
}
|
||||
|
||||
func Example_tokenize() {
|
||||
var seg jiebago.Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
|
||||
sentence := []byte("永和服装饰品有限公司")
|
||||
|
||||
// default mode
|
||||
tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false)
|
||||
fmt.Println("Default Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
|
||||
//search mode
|
||||
tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true)
|
||||
fmt.Println("Search Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
// Output:
|
||||
// Default Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
||||
// Search Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
||||
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
||||
}
|
||||
|
||||
5
jieba.go
5
jieba.go
@@ -22,6 +22,11 @@ type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// Dictionary returns segmenter's dictionary
|
||||
func (seg *Segmenter) Dictionary() *Dictionary {
|
||||
return seg.dict
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name. Everytime
|
||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago_test
|
||||
package tokenizers_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"os"
|
||||
|
||||
"github.com/blevesearch/bleve"
|
||||
_ "github.com/wangbin/jiebago"
|
||||
_ "github.com/wangbin/jiebago/tokenizers"
|
||||
)
|
||||
|
||||
func Example_beleveSearch() {
|
||||
@@ -15,7 +15,7 @@ func Example_beleveSearch() {
|
||||
|
||||
err := indexMapping.AddCustomTokenizer("jieba",
|
||||
map[string]interface{}{
|
||||
"file": "dict.txt",
|
||||
"file": "../dict.txt",
|
||||
"type": "jieba",
|
||||
})
|
||||
if err != nil {
|
||||
@@ -79,7 +79,7 @@ func Example_beleveSearch() {
|
||||
|
||||
// search for some text
|
||||
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
|
||||
query := bleve.NewMatchQuery(keyword)
|
||||
query := bleve.NewQueryStringQuery(keyword)
|
||||
search := bleve.NewSearchRequest(query)
|
||||
search.Highlight = bleve.NewHighlight()
|
||||
searchResults, err := index.Search(search)
|
||||
42
tokenizers/example_test.go
Normal file
42
tokenizers/example_test.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package tokenizers_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/wangbin/jiebago/tokenizers"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
sentence := []byte("永和服装饰品有限公司")
|
||||
|
||||
// default mode
|
||||
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
|
||||
fmt.Println("Default Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
|
||||
//search mode
|
||||
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
|
||||
fmt.Println("Search Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||
}
|
||||
// Output:
|
||||
// Default Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
||||
// Search Mode:
|
||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
||||
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
||||
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
@@ -16,7 +17,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||
type JiebaTokenizer struct {
|
||||
seg Segmenter
|
||||
seg jiebago.Segmenter
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
@@ -41,7 +42,7 @@ Parameters:
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg Segmenter
|
||||
var seg jiebago.Segmenter
|
||||
err := seg.LoadDictionary(dictFilePath)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
@@ -59,6 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
dict := jt.seg.Dictionary()
|
||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
@@ -68,7 +70,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
for i := 0; i < width-step+1; i++ {
|
||||
gram = string(runes[i : i+step])
|
||||
gramLen := len(gram)
|
||||
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
|
||||
if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
|
||||
gramStart := start + len(string(runes[:i]))
|
||||
token := analysis.Token{
|
||||
Term: []byte(gram),
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
Reference in New Issue
Block a user