mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
moved tokenizers to a seperated module
This commit is contained in:
@@ -57,41 +57,3 @@ func Example_loadUserDictionary() {
|
|||||||
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
// Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||||
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
// After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||||
}
|
}
|
||||||
|
|
||||||
func Example_tokenize() {
|
|
||||||
var seg jiebago.Segmenter
|
|
||||||
seg.LoadDictionary("dict.txt")
|
|
||||||
|
|
||||||
sentence := []byte("永和服装饰品有限公司")
|
|
||||||
|
|
||||||
// default mode
|
|
||||||
tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false)
|
|
||||||
fmt.Println("Default Mode:")
|
|
||||||
for _, token := range tokenizer.Tokenize(sentence) {
|
|
||||||
fmt.Printf(
|
|
||||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
|
||||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
|
||||||
}
|
|
||||||
|
|
||||||
//search mode
|
|
||||||
tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true)
|
|
||||||
fmt.Println("Search Mode:")
|
|
||||||
for _, token := range tokenizer.Tokenize(sentence) {
|
|
||||||
fmt.Printf(
|
|
||||||
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
|
||||||
token.Term, token.Start, token.End, token.Position, token.Type)
|
|
||||||
}
|
|
||||||
// Output:
|
|
||||||
// Default Mode:
|
|
||||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
|
||||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
|
||||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
|
||||||
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
|
||||||
// Search Mode:
|
|
||||||
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
|
||||||
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
|
||||||
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
|
||||||
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
|
||||||
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
|
||||||
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
|
||||||
}
|
|
||||||
|
|||||||
5
jieba.go
5
jieba.go
@@ -22,6 +22,11 @@ type Segmenter struct {
|
|||||||
dict *Dictionary
|
dict *Dictionary
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Dictionary returns segmenter's dictionary
|
||||||
|
func (seg *Segmenter) Dictionary() *Dictionary {
|
||||||
|
return seg.dict
|
||||||
|
}
|
||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name. Everytime
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
package jiebago_test
|
package tokenizers_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve"
|
"github.com/blevesearch/bleve"
|
||||||
_ "github.com/wangbin/jiebago"
|
_ "github.com/wangbin/jiebago/tokenizers"
|
||||||
)
|
)
|
||||||
|
|
||||||
func Example_beleveSearch() {
|
func Example_beleveSearch() {
|
||||||
@@ -15,7 +15,7 @@ func Example_beleveSearch() {
|
|||||||
|
|
||||||
err := indexMapping.AddCustomTokenizer("jieba",
|
err := indexMapping.AddCustomTokenizer("jieba",
|
||||||
map[string]interface{}{
|
map[string]interface{}{
|
||||||
"file": "dict.txt",
|
"file": "../dict.txt",
|
||||||
"type": "jieba",
|
"type": "jieba",
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -79,7 +79,7 @@ func Example_beleveSearch() {
|
|||||||
|
|
||||||
// search for some text
|
// search for some text
|
||||||
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
|
for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
|
||||||
query := bleve.NewMatchQuery(keyword)
|
query := bleve.NewQueryStringQuery(keyword)
|
||||||
search := bleve.NewSearchRequest(query)
|
search := bleve.NewSearchRequest(query)
|
||||||
search.Highlight = bleve.NewHighlight()
|
search.Highlight = bleve.NewHighlight()
|
||||||
searchResults, err := index.Search(search)
|
searchResults, err := index.Search(search)
|
||||||
42
tokenizers/example_test.go
Normal file
42
tokenizers/example_test.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package tokenizers_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/wangbin/jiebago/tokenizers"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Example() {
|
||||||
|
sentence := []byte("永和服装饰品有限公司")
|
||||||
|
|
||||||
|
// default mode
|
||||||
|
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
|
||||||
|
fmt.Println("Default Mode:")
|
||||||
|
for _, token := range tokenizer.Tokenize(sentence) {
|
||||||
|
fmt.Printf(
|
||||||
|
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||||
|
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||||
|
}
|
||||||
|
|
||||||
|
//search mode
|
||||||
|
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
|
||||||
|
fmt.Println("Search Mode:")
|
||||||
|
for _, token := range tokenizer.Tokenize(sentence) {
|
||||||
|
fmt.Printf(
|
||||||
|
"Term: %s Start: %d End: %d Position: %d Type: %d\n",
|
||||||
|
token.Term, token.Start, token.End, token.Position, token.Type)
|
||||||
|
}
|
||||||
|
// Output:
|
||||||
|
// Default Mode:
|
||||||
|
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||||
|
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||||
|
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||||
|
// Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
|
||||||
|
// Search Mode:
|
||||||
|
// Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
|
||||||
|
// Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
|
||||||
|
// Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
|
||||||
|
// Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
|
||||||
|
// Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
|
||||||
|
// Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package jiebago
|
package tokenizers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
"github.com/wangbin/jiebago"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Name is the jieba tokenizer name.
|
// Name is the jieba tokenizer name.
|
||||||
@@ -16,7 +17,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
|||||||
|
|
||||||
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||||
type JiebaTokenizer struct {
|
type JiebaTokenizer struct {
|
||||||
seg Segmenter
|
seg jiebago.Segmenter
|
||||||
hmm, searchMode bool
|
hmm, searchMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,7 +42,7 @@ Parameters:
|
|||||||
this word into "交换", "换机", which are valid Chinese words.
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
*/
|
*/
|
||||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
var seg Segmenter
|
var seg jiebago.Segmenter
|
||||||
err := seg.LoadDictionary(dictFilePath)
|
err := seg.LoadDictionary(dictFilePath)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
@@ -59,6 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
pos := 1
|
pos := 1
|
||||||
var width int
|
var width int
|
||||||
var gram string
|
var gram string
|
||||||
|
dict := jt.seg.Dictionary()
|
||||||
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||||
if jt.searchMode {
|
if jt.searchMode {
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
@@ -68,7 +70,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
for i := 0; i < width-step+1; i++ {
|
for i := 0; i < width-step+1; i++ {
|
||||||
gram = string(runes[i : i+step])
|
gram = string(runes[i : i+step])
|
||||||
gramLen := len(gram)
|
gramLen := len(gram)
|
||||||
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
|
if frequency, ok := dict.Frequency(gram); ok && frequency > 0 {
|
||||||
gramStart := start + len(string(runes[:i]))
|
gramStart := start + len(string(runes[:i]))
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Term: []byte(gram),
|
Term: []byte(gram),
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package jiebago
|
package tokenizers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
"reflect"
|
||||||
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
|
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
|
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
|
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
|
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
Reference in New Issue
Block a user