From 3d91f615cfc45719d2f302872b17c51f9760fc1e Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Thu, 7 May 2015 18:52:29 +0800 Subject: [PATCH] moved tokenizers to a seperated module --- example_test.go | 38 ----------------- jieba.go | 5 +++ .../example_bleve_test.go | 8 ++-- tokenizers/example_test.go | 42 +++++++++++++++++++ tokenizer.go => tokenizers/tokenizer.go | 10 +++-- .../tokenizer_test.go | 10 ++--- 6 files changed, 62 insertions(+), 51 deletions(-) rename example_bleve_test.go => tokenizers/example_bleve_test.go (96%) create mode 100644 tokenizers/example_test.go rename tokenizer.go => tokenizers/tokenizer.go (94%) rename tokenizer_test.go => tokenizers/tokenizer_test.go (99%) diff --git a/example_test.go b/example_test.go index e526bf8..d29c3fe 100644 --- a/example_test.go +++ b/example_test.go @@ -57,41 +57,3 @@ func Example_loadUserDictionary() { // Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / // After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / } - -func Example_tokenize() { - var seg jiebago.Segmenter - seg.LoadDictionary("dict.txt") - - sentence := []byte("永和服装饰品有限公司") - - // default mode - tokenizer, _ := jiebago.NewJiebaTokenizer("dict.txt", true, false) - fmt.Println("Default Mode:") - for _, token := range tokenizer.Tokenize(sentence) { - fmt.Printf( - "Term: %s Start: %d End: %d Position: %d Type: %d\n", - token.Term, token.Start, token.End, token.Position, token.Type) - } - - //search mode - tokenizer, _ = jiebago.NewJiebaTokenizer("dict.txt", true, true) - fmt.Println("Search Mode:") - for _, token := range tokenizer.Tokenize(sentence) { - fmt.Printf( - "Term: %s Start: %d End: %d Position: %d Type: %d\n", - token.Term, token.Start, token.End, token.Position, token.Type) - } - // Output: - // Default Mode: - // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 - // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 - // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 - // Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1 - // Search Mode: - // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 - // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 - // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 - // Term: 有限 Start: 18 End: 24 Position: 4 Type: 1 - // Term: 公司 Start: 24 End: 30 Position: 5 Type: 1 - // Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1 -} diff --git a/jieba.go b/jieba.go index e4fce4e..f81f2ab 100644 --- a/jieba.go +++ b/jieba.go @@ -22,6 +22,11 @@ type Segmenter struct { dict *Dictionary } +// Dictionary returns segmenter's dictionary +func (seg *Segmenter) Dictionary() *Dictionary { + return seg.dict +} + // LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary is called, previously loaded dictionary will be cleard. func (seg *Segmenter) LoadDictionary(fileName string) error { diff --git a/example_bleve_test.go b/tokenizers/example_bleve_test.go similarity index 96% rename from example_bleve_test.go rename to tokenizers/example_bleve_test.go index fa38859..f5da6b4 100644 --- a/example_bleve_test.go +++ b/tokenizers/example_bleve_test.go @@ -1,4 +1,4 @@ -package jiebago_test +package tokenizers_test import ( "fmt" @@ -6,7 +6,7 @@ import ( "os" "github.com/blevesearch/bleve" - _ "github.com/wangbin/jiebago" + _ "github.com/wangbin/jiebago/tokenizers" ) func Example_beleveSearch() { @@ -15,7 +15,7 @@ func Example_beleveSearch() { err := indexMapping.AddCustomTokenizer("jieba", map[string]interface{}{ - "file": "dict.txt", + "file": "../dict.txt", "type": "jieba", }) if err != nil { @@ -79,7 +79,7 @@ func Example_beleveSearch() { // search for some text for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} { - query := bleve.NewMatchQuery(keyword) + query := bleve.NewQueryStringQuery(keyword) search := bleve.NewSearchRequest(query) search.Highlight = bleve.NewHighlight() searchResults, err := index.Search(search) diff --git a/tokenizers/example_test.go b/tokenizers/example_test.go new file mode 100644 index 0000000..2eeda34 --- /dev/null +++ b/tokenizers/example_test.go @@ -0,0 +1,42 @@ +package tokenizers_test + +import ( + "fmt" + + "github.com/wangbin/jiebago/tokenizers" +) + +func Example() { + sentence := []byte("永和服装饰品有限公司") + + // default mode + tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false) + fmt.Println("Default Mode:") + for _, token := range tokenizer.Tokenize(sentence) { + fmt.Printf( + "Term: %s Start: %d End: %d Position: %d Type: %d\n", + token.Term, token.Start, token.End, token.Position, token.Type) + } + + //search mode + tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true) + fmt.Println("Search Mode:") + for _, token := range tokenizer.Tokenize(sentence) { + fmt.Printf( + "Term: %s Start: %d End: %d Position: %d Type: %d\n", + token.Term, token.Start, token.End, token.Position, token.Type) + } + // Output: + // Default Mode: + // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 + // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 + // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 + // Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1 + // Search Mode: + // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 + // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 + // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 + // Term: 有限 Start: 18 End: 24 Position: 4 Type: 1 + // Term: 公司 Start: 24 End: 30 Position: 5 Type: 1 + // Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1 +} diff --git a/tokenizer.go b/tokenizers/tokenizer.go similarity index 94% rename from tokenizer.go rename to tokenizers/tokenizer.go index 91eb0c9..f1bdcb6 100644 --- a/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -1,4 +1,4 @@ -package jiebago +package tokenizers import ( "fmt" @@ -7,6 +7,7 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/registry" + "github.com/wangbin/jiebago" ) // Name is the jieba tokenizer name. @@ -16,7 +17,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`) // JiebaTokenizer is the beleve tokenizer for jiebago. type JiebaTokenizer struct { - seg Segmenter + seg jiebago.Segmenter hmm, searchMode bool } @@ -41,7 +42,7 @@ Parameters: this word into "交换", "换机", which are valid Chinese words. */ func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { - var seg Segmenter + var seg jiebago.Segmenter err := seg.LoadDictionary(dictFilePath) return &JiebaTokenizer{ seg: seg, @@ -59,6 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string + dict := jt.seg.Dictionary() for word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) @@ -68,7 +70,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) gramLen := len(gram) - if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 { + if frequency, ok := dict.Frequency(gram); ok && frequency > 0 { gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram), diff --git a/tokenizer_test.go b/tokenizers/tokenizer_test.go similarity index 99% rename from tokenizer_test.go rename to tokenizers/tokenizer_test.go index adc6481..2ccd0d7 100644 --- a/tokenizer_test.go +++ b/tokenizers/tokenizer_test.go @@ -1,4 +1,4 @@ -package jiebago +package tokenizers import ( "reflect" @@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true) + tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) {