diff --git a/dictionary/dictionary_test.go b/dictionary/dictionary_test.go deleted file mode 100644 index e9b306d..0000000 --- a/dictionary/dictionary_test.go +++ /dev/null @@ -1,65 +0,0 @@ -package dictionary - -import ( - "math" - "testing" -) - -var d *Dictionary - -func init() { - d = New() -} - -func TestLoadDictionary(t *testing.T) { - if err := d.LoadDictionary("../dict.txt"); err != nil { - t.Fatalf("Failed to load dict.txt, err = %s", err) - } - n := len(d.freqMap) - - d.LoadDictionary("../foobar.txt") - if len(d.freqMap) == n { - t.Fatalf("Failed to load foobar.txt") - } -} - -func TestLoadUserDictionary(t *testing.T) { - err := d.LoadUserDictionary("../userdict.txt") - if err != nil { - t.Fatalf("Failed to load userdict.txt, err = %s", err) - } - if _, ok := d.Frequency("八一双鹿"); !ok { - t.Fatalf("Failed to load userdict.txt, no frequency for word \"八一双鹿\"") - } -} - -func TestFrequency(t *testing.T) { - d.LoadUserDictionary("../userdict.txt") - if f, _ := d.Frequency("八一双鹿"); f != 3.0 { - t.Fatalf("Wrong frequency for word \"八一双鹿\", expect 3.0, got %f", f) - } - -} - -func TestTotal(t *testing.T) { - d.LoadDictionary("../userdict.txt") - if d.Total() != 319.0 { - t.Fatalf("Wrong total for userdict.txt, expect 319.0, got %f", d.Total()) - } -} - -func TestLogTotal(t *testing.T) { - d.LoadDictionary("../userdict.txt") - if d.LogTotal() != math.Log(319.0) { - t.Fatalf("Wrong total for userdict.txt, expect %f, got %f", math.Log(319.0), d.LogTotal()) - } -} - -func TestAddToken(t *testing.T) { - d.LoadDictionary("../userdict.txt") - token := Token{text: "超敏C反应蛋白", frequency: 100.0, pos: "nz"} - d.AddToken(token) - if f, _ := d.Frequency("超敏C反应蛋白"); f != 100.0 { - t.Fatalf("Failed to add Token \"超敏C反应蛋白\", except frequency 100.0, got %f", f) - } -} diff --git a/dictionary/token.go b/dictionary/token.go index f4124a1..fbae97f 100644 --- a/dictionary/token.go +++ b/dictionary/token.go @@ -17,3 +17,7 @@ func (t Token) Frequency() float64 { func (t Token) Pos() string { return t.pos } + +func NewToken(text string, frequency float64, pos string) Token { + return Token{text: text, frequency: frequency, pos: pos} +} diff --git a/tokenizers/jieba.go b/tokenizer.go similarity index 89% rename from tokenizers/jieba.go rename to tokenizer.go index f8e9b4f..8684c96 100644 --- a/tokenizers/jieba.go +++ b/tokenizer.go @@ -1,12 +1,12 @@ -package tokenizers +package jiebago import ( "fmt" - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/registry" - "github.com/wangbin/jiebago" "regexp" "strconv" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" ) const Name = "jieba" @@ -14,14 +14,15 @@ const Name = "jieba" var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) type JiebaTokenizer struct { - j *jiebago.Jieba + seg Segmenter hmm, searchMode bool } func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { - j, err := jiebago.Open(dictFileName) + var seg Segmenter + err := seg.LoadDictionary(dictFileName) return &JiebaTokenizer{ - j: j, + seg: seg, hmm: hmm, searchMode: searchMode, }, err @@ -35,7 +36,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - for word := range jt.j.Cut(string(input), jt.hmm) { + for word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes) @@ -44,7 +45,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) gramLen := len(gram) - if value, ok := jt.j.Freq(gram); ok && value > 0 { + if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 { gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram), diff --git a/tokenizers/jieba_test.go b/tokenizer_test.go similarity index 99% rename from tokenizers/jieba_test.go rename to tokenizer_test.go index 006b594..adc6481 100644 --- a/tokenizers/jieba_test.go +++ b/tokenizer_test.go @@ -1,9 +1,10 @@ -package tokenizers +package jiebago import ( - "github.com/blevesearch/bleve/analysis" "reflect" "testing" + + "github.com/blevesearch/bleve/analysis" ) func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { @@ -5218,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) + tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -11056,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) + tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -16473,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) + tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -22505,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) + tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) {