1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-30 00:50:30 +08:00

move tokenizer.go to top directory

This commit is contained in:
Wang Bin
2015-05-04 18:20:35 +08:00
parent 52fad00403
commit 1a9466927a
4 changed files with 21 additions and 80 deletions

View File

@@ -1,65 +0,0 @@
package dictionary
import (
"math"
"testing"
)
var d *Dictionary
func init() {
d = New()
}
func TestLoadDictionary(t *testing.T) {
if err := d.LoadDictionary("../dict.txt"); err != nil {
t.Fatalf("Failed to load dict.txt, err = %s", err)
}
n := len(d.freqMap)
d.LoadDictionary("../foobar.txt")
if len(d.freqMap) == n {
t.Fatalf("Failed to load foobar.txt")
}
}
func TestLoadUserDictionary(t *testing.T) {
err := d.LoadUserDictionary("../userdict.txt")
if err != nil {
t.Fatalf("Failed to load userdict.txt, err = %s", err)
}
if _, ok := d.Frequency("八一双鹿"); !ok {
t.Fatalf("Failed to load userdict.txt, no frequency for word \"八一双鹿\"")
}
}
func TestFrequency(t *testing.T) {
d.LoadUserDictionary("../userdict.txt")
if f, _ := d.Frequency("八一双鹿"); f != 3.0 {
t.Fatalf("Wrong frequency for word \"八一双鹿\", expect 3.0, got %f", f)
}
}
func TestTotal(t *testing.T) {
d.LoadDictionary("../userdict.txt")
if d.Total() != 319.0 {
t.Fatalf("Wrong total for userdict.txt, expect 319.0, got %f", d.Total())
}
}
func TestLogTotal(t *testing.T) {
d.LoadDictionary("../userdict.txt")
if d.LogTotal() != math.Log(319.0) {
t.Fatalf("Wrong total for userdict.txt, expect %f, got %f", math.Log(319.0), d.LogTotal())
}
}
func TestAddToken(t *testing.T) {
d.LoadDictionary("../userdict.txt")
token := Token{text: "超敏C反应蛋白", frequency: 100.0, pos: "nz"}
d.AddToken(token)
if f, _ := d.Frequency("超敏C反应蛋白"); f != 100.0 {
t.Fatalf("Failed to add Token \"超敏C反应蛋白\", except frequency 100.0, got %f", f)
}
}

View File

@@ -17,3 +17,7 @@ func (t Token) Frequency() float64 {
func (t Token) Pos() string { func (t Token) Pos() string {
return t.pos return t.pos
} }
func NewToken(text string, frequency float64, pos string) Token {
return Token{text: text, frequency: frequency, pos: pos}
}

View File

@@ -1,12 +1,12 @@
package tokenizers package jiebago
import ( import (
"fmt" "fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/wangbin/jiebago"
"regexp" "regexp"
"strconv" "strconv"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
) )
const Name = "jieba" const Name = "jieba"
@@ -14,14 +14,15 @@ const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
type JiebaTokenizer struct { type JiebaTokenizer struct {
j *jiebago.Jieba seg Segmenter
hmm, searchMode bool hmm, searchMode bool
} }
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
j, err := jiebago.Open(dictFileName) var seg Segmenter
err := seg.LoadDictionary(dictFileName)
return &JiebaTokenizer{ return &JiebaTokenizer{
j: j, seg: seg,
hmm: hmm, hmm: hmm,
searchMode: searchMode, searchMode: searchMode,
}, err }, err
@@ -35,7 +36,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1 pos := 1
var width int var width int
var gram string var gram string
for word := range jt.j.Cut(string(input), jt.hmm) { for word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
width = len(runes) width = len(runes)
@@ -44,7 +45,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
for i := 0; i < width-step+1; i++ { for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step]) gram = string(runes[i : i+step])
gramLen := len(gram) gramLen := len(gram)
if value, ok := jt.j.Freq(gram); ok && value > 0 { if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
gramStart := start + len(string(runes[:i])) gramStart := start + len(string(runes[:i]))
token := analysis.Token{ token := analysis.Token{
Term: []byte(gram), Term: []byte(gram),

View File

@@ -1,9 +1,10 @@
package tokenizers package jiebago
import ( import (
"github.com/blevesearch/bleve/analysis"
"reflect" "reflect"
"testing" "testing"
"github.com/blevesearch/bleve/analysis"
) )
func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
@@ -5218,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -11056,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -16473,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -22505,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {