mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-30 00:50:30 +08:00
move tokenizer.go to top directory
This commit is contained in:
@@ -1,65 +0,0 @@
|
|||||||
package dictionary
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
var d *Dictionary
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
d = New()
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestLoadDictionary(t *testing.T) {
|
|
||||||
if err := d.LoadDictionary("../dict.txt"); err != nil {
|
|
||||||
t.Fatalf("Failed to load dict.txt, err = %s", err)
|
|
||||||
}
|
|
||||||
n := len(d.freqMap)
|
|
||||||
|
|
||||||
d.LoadDictionary("../foobar.txt")
|
|
||||||
if len(d.freqMap) == n {
|
|
||||||
t.Fatalf("Failed to load foobar.txt")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestLoadUserDictionary(t *testing.T) {
|
|
||||||
err := d.LoadUserDictionary("../userdict.txt")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Failed to load userdict.txt, err = %s", err)
|
|
||||||
}
|
|
||||||
if _, ok := d.Frequency("八一双鹿"); !ok {
|
|
||||||
t.Fatalf("Failed to load userdict.txt, no frequency for word \"八一双鹿\"")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFrequency(t *testing.T) {
|
|
||||||
d.LoadUserDictionary("../userdict.txt")
|
|
||||||
if f, _ := d.Frequency("八一双鹿"); f != 3.0 {
|
|
||||||
t.Fatalf("Wrong frequency for word \"八一双鹿\", expect 3.0, got %f", f)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestTotal(t *testing.T) {
|
|
||||||
d.LoadDictionary("../userdict.txt")
|
|
||||||
if d.Total() != 319.0 {
|
|
||||||
t.Fatalf("Wrong total for userdict.txt, expect 319.0, got %f", d.Total())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestLogTotal(t *testing.T) {
|
|
||||||
d.LoadDictionary("../userdict.txt")
|
|
||||||
if d.LogTotal() != math.Log(319.0) {
|
|
||||||
t.Fatalf("Wrong total for userdict.txt, expect %f, got %f", math.Log(319.0), d.LogTotal())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestAddToken(t *testing.T) {
|
|
||||||
d.LoadDictionary("../userdict.txt")
|
|
||||||
token := Token{text: "超敏C反应蛋白", frequency: 100.0, pos: "nz"}
|
|
||||||
d.AddToken(token)
|
|
||||||
if f, _ := d.Frequency("超敏C反应蛋白"); f != 100.0 {
|
|
||||||
t.Fatalf("Failed to add Token \"超敏C反应蛋白\", except frequency 100.0, got %f", f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -17,3 +17,7 @@ func (t Token) Frequency() float64 {
|
|||||||
func (t Token) Pos() string {
|
func (t Token) Pos() string {
|
||||||
return t.pos
|
return t.pos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func NewToken(text string, frequency float64, pos string) Token {
|
||||||
|
return Token{text: text, frequency: frequency, pos: pos}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
package tokenizers
|
package jiebago
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/blevesearch/bleve/analysis"
|
|
||||||
"github.com/blevesearch/bleve/registry"
|
|
||||||
"github.com/wangbin/jiebago"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
)
|
)
|
||||||
|
|
||||||
const Name = "jieba"
|
const Name = "jieba"
|
||||||
@@ -14,14 +14,15 @@ const Name = "jieba"
|
|||||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
|
||||||
type JiebaTokenizer struct {
|
type JiebaTokenizer struct {
|
||||||
j *jiebago.Jieba
|
seg Segmenter
|
||||||
hmm, searchMode bool
|
hmm, searchMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
j, err := jiebago.Open(dictFileName)
|
var seg Segmenter
|
||||||
|
err := seg.LoadDictionary(dictFileName)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
j: j,
|
seg: seg,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
searchMode: searchMode,
|
searchMode: searchMode,
|
||||||
}, err
|
}, err
|
||||||
@@ -35,7 +36,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
pos := 1
|
pos := 1
|
||||||
var width int
|
var width int
|
||||||
var gram string
|
var gram string
|
||||||
for word := range jt.j.Cut(string(input), jt.hmm) {
|
for word := range jt.seg.Cut(string(input), jt.hmm) {
|
||||||
if jt.searchMode {
|
if jt.searchMode {
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
width = len(runes)
|
width = len(runes)
|
||||||
@@ -44,7 +45,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
for i := 0; i < width-step+1; i++ {
|
for i := 0; i < width-step+1; i++ {
|
||||||
gram = string(runes[i : i+step])
|
gram = string(runes[i : i+step])
|
||||||
gramLen := len(gram)
|
gramLen := len(gram)
|
||||||
if value, ok := jt.j.Freq(gram); ok && value > 0 {
|
if value, ok := jt.seg.dict.Frequency(gram); ok && value > 0 {
|
||||||
gramStart := start + len(string(runes[:i]))
|
gramStart := start + len(string(runes[:i]))
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Term: []byte(gram),
|
Term: []byte(gram),
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
package tokenizers
|
package jiebago
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/blevesearch/bleve/analysis"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
||||||
@@ -5218,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -11056,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
tokenizer, _ := NewJiebaTokenizer("dict.txt", true, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -16473,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -22505,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
tokenizer, _ := NewJiebaTokenizer("dict.txt", false, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
Reference in New Issue
Block a user