1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-30 09:00:30 +08:00

优化 dict, add fs.File 支持

This commit is contained in:
源文雨
2022-11-30 14:14:48 +08:00
parent c8785c7994
commit f3da9e6420
22 changed files with 190 additions and 91 deletions

View File

@@ -6,8 +6,8 @@ import (
func Example_extractTags() { func Example_extractTags() {
var t TagExtracter var t TagExtracter
t.LoadDictionary("../dict.txt") t.LoadDictionaryAt("../dict.txt")
t.LoadIdf("idf.txt") t.LoadIdfAt("idf.txt")
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。" sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"
segments := t.ExtractTags(sentence, 5) segments := t.ExtractTags(sentence, 5)
@@ -20,7 +20,7 @@ func Example_extractTags() {
} }
func Example_textRank() { func Example_textRank() {
t, err := NewTextRanker("../dict.txt") t, err := NewTextRankerAt("../dict.txt")
if err != nil { if err != nil {
panic(err) panic(err)
} }

View File

@@ -1,6 +1,7 @@
package analyse package analyse
import ( import (
"io/fs"
"sort" "sort"
"sync" "sync"
@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
i.Unlock() i.Unlock()
} }
func (i *Idf) loadDictionary(fileName string) error { func (i *Idf) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(i, fileName) return dictionary.LoadDictionary(i, file)
}
func (i *Idf) loadDictionaryAt(fileName string) error {
return dictionary.LoadDictionaryAt(i, fileName)
} }
// Frequency returns the IDF of given word. // Frequency returns the IDF of given word.

View File

@@ -1,6 +1,7 @@
package analyse package analyse
import ( import (
"io/fs"
"sync" "sync"
"github.com/fumiama/jieba/dictionary" "github.com/fumiama/jieba/dictionary"
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
s.Unlock() s.Unlock()
} }
func (s *StopWord) loadDictionary(fileName string) error { func (s *StopWord) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(s, fileName) return dictionary.LoadDictionary(s, file)
}
func (s *StopWord) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(s, file)
} }

View File

@@ -2,6 +2,7 @@
package analyse package analyse
import ( import (
"io/fs"
"sort" "sort"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
@@ -52,22 +53,41 @@ type TagExtracter struct {
} }
// LoadDictionary reads the given filename and create a new dictionary. // LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error { func (t *TagExtracter) LoadDictionary(file fs.File) error {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter) t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionary(fileName) return t.seg.LoadDictionary(file)
}
// LoadDictionaryAt reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionaryAt(fileName)
} }
// LoadIdf reads the given file and create a new Idf dictionary. // LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error { func (t *TagExtracter) LoadIdf(file fs.File) error {
t.idf = NewIdf() t.idf = NewIdf()
return t.idf.loadDictionary(fileName) return t.idf.loadDictionary(file)
}
// LoadIdfAt reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdfAt(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionaryAt(fileName)
} }
// LoadStopWords reads the given file and create a new StopWord dictionary. // LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error { func (t *TagExtracter) LoadStopWords(file fs.File) error {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName) return t.stopWord.loadDictionary(file)
}
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWordsAt(file string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionaryAt(file)
} }
// ExtractTags extracts the topK key words from sentence. // ExtractTags extracts the topK key words from sentence.

View File

@@ -256,8 +256,8 @@ var (
func TestExtractTags(t *testing.T) { func TestExtractTags(t *testing.T) {
var te TagExtracter var te TagExtracter
te.LoadDictionary("../dict.txt") te.LoadDictionaryAt("../dict.txt")
te.LoadIdf("idf.txt") te.LoadIdfAt("idf.txt")
for index, sentence := range testContents { for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20) result := te.ExtractTags(sentence, 20)
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {
func TestExtratTagsWithWeight(t *testing.T) { func TestExtratTagsWithWeight(t *testing.T) {
var te TagExtracter var te TagExtracter
te.LoadDictionary("../dict.txt") te.LoadDictionaryAt("../dict.txt")
te.LoadIdf("idf.txt") te.LoadIdfAt("idf.txt")
result := te.ExtractTags(Lyric, 10) result := te.ExtractTags(Lyric, 10)
for index, tag := range result { for index, tag := range result {
if LyciWeight[index].text != tag.text || if LyciWeight[index].text != tag.text ||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
func TestExtractTagsWithStopWordsFile(t *testing.T) { func TestExtractTagsWithStopWordsFile(t *testing.T) {
var te TagExtracter var te TagExtracter
te.LoadDictionary("../dict.txt") te.LoadDictionaryAt("../dict.txt")
te.LoadIdf("idf.txt") te.LoadIdfAt("idf.txt")
te.LoadStopWords("stop_words.txt") te.LoadStopWordsAt("stop_words.txt")
result := te.ExtractTags(Lyric, 7) result := te.ExtractTags(Lyric, 7)
for index, tag := range result { for index, tag := range result {
if LyciWeight2[index].text != tag.text || if LyciWeight2[index].text != tag.text ||

View File

@@ -2,6 +2,7 @@ package analyse
import ( import (
"hash/crc64" "hash/crc64"
"io/fs"
"math" "math"
"sort" "sort"
@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
type TextRanker posseg.Segmenter type TextRanker posseg.Segmenter
// NewTextRanker reads a given file and create a new dictionary file for Textranker. // NewTextRanker reads a given file and create a new dictionary file for Textranker.
func NewTextRanker(fileName string) (TextRanker, error) { func NewTextRanker(file fs.File) (TextRanker, error) {
seg := posseg.Segmenter{} seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionary(fileName) return TextRanker(seg), seg.LoadDictionary(file)
}
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
func NewTextRankerAt(fileName string) (TextRanker, error) {
seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
} }

View File

@@ -23,7 +23,7 @@ var (
) )
func TestTextRank(t *testing.T) { func TestTextRank(t *testing.T) {
tr, err := NewTextRanker("../dict.txt") tr, err := NewTextRankerAt("../dict.txt")
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@@ -1,6 +1,7 @@
package jieba package jieba
import ( import (
"io/fs"
"math" "math"
"sync" "sync"
@@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok return freq, ok
} }
func (d *Dictionary) loadDictionary(fileName string) error { func (d *Dictionary) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(d, fileName) return dictionary.LoadDictionary(d, file)
}
func (d *Dictionary) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(d, file)
} }

View File

@@ -4,8 +4,8 @@ package dictionary
import ( import (
"bufio" "bufio"
"io/fs"
"os" "os"
"path/filepath"
"strconv" "strconv"
"strings" "strings"
) )
@@ -17,7 +17,7 @@ type DictLoader interface {
AddToken(Token) AddToken(Token)
} }
func loadDictionary(file *os.File) (tokens []Token, err error) { func loadDictionary(file fs.File) (tokens []Token, err error) {
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
var token Token var token Token
var line string var line string
@@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) {
} }
// LoadDictionary reads the given file and passes all tokens to a DictLoader. // LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, fileName string) error { func LoadDictionary(dl DictLoader, file fs.File) error {
filePath, err := dictPath(fileName) tokens, err := loadDictionary(file)
if err != nil { if err != nil {
return err return err
} }
dictFile, err := os.Open(filePath) dl.Load(tokens...)
return nil
}
// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader.
func LoadDictionaryAt(dl DictLoader, file string) error {
dictFile, err := os.Open(file)
if err != nil { if err != nil {
return err return err
} }
@@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error {
dl.Load(tokens...) dl.Load(tokens...)
return nil return nil
} }
func dictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}

View File

@@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) {
func TestLoadDictionary(t *testing.T) { func TestLoadDictionary(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
err := LoadDictionary(d, "../userdict.txt") err := LoadDictionaryAt(d, "../userdict.txt")
if err != nil { if err != nil {
t.Fatalf(err.Error()) t.Fatalf(err.Error())
} }
@@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) {
func TestAddToken(t *testing.T) { func TestAddToken(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
LoadDictionary(d, "../userdict.txt") LoadDictionaryAt(d, "../userdict.txt")
d.AddToken(Token{"好用", 99, "a"}) d.AddToken(Token{99, "好用", "a"})
if d.freqMap["好用"] != 99 { if d.freqMap["好用"] != 99 {
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"]) t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
} }

View File

@@ -2,8 +2,8 @@ package dictionary
// Token represents a Chinese word with (optional) frequency and POS. // Token represents a Chinese word with (optional) frequency and POS.
type Token struct { type Token struct {
text string
frequency float64 frequency float64
text string
pos string pos string
} }

View File

@@ -36,7 +36,7 @@ func Example_parallelCut() {
runtime.GOMAXPROCS(numThreads) runtime.GOMAXPROCS(numThreads)
// Load dictionary // Load dictionary
segmenter.LoadDictionary("dict.txt") segmenter.LoadDictionaryAt("dict.txt")
// open file for segmentation // open file for segmentation
file, err := os.Open("README.md") file, err := os.Open("README.md")

View File

@@ -6,7 +6,7 @@ import (
func Example() { func Example() {
var seg Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
fmt.Print("【全模式】:") fmt.Print("【全模式】:")
fmt.Println(seg.CutAll("我来到北京清华大学")) fmt.Println(seg.CutAll("我来到北京清华大学"))
@@ -28,7 +28,7 @@ func Example() {
func Example_suggestFrequency() { func Example_suggestFrequency() {
var seg Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
sentence := "超敏C反应蛋白是什么" sentence := "超敏C反应蛋白是什么"
fmt.Print("Before:") fmt.Print("Before:")
@@ -76,13 +76,13 @@ func Example_suggestFrequency() {
func Example_loadUserDictionary() { func Example_loadUserDictionary() {
var seg Segmenter var seg Segmenter
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家" sentence := "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:") fmt.Print("Before:")
fmt.Println(seg.Cut(sentence, true)) fmt.Println(seg.Cut(sentence, true))
seg.LoadUserDictionary("userdict.txt") seg.LoadUserDictionaryAt("userdict.txt")
fmt.Print("After:") fmt.Print("After:")
fmt.Println(seg.Cut(sentence, true)) fmt.Println(seg.Cut(sentence, true))

View File

@@ -2,6 +2,7 @@
package jieba package jieba
import ( import (
"io/fs"
"math" "math"
"regexp" "regexp"
"strings" "strings"
@@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
// LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard. // LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error { func (seg *Segmenter) LoadDictionary(file fs.File) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)} seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(file)
}
// LoadDictionaryAt loads dictionary from given file name. Everytime
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionaryAt(file string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionaryAt(file)
} }
// LoadUserDictionary loads a user specified dictionary, it must be called // LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error { func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(file)
}
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
return seg.dict.loadDictionaryAt(file)
} }
func (seg *Segmenter) dag(runes []rune) map[int][]int { func (seg *Segmenter) dag(runes []rune) map[int][]int {

View File

@@ -616,7 +616,7 @@ var (
) )
func init() { func init() {
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
} }
func TestCutDAG(t *testing.T) { func TestCutDAG(t *testing.T) {
@@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) {
func TestLoadDictionary(t *testing.T) { func TestLoadDictionary(t *testing.T) {
var result []string var result []string
seg.LoadDictionary("foobar.txt") seg.LoadDictionaryAt("foobar.txt")
for index, content := range testContents { for index, content := range testContents {
result = seg.Cut(content, true) result = seg.Cut(content, true)
if len(result) != len(userDictCutResult[index]) { if len(result) != len(userDictCutResult[index]) {
@@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) {
} }
} }
} }
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
} }
func TestLoadUserDictionary(t *testing.T) { func TestLoadUserDictionary(t *testing.T) {
seg.LoadUserDictionary("userdict.txt") seg.LoadUserDictionaryAt("userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型" sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
@@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) {
t.Fatal(word) t.Fatal(word)
} }
} }
seg.LoadDictionary("dict.txt") seg.LoadDictionaryAt("dict.txt")
} }
func BenchmarkCutNoHMM(b *testing.B) { func BenchmarkCutNoHMM(b *testing.B) {

View File

@@ -1,6 +1,7 @@
package posseg package posseg
import ( import (
"io/fs"
"math" "math"
"sync" "sync"
@@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) {
return pos, ok return pos, ok
} }
func (d *Dictionary) loadDictionary(fileName string) error { func (d *Dictionary) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(d, fileName) return dictionary.LoadDictionary(d, file)
}
func (d *Dictionary) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(d, file)
} }

View File

@@ -8,7 +8,7 @@ import (
func Example() { func Example() {
var seg posseg.Segmenter var seg posseg.Segmenter
seg.LoadDictionary("../dict.txt") seg.LoadDictionaryAt("../dict.txt")
for segment := range seg.Cut("我爱北京天安门", true) { for segment := range seg.Cut("我爱北京天安门", true) {
fmt.Printf("%s %s\n", segment.Text(), segment.Pos()) fmt.Printf("%s %s\n", segment.Text(), segment.Pos())

View File

@@ -2,6 +2,7 @@
package posseg package posseg
import ( import (
"io/fs"
"math" "math"
"regexp" "regexp"
@@ -39,17 +40,31 @@ type Segmenter struct {
} }
// LoadDictionary loads dictionary from given file name. // LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard. // Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error { func (seg *Segmenter) LoadDictionary(file fs.File) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(file)
}
// LoadDictionaryAt loads dictionary from given file name.
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionaryAt(fileName)
} }
// LoadUserDictionary loads a user specified dictionary, it must be called // LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error { func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(file)
}
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
return seg.dict.loadDictionaryAt(fileName)
} }
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {

View File

@@ -269,7 +269,7 @@ var (
) )
func init() { func init() {
seg.LoadDictionary("../dict.txt") seg.LoadDictionaryAt("../dict.txt")
} }
func chanToArray(ch <-chan Segment) []Segment { func chanToArray(ch <-chan Segment) []Segment {
@@ -357,8 +357,8 @@ func TestBug137(t *testing.T) {
} }
func TestUserDict(t *testing.T) { func TestUserDict(t *testing.T) {
seg.LoadUserDictionary("../userdict.txt") seg.LoadUserDictionaryAt("../userdict.txt")
defer seg.LoadDictionary("../dict.txt") defer seg.LoadDictionaryAt("../dict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型" sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
cutResult := []Segment{ cutResult := []Segment{

View File

@@ -10,7 +10,7 @@ func Example() {
sentence := []byte("永和服装饰品有限公司") sentence := []byte("永和服装饰品有限公司")
// default mode // default mode
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false) tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false)
fmt.Println("Default Mode:") fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) { for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf( fmt.Printf(
@@ -19,7 +19,7 @@ func Example() {
} }
//search mode //search mode
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true) tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true)
fmt.Println("Search Mode:") fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) { for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf( fmt.Printf(

View File

@@ -1,7 +1,7 @@
package tokenizers package tokenizers
import ( import (
"fmt" "io/fs"
"regexp" "regexp"
"strconv" "strconv"
@@ -24,6 +24,36 @@ type JiebaTokenizer struct {
/* /*
NewJiebaTokenizer creates a new JiebaTokenizer. NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFile: the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter
err := seg.LoadDictionary(dictFile)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
searchMode: searchMode,
}, err
}
/*
NewJiebaTokenizerAt creates a new JiebaTokenizer.
Parameters: Parameters:
dictFilePath: path of the dictioanry file. dictFilePath: path of the dictioanry file.
@@ -41,9 +71,9 @@ Parameters:
"交换机" as a single word. If searchMode is true, it will further split "交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter var seg jieba.Segmenter
err := seg.LoadDictionary(dictFilePath) err := seg.LoadDictionaryAt(dictFilePath)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
hmm: hmm, hmm: hmm,
@@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter: Parameter config should contains at least one parameter:
file: the path of the dictionary file. file: the path of the dictionary file or fs.File.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details. search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/ */
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
analysis.Tokenizer, error) {
dictFilePath, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
hmm, ok := config["hmm"].(bool) hmm, ok := config["hmm"].(bool)
if !ok { if !ok {
hmm = true hmm = true
@@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
if !ok { if !ok {
searchMode = true searchMode = true
} }
dictFilePath, ok := config["file"].(string)
return NewJiebaTokenizer(dictFilePath, hmm, searchMode) if ok {
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
}
dictFile := config["file"].(fs.File)
return NewJiebaTokenizer(dictFile, hmm, searchMode)
} }
func detectTokenType(term string) analysis.TokenType { func detectTokenType(term string) analysis.TokenType {

View File

@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
}, },
} }
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true)
for _, test := range tests { for _, test := range tests {
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {