diff --git a/analyse/example_test.go b/analyse/example_test.go index d35d893..3394964 100755 --- a/analyse/example_test.go +++ b/analyse/example_test.go @@ -6,8 +6,8 @@ import ( func Example_extractTags() { var t TagExtracter - t.LoadDictionary("../dict.txt") - t.LoadIdf("idf.txt") + t.LoadDictionaryAt("../dict.txt") + t.LoadIdfAt("idf.txt") sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。" segments := t.ExtractTags(sentence, 5) @@ -20,7 +20,7 @@ func Example_extractTags() { } func Example_textRank() { - t, err := NewTextRanker("../dict.txt") + t, err := NewTextRankerAt("../dict.txt") if err != nil { panic(err) } diff --git a/analyse/idf.go b/analyse/idf.go index 10d0c98..fdbdac8 100755 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -1,6 +1,7 @@ package analyse import ( + "io/fs" "sort" "sync" @@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) { i.Unlock() } -func (i *Idf) loadDictionary(fileName string) error { - return dictionary.LoadDictionary(i, fileName) +func (i *Idf) loadDictionary(file fs.File) error { + return dictionary.LoadDictionary(i, file) +} + +func (i *Idf) loadDictionaryAt(fileName string) error { + return dictionary.LoadDictionaryAt(i, fileName) } // Frequency returns the IDF of given word. diff --git a/analyse/stopwords.go b/analyse/stopwords.go index 0703e86..5a39391 100755 --- a/analyse/stopwords.go +++ b/analyse/stopwords.go @@ -1,6 +1,7 @@ package analyse import ( + "io/fs" "sync" "github.com/fumiama/jieba/dictionary" @@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) { s.Unlock() } -func (s *StopWord) loadDictionary(fileName string) error { - return dictionary.LoadDictionary(s, fileName) +func (s *StopWord) loadDictionary(file fs.File) error { + return dictionary.LoadDictionary(s, file) +} + +func (s *StopWord) loadDictionaryAt(file string) error { + return dictionary.LoadDictionaryAt(s, file) } diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index 0a8dde5..ee938a6 100755 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -2,6 +2,7 @@ package analyse import ( + "io/fs" "sort" "strings" "unicode/utf8" @@ -52,22 +53,41 @@ type TagExtracter struct { } // LoadDictionary reads the given filename and create a new dictionary. -func (t *TagExtracter) LoadDictionary(fileName string) error { +func (t *TagExtracter) LoadDictionary(file fs.File) error { t.stopWord = NewStopWord() t.seg = new(jieba.Segmenter) - return t.seg.LoadDictionary(fileName) + return t.seg.LoadDictionary(file) +} + +// LoadDictionaryAt reads the given filename and create a new dictionary. +func (t *TagExtracter) LoadDictionaryAt(fileName string) error { + t.stopWord = NewStopWord() + t.seg = new(jieba.Segmenter) + return t.seg.LoadDictionaryAt(fileName) } // LoadIdf reads the given file and create a new Idf dictionary. -func (t *TagExtracter) LoadIdf(fileName string) error { +func (t *TagExtracter) LoadIdf(file fs.File) error { t.idf = NewIdf() - return t.idf.loadDictionary(fileName) + return t.idf.loadDictionary(file) +} + +// LoadIdfAt reads the given file and create a new Idf dictionary. +func (t *TagExtracter) LoadIdfAt(fileName string) error { + t.idf = NewIdf() + return t.idf.loadDictionaryAt(fileName) } // LoadStopWords reads the given file and create a new StopWord dictionary. -func (t *TagExtracter) LoadStopWords(fileName string) error { +func (t *TagExtracter) LoadStopWords(file fs.File) error { t.stopWord = NewStopWord() - return t.stopWord.loadDictionary(fileName) + return t.stopWord.loadDictionary(file) +} + +// LoadStopWordsAt reads the given file and create a new StopWord dictionary. +func (t *TagExtracter) LoadStopWordsAt(file string) error { + t.stopWord = NewStopWord() + return t.stopWord.loadDictionaryAt(file) } // ExtractTags extracts the topK key words from sentence. diff --git a/analyse/tag_extracker_test.go b/analyse/tag_extracker_test.go index 24f80f0..bdb9b68 100755 --- a/analyse/tag_extracker_test.go +++ b/analyse/tag_extracker_test.go @@ -256,8 +256,8 @@ var ( func TestExtractTags(t *testing.T) { var te TagExtracter - te.LoadDictionary("../dict.txt") - te.LoadIdf("idf.txt") + te.LoadDictionaryAt("../dict.txt") + te.LoadIdfAt("idf.txt") for index, sentence := range testContents { result := te.ExtractTags(sentence, 20) @@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) { func TestExtratTagsWithWeight(t *testing.T) { var te TagExtracter - te.LoadDictionary("../dict.txt") - te.LoadIdf("idf.txt") + te.LoadDictionaryAt("../dict.txt") + te.LoadIdfAt("idf.txt") result := te.ExtractTags(Lyric, 10) for index, tag := range result { if LyciWeight[index].text != tag.text || @@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) { func TestExtractTagsWithStopWordsFile(t *testing.T) { var te TagExtracter - te.LoadDictionary("../dict.txt") - te.LoadIdf("idf.txt") - te.LoadStopWords("stop_words.txt") + te.LoadDictionaryAt("../dict.txt") + te.LoadIdfAt("idf.txt") + te.LoadStopWordsAt("stop_words.txt") result := te.ExtractTags(Lyric, 7) for index, tag := range result { if LyciWeight2[index].text != tag.text || diff --git a/analyse/textrank.go b/analyse/textrank.go index 3db0cdf..7e2cdab 100755 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -2,6 +2,7 @@ package analyse import ( "hash/crc64" + "io/fs" "math" "sort" @@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments { type TextRanker posseg.Segmenter // NewTextRanker reads a given file and create a new dictionary file for Textranker. -func NewTextRanker(fileName string) (TextRanker, error) { +func NewTextRanker(file fs.File) (TextRanker, error) { seg := posseg.Segmenter{} - return TextRanker(seg), seg.LoadDictionary(fileName) + return TextRanker(seg), seg.LoadDictionary(file) +} + +// NewTextRankerAt reads a given file and create a new dictionary file for Textranker. +func NewTextRankerAt(fileName string) (TextRanker, error) { + seg := posseg.Segmenter{} + return TextRanker(seg), seg.LoadDictionaryAt(fileName) } diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index 94176d1..6172c6d 100755 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -23,7 +23,7 @@ var ( ) func TestTextRank(t *testing.T) { - tr, err := NewTextRanker("../dict.txt") + tr, err := NewTextRankerAt("../dict.txt") if err != nil { t.Fatal(err) } diff --git a/dictionary.go b/dictionary.go index 6f11919..ff7e6ed 100755 --- a/dictionary.go +++ b/dictionary.go @@ -1,6 +1,7 @@ package jieba import ( + "io/fs" "math" "sync" @@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) { return freq, ok } -func (d *Dictionary) loadDictionary(fileName string) error { - return dictionary.LoadDictionary(d, fileName) +func (d *Dictionary) loadDictionary(file fs.File) error { + return dictionary.LoadDictionary(d, file) +} + +func (d *Dictionary) loadDictionaryAt(file string) error { + return dictionary.LoadDictionaryAt(d, file) } diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go index e98e1fe..4f96183 100755 --- a/dictionary/dictionary.go +++ b/dictionary/dictionary.go @@ -4,8 +4,8 @@ package dictionary import ( "bufio" + "io/fs" "os" - "path/filepath" "strconv" "strings" ) @@ -17,7 +17,7 @@ type DictLoader interface { AddToken(Token) } -func loadDictionary(file *os.File) (tokens []Token, err error) { +func loadDictionary(file fs.File) (tokens []Token, err error) { scanner := bufio.NewScanner(file) var token Token var line string @@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) { } // LoadDictionary reads the given file and passes all tokens to a DictLoader. -func LoadDictionary(dl DictLoader, fileName string) error { - filePath, err := dictPath(fileName) +func LoadDictionary(dl DictLoader, file fs.File) error { + tokens, err := loadDictionary(file) if err != nil { return err } - dictFile, err := os.Open(filePath) + dl.Load(tokens...) + return nil +} + +// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader. +func LoadDictionaryAt(dl DictLoader, file string) error { + dictFile, err := os.Open(file) if err != nil { return err } @@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error { dl.Load(tokens...) return nil } - -func dictPath(dictFileName string) (string, error) { - if filepath.IsAbs(dictFileName) { - return dictFileName, nil - } - var dictFilePath string - cwd, err := os.Getwd() - if err != nil { - return dictFilePath, err - } - dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) - return dictFilePath, nil -} diff --git a/dictionary/dictionary_test.go b/dictionary/dictionary_test.go index d429d2d..263fb66 100755 --- a/dictionary/dictionary_test.go +++ b/dictionary/dictionary_test.go @@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) { func TestLoadDictionary(t *testing.T) { d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} - err := LoadDictionary(d, "../userdict.txt") + err := LoadDictionaryAt(d, "../userdict.txt") if err != nil { t.Fatalf(err.Error()) } @@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) { func TestAddToken(t *testing.T) { d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} - LoadDictionary(d, "../userdict.txt") - d.AddToken(Token{"好用", 99, "a"}) + LoadDictionaryAt(d, "../userdict.txt") + d.AddToken(Token{99, "好用", "a"}) if d.freqMap["好用"] != 99 { t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"]) } diff --git a/dictionary/token.go b/dictionary/token.go index 6bebae1..17d574e 100755 --- a/dictionary/token.go +++ b/dictionary/token.go @@ -2,12 +2,12 @@ package dictionary // Token represents a Chinese word with (optional) frequency and POS. type Token struct { - text string frequency float64 + text string pos string } -//Text returns token's text. +// Text returns token's text. func (t Token) Text() string { return t.text } diff --git a/example_parallel_cut_test.go b/example_parallel_cut_test.go index 373815c..56f4822 100755 --- a/example_parallel_cut_test.go +++ b/example_parallel_cut_test.go @@ -36,7 +36,7 @@ func Example_parallelCut() { runtime.GOMAXPROCS(numThreads) // Load dictionary - segmenter.LoadDictionary("dict.txt") + segmenter.LoadDictionaryAt("dict.txt") // open file for segmentation file, err := os.Open("README.md") diff --git a/example_test.go b/example_test.go index 7332f88..569e58e 100755 --- a/example_test.go +++ b/example_test.go @@ -6,7 +6,7 @@ import ( func Example() { var seg Segmenter - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") fmt.Print("【全模式】:") fmt.Println(seg.CutAll("我来到北京清华大学")) @@ -28,7 +28,7 @@ func Example() { func Example_suggestFrequency() { var seg Segmenter - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") sentence := "超敏C反应蛋白是什么?" fmt.Print("Before:") @@ -76,13 +76,13 @@ func Example_suggestFrequency() { func Example_loadUserDictionary() { var seg Segmenter - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") sentence := "李小福是创新办主任也是云计算方面的专家" fmt.Print("Before:") fmt.Println(seg.Cut(sentence, true)) - seg.LoadUserDictionary("userdict.txt") + seg.LoadUserDictionaryAt("userdict.txt") fmt.Print("After:") fmt.Println(seg.Cut(sentence, true)) diff --git a/jieba.go b/jieba.go index f1484c3..408ddf8 100755 --- a/jieba.go +++ b/jieba.go @@ -2,6 +2,7 @@ package jieba import ( + "io/fs" "math" "regexp" "strings" @@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 { // LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionary(fileName string) error { +func (seg *Segmenter) LoadDictionary(file fs.File) error { seg.dict = &Dictionary{freqMap: make(map[string]float64)} - return seg.dict.loadDictionary(fileName) + return seg.dict.loadDictionary(file) +} + +// LoadDictionaryAt loads dictionary from given file name. Everytime +// LoadDictionaryAt is called, previously loaded dictionary will be cleard. +func (seg *Segmenter) LoadDictionaryAt(file string) error { + seg.dict = &Dictionary{freqMap: make(map[string]float64)} + return seg.dict.loadDictionaryAt(file) } // LoadUserDictionary loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. -func (seg *Segmenter) LoadUserDictionary(fileName string) error { - return seg.dict.loadDictionary(fileName) +func (seg *Segmenter) LoadUserDictionary(file fs.File) error { + return seg.dict.loadDictionary(file) +} + +// LoadUserDictionaryAt loads a user specified dictionary, it must be called +// after LoadDictionary, and it will not clear any previous loaded dictionary, +// instead it will override exist entries. +func (seg *Segmenter) LoadUserDictionaryAt(file string) error { + return seg.dict.loadDictionaryAt(file) } func (seg *Segmenter) dag(runes []rune) map[int][]int { diff --git a/jieba_test.go b/jieba_test.go index 206188c..de1ebe0 100755 --- a/jieba_test.go +++ b/jieba_test.go @@ -616,7 +616,7 @@ var ( ) func init() { - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") } func TestCutDAG(t *testing.T) { @@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) { func TestLoadDictionary(t *testing.T) { var result []string - seg.LoadDictionary("foobar.txt") + seg.LoadDictionaryAt("foobar.txt") for index, content := range testContents { result = seg.Cut(content, true) if len(result) != len(userDictCutResult[index]) { @@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) { } } } - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") } func TestLoadUserDictionary(t *testing.T) { - seg.LoadUserDictionary("userdict.txt") + seg.LoadUserDictionaryAt("userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} @@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) { t.Fatal(word) } } - seg.LoadDictionary("dict.txt") + seg.LoadDictionaryAt("dict.txt") } func BenchmarkCutNoHMM(b *testing.B) { diff --git a/posseg/dictionary.go b/posseg/dictionary.go index 4200132..8adcf3a 100755 --- a/posseg/dictionary.go +++ b/posseg/dictionary.go @@ -1,6 +1,7 @@ package posseg import ( + "io/fs" "math" "sync" @@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) { return pos, ok } -func (d *Dictionary) loadDictionary(fileName string) error { - return dictionary.LoadDictionary(d, fileName) +func (d *Dictionary) loadDictionary(file fs.File) error { + return dictionary.LoadDictionary(d, file) +} + +func (d *Dictionary) loadDictionaryAt(file string) error { + return dictionary.LoadDictionaryAt(d, file) } diff --git a/posseg/example_test.go b/posseg/example_test.go index d1ee2bd..8ac1d84 100755 --- a/posseg/example_test.go +++ b/posseg/example_test.go @@ -8,7 +8,7 @@ import ( func Example() { var seg posseg.Segmenter - seg.LoadDictionary("../dict.txt") + seg.LoadDictionaryAt("../dict.txt") for segment := range seg.Cut("我爱北京天安门", true) { fmt.Printf("%s %s\n", segment.Text(), segment.Pos()) diff --git a/posseg/posseg.go b/posseg/posseg.go index 4f58dd4..6dad982 100755 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -2,6 +2,7 @@ package posseg import ( + "io/fs" "math" "regexp" @@ -39,17 +40,31 @@ type Segmenter struct { } // LoadDictionary loads dictionary from given file name. -// Everytime LoadDictionary is called, previously loaded dictionary will be cleard. -func (seg *Segmenter) LoadDictionary(fileName string) error { +// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard. +func (seg *Segmenter) LoadDictionary(file fs.File) error { seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} - return seg.dict.loadDictionary(fileName) + return seg.dict.loadDictionary(file) +} + +// LoadDictionaryAt loads dictionary from given file name. +// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard. +func (seg *Segmenter) LoadDictionaryAt(fileName string) error { + seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} + return seg.dict.loadDictionaryAt(fileName) } // LoadUserDictionary loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. -func (seg *Segmenter) LoadUserDictionary(fileName string) error { - return seg.dict.loadDictionary(fileName) +func (seg *Segmenter) LoadUserDictionary(file fs.File) error { + return seg.dict.loadDictionary(file) +} + +// LoadUserDictionaryAt loads a user specified dictionary, it must be called +// after LoadDictionary, and it will not clear any previous loaded dictionary, +// instead it will override exist entries. +func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error { + return seg.dict.loadDictionaryAt(fileName) } func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 460d579..e307b68 100755 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -269,7 +269,7 @@ var ( ) func init() { - seg.LoadDictionary("../dict.txt") + seg.LoadDictionaryAt("../dict.txt") } func chanToArray(ch <-chan Segment) []Segment { @@ -357,8 +357,8 @@ func TestBug137(t *testing.T) { } func TestUserDict(t *testing.T) { - seg.LoadUserDictionary("../userdict.txt") - defer seg.LoadDictionary("../dict.txt") + seg.LoadUserDictionaryAt("../userdict.txt") + defer seg.LoadDictionaryAt("../dict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" cutResult := []Segment{ diff --git a/tokenizers/example_test.go b/tokenizers/example_test.go index 62f490e..7d638cb 100755 --- a/tokenizers/example_test.go +++ b/tokenizers/example_test.go @@ -10,7 +10,7 @@ func Example() { sentence := []byte("永和服装饰品有限公司") // default mode - tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false) + tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false) fmt.Println("Default Mode:") for _, token := range tokenizer.Tokenize(sentence) { fmt.Printf( @@ -19,7 +19,7 @@ func Example() { } //search mode - tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true) + tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true) fmt.Println("Search Mode:") for _, token := range tokenizer.Tokenize(sentence) { fmt.Printf( diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index 0b1b657..e98dfc6 100755 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -1,7 +1,7 @@ package tokenizers import ( - "fmt" + "io/fs" "regexp" "strconv" @@ -24,6 +24,36 @@ type JiebaTokenizer struct { /* NewJiebaTokenizer creates a new JiebaTokenizer. +Parameters: + + dictFile: the dictioanry file. + + hmm: whether to use Hidden Markov Model to cut unknown words, + i.e. not found in dictionary. For example word "安卓" (means "Android" in + English) not in the dictionary file. If hmm is set to false, it will be + cutted into two single words "安" and "卓", if hmm is set to true, it will + be traded as one single word because Jieba using Hidden Markov Model with + Viterbi algorithm to guess the best possibility. + + searchMode: whether to further cut long words into serveral short words. + In Chinese, some long words may contains other words, for example "交换机" + is a Chinese word for "Switcher", if sechMode is false, it will trade + "交换机" as a single word. If searchMode is true, it will further split + this word into "交换", "换机", which are valid Chinese words. +*/ +func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) { + var seg jieba.Segmenter + err := seg.LoadDictionary(dictFile) + return &JiebaTokenizer{ + seg: seg, + hmm: hmm, + searchMode: searchMode, + }, err +} + +/* +NewJiebaTokenizerAt creates a new JiebaTokenizer. + Parameters: dictFilePath: path of the dictioanry file. @@ -41,9 +71,9 @@ Parameters: "交换机" as a single word. If searchMode is true, it will further split this word into "交换", "换机", which are valid Chinese words. */ -func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { +func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { var seg jieba.Segmenter - err := seg.LoadDictionary(dictFilePath) + err := seg.LoadDictionaryAt(dictFilePath) return &JiebaTokenizer{ seg: seg, hmm: hmm, @@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer. Parameter config should contains at least one parameter: - file: the path of the dictionary file. + file: the path of the dictionary file or fs.File. hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details. */ -func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( - analysis.Tokenizer, error) { - dictFilePath, ok := config["file"].(string) - if !ok { - return nil, fmt.Errorf("must specify dictionary file path") - } +func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { hmm, ok := config["hmm"].(bool) if !ok { hmm = true @@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca if !ok { searchMode = true } - - return NewJiebaTokenizer(dictFilePath, hmm, searchMode) + dictFilePath, ok := config["file"].(string) + if ok { + return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode) + } + dictFile := config["file"].(fs.File) + return NewJiebaTokenizer(dictFile, hmm, searchMode) } func detectTokenType(term string) analysis.TokenType { diff --git a/tokenizers/tokenizer_test.go b/tokenizers/tokenizer_test.go index 2ccd0d7..532a77a 100755 --- a/tokenizers/tokenizer_test.go +++ b/tokenizers/tokenizer_test.go @@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false) + tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true) + tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false) + tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { @@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) { }, } - tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true) + tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true) for _, test := range tests { actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) {