优化 dict, add fs.File 支持

2026-07-17 02:40:23 +08:00 · 2022-11-30 14:14:48 +08:00
parent c8785c7994
commit f3da9e6420
22 changed files with 190 additions and 91 deletions
--- a/analyse/example_test.go
+++ b/analyse/example_test.go
@@ -6,8 +6,8 @@ import (

 func Example_extractTags() {
 	var t TagExtracter
-	t.LoadDictionary("../dict.txt")
-	t.LoadIdf("idf.txt")
+	t.LoadDictionaryAt("../dict.txt")
+	t.LoadIdfAt("idf.txt")

 	sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。"
 	segments := t.ExtractTags(sentence, 5)
@@ -20,7 +20,7 @@ func Example_extractTags() {
 }

 func Example_textRank() {
-	t, err := NewTextRanker("../dict.txt")
+	t, err := NewTextRankerAt("../dict.txt")
 	if err != nil {
 		panic(err)
 	}
--- a/analyse/idf.go
+++ b/analyse/idf.go
@@ -1,6 +1,7 @@
 package analyse

 import (
+	"io/fs"
 	"sort"
 	"sync"

@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
 	i.Unlock()
 }

-func (i *Idf) loadDictionary(fileName string) error {
-	return dictionary.LoadDictionary(i, fileName)
+func (i *Idf) loadDictionary(file fs.File) error {
+	return dictionary.LoadDictionary(i, file)
+}
+
+func (i *Idf) loadDictionaryAt(fileName string) error {
+	return dictionary.LoadDictionaryAt(i, fileName)
 }

 // Frequency returns the IDF of given word.
--- a/analyse/stopwords.go
+++ b/analyse/stopwords.go
@@ -1,6 +1,7 @@
 package analyse

 import (
+	"io/fs"
 	"sync"

 	"github.com/fumiama/jieba/dictionary"
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
 	s.Unlock()
 }

-func (s *StopWord) loadDictionary(fileName string) error {
-	return dictionary.LoadDictionary(s, fileName)
+func (s *StopWord) loadDictionary(file fs.File) error {
+	return dictionary.LoadDictionary(s, file)
+}
+
+func (s *StopWord) loadDictionaryAt(file string) error {
+	return dictionary.LoadDictionaryAt(s, file)
 }
--- a/analyse/tag_extracker.go
+++ b/analyse/tag_extracker.go
@@ -2,6 +2,7 @@
 package analyse

 import (
+	"io/fs"
 	"sort"
 	"strings"
 	"unicode/utf8"
@@ -52,22 +53,41 @@ type TagExtracter struct {
 }

 // LoadDictionary reads the given filename and create a new dictionary.
-func (t *TagExtracter) LoadDictionary(fileName string) error {
+func (t *TagExtracter) LoadDictionary(file fs.File) error {
 	t.stopWord = NewStopWord()
 	t.seg = new(jieba.Segmenter)
-	return t.seg.LoadDictionary(fileName)
+	return t.seg.LoadDictionary(file)
+}
+
+// LoadDictionaryAt reads the given filename and create a new dictionary.
+func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
+	t.stopWord = NewStopWord()
+	t.seg = new(jieba.Segmenter)
+	return t.seg.LoadDictionaryAt(fileName)
 }

 // LoadIdf reads the given file and create a new Idf dictionary.
-func (t *TagExtracter) LoadIdf(fileName string) error {
+func (t *TagExtracter) LoadIdf(file fs.File) error {
 	t.idf = NewIdf()
-	return t.idf.loadDictionary(fileName)
+	return t.idf.loadDictionary(file)
+}
+
+// LoadIdfAt reads the given file and create a new Idf dictionary.
+func (t *TagExtracter) LoadIdfAt(fileName string) error {
+	t.idf = NewIdf()
+	return t.idf.loadDictionaryAt(fileName)
 }

 // LoadStopWords reads the given file and create a new StopWord dictionary.
-func (t *TagExtracter) LoadStopWords(fileName string) error {
+func (t *TagExtracter) LoadStopWords(file fs.File) error {
 	t.stopWord = NewStopWord()
-	return t.stopWord.loadDictionary(fileName)
+	return t.stopWord.loadDictionary(file)
+}
+
+// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
+func (t *TagExtracter) LoadStopWordsAt(file string) error {
+	t.stopWord = NewStopWord()
+	return t.stopWord.loadDictionaryAt(file)
 }

 // ExtractTags extracts the topK key words from sentence.
--- a/analyse/tag_extracker_test.go
+++ b/analyse/tag_extracker_test.go
@@ -256,8 +256,8 @@ var (

 func TestExtractTags(t *testing.T) {
 	var te TagExtracter
-	te.LoadDictionary("../dict.txt")
-	te.LoadIdf("idf.txt")
+	te.LoadDictionaryAt("../dict.txt")
+	te.LoadIdfAt("idf.txt")

 	for index, sentence := range testContents {
 		result := te.ExtractTags(sentence, 20)
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {

 func TestExtratTagsWithWeight(t *testing.T) {
 	var te TagExtracter
-	te.LoadDictionary("../dict.txt")
-	te.LoadIdf("idf.txt")
+	te.LoadDictionaryAt("../dict.txt")
+	te.LoadIdfAt("idf.txt")
 	result := te.ExtractTags(Lyric, 10)
 	for index, tag := range result {
 		if LyciWeight[index].text != tag.text ||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {

 func TestExtractTagsWithStopWordsFile(t *testing.T) {
 	var te TagExtracter
-	te.LoadDictionary("../dict.txt")
-	te.LoadIdf("idf.txt")
-	te.LoadStopWords("stop_words.txt")
+	te.LoadDictionaryAt("../dict.txt")
+	te.LoadIdfAt("idf.txt")
+	te.LoadStopWordsAt("stop_words.txt")
 	result := te.ExtractTags(Lyric, 7)
 	for index, tag := range result {
 		if LyciWeight2[index].text != tag.text ||
--- a/analyse/textrank.go
+++ b/analyse/textrank.go
@@ -2,6 +2,7 @@ package analyse

 import (
 	"hash/crc64"
+	"io/fs"
 	"math"
 	"sort"

@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
 type TextRanker posseg.Segmenter

 // NewTextRanker reads a given file and create a new dictionary file for Textranker.
-func NewTextRanker(fileName string) (TextRanker, error) {
+func NewTextRanker(file fs.File) (TextRanker, error) {
 	seg := posseg.Segmenter{}
-	return TextRanker(seg), seg.LoadDictionary(fileName)
+	return TextRanker(seg), seg.LoadDictionary(file)
+}
+
+// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
+func NewTextRankerAt(fileName string) (TextRanker, error) {
+	seg := posseg.Segmenter{}
+	return TextRanker(seg), seg.LoadDictionaryAt(fileName)
 }
--- a/analyse/textrank_test.go
+++ b/analyse/textrank_test.go
@@ -23,7 +23,7 @@ var (
 )

 func TestTextRank(t *testing.T) {
-	tr, err := NewTextRanker("../dict.txt")
+	tr, err := NewTextRankerAt("../dict.txt")
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/dictionary.go
+++ b/dictionary.go
@@ -1,6 +1,7 @@
 package jieba

 import (
+	"io/fs"
 	"math"
 	"sync"

@@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
 	return freq, ok
 }

-func (d *Dictionary) loadDictionary(fileName string) error {
-	return dictionary.LoadDictionary(d, fileName)
+func (d *Dictionary) loadDictionary(file fs.File) error {
+	return dictionary.LoadDictionary(d, file)
+}
+
+func (d *Dictionary) loadDictionaryAt(file string) error {
+	return dictionary.LoadDictionaryAt(d, file)
 }
--- a/dictionary/dictionary.go
+++ b/dictionary/dictionary.go
@@ -4,8 +4,8 @@ package dictionary

 import (
 	"bufio"
+	"io/fs"
 	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -17,7 +17,7 @@ type DictLoader interface {
 	AddToken(Token)
 }

-func loadDictionary(file *os.File) (tokens []Token, err error) {
+func loadDictionary(file fs.File) (tokens []Token, err error) {
 	scanner := bufio.NewScanner(file)
 	var token Token
 	var line string
@@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) {
 }

 // LoadDictionary reads the given file and passes all tokens to a DictLoader.
-func LoadDictionary(dl DictLoader, fileName string) error {
-	filePath, err := dictPath(fileName)
+func LoadDictionary(dl DictLoader, file fs.File) error {
+	tokens, err := loadDictionary(file)
 	if err != nil {
 		return err
 	}
-	dictFile, err := os.Open(filePath)
+	dl.Load(tokens...)
+	return nil
+}
+
+// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader.
+func LoadDictionaryAt(dl DictLoader, file string) error {
+	dictFile, err := os.Open(file)
 	if err != nil {
 		return err
 	}
@@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error {
 	dl.Load(tokens...)
 	return nil
 }
-
-func dictPath(dictFileName string) (string, error) {
-	if filepath.IsAbs(dictFileName) {
-		return dictFileName, nil
-	}
-	var dictFilePath string
-	cwd, err := os.Getwd()
-	if err != nil {
-		return dictFilePath, err
-	}
-	dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
-	return dictFilePath, nil
-}
--- a/dictionary/dictionary_test.go
+++ b/dictionary/dictionary_test.go
@@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) {

 func TestLoadDictionary(t *testing.T) {
 	d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
-	err := LoadDictionary(d, "../userdict.txt")
+	err := LoadDictionaryAt(d, "../userdict.txt")
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) {

 func TestAddToken(t *testing.T) {
 	d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
-	LoadDictionary(d, "../userdict.txt")
-	d.AddToken(Token{"好用", 99, "a"})
+	LoadDictionaryAt(d, "../userdict.txt")
+	d.AddToken(Token{99, "好用", "a"})
 	if d.freqMap["好用"] != 99 {
 		t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
 	}
--- a/dictionary/token.go
+++ b/dictionary/token.go
@@ -2,12 +2,12 @@ package dictionary

 // Token represents a Chinese word with (optional) frequency and POS.
 type Token struct {
-	text      string
 	frequency float64
+	text      string
 	pos       string
 }

-//Text returns token's text.
+// Text returns token's text.
 func (t Token) Text() string {
 	return t.text
 }
--- a/example_parallel_cut_test.go
+++ b/example_parallel_cut_test.go
@@ -36,7 +36,7 @@ func Example_parallelCut() {
 	runtime.GOMAXPROCS(numThreads)

 	// Load dictionary
-	segmenter.LoadDictionary("dict.txt")
+	segmenter.LoadDictionaryAt("dict.txt")

 	// open file for segmentation
 	file, err := os.Open("README.md")
--- a/example_test.go
+++ b/example_test.go
@@ -6,7 +6,7 @@ import (

 func Example() {
 	var seg Segmenter
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")

 	fmt.Print("【全模式】：")
 	fmt.Println(seg.CutAll("我来到北京清华大学"))
@@ -28,7 +28,7 @@ func Example() {

 func Example_suggestFrequency() {
 	var seg Segmenter
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")

 	sentence := "超敏C反应蛋白是什么？"
 	fmt.Print("Before:")
@@ -76,13 +76,13 @@ func Example_suggestFrequency() {

 func Example_loadUserDictionary() {
 	var seg Segmenter
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")

 	sentence := "李小福是创新办主任也是云计算方面的专家"
 	fmt.Print("Before:")
 	fmt.Println(seg.Cut(sentence, true))

-	seg.LoadUserDictionary("userdict.txt")
+	seg.LoadUserDictionaryAt("userdict.txt")

 	fmt.Print("After:")
 	fmt.Println(seg.Cut(sentence, true))
--- a/jieba.go
+++ b/jieba.go
@@ -2,6 +2,7 @@
 package jieba

 import (
+	"io/fs"
 	"math"
 	"regexp"
 	"strings"
@@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {

 // LoadDictionary loads dictionary from given file name. Everytime
 // LoadDictionary is called, previously loaded dictionary will be cleard.
-func (seg *Segmenter) LoadDictionary(fileName string) error {
+func (seg *Segmenter) LoadDictionary(file fs.File) error {
 	seg.dict = &Dictionary{freqMap: make(map[string]float64)}
-	return seg.dict.loadDictionary(fileName)
+	return seg.dict.loadDictionary(file)
+}
+
+// LoadDictionaryAt loads dictionary from given file name. Everytime
+// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
+func (seg *Segmenter) LoadDictionaryAt(file string) error {
+	seg.dict = &Dictionary{freqMap: make(map[string]float64)}
+	return seg.dict.loadDictionaryAt(file)
 }

 // LoadUserDictionary loads a user specified dictionary, it must be called
 // after LoadDictionary, and it will not clear any previous loaded dictionary,
 // instead it will override exist entries.
-func (seg *Segmenter) LoadUserDictionary(fileName string) error {
-	return seg.dict.loadDictionary(fileName)
+func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
+	return seg.dict.loadDictionary(file)
+}
+
+// LoadUserDictionaryAt loads a user specified dictionary, it must be called
+// after LoadDictionary, and it will not clear any previous loaded dictionary,
+// instead it will override exist entries.
+func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
+	return seg.dict.loadDictionaryAt(file)
 }

 func (seg *Segmenter) dag(runes []rune) map[int][]int {
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -616,7 +616,7 @@ var (
 )

 func init() {
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")
 }

 func TestCutDAG(t *testing.T) {
@@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) {

 func TestLoadDictionary(t *testing.T) {
 	var result []string
-	seg.LoadDictionary("foobar.txt")
+	seg.LoadDictionaryAt("foobar.txt")
 	for index, content := range testContents {
 		result = seg.Cut(content, true)
 		if len(result) != len(userDictCutResult[index]) {
@@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) {
 			}
 		}
 	}
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")
 }

 func TestLoadUserDictionary(t *testing.T) {
-	seg.LoadUserDictionary("userdict.txt")
+	seg.LoadUserDictionaryAt("userdict.txt")

 	sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 	result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "，", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
@@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) {
 			t.Fatal(word)
 		}
 	}
-	seg.LoadDictionary("dict.txt")
+	seg.LoadDictionaryAt("dict.txt")
 }

 func BenchmarkCutNoHMM(b *testing.B) {
--- a/posseg/dictionary.go
+++ b/posseg/dictionary.go
@@ -1,6 +1,7 @@
 package posseg

 import (
+	"io/fs"
 	"math"
 	"sync"

@@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) {
 	return pos, ok
 }

-func (d *Dictionary) loadDictionary(fileName string) error {
-	return dictionary.LoadDictionary(d, fileName)
+func (d *Dictionary) loadDictionary(file fs.File) error {
+	return dictionary.LoadDictionary(d, file)
+}
+
+func (d *Dictionary) loadDictionaryAt(file string) error {
+	return dictionary.LoadDictionaryAt(d, file)
 }
--- a/posseg/example_test.go
+++ b/posseg/example_test.go
@@ -8,7 +8,7 @@ import (

 func Example() {
 	var seg posseg.Segmenter
-	seg.LoadDictionary("../dict.txt")
+	seg.LoadDictionaryAt("../dict.txt")

 	for segment := range seg.Cut("我爱北京天安门", true) {
 		fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
--- a/posseg/posseg.go
+++ b/posseg/posseg.go
@@ -2,6 +2,7 @@
 package posseg

 import (
+	"io/fs"
 	"math"
 	"regexp"

@@ -39,17 +40,31 @@ type Segmenter struct {
 }

 // LoadDictionary loads dictionary from given file name.
-// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
-func (seg *Segmenter) LoadDictionary(fileName string) error {
+// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
+func (seg *Segmenter) LoadDictionary(file fs.File) error {
 	seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
-	return seg.dict.loadDictionary(fileName)
+	return seg.dict.loadDictionary(file)
+}
+
+// LoadDictionaryAt loads dictionary from given file name.
+// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
+func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
+	seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
+	return seg.dict.loadDictionaryAt(fileName)
 }

 // LoadUserDictionary loads a user specified dictionary, it must be called
 // after LoadDictionary, and it will not clear any previous loaded dictionary,
 // instead it will override exist entries.
-func (seg *Segmenter) LoadUserDictionary(fileName string) error {
-	return seg.dict.loadDictionary(fileName)
+func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
+	return seg.dict.loadDictionary(file)
+}
+
+// LoadUserDictionaryAt loads a user specified dictionary, it must be called
+// after LoadDictionary, and it will not clear any previous loaded dictionary,
+// instead it will override exist entries.
+func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
+	return seg.dict.loadDictionaryAt(fileName)
 }

 func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
--- a/posseg/posseg_test.go
+++ b/posseg/posseg_test.go
@@ -269,7 +269,7 @@ var (
 )

 func init() {
-	seg.LoadDictionary("../dict.txt")
+	seg.LoadDictionaryAt("../dict.txt")
 }

 func chanToArray(ch <-chan Segment) []Segment {
@@ -357,8 +357,8 @@ func TestBug137(t *testing.T) {
 }

 func TestUserDict(t *testing.T) {
-	seg.LoadUserDictionary("../userdict.txt")
-	defer seg.LoadDictionary("../dict.txt")
+	seg.LoadUserDictionaryAt("../userdict.txt")
+	defer seg.LoadDictionaryAt("../dict.txt")
 	sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"

 	cutResult := []Segment{
--- a/tokenizers/example_test.go
+++ b/tokenizers/example_test.go
@@ -10,7 +10,7 @@ func Example() {
 	sentence := []byte("永和服装饰品有限公司")

 	// default mode
-	tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
+	tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false)
 	fmt.Println("Default Mode:")
 	for _, token := range tokenizer.Tokenize(sentence) {
 		fmt.Printf(
@@ -19,7 +19,7 @@ func Example() {
 	}

 	//search mode
-	tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
+	tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true)
 	fmt.Println("Search Mode:")
 	for _, token := range tokenizer.Tokenize(sentence) {
 		fmt.Printf(
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -1,7 +1,7 @@
 package tokenizers

 import (
-	"fmt"
+	"io/fs"
 	"regexp"
 	"strconv"

@@ -24,6 +24,36 @@ type JiebaTokenizer struct {
 /*
 NewJiebaTokenizer creates a new JiebaTokenizer.

+Parameters:
+
+	dictFile: the dictioanry file.
+
+	hmm: whether to use Hidden Markov Model to cut unknown words,
+	i.e. not found in dictionary. For example word "安卓" (means "Android" in
+	English) not in the dictionary file. If hmm is set to false, it will be
+	cutted into two single words "安" and "卓", if hmm is set to true, it will
+	be traded as one single word because Jieba using Hidden Markov Model with
+	Viterbi algorithm to guess the best possibility.
+
+	searchMode: whether to further cut long words into serveral short words.
+	In Chinese, some long words may contains other words, for example "交换机"
+	is a Chinese word for "Switcher", if sechMode is false, it will trade
+	"交换机" as a single word. If searchMode is true, it will further split
+	this word into "交换", "换机", which are valid Chinese words.
+*/
+func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
+	var seg jieba.Segmenter
+	err := seg.LoadDictionary(dictFile)
+	return &JiebaTokenizer{
+		seg:        seg,
+		hmm:        hmm,
+		searchMode: searchMode,
+	}, err
+}
+
+/*
+NewJiebaTokenizerAt creates a new JiebaTokenizer.
+
 Parameters:

 	dictFilePath: path of the dictioanry file.
@@ -41,9 +71,9 @@ Parameters:
 	"交换机" as a single word. If searchMode is true, it will further split
 	this word into "交换", "换机", which are valid Chinese words.
 */
-func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
+func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
 	var seg jieba.Segmenter
-	err := seg.LoadDictionary(dictFilePath)
+	err := seg.LoadDictionaryAt(dictFilePath)
 	return &JiebaTokenizer{
 		seg:        seg,
 		hmm:        hmm,
@@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.

 Parameter config should contains at least one parameter:

-	file: the path of the dictionary file.
+	file: the path of the dictionary file or fs.File.

 	hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.

 	search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
 */
-func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
-	analysis.Tokenizer, error) {
-	dictFilePath, ok := config["file"].(string)
-	if !ok {
-		return nil, fmt.Errorf("must specify dictionary file path")
-	}
+func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
 	hmm, ok := config["hmm"].(bool)
 	if !ok {
 		hmm = true
@@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
 	if !ok {
 		searchMode = true
 	}
-
-	return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
+	dictFilePath, ok := config["file"].(string)
+	if ok {
+		return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
+	}
+	dictFile := config["file"].(fs.File)
+	return NewJiebaTokenizer(dictFile, hmm, searchMode)
 }

 func detectTokenType(term string) analysis.TokenType {
--- a/tokenizers/tokenizer_test.go
+++ b/tokenizers/tokenizer_test.go
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
 		},
 	}

-	tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
+	tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false)
 	for _, test := range tests {
 		actual := tokenizer.Tokenize(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
 		},
 	}

-	tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
+	tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true)
 	for _, test := range tests {
 		actual := tokenizer.Tokenize(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
 		},
 	}

-	tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
+	tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false)
 	for _, test := range tests {
 		actual := tokenizer.Tokenize(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
 		},
 	}

-	tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
+	tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true)
 	for _, test := range tests {
 		actual := tokenizer.Tokenize(test.input)
 		if !reflect.DeepEqual(actual, test.output) {