1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

优化 dict, add fs.File 支持

This commit is contained in:
源文雨
2022-11-30 14:14:48 +08:00
parent c8785c7994
commit f3da9e6420
22 changed files with 190 additions and 91 deletions

View File

@@ -6,8 +6,8 @@ import (
func Example_extractTags() {
var t TagExtracter
t.LoadDictionary("../dict.txt")
t.LoadIdf("idf.txt")
t.LoadDictionaryAt("../dict.txt")
t.LoadIdfAt("idf.txt")
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"
segments := t.ExtractTags(sentence, 5)
@@ -20,7 +20,7 @@ func Example_extractTags() {
}
func Example_textRank() {
t, err := NewTextRanker("../dict.txt")
t, err := NewTextRankerAt("../dict.txt")
if err != nil {
panic(err)
}

View File

@@ -1,6 +1,7 @@
package analyse
import (
"io/fs"
"sort"
"sync"
@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
i.Unlock()
}
func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName)
func (i *Idf) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(i, file)
}
func (i *Idf) loadDictionaryAt(fileName string) error {
return dictionary.LoadDictionaryAt(i, fileName)
}
// Frequency returns the IDF of given word.

View File

@@ -1,6 +1,7 @@
package analyse
import (
"io/fs"
"sync"
"github.com/fumiama/jieba/dictionary"
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
s.Unlock()
}
func (s *StopWord) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(s, fileName)
func (s *StopWord) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(s, file)
}
func (s *StopWord) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(s, file)
}

View File

@@ -2,6 +2,7 @@
package analyse
import (
"io/fs"
"sort"
"strings"
"unicode/utf8"
@@ -52,22 +53,41 @@ type TagExtracter struct {
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
func (t *TagExtracter) LoadDictionary(file fs.File) error {
t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionary(fileName)
return t.seg.LoadDictionary(file)
}
// LoadDictionaryAt reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionaryAt(fileName)
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error {
func (t *TagExtracter) LoadIdf(file fs.File) error {
t.idf = NewIdf()
return t.idf.loadDictionary(fileName)
return t.idf.loadDictionary(file)
}
// LoadIdfAt reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdfAt(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionaryAt(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error {
func (t *TagExtracter) LoadStopWords(file fs.File) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName)
return t.stopWord.loadDictionary(file)
}
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWordsAt(file string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionaryAt(file)
}
// ExtractTags extracts the topK key words from sentence.

View File

@@ -256,8 +256,8 @@ var (
func TestExtractTags(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20)
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {
func TestExtratTagsWithWeight(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
result := te.ExtractTags(Lyric, 10)
for index, tag := range result {
if LyciWeight[index].text != tag.text ||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
func TestExtractTagsWithStopWordsFile(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadStopWords("stop_words.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
te.LoadStopWordsAt("stop_words.txt")
result := te.ExtractTags(Lyric, 7)
for index, tag := range result {
if LyciWeight2[index].text != tag.text ||

View File

@@ -2,6 +2,7 @@ package analyse
import (
"hash/crc64"
"io/fs"
"math"
"sort"
@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
type TextRanker posseg.Segmenter
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
func NewTextRanker(fileName string) (TextRanker, error) {
func NewTextRanker(file fs.File) (TextRanker, error) {
seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionary(fileName)
return TextRanker(seg), seg.LoadDictionary(file)
}
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
func NewTextRankerAt(fileName string) (TextRanker, error) {
seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
}

View File

@@ -23,7 +23,7 @@ var (
)
func TestTextRank(t *testing.T) {
tr, err := NewTextRanker("../dict.txt")
tr, err := NewTextRankerAt("../dict.txt")
if err != nil {
t.Fatal(err)
}

View File

@@ -1,6 +1,7 @@
package jieba
import (
"io/fs"
"math"
"sync"
@@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok
}
func (d *Dictionary) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(d, fileName)
func (d *Dictionary) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(d, file)
}
func (d *Dictionary) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(d, file)
}

View File

@@ -4,8 +4,8 @@ package dictionary
import (
"bufio"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"
)
@@ -17,7 +17,7 @@ type DictLoader interface {
AddToken(Token)
}
func loadDictionary(file *os.File) (tokens []Token, err error) {
func loadDictionary(file fs.File) (tokens []Token, err error) {
scanner := bufio.NewScanner(file)
var token Token
var line string
@@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) {
}
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, fileName string) error {
filePath, err := dictPath(fileName)
func LoadDictionary(dl DictLoader, file fs.File) error {
tokens, err := loadDictionary(file)
if err != nil {
return err
}
dictFile, err := os.Open(filePath)
dl.Load(tokens...)
return nil
}
// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader.
func LoadDictionaryAt(dl DictLoader, file string) error {
dictFile, err := os.Open(file)
if err != nil {
return err
}
@@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error {
dl.Load(tokens...)
return nil
}
func dictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}

View File

@@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) {
func TestLoadDictionary(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
err := LoadDictionary(d, "../userdict.txt")
err := LoadDictionaryAt(d, "../userdict.txt")
if err != nil {
t.Fatalf(err.Error())
}
@@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) {
func TestAddToken(t *testing.T) {
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
LoadDictionary(d, "../userdict.txt")
d.AddToken(Token{"好用", 99, "a"})
LoadDictionaryAt(d, "../userdict.txt")
d.AddToken(Token{99, "好用", "a"})
if d.freqMap["好用"] != 99 {
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
}

View File

@@ -2,12 +2,12 @@ package dictionary
// Token represents a Chinese word with (optional) frequency and POS.
type Token struct {
text string
frequency float64
text string
pos string
}
//Text returns token's text.
// Text returns token's text.
func (t Token) Text() string {
return t.text
}

View File

@@ -36,7 +36,7 @@ func Example_parallelCut() {
runtime.GOMAXPROCS(numThreads)
// Load dictionary
segmenter.LoadDictionary("dict.txt")
segmenter.LoadDictionaryAt("dict.txt")
// open file for segmentation
file, err := os.Open("README.md")

View File

@@ -6,7 +6,7 @@ import (
func Example() {
var seg Segmenter
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
fmt.Print("【全模式】:")
fmt.Println(seg.CutAll("我来到北京清华大学"))
@@ -28,7 +28,7 @@ func Example() {
func Example_suggestFrequency() {
var seg Segmenter
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
sentence := "超敏C反应蛋白是什么"
fmt.Print("Before:")
@@ -76,13 +76,13 @@ func Example_suggestFrequency() {
func Example_loadUserDictionary() {
var seg Segmenter
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:")
fmt.Println(seg.Cut(sentence, true))
seg.LoadUserDictionary("userdict.txt")
seg.LoadUserDictionaryAt("userdict.txt")
fmt.Print("After:")
fmt.Println(seg.Cut(sentence, true))

View File

@@ -2,6 +2,7 @@
package jieba
import (
"io/fs"
"math"
"regexp"
"strings"
@@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
// LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
func (seg *Segmenter) LoadDictionary(file fs.File) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(fileName)
return seg.dict.loadDictionary(file)
}
// LoadDictionaryAt loads dictionary from given file name. Everytime
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionaryAt(file string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionaryAt(file)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
return seg.dict.loadDictionary(file)
}
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
return seg.dict.loadDictionaryAt(file)
}
func (seg *Segmenter) dag(runes []rune) map[int][]int {

View File

@@ -616,7 +616,7 @@ var (
)
func init() {
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
}
func TestCutDAG(t *testing.T) {
@@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) {
func TestLoadDictionary(t *testing.T) {
var result []string
seg.LoadDictionary("foobar.txt")
seg.LoadDictionaryAt("foobar.txt")
for index, content := range testContents {
result = seg.Cut(content, true)
if len(result) != len(userDictCutResult[index]) {
@@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) {
}
}
}
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
}
func TestLoadUserDictionary(t *testing.T) {
seg.LoadUserDictionary("userdict.txt")
seg.LoadUserDictionaryAt("userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
@@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) {
t.Fatal(word)
}
}
seg.LoadDictionary("dict.txt")
seg.LoadDictionaryAt("dict.txt")
}
func BenchmarkCutNoHMM(b *testing.B) {

View File

@@ -1,6 +1,7 @@
package posseg
import (
"io/fs"
"math"
"sync"
@@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) {
return pos, ok
}
func (d *Dictionary) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(d, fileName)
func (d *Dictionary) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(d, file)
}
func (d *Dictionary) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(d, file)
}

View File

@@ -8,7 +8,7 @@ import (
func Example() {
var seg posseg.Segmenter
seg.LoadDictionary("../dict.txt")
seg.LoadDictionaryAt("../dict.txt")
for segment := range seg.Cut("我爱北京天安门", true) {
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())

View File

@@ -2,6 +2,7 @@
package posseg
import (
"io/fs"
"math"
"regexp"
@@ -39,17 +40,31 @@ type Segmenter struct {
}
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(file fs.File) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
return seg.dict.loadDictionary(file)
}
// LoadDictionaryAt loads dictionary from given file name.
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionaryAt(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
return seg.dict.loadDictionary(file)
}
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
return seg.dict.loadDictionaryAt(fileName)
}
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {

View File

@@ -269,7 +269,7 @@ var (
)
func init() {
seg.LoadDictionary("../dict.txt")
seg.LoadDictionaryAt("../dict.txt")
}
func chanToArray(ch <-chan Segment) []Segment {
@@ -357,8 +357,8 @@ func TestBug137(t *testing.T) {
}
func TestUserDict(t *testing.T) {
seg.LoadUserDictionary("../userdict.txt")
defer seg.LoadDictionary("../dict.txt")
seg.LoadUserDictionaryAt("../userdict.txt")
defer seg.LoadDictionaryAt("../dict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
cutResult := []Segment{

View File

@@ -10,7 +10,7 @@ func Example() {
sentence := []byte("永和服装饰品有限公司")
// default mode
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false)
fmt.Println("Default Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(
@@ -19,7 +19,7 @@ func Example() {
}
//search mode
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true)
fmt.Println("Search Mode:")
for _, token := range tokenizer.Tokenize(sentence) {
fmt.Printf(

View File

@@ -1,7 +1,7 @@
package tokenizers
import (
"fmt"
"io/fs"
"regexp"
"strconv"
@@ -24,6 +24,36 @@ type JiebaTokenizer struct {
/*
NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFile: the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter
err := seg.LoadDictionary(dictFile)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
searchMode: searchMode,
}, err
}
/*
NewJiebaTokenizerAt creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
@@ -41,9 +71,9 @@ Parameters:
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter
err := seg.LoadDictionary(dictFilePath)
err := seg.LoadDictionaryAt(dictFilePath)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
@@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
file: the path of the dictionary file or fs.File.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFilePath, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
hmm, ok := config["hmm"].(bool)
if !ok {
hmm = true
@@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
if !ok {
searchMode = true
}
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
dictFilePath, ok := config["file"].(string)
if ok {
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
}
dictFile := config["file"].(fs.File)
return NewJiebaTokenizer(dictFile, hmm, searchMode)
}
func detectTokenType(term string) analysis.TokenType {

View File

@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
},
}
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true)
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {