1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-18 09:20:26 +08:00

优化 dict, add fs.File 支持

This commit is contained in:
源文雨
2022-11-30 14:14:48 +08:00
parent c8785c7994
commit f3da9e6420
22 changed files with 190 additions and 91 deletions

View File

@@ -6,8 +6,8 @@ import (
func Example_extractTags() {
var t TagExtracter
t.LoadDictionary("../dict.txt")
t.LoadIdf("idf.txt")
t.LoadDictionaryAt("../dict.txt")
t.LoadIdfAt("idf.txt")
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。"
segments := t.ExtractTags(sentence, 5)
@@ -20,7 +20,7 @@ func Example_extractTags() {
}
func Example_textRank() {
t, err := NewTextRanker("../dict.txt")
t, err := NewTextRankerAt("../dict.txt")
if err != nil {
panic(err)
}

View File

@@ -1,6 +1,7 @@
package analyse
import (
"io/fs"
"sort"
"sync"
@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
i.Unlock()
}
func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName)
func (i *Idf) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(i, file)
}
func (i *Idf) loadDictionaryAt(fileName string) error {
return dictionary.LoadDictionaryAt(i, fileName)
}
// Frequency returns the IDF of given word.

View File

@@ -1,6 +1,7 @@
package analyse
import (
"io/fs"
"sync"
"github.com/fumiama/jieba/dictionary"
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
s.Unlock()
}
func (s *StopWord) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(s, fileName)
func (s *StopWord) loadDictionary(file fs.File) error {
return dictionary.LoadDictionary(s, file)
}
func (s *StopWord) loadDictionaryAt(file string) error {
return dictionary.LoadDictionaryAt(s, file)
}

View File

@@ -2,6 +2,7 @@
package analyse
import (
"io/fs"
"sort"
"strings"
"unicode/utf8"
@@ -52,22 +53,41 @@ type TagExtracter struct {
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
func (t *TagExtracter) LoadDictionary(file fs.File) error {
t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionary(fileName)
return t.seg.LoadDictionary(file)
}
// LoadDictionaryAt reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionaryAt(fileName)
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error {
func (t *TagExtracter) LoadIdf(file fs.File) error {
t.idf = NewIdf()
return t.idf.loadDictionary(fileName)
return t.idf.loadDictionary(file)
}
// LoadIdfAt reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdfAt(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionaryAt(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error {
func (t *TagExtracter) LoadStopWords(file fs.File) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName)
return t.stopWord.loadDictionary(file)
}
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWordsAt(file string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionaryAt(file)
}
// ExtractTags extracts the topK key words from sentence.

View File

@@ -256,8 +256,8 @@ var (
func TestExtractTags(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20)
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {
func TestExtratTagsWithWeight(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
result := te.ExtractTags(Lyric, 10)
for index, tag := range result {
if LyciWeight[index].text != tag.text ||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
func TestExtractTagsWithStopWordsFile(t *testing.T) {
var te TagExtracter
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
te.LoadStopWords("stop_words.txt")
te.LoadDictionaryAt("../dict.txt")
te.LoadIdfAt("idf.txt")
te.LoadStopWordsAt("stop_words.txt")
result := te.ExtractTags(Lyric, 7)
for index, tag := range result {
if LyciWeight2[index].text != tag.text ||

View File

@@ -2,6 +2,7 @@ package analyse
import (
"hash/crc64"
"io/fs"
"math"
"sort"
@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
type TextRanker posseg.Segmenter
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
func NewTextRanker(fileName string) (TextRanker, error) {
func NewTextRanker(file fs.File) (TextRanker, error) {
seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionary(fileName)
return TextRanker(seg), seg.LoadDictionary(file)
}
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
func NewTextRankerAt(fileName string) (TextRanker, error) {
seg := posseg.Segmenter{}
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
}

View File

@@ -23,7 +23,7 @@ var (
)
func TestTextRank(t *testing.T) {
tr, err := NewTextRanker("../dict.txt")
tr, err := NewTextRankerAt("../dict.txt")
if err != nil {
t.Fatal(err)
}