mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化 dict, add fs.File 支持
This commit is contained in:
@@ -6,8 +6,8 @@ import (
|
||||
|
||||
func Example_extractTags() {
|
||||
var t TagExtracter
|
||||
t.LoadDictionary("../dict.txt")
|
||||
t.LoadIdf("idf.txt")
|
||||
t.LoadDictionaryAt("../dict.txt")
|
||||
t.LoadIdfAt("idf.txt")
|
||||
|
||||
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
|
||||
segments := t.ExtractTags(sentence, 5)
|
||||
@@ -20,7 +20,7 @@ func Example_extractTags() {
|
||||
}
|
||||
|
||||
func Example_textRank() {
|
||||
t, err := NewTextRanker("../dict.txt")
|
||||
t, err := NewTextRankerAt("../dict.txt")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
|
||||
i.Unlock()
|
||||
}
|
||||
|
||||
func (i *Idf) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(i, fileName)
|
||||
func (i *Idf) loadDictionary(file fs.File) error {
|
||||
return dictionary.LoadDictionary(i, file)
|
||||
}
|
||||
|
||||
func (i *Idf) loadDictionaryAt(fileName string) error {
|
||||
return dictionary.LoadDictionaryAt(i, fileName)
|
||||
}
|
||||
|
||||
// Frequency returns the IDF of given word.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"sync"
|
||||
|
||||
"github.com/fumiama/jieba/dictionary"
|
||||
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
func (s *StopWord) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(s, fileName)
|
||||
func (s *StopWord) loadDictionary(file fs.File) error {
|
||||
return dictionary.LoadDictionary(s, file)
|
||||
}
|
||||
|
||||
func (s *StopWord) loadDictionaryAt(file string) error {
|
||||
return dictionary.LoadDictionaryAt(s, file)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
@@ -52,22 +53,41 @@ type TagExtracter struct {
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||
func (t *TagExtracter) LoadDictionary(file fs.File) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jieba.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
return t.seg.LoadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadDictionaryAt reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jieba.Segmenter)
|
||||
return t.seg.LoadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||
func (t *TagExtracter) LoadIdf(file fs.File) error {
|
||||
t.idf = NewIdf()
|
||||
return t.idf.loadDictionary(fileName)
|
||||
return t.idf.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadIdfAt reads the given file and create a new Idf dictionary.
|
||||
func (t *TagExtracter) LoadIdfAt(fileName string) error {
|
||||
t.idf = NewIdf()
|
||||
return t.idf.loadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
func (t *TagExtracter) LoadStopWords(file fs.File) error {
|
||||
t.stopWord = NewStopWord()
|
||||
return t.stopWord.loadDictionary(fileName)
|
||||
return t.stopWord.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
|
||||
func (t *TagExtracter) LoadStopWordsAt(file string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
return t.stopWord.loadDictionaryAt(file)
|
||||
}
|
||||
|
||||
// ExtractTags extracts the topK key words from sentence.
|
||||
|
||||
@@ -256,8 +256,8 @@ var (
|
||||
|
||||
func TestExtractTags(t *testing.T) {
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
te.LoadDictionaryAt("../dict.txt")
|
||||
te.LoadIdfAt("idf.txt")
|
||||
|
||||
for index, sentence := range testContents {
|
||||
result := te.ExtractTags(sentence, 20)
|
||||
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {
|
||||
|
||||
func TestExtratTagsWithWeight(t *testing.T) {
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
te.LoadDictionaryAt("../dict.txt")
|
||||
te.LoadIdfAt("idf.txt")
|
||||
result := te.ExtractTags(Lyric, 10)
|
||||
for index, tag := range result {
|
||||
if LyciWeight[index].text != tag.text ||
|
||||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
|
||||
|
||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||
var te TagExtracter
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
te.LoadStopWords("stop_words.txt")
|
||||
te.LoadDictionaryAt("../dict.txt")
|
||||
te.LoadIdfAt("idf.txt")
|
||||
te.LoadStopWordsAt("stop_words.txt")
|
||||
result := te.ExtractTags(Lyric, 7)
|
||||
for index, tag := range result {
|
||||
if LyciWeight2[index].text != tag.text ||
|
||||
|
||||
@@ -2,6 +2,7 @@ package analyse
|
||||
|
||||
import (
|
||||
"hash/crc64"
|
||||
"io/fs"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||
type TextRanker posseg.Segmenter
|
||||
|
||||
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
||||
func NewTextRanker(fileName string) (TextRanker, error) {
|
||||
func NewTextRanker(file fs.File) (TextRanker, error) {
|
||||
seg := posseg.Segmenter{}
|
||||
return TextRanker(seg), seg.LoadDictionary(fileName)
|
||||
return TextRanker(seg), seg.LoadDictionary(file)
|
||||
}
|
||||
|
||||
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
|
||||
func NewTextRankerAt(fileName string) (TextRanker, error) {
|
||||
seg := posseg.Segmenter{}
|
||||
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ var (
|
||||
)
|
||||
|
||||
func TestTextRank(t *testing.T) {
|
||||
tr, err := NewTextRanker("../dict.txt")
|
||||
tr, err := NewTextRankerAt("../dict.txt")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
@@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(d, fileName)
|
||||
func (d *Dictionary) loadDictionary(file fs.File) error {
|
||||
return dictionary.LoadDictionary(d, file)
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionaryAt(file string) error {
|
||||
return dictionary.LoadDictionaryAt(d, file)
|
||||
}
|
||||
|
||||
@@ -4,8 +4,8 @@ package dictionary
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -17,7 +17,7 @@ type DictLoader interface {
|
||||
AddToken(Token)
|
||||
}
|
||||
|
||||
func loadDictionary(file *os.File) (tokens []Token, err error) {
|
||||
func loadDictionary(file fs.File) (tokens []Token, err error) {
|
||||
scanner := bufio.NewScanner(file)
|
||||
var token Token
|
||||
var line string
|
||||
@@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) {
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||
func LoadDictionary(dl DictLoader, fileName string) error {
|
||||
filePath, err := dictPath(fileName)
|
||||
func LoadDictionary(dl DictLoader, file fs.File) error {
|
||||
tokens, err := loadDictionary(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dictFile, err := os.Open(filePath)
|
||||
dl.Load(tokens...)
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader.
|
||||
func LoadDictionaryAt(dl DictLoader, file string) error {
|
||||
dictFile, err := os.Open(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error {
|
||||
dl.Load(tokens...)
|
||||
return nil
|
||||
}
|
||||
|
||||
func dictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) {
|
||||
|
||||
func TestLoadDictionary(t *testing.T) {
|
||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
err := LoadDictionary(d, "../userdict.txt")
|
||||
err := LoadDictionaryAt(d, "../userdict.txt")
|
||||
if err != nil {
|
||||
t.Fatalf(err.Error())
|
||||
}
|
||||
@@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) {
|
||||
|
||||
func TestAddToken(t *testing.T) {
|
||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
LoadDictionary(d, "../userdict.txt")
|
||||
d.AddToken(Token{"好用", 99, "a"})
|
||||
LoadDictionaryAt(d, "../userdict.txt")
|
||||
d.AddToken(Token{99, "好用", "a"})
|
||||
if d.freqMap["好用"] != 99 {
|
||||
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
|
||||
}
|
||||
|
||||
@@ -2,12 +2,12 @@ package dictionary
|
||||
|
||||
// Token represents a Chinese word with (optional) frequency and POS.
|
||||
type Token struct {
|
||||
text string
|
||||
frequency float64
|
||||
text string
|
||||
pos string
|
||||
}
|
||||
|
||||
//Text returns token's text.
|
||||
// Text returns token's text.
|
||||
func (t Token) Text() string {
|
||||
return t.text
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ func Example_parallelCut() {
|
||||
runtime.GOMAXPROCS(numThreads)
|
||||
|
||||
// Load dictionary
|
||||
segmenter.LoadDictionary("dict.txt")
|
||||
segmenter.LoadDictionaryAt("dict.txt")
|
||||
|
||||
// open file for segmentation
|
||||
file, err := os.Open("README.md")
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
|
||||
func Example() {
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
|
||||
fmt.Print("【全模式】:")
|
||||
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
||||
@@ -28,7 +28,7 @@ func Example() {
|
||||
|
||||
func Example_suggestFrequency() {
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
|
||||
sentence := "超敏C反应蛋白是什么?"
|
||||
fmt.Print("Before:")
|
||||
@@ -76,13 +76,13 @@ func Example_suggestFrequency() {
|
||||
|
||||
func Example_loadUserDictionary() {
|
||||
var seg Segmenter
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家"
|
||||
fmt.Print("Before:")
|
||||
fmt.Println(seg.Cut(sentence, true))
|
||||
|
||||
seg.LoadUserDictionary("userdict.txt")
|
||||
seg.LoadUserDictionaryAt("userdict.txt")
|
||||
|
||||
fmt.Print("After:")
|
||||
fmt.Println(seg.Cut(sentence, true))
|
||||
|
||||
23
jieba.go
23
jieba.go
@@ -2,6 +2,7 @@
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"math"
|
||||
"regexp"
|
||||
"strings"
|
||||
@@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
||||
|
||||
// LoadDictionary loads dictionary from given file name. Everytime
|
||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
return seg.dict.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadDictionaryAt loads dictionary from given file name. Everytime
|
||||
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionaryAt(file string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||
return seg.dict.loadDictionaryAt(file)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||
return seg.dict.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
|
||||
return seg.dict.loadDictionaryAt(file)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
|
||||
@@ -616,7 +616,7 @@ var (
|
||||
)
|
||||
|
||||
func init() {
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
}
|
||||
|
||||
func TestCutDAG(t *testing.T) {
|
||||
@@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) {
|
||||
|
||||
func TestLoadDictionary(t *testing.T) {
|
||||
var result []string
|
||||
seg.LoadDictionary("foobar.txt")
|
||||
seg.LoadDictionaryAt("foobar.txt")
|
||||
for index, content := range testContents {
|
||||
result = seg.Cut(content, true)
|
||||
if len(result) != len(userDictCutResult[index]) {
|
||||
@@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
}
|
||||
|
||||
func TestLoadUserDictionary(t *testing.T) {
|
||||
seg.LoadUserDictionary("userdict.txt")
|
||||
seg.LoadUserDictionaryAt("userdict.txt")
|
||||
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||
@@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) {
|
||||
t.Fatal(word)
|
||||
}
|
||||
}
|
||||
seg.LoadDictionary("dict.txt")
|
||||
seg.LoadDictionaryAt("dict.txt")
|
||||
}
|
||||
|
||||
func BenchmarkCutNoHMM(b *testing.B) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
@@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) {
|
||||
return pos, ok
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(d, fileName)
|
||||
func (d *Dictionary) loadDictionary(file fs.File) error {
|
||||
return dictionary.LoadDictionary(d, file)
|
||||
}
|
||||
|
||||
func (d *Dictionary) loadDictionaryAt(file string) error {
|
||||
return dictionary.LoadDictionaryAt(d, file)
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
func Example() {
|
||||
var seg posseg.Segmenter
|
||||
seg.LoadDictionary("../dict.txt")
|
||||
seg.LoadDictionaryAt("../dict.txt")
|
||||
|
||||
for segment := range seg.Cut("我爱北京天安门", true) {
|
||||
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"math"
|
||||
"regexp"
|
||||
|
||||
@@ -39,17 +40,31 @@ type Segmenter struct {
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name.
|
||||
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
return seg.dict.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadDictionaryAt loads dictionary from given file name.
|
||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||
return seg.dict.loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
|
||||
return seg.dict.loadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||
|
||||
@@ -269,7 +269,7 @@ var (
|
||||
)
|
||||
|
||||
func init() {
|
||||
seg.LoadDictionary("../dict.txt")
|
||||
seg.LoadDictionaryAt("../dict.txt")
|
||||
}
|
||||
|
||||
func chanToArray(ch <-chan Segment) []Segment {
|
||||
@@ -357,8 +357,8 @@ func TestBug137(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUserDict(t *testing.T) {
|
||||
seg.LoadUserDictionary("../userdict.txt")
|
||||
defer seg.LoadDictionary("../dict.txt")
|
||||
seg.LoadUserDictionaryAt("../userdict.txt")
|
||||
defer seg.LoadDictionaryAt("../dict.txt")
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
|
||||
cutResult := []Segment{
|
||||
|
||||
@@ -10,7 +10,7 @@ func Example() {
|
||||
sentence := []byte("永和服装饰品有限公司")
|
||||
|
||||
// default mode
|
||||
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
|
||||
tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false)
|
||||
fmt.Println("Default Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
@@ -19,7 +19,7 @@ func Example() {
|
||||
}
|
||||
|
||||
//search mode
|
||||
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
|
||||
tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true)
|
||||
fmt.Println("Search Mode:")
|
||||
for _, token := range tokenizer.Tokenize(sentence) {
|
||||
fmt.Printf(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
package tokenizers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
@@ -24,6 +24,36 @@ type JiebaTokenizer struct {
|
||||
/*
|
||||
NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFile: the dictioanry file.
|
||||
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg jieba.Segmenter
|
||||
err := seg.LoadDictionary(dictFile)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
hmm: hmm,
|
||||
searchMode: searchMode,
|
||||
}, err
|
||||
}
|
||||
|
||||
/*
|
||||
NewJiebaTokenizerAt creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFilePath: path of the dictioanry file.
|
||||
@@ -41,9 +71,9 @@ Parameters:
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg jieba.Segmenter
|
||||
err := seg.LoadDictionary(dictFilePath)
|
||||
err := seg.LoadDictionaryAt(dictFilePath)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
hmm: hmm,
|
||||
@@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||
|
||||
Parameter config should contains at least one parameter:
|
||||
|
||||
file: the path of the dictionary file.
|
||||
file: the path of the dictionary file or fs.File.
|
||||
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
*/
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
dictFilePath, ok := config["file"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify dictionary file path")
|
||||
}
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
hmm, ok := config["hmm"].(bool)
|
||||
if !ok {
|
||||
hmm = true
|
||||
@@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
|
||||
if !ok {
|
||||
searchMode = true
|
||||
}
|
||||
|
||||
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
|
||||
dictFilePath, ok := config["file"].(string)
|
||||
if ok {
|
||||
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
|
||||
}
|
||||
dictFile := config["file"].(fs.File)
|
||||
return NewJiebaTokenizer(dictFile, hmm, searchMode)
|
||||
}
|
||||
|
||||
func detectTokenType(term string) analysis.TokenType {
|
||||
|
||||
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
||||
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
||||
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
||||
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
||||
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true)
|
||||
for _, test := range tests {
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
|
||||
Reference in New Issue
Block a user