mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-30 09:00:30 +08:00
优化 dict, add fs.File 支持
This commit is contained in:
@@ -6,8 +6,8 @@ import (
|
|||||||
|
|
||||||
func Example_extractTags() {
|
func Example_extractTags() {
|
||||||
var t TagExtracter
|
var t TagExtracter
|
||||||
t.LoadDictionary("../dict.txt")
|
t.LoadDictionaryAt("../dict.txt")
|
||||||
t.LoadIdf("idf.txt")
|
t.LoadIdfAt("idf.txt")
|
||||||
|
|
||||||
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
|
sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
|
||||||
segments := t.ExtractTags(sentence, 5)
|
segments := t.ExtractTags(sentence, 5)
|
||||||
@@ -20,7 +20,7 @@ func Example_extractTags() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Example_textRank() {
|
func Example_textRank() {
|
||||||
t, err := NewTextRanker("../dict.txt")
|
t, err := NewTextRankerAt("../dict.txt")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -38,8 +39,12 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
|
|||||||
i.Unlock()
|
i.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *Idf) loadDictionary(fileName string) error {
|
func (i *Idf) loadDictionary(file fs.File) error {
|
||||||
return dictionary.LoadDictionary(i, fileName)
|
return dictionary.LoadDictionary(i, file)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *Idf) loadDictionaryAt(fileName string) error {
|
||||||
|
return dictionary.LoadDictionaryAt(i, fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Frequency returns the IDF of given word.
|
// Frequency returns the IDF of given word.
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/fumiama/jieba/dictionary"
|
"github.com/fumiama/jieba/dictionary"
|
||||||
@@ -82,6 +83,10 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
|
|||||||
s.Unlock()
|
s.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *StopWord) loadDictionary(fileName string) error {
|
func (s *StopWord) loadDictionary(file fs.File) error {
|
||||||
return dictionary.LoadDictionary(s, fileName)
|
return dictionary.LoadDictionary(s, file)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *StopWord) loadDictionaryAt(file string) error {
|
||||||
|
return dictionary.LoadDictionaryAt(s, file)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
@@ -52,22 +53,41 @@ type TagExtracter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary reads the given filename and create a new dictionary.
|
// LoadDictionary reads the given filename and create a new dictionary.
|
||||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
func (t *TagExtracter) LoadDictionary(file fs.File) error {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
t.seg = new(jieba.Segmenter)
|
t.seg = new(jieba.Segmenter)
|
||||||
return t.seg.LoadDictionary(fileName)
|
return t.seg.LoadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadDictionaryAt reads the given filename and create a new dictionary.
|
||||||
|
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
|
||||||
|
t.stopWord = NewStopWord()
|
||||||
|
t.seg = new(jieba.Segmenter)
|
||||||
|
return t.seg.LoadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
func (t *TagExtracter) LoadIdf(file fs.File) error {
|
||||||
t.idf = NewIdf()
|
t.idf = NewIdf()
|
||||||
return t.idf.loadDictionary(fileName)
|
return t.idf.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadIdfAt reads the given file and create a new Idf dictionary.
|
||||||
|
func (t *TagExtracter) LoadIdfAt(fileName string) error {
|
||||||
|
t.idf = NewIdf()
|
||||||
|
return t.idf.loadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
func (t *TagExtracter) LoadStopWords(file fs.File) error {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
return t.stopWord.loadDictionary(fileName)
|
return t.stopWord.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
|
||||||
|
func (t *TagExtracter) LoadStopWordsAt(file string) error {
|
||||||
|
t.stopWord = NewStopWord()
|
||||||
|
return t.stopWord.loadDictionaryAt(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractTags extracts the topK key words from sentence.
|
// ExtractTags extracts the topK key words from sentence.
|
||||||
|
|||||||
@@ -256,8 +256,8 @@ var (
|
|||||||
|
|
||||||
func TestExtractTags(t *testing.T) {
|
func TestExtractTags(t *testing.T) {
|
||||||
var te TagExtracter
|
var te TagExtracter
|
||||||
te.LoadDictionary("../dict.txt")
|
te.LoadDictionaryAt("../dict.txt")
|
||||||
te.LoadIdf("idf.txt")
|
te.LoadIdfAt("idf.txt")
|
||||||
|
|
||||||
for index, sentence := range testContents {
|
for index, sentence := range testContents {
|
||||||
result := te.ExtractTags(sentence, 20)
|
result := te.ExtractTags(sentence, 20)
|
||||||
@@ -274,8 +274,8 @@ func TestExtractTags(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtratTagsWithWeight(t *testing.T) {
|
func TestExtratTagsWithWeight(t *testing.T) {
|
||||||
var te TagExtracter
|
var te TagExtracter
|
||||||
te.LoadDictionary("../dict.txt")
|
te.LoadDictionaryAt("../dict.txt")
|
||||||
te.LoadIdf("idf.txt")
|
te.LoadIdfAt("idf.txt")
|
||||||
result := te.ExtractTags(Lyric, 10)
|
result := te.ExtractTags(Lyric, 10)
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight[index].text != tag.text ||
|
if LyciWeight[index].text != tag.text ||
|
||||||
@@ -287,9 +287,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
|
|||||||
|
|
||||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||||
var te TagExtracter
|
var te TagExtracter
|
||||||
te.LoadDictionary("../dict.txt")
|
te.LoadDictionaryAt("../dict.txt")
|
||||||
te.LoadIdf("idf.txt")
|
te.LoadIdfAt("idf.txt")
|
||||||
te.LoadStopWords("stop_words.txt")
|
te.LoadStopWordsAt("stop_words.txt")
|
||||||
result := te.ExtractTags(Lyric, 7)
|
result := te.ExtractTags(Lyric, 7)
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight2[index].text != tag.text ||
|
if LyciWeight2[index].text != tag.text ||
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package analyse
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"hash/crc64"
|
"hash/crc64"
|
||||||
|
"io/fs"
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
@@ -173,7 +174,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
|||||||
type TextRanker posseg.Segmenter
|
type TextRanker posseg.Segmenter
|
||||||
|
|
||||||
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
||||||
func NewTextRanker(fileName string) (TextRanker, error) {
|
func NewTextRanker(file fs.File) (TextRanker, error) {
|
||||||
seg := posseg.Segmenter{}
|
seg := posseg.Segmenter{}
|
||||||
return TextRanker(seg), seg.LoadDictionary(fileName)
|
return TextRanker(seg), seg.LoadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
|
||||||
|
func NewTextRankerAt(fileName string) (TextRanker, error) {
|
||||||
|
seg := posseg.Segmenter{}
|
||||||
|
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestTextRank(t *testing.T) {
|
func TestTextRank(t *testing.T) {
|
||||||
tr, err := NewTextRanker("../dict.txt")
|
tr, err := NewTextRankerAt("../dict.txt")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package jieba
|
package jieba
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -57,6 +58,10 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
|||||||
return freq, ok
|
return freq, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
func (d *Dictionary) loadDictionary(file fs.File) error {
|
||||||
return dictionary.LoadDictionary(d, fileName)
|
return dictionary.LoadDictionary(d, file)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Dictionary) loadDictionaryAt(file string) error {
|
||||||
|
return dictionary.LoadDictionaryAt(d, file)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ package dictionary
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"io/fs"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -17,7 +17,7 @@ type DictLoader interface {
|
|||||||
AddToken(Token)
|
AddToken(Token)
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadDictionary(file *os.File) (tokens []Token, err error) {
|
func loadDictionary(file fs.File) (tokens []Token, err error) {
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
var token Token
|
var token Token
|
||||||
var line string
|
var line string
|
||||||
@@ -45,12 +45,18 @@ func loadDictionary(file *os.File) (tokens []Token, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||||
func LoadDictionary(dl DictLoader, fileName string) error {
|
func LoadDictionary(dl DictLoader, file fs.File) error {
|
||||||
filePath, err := dictPath(fileName)
|
tokens, err := loadDictionary(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
dictFile, err := os.Open(filePath)
|
dl.Load(tokens...)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadDictionaryAt reads the given file and passes all tokens to a DictLoader.
|
||||||
|
func LoadDictionaryAt(dl DictLoader, file string) error {
|
||||||
|
dictFile, err := os.Open(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -62,16 +68,3 @@ func LoadDictionary(dl DictLoader, fileName string) error {
|
|||||||
dl.Load(tokens...)
|
dl.Load(tokens...)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func dictPath(dictFileName string) (string, error) {
|
|
||||||
if filepath.IsAbs(dictFileName) {
|
|
||||||
return dictFileName, nil
|
|
||||||
}
|
|
||||||
var dictFilePath string
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
return dictFilePath, err
|
|
||||||
}
|
|
||||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
|
||||||
return dictFilePath, nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ func (d *Dict) AddToken(token Token) {
|
|||||||
|
|
||||||
func TestLoadDictionary(t *testing.T) {
|
func TestLoadDictionary(t *testing.T) {
|
||||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
err := LoadDictionary(d, "../userdict.txt")
|
err := LoadDictionaryAt(d, "../userdict.txt")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf(err.Error())
|
t.Fatalf(err.Error())
|
||||||
}
|
}
|
||||||
@@ -48,8 +48,8 @@ func TestLoadDictionary(t *testing.T) {
|
|||||||
|
|
||||||
func TestAddToken(t *testing.T) {
|
func TestAddToken(t *testing.T) {
|
||||||
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
LoadDictionary(d, "../userdict.txt")
|
LoadDictionaryAt(d, "../userdict.txt")
|
||||||
d.AddToken(Token{"好用", 99, "a"})
|
d.AddToken(Token{99, "好用", "a"})
|
||||||
if d.freqMap["好用"] != 99 {
|
if d.freqMap["好用"] != 99 {
|
||||||
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
|
t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,8 +2,8 @@ package dictionary
|
|||||||
|
|
||||||
// Token represents a Chinese word with (optional) frequency and POS.
|
// Token represents a Chinese word with (optional) frequency and POS.
|
||||||
type Token struct {
|
type Token struct {
|
||||||
text string
|
|
||||||
frequency float64
|
frequency float64
|
||||||
|
text string
|
||||||
pos string
|
pos string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ func Example_parallelCut() {
|
|||||||
runtime.GOMAXPROCS(numThreads)
|
runtime.GOMAXPROCS(numThreads)
|
||||||
|
|
||||||
// Load dictionary
|
// Load dictionary
|
||||||
segmenter.LoadDictionary("dict.txt")
|
segmenter.LoadDictionaryAt("dict.txt")
|
||||||
|
|
||||||
// open file for segmentation
|
// open file for segmentation
|
||||||
file, err := os.Open("README.md")
|
file, err := os.Open("README.md")
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
|
|
||||||
func Example() {
|
func Example() {
|
||||||
var seg Segmenter
|
var seg Segmenter
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
|
|
||||||
fmt.Print("【全模式】:")
|
fmt.Print("【全模式】:")
|
||||||
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
||||||
@@ -28,7 +28,7 @@ func Example() {
|
|||||||
|
|
||||||
func Example_suggestFrequency() {
|
func Example_suggestFrequency() {
|
||||||
var seg Segmenter
|
var seg Segmenter
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
|
|
||||||
sentence := "超敏C反应蛋白是什么?"
|
sentence := "超敏C反应蛋白是什么?"
|
||||||
fmt.Print("Before:")
|
fmt.Print("Before:")
|
||||||
@@ -76,13 +76,13 @@ func Example_suggestFrequency() {
|
|||||||
|
|
||||||
func Example_loadUserDictionary() {
|
func Example_loadUserDictionary() {
|
||||||
var seg Segmenter
|
var seg Segmenter
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家"
|
sentence := "李小福是创新办主任也是云计算方面的专家"
|
||||||
fmt.Print("Before:")
|
fmt.Print("Before:")
|
||||||
fmt.Println(seg.Cut(sentence, true))
|
fmt.Println(seg.Cut(sentence, true))
|
||||||
|
|
||||||
seg.LoadUserDictionary("userdict.txt")
|
seg.LoadUserDictionaryAt("userdict.txt")
|
||||||
|
|
||||||
fmt.Print("After:")
|
fmt.Print("After:")
|
||||||
fmt.Println(seg.Cut(sentence, true))
|
fmt.Println(seg.Cut(sentence, true))
|
||||||
|
|||||||
23
jieba.go
23
jieba.go
@@ -2,6 +2,7 @@
|
|||||||
package jieba
|
package jieba
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -93,16 +94,30 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
|||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name. Everytime
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadDictionaryAt loads dictionary from given file name. Everytime
|
||||||
|
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
|
func (seg *Segmenter) LoadDictionaryAt(file string) error {
|
||||||
|
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||||
|
return seg.dict.loadDictionaryAt(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||||
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
|
// instead it will override exist entries.
|
||||||
|
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
|
||||||
|
return seg.dict.loadDictionaryAt(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||||
|
|||||||
@@ -616,7 +616,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAG(t *testing.T) {
|
func TestCutDAG(t *testing.T) {
|
||||||
@@ -715,7 +715,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
|
|
||||||
func TestLoadDictionary(t *testing.T) {
|
func TestLoadDictionary(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
seg.LoadDictionary("foobar.txt")
|
seg.LoadDictionaryAt("foobar.txt")
|
||||||
for index, content := range testContents {
|
for index, content := range testContents {
|
||||||
result = seg.Cut(content, true)
|
result = seg.Cut(content, true)
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
@@ -728,11 +728,11 @@ func TestLoadDictionary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadUserDictionary(t *testing.T) {
|
func TestLoadUserDictionary(t *testing.T) {
|
||||||
seg.LoadUserDictionary("userdict.txt")
|
seg.LoadUserDictionaryAt("userdict.txt")
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||||
@@ -771,7 +771,7 @@ func TestLoadUserDictionary(t *testing.T) {
|
|||||||
t.Fatal(word)
|
t.Fatal(word)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seg.LoadDictionary("dict.txt")
|
seg.LoadDictionaryAt("dict.txt")
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkCutNoHMM(b *testing.B) {
|
func BenchmarkCutNoHMM(b *testing.B) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -69,6 +70,10 @@ func (d *Dictionary) Pos(key string) (string, bool) {
|
|||||||
return pos, ok
|
return pos, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) loadDictionary(fileName string) error {
|
func (d *Dictionary) loadDictionary(file fs.File) error {
|
||||||
return dictionary.LoadDictionary(d, fileName)
|
return dictionary.LoadDictionary(d, file)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Dictionary) loadDictionaryAt(file string) error {
|
||||||
|
return dictionary.LoadDictionaryAt(d, file)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
|
|
||||||
func Example() {
|
func Example() {
|
||||||
var seg posseg.Segmenter
|
var seg posseg.Segmenter
|
||||||
seg.LoadDictionary("../dict.txt")
|
seg.LoadDictionaryAt("../dict.txt")
|
||||||
|
|
||||||
for segment := range seg.Cut("我爱北京天安门", true) {
|
for segment := range seg.Cut("我爱北京天安门", true) {
|
||||||
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/fs"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
|
||||||
@@ -39,17 +40,31 @@ type Segmenter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name.
|
// LoadDictionary loads dictionary from given file name.
|
||||||
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadDictionaryAt loads dictionary from given file name.
|
||||||
|
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
|
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
|
||||||
|
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
|
return seg.dict.loadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||||
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
|
// instead it will override exist entries.
|
||||||
|
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
|
||||||
|
return seg.dict.loadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||||
|
|||||||
@@ -269,7 +269,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
seg.LoadDictionary("../dict.txt")
|
seg.LoadDictionaryAt("../dict.txt")
|
||||||
}
|
}
|
||||||
|
|
||||||
func chanToArray(ch <-chan Segment) []Segment {
|
func chanToArray(ch <-chan Segment) []Segment {
|
||||||
@@ -357,8 +357,8 @@ func TestBug137(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestUserDict(t *testing.T) {
|
func TestUserDict(t *testing.T) {
|
||||||
seg.LoadUserDictionary("../userdict.txt")
|
seg.LoadUserDictionaryAt("../userdict.txt")
|
||||||
defer seg.LoadDictionary("../dict.txt")
|
defer seg.LoadDictionaryAt("../dict.txt")
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
|
||||||
cutResult := []Segment{
|
cutResult := []Segment{
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ func Example() {
|
|||||||
sentence := []byte("永和服装饰品有限公司")
|
sentence := []byte("永和服装饰品有限公司")
|
||||||
|
|
||||||
// default mode
|
// default mode
|
||||||
tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
|
tokenizer, _ := tokenizers.NewJiebaTokenizerAt("../dict.txt", true, false)
|
||||||
fmt.Println("Default Mode:")
|
fmt.Println("Default Mode:")
|
||||||
for _, token := range tokenizer.Tokenize(sentence) {
|
for _, token := range tokenizer.Tokenize(sentence) {
|
||||||
fmt.Printf(
|
fmt.Printf(
|
||||||
@@ -19,7 +19,7 @@ func Example() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//search mode
|
//search mode
|
||||||
tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
|
tokenizer, _ = tokenizers.NewJiebaTokenizerAt("../dict.txt", true, true)
|
||||||
fmt.Println("Search Mode:")
|
fmt.Println("Search Mode:")
|
||||||
for _, token := range tokenizer.Tokenize(sentence) {
|
for _, token := range tokenizer.Tokenize(sentence) {
|
||||||
fmt.Printf(
|
fmt.Printf(
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package tokenizers
|
package tokenizers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"io/fs"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
@@ -24,6 +24,36 @@ type JiebaTokenizer struct {
|
|||||||
/*
|
/*
|
||||||
NewJiebaTokenizer creates a new JiebaTokenizer.
|
NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
dictFile: the dictioanry file.
|
||||||
|
|
||||||
|
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||||
|
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||||
|
English) not in the dictionary file. If hmm is set to false, it will be
|
||||||
|
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||||
|
be traded as one single word because Jieba using Hidden Markov Model with
|
||||||
|
Viterbi algorithm to guess the best possibility.
|
||||||
|
|
||||||
|
searchMode: whether to further cut long words into serveral short words.
|
||||||
|
In Chinese, some long words may contains other words, for example "交换机"
|
||||||
|
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||||
|
"交换机" as a single word. If searchMode is true, it will further split
|
||||||
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
|
*/
|
||||||
|
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
|
var seg jieba.Segmenter
|
||||||
|
err := seg.LoadDictionary(dictFile)
|
||||||
|
return &JiebaTokenizer{
|
||||||
|
seg: seg,
|
||||||
|
hmm: hmm,
|
||||||
|
searchMode: searchMode,
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
NewJiebaTokenizerAt creates a new JiebaTokenizer.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
|
|
||||||
dictFilePath: path of the dictioanry file.
|
dictFilePath: path of the dictioanry file.
|
||||||
@@ -41,9 +71,9 @@ Parameters:
|
|||||||
"交换机" as a single word. If searchMode is true, it will further split
|
"交换机" as a single word. If searchMode is true, it will further split
|
||||||
this word into "交换", "换机", which are valid Chinese words.
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
*/
|
*/
|
||||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
var seg jieba.Segmenter
|
var seg jieba.Segmenter
|
||||||
err := seg.LoadDictionary(dictFilePath)
|
err := seg.LoadDictionaryAt(dictFilePath)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
@@ -107,18 +137,13 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
|
|||||||
|
|
||||||
Parameter config should contains at least one parameter:
|
Parameter config should contains at least one parameter:
|
||||||
|
|
||||||
file: the path of the dictionary file.
|
file: the path of the dictionary file or fs.File.
|
||||||
|
|
||||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||||
|
|
||||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||||
*/
|
*/
|
||||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||||
analysis.Tokenizer, error) {
|
|
||||||
dictFilePath, ok := config["file"].(string)
|
|
||||||
if !ok {
|
|
||||||
return nil, fmt.Errorf("must specify dictionary file path")
|
|
||||||
}
|
|
||||||
hmm, ok := config["hmm"].(bool)
|
hmm, ok := config["hmm"].(bool)
|
||||||
if !ok {
|
if !ok {
|
||||||
hmm = true
|
hmm = true
|
||||||
@@ -127,8 +152,12 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
|
|||||||
if !ok {
|
if !ok {
|
||||||
searchMode = true
|
searchMode = true
|
||||||
}
|
}
|
||||||
|
dictFilePath, ok := config["file"].(string)
|
||||||
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
|
if ok {
|
||||||
|
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
|
||||||
|
}
|
||||||
|
dictFile := config["file"].(fs.File)
|
||||||
|
return NewJiebaTokenizer(dictFile, hmm, searchMode)
|
||||||
}
|
}
|
||||||
|
|
||||||
func detectTokenType(term string) analysis.TokenType {
|
func detectTokenType(term string) analysis.TokenType {
|
||||||
|
|||||||
@@ -5219,7 +5219,7 @@ func TestJiebaTokenizerDefaultModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, false)
|
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -11057,7 +11057,7 @@ func TestJiebaTokenizerSearchModeWithHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", true, true)
|
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", true, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -16474,7 +16474,7 @@ func TestJiebaTokenizerDefaultModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, false)
|
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, false)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
@@ -22506,7 +22506,7 @@ func TestJiebaTokenizerSearchModeWithoutHMM(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer, _ := NewJiebaTokenizer("../dict.txt", false, true)
|
tokenizer, _ := NewJiebaTokenizerAt("../dict.txt", false, true)
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
|||||||
Reference in New Issue
Block a user