1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-22 20:20:44 +08:00

优化 jieba

This commit is contained in:
源文雨
2022-11-30 16:00:56 +08:00
parent 4d76899e79
commit 6982ead703
8 changed files with 114 additions and 109 deletions

View File

@@ -53,17 +53,17 @@ type TagExtracter struct {
} }
// LoadDictionary reads the given filename and create a new dictionary. // LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(file fs.File) error { func (t *TagExtracter) LoadDictionary(file fs.File) (err error) {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter) t.seg, err = jieba.LoadDictionary(file)
return t.seg.LoadDictionary(file) return
} }
// LoadDictionaryAt reads the given filename and create a new dictionary. // LoadDictionaryAt reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionaryAt(fileName string) error { func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
t.seg = new(jieba.Segmenter) t.seg, err = jieba.LoadDictionaryAt(file)
return t.seg.LoadDictionaryAt(fileName) return
} }
// LoadIdf reads the given file and create a new Idf dictionary. // LoadIdf reads the given file and create a new Idf dictionary.

View File

@@ -10,9 +10,9 @@ import (
// A Dictionary represents a thread-safe dictionary used for word segmentation. // A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct { type Dictionary struct {
sync.RWMutex
total, logTotal float64 total, logTotal float64
freqMap map[string]float64 freqMap map[string]float64
sync.RWMutex
} }
// Load loads all tokens // Load loads all tokens

View File

@@ -16,7 +16,7 @@ type line struct {
} }
var ( var (
segmenter = Segmenter{} segmenter *Segmenter
numThreads = runtime.NumCPU() numThreads = runtime.NumCPU()
task = make(chan line, numThreads) task = make(chan line, numThreads)
result = make(chan line, numThreads) result = make(chan line, numThreads)
@@ -35,9 +35,6 @@ func Example_parallelCut() {
// Set the number of goroutines // Set the number of goroutines
runtime.GOMAXPROCS(numThreads) runtime.GOMAXPROCS(numThreads)
// Load dictionary
segmenter.LoadDictionaryAt("dict.txt")
// open file for segmentation // open file for segmentation
file, err := os.Open("README.md") file, err := os.Open("README.md")
if err != nil { if err != nil {
@@ -45,6 +42,12 @@ func Example_parallelCut() {
} }
defer file.Close() defer file.Close()
// Load dictionary
segmenter, err = LoadDictionaryAt("dict.txt")
if err != nil {
log.Fatal(err)
}
// start worker routines // start worker routines
for i := 0; i < numThreads; i++ { for i := 0; i < numThreads; i++ {
go worker() go worker()

View File

@@ -5,8 +5,10 @@ import (
) )
func Example() { func Example() {
var seg Segmenter seg, err := LoadDictionaryAt("dict.txt")
seg.LoadDictionaryAt("dict.txt") if err != nil {
panic(err)
}
fmt.Print("【全模式】:") fmt.Print("【全模式】:")
fmt.Println(seg.CutAll("我来到北京清华大学")) fmt.Println(seg.CutAll("我来到北京清华大学"))
@@ -27,8 +29,10 @@ func Example() {
} }
func Example_suggestFrequency() { func Example_suggestFrequency() {
var seg Segmenter seg, err := LoadDictionaryAt("dict.txt")
seg.LoadDictionaryAt("dict.txt") if err != nil {
panic(err)
}
sentence := "超敏C反应蛋白是什么" sentence := "超敏C反应蛋白是什么"
fmt.Print("Before:") fmt.Print("Before:")
@@ -75,8 +79,10 @@ func Example_suggestFrequency() {
} }
func Example_loadUserDictionary() { func Example_loadUserDictionary() {
var seg Segmenter seg, err := LoadDictionaryAt("dict.txt")
seg.LoadDictionaryAt("dict.txt") if err != nil {
panic(err)
}
sentence := "李小福是创新办主任也是云计算方面的专家" sentence := "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:") fmt.Print("Before:")

139
jieba.go
View File

@@ -21,23 +21,21 @@ var (
) )
// Segmenter is a Chinese words segmentation struct. // Segmenter is a Chinese words segmentation struct.
type Segmenter struct { type Segmenter Dictionary
dict *Dictionary
}
// Frequency returns a word's frequency and existence // Frequency returns a word's frequency and existence
func (seg *Segmenter) Frequency(word string) (float64, bool) { func (seg *Segmenter) Frequency(word string) (float64, bool) {
return seg.dict.Frequency(word) return (*Dictionary)(seg).Frequency(word)
} }
// AddWord adds a new word with frequency to dictionary // AddWord adds a new word with frequency to dictionary
func (seg *Segmenter) AddWord(word string, frequency float64) { func (seg *Segmenter) AddWord(word string, frequency float64) {
seg.dict.AddToken(dictionary.NewToken(word, frequency, "")) (*Dictionary)(seg).AddToken(dictionary.NewToken(word, frequency, ""))
} }
// DeleteWord removes a word from dictionary // DeleteWord removes a word from dictionary
func (seg *Segmenter) DeleteWord(word string) { func (seg *Segmenter) DeleteWord(word string) {
seg.dict.AddToken(dictionary.NewToken(word, 0.0, "")) (*Dictionary)(seg).AddToken(dictionary.NewToken(word, 0.0, ""))
} }
/* /*
@@ -58,79 +56,79 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
frequency := 1.0 frequency := 1.0
if len(words) > 1 { if len(words) > 1 {
for _, word := range words { for _, word := range words {
if freq, ok := seg.dict.Frequency(word); ok { if freq, ok := (*Dictionary)(seg).Frequency(word); ok {
frequency *= freq frequency *= freq
} }
frequency /= seg.dict.total frequency /= (*Dictionary)(seg).total
} }
frequency, _ = math.Modf(frequency * seg.dict.total) frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total)
wordFreq := 0.0 wordFreq := 0.0
if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok { if freq, ok := (*Dictionary)(seg).Frequency(strings.Join(words, "")); ok {
wordFreq = freq wordFreq = freq
} }
if wordFreq < frequency { if wordFreq < frequency {
frequency = wordFreq frequency = wordFreq
} }
} else { return frequency
word := words[0] }
for _, segment := range seg.Cut(word, false) { word := words[0]
if freq, ok := seg.dict.Frequency(segment); ok { for _, segment := range seg.Cut(word, false) {
frequency *= freq if freq, ok := (*Dictionary)(seg).Frequency(segment); ok {
} frequency *= freq
frequency /= seg.dict.total
}
frequency, _ = math.Modf(frequency * seg.dict.total)
frequency += 1.0
wordFreq := 1.0
if freq, ok := seg.dict.Frequency(word); ok {
wordFreq = freq
}
if wordFreq > frequency {
frequency = wordFreq
} }
frequency /= (*Dictionary)(seg).total
}
frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total)
frequency += 1.0
wordFreq := 1.0
if freq, ok := (*Dictionary)(seg).Frequency(word); ok {
wordFreq = freq
}
if wordFreq > frequency {
frequency = wordFreq
} }
return frequency return frequency
} }
// LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard. // LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(file fs.File) error { func LoadDictionary(file fs.File) (*Segmenter, error) {
seg.dict = &Dictionary{freqMap: make(map[string]float64)} d := &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(file) err := d.loadDictionary(file)
return (*Segmenter)(d), err
} }
// LoadDictionaryAt loads dictionary from given file name. Everytime // LoadDictionaryAt loads dictionary from given file name. Everytime
// LoadDictionaryAt is called, previously loaded dictionary will be cleard. // LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionaryAt(file string) error { func LoadDictionaryAt(file string) (*Segmenter, error) {
seg.dict = &Dictionary{freqMap: make(map[string]float64)} d := &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionaryAt(file) err := d.loadDictionaryAt(file)
return (*Segmenter)(d), err
} }
// LoadUserDictionary loads a user specified dictionary, it must be called // LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(file fs.File) error { func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
return seg.dict.loadDictionary(file) return (*Dictionary)(seg).loadDictionary(file)
} }
// LoadUserDictionaryAt loads a user specified dictionary, it must be called // LoadUserDictionaryAt loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionaryAt(file string) error { func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
return seg.dict.loadDictionaryAt(file) return (*Dictionary)(seg).loadDictionaryAt(file)
} }
func (seg *Segmenter) dag(runes []rune) map[int][]int { func (seg *Segmenter) dag(runes []rune) [][]int {
dag := make(map[int][]int)
n := len(runes) n := len(runes)
var frag []rune dag := make([][]int, n)
var i int
for k := 0; k < n; k++ { for k := 0; k < n; k++ {
dag[k] = make([]int, 0) dag[k] = make([]int, 0, 64)
i = k i := k
frag = runes[k : k+1] frag := runes[k : k+1]
for { for {
freq, ok := seg.dict.Frequency(string(frag)) freq, ok := (*Dictionary)(seg).Frequency(string(frag))
if !ok { if !ok {
break break
} }
@@ -155,20 +153,20 @@ type route struct {
index int index int
} }
func (seg *Segmenter) calc(runes []rune) map[int]route { func (seg *Segmenter) calc(runes []rune) []*route {
dag := seg.dag(runes) dag := seg.dag(runes)
n := len(runes) n := len(runes)
rs := make(map[int]route) rs := make([]*route, n+1)
rs[n] = route{frequency: 0.0, index: 0} rs[n] = &route{frequency: 0.0, index: 0}
var r route
for idx := n - 1; idx >= 0; idx-- { for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] { for _, i := range dag[idx] {
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok { var r *route
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i} if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
} else { } else {
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i} r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
} }
if v, ok := rs[idx]; !ok { if v := rs[idx]; v == nil {
rs[idx] = r rs[idx] = r
} else { } else {
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
@@ -190,14 +188,11 @@ type cutFunc func(sentence string) []string
func (seg *Segmenter) cutDAG(sentence string) []string { func (seg *Segmenter) cutDAG(sentence string) []string {
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
runes := []rune(sentence) runes := []rune(sentence)
routes := seg.calc(runes) routes := seg.calc(runes)
var y int buf := make([]rune, 0, 256)
length := len(runes) for x := 0; x < len(runes); {
var buf []rune y := routes[x].index + 1
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
if y-x == 1 { if y-x == 1 {
buf = append(buf, frag...) buf = append(buf, frag...)
@@ -207,7 +202,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
if len(buf) == 1 { if len(buf) == 1 {
result = append(result, bufString) result = append(result, bufString)
} else { } else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
result = append(result, finalseg.Cut(bufString)...) result = append(result, finalseg.Cut(bufString)...)
} else { } else {
for _, elem := range buf { for _, elem := range buf {
@@ -215,7 +210,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
} }
} }
} }
buf = make([]rune, 0) buf = buf[:0]
} }
result = append(result, string(frag)) result = append(result, string(frag))
} }
@@ -227,7 +222,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
if len(buf) == 1 { if len(buf) == 1 {
result = append(result, bufString) result = append(result, bufString)
} else { } else {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
result = append(result, finalseg.Cut(bufString)...) result = append(result, finalseg.Cut(bufString)...)
} else { } else {
for _, elem := range buf { for _, elem := range buf {
@@ -242,14 +237,11 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
func (seg *Segmenter) cutDAGNoHMM(sentence string) []string { func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
runes := []rune(sentence) runes := []rune(sentence)
routes := seg.calc(runes) routes := seg.calc(runes)
var y int buf := make([]rune, 0, 256)
length := len(runes) for x := 0; x < len(runes); {
var buf []rune y := routes[x].index + 1
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
if reEng.MatchString(string(frag)) && len(frag) == 1 { if reEng.MatchString(string(frag)) && len(frag) == 1 {
buf = append(buf, frag...) buf = append(buf, frag...)
@@ -258,7 +250,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
} }
if len(buf) > 0 { if len(buf) > 0 {
result = append(result, string(buf)) result = append(result, string(buf))
buf = make([]rune, 0) buf = buf[:0]
} }
result = append(result, string(frag)) result = append(result, string(frag))
x = y x = y
@@ -307,17 +299,11 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) []string {
func (seg *Segmenter) cutAll(sentence string) []string { func (seg *Segmenter) cutAll(sentence string) []string {
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
runes := []rune(sentence) runes := []rune(sentence)
dag := seg.dag(runes) dag := seg.dag(runes)
start := -1 start := -1
ks := make([]int, len(dag)) for k := 0; k < len(dag); k++ {
for k := range dag { l := dag[k]
ks[k] = k
}
var l []int
for k := range ks {
l = dag[k]
if len(l) == 1 && k > start { if len(l) == 1 && k > start {
result = append(result, string(runes[k:l[0]+1])) result = append(result, string(runes[k:l[0]+1]))
start = l[0] start = l[0]
@@ -367,10 +353,9 @@ func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string {
if len(runes) <= increment { if len(runes) <= increment {
continue continue
} }
var gram string
for i := 0; i < len(runes)-increment+1; i++ { for i := 0; i < len(runes)-increment+1; i++ {
gram = string(runes[i : i+increment]) gram := string(runes[i : i+increment])
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { if v, ok := (*Dictionary)(seg).Frequency(gram); ok && v > 0.0 {
result = append(result, gram) result = append(result, gram)
} }
} }

View File

@@ -3,7 +3,7 @@ package jieba
import "testing" import "testing"
var ( var (
seg Segmenter seg *Segmenter
testContents = []string{ testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。", "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。", "我不喜欢日本和服。",
@@ -616,7 +616,11 @@ var (
) )
func init() { func init() {
seg.LoadDictionaryAt("dict.txt") var err error
seg, err = LoadDictionaryAt("dict.txt")
if err != nil {
panic(err)
}
} }
func TestCutDAG(t *testing.T) { func TestCutDAG(t *testing.T) {
@@ -715,7 +719,11 @@ func TestCutForSearch(t *testing.T) {
func TestLoadDictionary(t *testing.T) { func TestLoadDictionary(t *testing.T) {
var result []string var result []string
seg.LoadDictionaryAt("foobar.txt") var err error
seg, err = LoadDictionaryAt("foobar.txt")
if err != nil {
t.Fatal(err)
}
for index, content := range testContents { for index, content := range testContents {
result = seg.Cut(content, true) result = seg.Cut(content, true)
if len(result) != len(userDictCutResult[index]) { if len(result) != len(userDictCutResult[index]) {
@@ -728,7 +736,10 @@ func TestLoadDictionary(t *testing.T) {
} }
} }
} }
seg.LoadDictionaryAt("dict.txt") seg, err = LoadDictionaryAt("dict.txt")
if err != nil {
t.Fatal(err)
}
} }
func TestLoadUserDictionary(t *testing.T) { func TestLoadUserDictionary(t *testing.T) {
@@ -771,7 +782,11 @@ func TestLoadUserDictionary(t *testing.T) {
t.Fatal(word) t.Fatal(word)
} }
} }
seg.LoadDictionaryAt("dict.txt") var err error
seg, err = LoadDictionaryAt("dict.txt")
if err != nil {
t.Fatal(err)
}
} }
func BenchmarkCutNoHMM(b *testing.B) { func BenchmarkCutNoHMM(b *testing.B) {

View File

@@ -123,12 +123,10 @@ func (seg *Segmenter) cutDetail(sentence string) (results []Segment) {
func (seg *Segmenter) dag(runes []rune) [][]int { func (seg *Segmenter) dag(runes []rune) [][]int {
n := len(runes) n := len(runes)
dag := make([][]int, n) dag := make([][]int, n)
var frag []rune
var i int
for k := 0; k < n; k++ { for k := 0; k < n; k++ {
dag[k] = make([]int, 0, 64) dag[k] = make([]int, 0, 64)
i = k i := k
frag = runes[k : k+1] frag := runes[k : k+1]
for { for {
freq, ok := (*Dictionary)(seg).Frequency(string(frag)) freq, ok := (*Dictionary)(seg).Frequency(string(frag))
if !ok { if !ok {
@@ -160,9 +158,9 @@ func (seg *Segmenter) calc(runes []rune) []*route {
n := len(runes) n := len(runes)
rs := make([]*route, n+1) rs := make([]*route, n+1)
rs[n] = &route{frequency: 0.0, index: 0} rs[n] = &route{frequency: 0.0, index: 0}
var r *route
for idx := n - 1; idx >= 0; idx-- { for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] { for _, i := range dag[idx] {
var r *route
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok { if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
} else { } else {

View File

@@ -18,7 +18,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jieba. // JiebaTokenizer is the beleve tokenizer for jieba.
type JiebaTokenizer struct { type JiebaTokenizer struct {
seg jieba.Segmenter seg *jieba.Segmenter
hmm, searchMode bool hmm, searchMode bool
} }
@@ -43,8 +43,7 @@ Parameters:
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter seg, err := jieba.LoadDictionary(dictFile)
err := seg.LoadDictionary(dictFile)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
hmm: hmm, hmm: hmm,
@@ -73,8 +72,7 @@ Parameters:
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg jieba.Segmenter seg, err := jieba.LoadDictionaryAt(dictFilePath)
err := seg.LoadDictionaryAt(dictFilePath)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
hmm: hmm, hmm: hmm,