1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-07-02 10:00:27 +08:00

use github.com/deckarep/golang-set instead of Trie, to reduce memory usage and improve performance, this is corresponding to jieba commit #4a93f21918a26083c039970edb9457c589c3a0ab

This commit is contained in:
Wang Bin
2015-02-03 15:20:30 +08:00
parent 9b2e9d29df
commit 9ee7ba2c13
3 changed files with 70 additions and 96 deletions

View File

@@ -9,7 +9,7 @@ import (
var ( var (
Dictionary = "dict.txt" Dictionary = "dict.txt"
TT *TopTrie trie *Trie
UserWordTagTab = make(map[string]string) UserWordTagTab = make(map[string]string)
) )
@@ -70,39 +70,29 @@ func GetDAG(sentence string) map[int][]int {
dag := make(map[int][]int) dag := make(map[int][]int)
runes := []rune(sentence) runes := []rune(sentence)
n := len(runes) n := len(runes)
p := TT.T i := 0
i, j := 0, 0 var frag string
var c rune for k := 0; k < n; k++ {
tmpList := make([]int, 0)
i = k
frag = string(runes[k])
for { for {
if !trie.Nodes.Contains(frag) {
break
}
if _, ok := trie.Freq[frag]; ok {
tmpList = append(tmpList, i)
}
i += 1
if i >= n { if i >= n {
break break
} }
c = runes[j] frag = string(runes[k : i+1])
if _, ok := p.Nodes[c]; ok {
p = p.Nodes[c]
if p.IsLeaf {
if _, inDag := dag[i]; !inDag {
dag[i] = []int{j}
} else {
dag[i] = append(dag[i], j)
} }
if len(tmpList) == 0 {
tmpList = append(tmpList, k)
} }
j += 1 dag[k] = tmpList
if j >= n {
i += 1
j = i
p = TT.T
}
} else {
p = TT.T
i += 1
j = i
}
}
for i := 0; i < n; i++ {
if _, ok := dag[i]; !ok {
dag[i] = []int{i}
}
} }
return dag return dag
} }
@@ -122,10 +112,10 @@ func Calc(sentence string, dag map[int][]int, idx int) map[int]*Route {
word = string(runes[idx : i+1]) word = string(runes[idx : i+1])
} }
var route *Route var route *Route
if _, ok := TT.Freq[word]; ok { if _, ok := trie.Freq[word]; ok {
route = &Route{TT.Freq[word] + routes[i+1].Freq, i} route = &Route{trie.Freq[word] + routes[i+1].Freq, i}
} else { } else {
route = &Route{TT.MinFreq + routes[i+1].Freq, i} route = &Route{trie.MinFreq + routes[i+1].Freq, i}
} }
candidates = append(candidates, route) candidates = append(candidates, route)
} }
@@ -161,7 +151,7 @@ func cut_DAG(sentence string) []string {
buf = make([]rune, 0) buf = make([]rune, 0)
} else { } else {
bufString := string(buf) bufString := string(buf)
if _, ok := TT.Freq[bufString]; !ok { if _, ok := trie.Freq[bufString]; !ok {
recognized := finalseg.Cut(bufString) recognized := finalseg.Cut(bufString)
for _, t := range recognized { for _, t := range recognized {
result = append(result, t) result = append(result, t)
@@ -184,7 +174,7 @@ func cut_DAG(sentence string) []string {
result = append(result, string(buf)) result = append(result, string(buf))
} else { } else {
bufString := string(buf) bufString := string(buf)
if _, ok := TT.Freq[bufString]; !ok { if _, ok := trie.Freq[bufString]; !ok {
recognized := finalseg.Cut(bufString) recognized := finalseg.Cut(bufString)
for _, t := range recognized { for _, t := range recognized {
result = append(result, t) result = append(result, t)
@@ -328,7 +318,7 @@ func CutForSearch(sentence string, hmm bool) []string {
var gram2 string var gram2 string
for i := 0; i < len(runes)-increment+1; i++ { for i := 0; i < len(runes)-increment+1; i++ {
gram2 = string(runes[i : i+increment]) gram2 = string(runes[i : i+increment])
if _, ok := TT.Freq[gram2]; ok { if _, ok := trie.Freq[gram2]; ok {
result = append(result, gram2) result = append(result, gram2)
} }
} }
@@ -340,6 +330,6 @@ func CutForSearch(sentence string, hmm bool) []string {
} }
func SetDictionary(dict_path string) (err error) { func SetDictionary(dict_path string) (err error) {
TT, err = newTopTrie(dict_path) trie, err = newTrie(dict_path)
return return
} }

View File

@@ -24,7 +24,7 @@ func Tokenize(sentence string, mode string, HMM bool) []Token {
if width > step { if width > step {
for i := 0; i < width-step+1; i++ { for i := 0; i < width-step+1; i++ {
gram := string(runes[i : i+step]) gram := string(runes[i : i+step])
if _, ok := TT.Freq[gram]; ok { if _, ok := trie.Freq[gram]; ok {
tokens = append(tokens, Token{gram, start + i, start + i + step}) tokens = append(tokens, Token{gram, start + i, start + i + step})
} }
} }

View File

@@ -5,6 +5,7 @@ import (
"crypto/md5" "crypto/md5"
"encoding/gob" "encoding/gob"
"fmt" "fmt"
mapset "github.com/deckarep/golang-set"
"log" "log"
"math" "math"
"os" "os"
@@ -14,56 +15,47 @@ import (
) )
type Trie struct { type Trie struct {
Nodes map[rune]*Trie Nodes mapset.Set
IsLeaf bool
}
func NewTrie() *Trie {
return &Trie{make(map[rune]*Trie), false}
}
type TopTrie struct {
T *Trie
MinFreq float64 MinFreq float64
Total float64 Total float64
Freq map[string]float64 Freq map[string]float64
} }
func newTopTrie(filename string) (*TopTrie, error) { func newTrie(fileName string) (*Trie, error) {
var file_path string var filePath string
var topTrie *TopTrie var trie *Trie
if filepath.IsAbs(filename) { if filepath.IsAbs(fileName) {
file_path = filename filePath = fileName
} else { } else {
pwd, err := os.Getwd() pwd, err := os.Getwd()
if err != nil { if err != nil {
return nil, err return nil, err
} }
file_path = filepath.Clean(filepath.Join(pwd, filename)) filePath = filepath.Clean(filepath.Join(pwd, fileName))
} }
fi, err := os.Stat(file_path) fi, err := os.Stat(filePath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
log.Printf("Building Trie..., from %s\n", file_path) log.Printf("Building Trie..., from %s\n", filePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path))) h := fmt.Sprintf("%x", md5.Sum([]byte(filePath)))
cache_file_name := fmt.Sprintf("jieba.%s.cache", h) cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cache_path := filepath.Join(os.TempDir(), cache_file_name) cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true isDictCached := true
cache_fi, err := os.Stat(cache_path) cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil { if err != nil {
isDictCached = false isDictCached = false
} }
if isDictCached { if isDictCached {
isDictCached = cache_fi.ModTime().After(fi.ModTime()) isDictCached = cacheFileInfo.ModTime().After(fi.ModTime())
} }
var cacheFile *os.File var cacheFile *os.File
if isDictCached { if isDictCached {
cacheFile, err = os.Open(cache_path) cacheFile, err = os.Open(cacheFilePath)
if err != nil { if err != nil {
isDictCached = false isDictCached = false
} }
@@ -71,17 +63,19 @@ func newTopTrie(filename string) (*TopTrie, error) {
} }
if isDictCached { if isDictCached {
dec := gob.NewDecoder(cacheFile) dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&topTrie) err = dec.Decode(&trie)
if err != nil { if err != nil {
isDictCached = false isDictCached = false
} else { } else {
log.Printf("loaded model from cache %s\n", cache_path) log.Printf("loaded model from cache %s\n", cacheFilePath)
} }
} }
if !isDictCached { if !isDictCached {
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)} trie = &Trie{Nodes: mapset.NewSet(), MinFreq: 0.0, Total: 0.0,
file, openError := os.Open(file_path) Freq: make(map[string]float64)}
file, openError := os.Open(filePath)
if openError != nil { if openError != nil {
return nil, openError return nil, openError
} }
@@ -93,55 +87,45 @@ func newTopTrie(filename string) (*TopTrie, error) {
words := strings.Split(line, " ") words := strings.Split(line, " ")
word, freqStr := words[0], words[1] word, freqStr := words[0], words[1]
freq, _ := strconv.ParseFloat(freqStr, 64) freq, _ := strconv.ParseFloat(freqStr, 64)
topTrie.Total += freq trie.addWord(word, freq)
topTrie.addWord(word, freq)
} }
if scanErr := scanner.Err(); scanErr != nil { if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr return nil, scanErr
} }
var val float64 var val float64
for key := range topTrie.Freq { for key := range trie.Freq {
val = math.Log(topTrie.Freq[key] / topTrie.Total) val = math.Log(trie.Freq[key] / trie.Total)
if val < topTrie.MinFreq { if val < trie.MinFreq {
topTrie.MinFreq = val trie.MinFreq = val
} }
topTrie.Freq[key] = val trie.Freq[key] = val
} }
// dump topTrie // dump trie
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil { if err != nil {
return topTrie, err return trie, err
} }
defer cacheFile.Close() defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile) enc := gob.NewEncoder(cacheFile)
err := enc.Encode(topTrie) err := enc.Encode(trie)
if err != nil { if err != nil {
return topTrie, err return trie, err
} else { } else {
log.Printf("dumped model from cache %s\n", cache_path) log.Printf("dumped model from cache %s\n", cacheFilePath)
} }
} }
return topTrie, nil return trie, nil
} }
func (tt *TopTrie) addWord(word string, freq float64) { func (t *Trie) addWord(word string, freq float64) {
tt.Freq[word] = freq t.Freq[word] = freq
var p *Trie t.Total += freq
runes := []rune(word) runes := []rune(word)
count := len(runes) count := len(runes)
for index, key := range runes { for i := 0; i < count; i++ {
if index == 0 { t.Nodes.Add(string(runes[:i+1]))
p = tt.T
}
if _, ok := p.Nodes[key]; !ok {
p.Nodes[key] = NewTrie()
}
if index == count-1 {
p.Nodes[key].IsLeaf = true
}
p = p.Nodes[key]
} }
} }
@@ -149,11 +133,11 @@ func addWord(word string, freq float64, tag string) {
if len(tag) > 0 { if len(tag) > 0 {
UserWordTagTab[word] = strings.TrimSpace(tag) UserWordTagTab[word] = strings.TrimSpace(tag)
} }
TT.addWord(word, freq) trie.addWord(word, freq)
} }
func LoadUserDict(file_path string) error { func LoadUserDict(filePath string) error {
file, openError := os.Open(file_path) file, openError := os.Open(filePath)
if openError != nil { if openError != nil {
return openError return openError
} }