1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-11 20:50:29 +08:00

use github.com/deckarep/golang-set instead of Trie, to reduce memory usage and improve performance, this is corresponding to jieba commit #4a93f21918a26083c039970edb9457c589c3a0ab

This commit is contained in:
Wang Bin
2015-02-03 15:20:30 +08:00
parent 9b2e9d29df
commit 9ee7ba2c13
3 changed files with 70 additions and 96 deletions

View File

@@ -5,6 +5,7 @@ import (
"crypto/md5"
"encoding/gob"
"fmt"
mapset "github.com/deckarep/golang-set"
"log"
"math"
"os"
@@ -14,56 +15,47 @@ import (
)
type Trie struct {
Nodes map[rune]*Trie
IsLeaf bool
}
func NewTrie() *Trie {
return &Trie{make(map[rune]*Trie), false}
}
type TopTrie struct {
T *Trie
Nodes mapset.Set
MinFreq float64
Total float64
Freq map[string]float64
}
func newTopTrie(filename string) (*TopTrie, error) {
var file_path string
var topTrie *TopTrie
if filepath.IsAbs(filename) {
file_path = filename
func newTrie(fileName string) (*Trie, error) {
var filePath string
var trie *Trie
if filepath.IsAbs(fileName) {
filePath = fileName
} else {
pwd, err := os.Getwd()
if err != nil {
return nil, err
}
file_path = filepath.Clean(filepath.Join(pwd, filename))
filePath = filepath.Clean(filepath.Join(pwd, fileName))
}
fi, err := os.Stat(file_path)
fi, err := os.Stat(filePath)
if err != nil {
return nil, err
}
log.Printf("Building Trie..., from %s\n", file_path)
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
cache_path := filepath.Join(os.TempDir(), cache_file_name)
log.Printf("Building Trie..., from %s\n", filePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(filePath)))
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true
cache_fi, err := os.Stat(cache_path)
cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil {
isDictCached = false
}
if isDictCached {
isDictCached = cache_fi.ModTime().After(fi.ModTime())
isDictCached = cacheFileInfo.ModTime().After(fi.ModTime())
}
var cacheFile *os.File
if isDictCached {
cacheFile, err = os.Open(cache_path)
cacheFile, err = os.Open(cacheFilePath)
if err != nil {
isDictCached = false
}
@@ -71,17 +63,19 @@ func newTopTrie(filename string) (*TopTrie, error) {
}
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&topTrie)
err = dec.Decode(&trie)
if err != nil {
isDictCached = false
} else {
log.Printf("loaded model from cache %s\n", cache_path)
log.Printf("loaded model from cache %s\n", cacheFilePath)
}
}
if !isDictCached {
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(file_path)
trie = &Trie{Nodes: mapset.NewSet(), MinFreq: 0.0, Total: 0.0,
Freq: make(map[string]float64)}
file, openError := os.Open(filePath)
if openError != nil {
return nil, openError
}
@@ -93,55 +87,45 @@ func newTopTrie(filename string) (*TopTrie, error) {
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
freq, _ := strconv.ParseFloat(freqStr, 64)
topTrie.Total += freq
topTrie.addWord(word, freq)
trie.addWord(word, freq)
}
if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr
}
var val float64
for key := range topTrie.Freq {
val = math.Log(topTrie.Freq[key] / topTrie.Total)
if val < topTrie.MinFreq {
topTrie.MinFreq = val
for key := range trie.Freq {
val = math.Log(trie.Freq[key] / trie.Total)
if val < trie.MinFreq {
trie.MinFreq = val
}
topTrie.Freq[key] = val
trie.Freq[key] = val
}
// dump topTrie
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
// dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return topTrie, err
return trie, err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err := enc.Encode(topTrie)
err := enc.Encode(trie)
if err != nil {
return topTrie, err
return trie, err
} else {
log.Printf("dumped model from cache %s\n", cache_path)
log.Printf("dumped model from cache %s\n", cacheFilePath)
}
}
return topTrie, nil
return trie, nil
}
func (tt *TopTrie) addWord(word string, freq float64) {
tt.Freq[word] = freq
var p *Trie
func (t *Trie) addWord(word string, freq float64) {
t.Freq[word] = freq
t.Total += freq
runes := []rune(word)
count := len(runes)
for index, key := range runes {
if index == 0 {
p = tt.T
}
if _, ok := p.Nodes[key]; !ok {
p.Nodes[key] = NewTrie()
}
if index == count-1 {
p.Nodes[key].IsLeaf = true
}
p = p.Nodes[key]
for i := 0; i < count; i++ {
t.Nodes.Add(string(runes[:i+1]))
}
}
@@ -149,11 +133,11 @@ func addWord(word string, freq float64, tag string) {
if len(tag) > 0 {
UserWordTagTab[word] = strings.TrimSpace(tag)
}
TT.addWord(word, freq)
trie.addWord(word, freq)
}
func LoadUserDict(file_path string) error {
file, openError := os.Open(file_path)
func LoadUserDict(filePath string) error {
file, openError := os.Open(filePath)
if openError != nil {
return openError
}