mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-11 20:50:29 +08:00
use github.com/deckarep/golang-set instead of Trie, to reduce memory usage and improve performance, this is corresponding to jieba commit #4a93f21918a26083c039970edb9457c589c3a0ab
This commit is contained in:
102
trie_node.go
102
trie_node.go
@@ -5,6 +5,7 @@ import (
|
||||
"crypto/md5"
|
||||
"encoding/gob"
|
||||
"fmt"
|
||||
mapset "github.com/deckarep/golang-set"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
@@ -14,56 +15,47 @@ import (
|
||||
)
|
||||
|
||||
type Trie struct {
|
||||
Nodes map[rune]*Trie
|
||||
IsLeaf bool
|
||||
}
|
||||
|
||||
func NewTrie() *Trie {
|
||||
return &Trie{make(map[rune]*Trie), false}
|
||||
}
|
||||
|
||||
type TopTrie struct {
|
||||
T *Trie
|
||||
Nodes mapset.Set
|
||||
MinFreq float64
|
||||
Total float64
|
||||
Freq map[string]float64
|
||||
}
|
||||
|
||||
func newTopTrie(filename string) (*TopTrie, error) {
|
||||
var file_path string
|
||||
var topTrie *TopTrie
|
||||
if filepath.IsAbs(filename) {
|
||||
file_path = filename
|
||||
func newTrie(fileName string) (*Trie, error) {
|
||||
var filePath string
|
||||
var trie *Trie
|
||||
if filepath.IsAbs(fileName) {
|
||||
filePath = fileName
|
||||
} else {
|
||||
pwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
||||
filePath = filepath.Clean(filepath.Join(pwd, fileName))
|
||||
}
|
||||
|
||||
fi, err := os.Stat(file_path)
|
||||
fi, err := os.Stat(filePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
log.Printf("Building Trie..., from %s\n", file_path)
|
||||
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
|
||||
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
|
||||
cache_path := filepath.Join(os.TempDir(), cache_file_name)
|
||||
log.Printf("Building Trie..., from %s\n", filePath)
|
||||
h := fmt.Sprintf("%x", md5.Sum([]byte(filePath)))
|
||||
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
|
||||
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
|
||||
isDictCached := true
|
||||
cache_fi, err := os.Stat(cache_path)
|
||||
cacheFileInfo, err := os.Stat(cacheFilePath)
|
||||
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
}
|
||||
|
||||
if isDictCached {
|
||||
isDictCached = cache_fi.ModTime().After(fi.ModTime())
|
||||
isDictCached = cacheFileInfo.ModTime().After(fi.ModTime())
|
||||
}
|
||||
|
||||
var cacheFile *os.File
|
||||
if isDictCached {
|
||||
cacheFile, err = os.Open(cache_path)
|
||||
cacheFile, err = os.Open(cacheFilePath)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
}
|
||||
@@ -71,17 +63,19 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
||||
}
|
||||
if isDictCached {
|
||||
dec := gob.NewDecoder(cacheFile)
|
||||
err = dec.Decode(&topTrie)
|
||||
err = dec.Decode(&trie)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
} else {
|
||||
log.Printf("loaded model from cache %s\n", cache_path)
|
||||
log.Printf("loaded model from cache %s\n", cacheFilePath)
|
||||
}
|
||||
}
|
||||
|
||||
if !isDictCached {
|
||||
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
||||
file, openError := os.Open(file_path)
|
||||
trie = &Trie{Nodes: mapset.NewSet(), MinFreq: 0.0, Total: 0.0,
|
||||
Freq: make(map[string]float64)}
|
||||
|
||||
file, openError := os.Open(filePath)
|
||||
if openError != nil {
|
||||
return nil, openError
|
||||
}
|
||||
@@ -93,55 +87,45 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
||||
words := strings.Split(line, " ")
|
||||
word, freqStr := words[0], words[1]
|
||||
freq, _ := strconv.ParseFloat(freqStr, 64)
|
||||
topTrie.Total += freq
|
||||
topTrie.addWord(word, freq)
|
||||
trie.addWord(word, freq)
|
||||
}
|
||||
if scanErr := scanner.Err(); scanErr != nil {
|
||||
return nil, scanErr
|
||||
}
|
||||
|
||||
var val float64
|
||||
for key := range topTrie.Freq {
|
||||
val = math.Log(topTrie.Freq[key] / topTrie.Total)
|
||||
if val < topTrie.MinFreq {
|
||||
topTrie.MinFreq = val
|
||||
for key := range trie.Freq {
|
||||
val = math.Log(trie.Freq[key] / trie.Total)
|
||||
if val < trie.MinFreq {
|
||||
trie.MinFreq = val
|
||||
}
|
||||
topTrie.Freq[key] = val
|
||||
trie.Freq[key] = val
|
||||
}
|
||||
|
||||
// dump topTrie
|
||||
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
// dump trie
|
||||
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
return topTrie, err
|
||||
return trie, err
|
||||
}
|
||||
defer cacheFile.Close()
|
||||
enc := gob.NewEncoder(cacheFile)
|
||||
err := enc.Encode(topTrie)
|
||||
err := enc.Encode(trie)
|
||||
if err != nil {
|
||||
return topTrie, err
|
||||
return trie, err
|
||||
} else {
|
||||
log.Printf("dumped model from cache %s\n", cache_path)
|
||||
log.Printf("dumped model from cache %s\n", cacheFilePath)
|
||||
}
|
||||
}
|
||||
return topTrie, nil
|
||||
return trie, nil
|
||||
}
|
||||
|
||||
func (tt *TopTrie) addWord(word string, freq float64) {
|
||||
tt.Freq[word] = freq
|
||||
var p *Trie
|
||||
func (t *Trie) addWord(word string, freq float64) {
|
||||
t.Freq[word] = freq
|
||||
t.Total += freq
|
||||
runes := []rune(word)
|
||||
count := len(runes)
|
||||
for index, key := range runes {
|
||||
if index == 0 {
|
||||
p = tt.T
|
||||
}
|
||||
if _, ok := p.Nodes[key]; !ok {
|
||||
p.Nodes[key] = NewTrie()
|
||||
}
|
||||
if index == count-1 {
|
||||
p.Nodes[key].IsLeaf = true
|
||||
}
|
||||
p = p.Nodes[key]
|
||||
for i := 0; i < count; i++ {
|
||||
t.Nodes.Add(string(runes[:i+1]))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,11 +133,11 @@ func addWord(word string, freq float64, tag string) {
|
||||
if len(tag) > 0 {
|
||||
UserWordTagTab[word] = strings.TrimSpace(tag)
|
||||
}
|
||||
TT.addWord(word, freq)
|
||||
trie.addWord(word, freq)
|
||||
}
|
||||
|
||||
func LoadUserDict(file_path string) error {
|
||||
file, openError := os.Open(file_path)
|
||||
func LoadUserDict(filePath string) error {
|
||||
file, openError := os.Open(filePath)
|
||||
if openError != nil {
|
||||
return openError
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user