1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

code refactor, simplfied trie model, also added cache for dictionary file

This commit is contained in:
Wang Bin
2014-08-13 18:21:41 +08:00
parent ad077fcb06
commit d2acf94693
2 changed files with 98 additions and 47 deletions

View File

@@ -72,14 +72,15 @@ func GetDAG(sentence string) map[int][]int {
n := len(runes)
p := TT.T
i, j := 0, 0
var c rune
for {
if i >= n {
break
}
c := string(runes[j])
if node, ok := p[c]; ok {
p = node.SubNodes
if node.IsLeaf {
c = runes[j]
if _, ok := p.Nodes[c]; ok {
p = p.Nodes[c]
if p.IsLeaf {
if _, inDag := dag[i]; !inDag {
dag[i] = []int{j}
} else {

View File

@@ -2,24 +2,28 @@ package jiebago
import (
"bufio"
"crypto/md5"
"encoding/gob"
"fmt"
"log"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"unicode/utf8"
)
type Node struct {
Name string
SubNodes Trie
IsLeaf bool
type Trie struct {
Nodes map[rune]*Trie
IsLeaf bool
}
type Trie map[string]*Node
func NewTrie() *Trie {
return &Trie{make(map[rune]*Trie), false}
}
type TopTrie struct {
T Trie
T *Trie
MinFreq float64
Total float64
Freq map[string]float64
@@ -38,60 +42,106 @@ func newTopTrie(filename string) (*TopTrie, error) {
file_path = filepath.Clean(filepath.Join(pwd, filename))
}
topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(file_path)
if openError != nil {
return nil, openError
fi, err := os.Stat(file_path)
if err != nil {
return nil, err
}
defer file.Close()
log.Printf("Building Trie..., from %s\n", file_path)
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
cache_path := filepath.Join(os.TempDir(), cache_file_name)
isDictCached := true
cache_fi, err := os.Stat(cache_path)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
freq, _ := strconv.ParseFloat(freqStr, 64)
topTrie.Total += freq
topTrie.addWord(word, freq)
}
if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr
if err != nil {
isDictCached = false
}
var val float64
for key := range topTrie.Freq {
val = math.Log(topTrie.Freq[key] / topTrie.Total)
if val < topTrie.MinFreq {
topTrie.MinFreq = val
if isDictCached {
isDictCached = cache_fi.ModTime().After(fi.ModTime())
}
var cacheFile *os.File
if isDictCached {
cacheFile, err = os.Open(cache_path)
if err != nil {
isDictCached = false
}
defer cacheFile.Close()
}
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&topTrie)
if err != nil {
isDictCached = false
} else {
log.Printf("loaded model from cache %s\n", cache_path)
}
topTrie.Freq[key] = val
}
if !isDictCached {
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(file_path)
if openError != nil {
return nil, openError
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
freq, _ := strconv.ParseFloat(freqStr, 64)
topTrie.Total += freq
topTrie.addWord(word, freq)
}
if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr
}
var val float64
for key := range topTrie.Freq {
val = math.Log(topTrie.Freq[key] / topTrie.Total)
if val < topTrie.MinFreq {
topTrie.MinFreq = val
}
topTrie.Freq[key] = val
}
// dump topTrie
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return topTrie, err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err := enc.Encode(topTrie)
if err != nil {
return topTrie, err
} else {
log.Printf("dumped model from cache %s\n", cache_path)
}
}
return topTrie, nil
}
func (tt *TopTrie) addWord(word string, freq float64) {
tt.Freq[word] = freq
var p Trie
var node *Node
var key string
count := utf8.RuneCountInString(word)
for index, c := range []rune(word) {
var p *Trie
runes := []rune(word)
count := len(runes)
for index, key := range runes {
if index == 0 {
p = tt.T
}
key = string(c)
if _, ok := p[key]; ok {
node = p[key]
} else {
node = &Node{Name: key, IsLeaf: false}
p[key] = node
node.SubNodes = make(Trie)
if _, ok := p.Nodes[key]; !ok {
p.Nodes[key] = NewTrie()
}
if index == count-1 {
p[key].IsLeaf = true
p.Nodes[key].IsLeaf = true
}
p = node.SubNodes
p = p.Nodes[key]
}
}