1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-30 09:00:30 +08:00

code refactor, simplfied trie model, also added cache for dictionary file

This commit is contained in:
Wang Bin
2014-08-13 18:21:41 +08:00
parent ad077fcb06
commit d2acf94693
2 changed files with 98 additions and 47 deletions

View File

@@ -72,14 +72,15 @@ func GetDAG(sentence string) map[int][]int {
n := len(runes) n := len(runes)
p := TT.T p := TT.T
i, j := 0, 0 i, j := 0, 0
var c rune
for { for {
if i >= n { if i >= n {
break break
} }
c := string(runes[j]) c = runes[j]
if node, ok := p[c]; ok { if _, ok := p.Nodes[c]; ok {
p = node.SubNodes p = p.Nodes[c]
if node.IsLeaf { if p.IsLeaf {
if _, inDag := dag[i]; !inDag { if _, inDag := dag[i]; !inDag {
dag[i] = []int{j} dag[i] = []int{j}
} else { } else {

View File

@@ -2,24 +2,28 @@ package jiebago
import ( import (
"bufio" "bufio"
"crypto/md5"
"encoding/gob"
"fmt"
"log"
"math" "math"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"unicode/utf8"
) )
type Node struct { type Trie struct {
Name string Nodes map[rune]*Trie
SubNodes Trie
IsLeaf bool IsLeaf bool
} }
type Trie map[string]*Node func NewTrie() *Trie {
return &Trie{make(map[rune]*Trie), false}
}
type TopTrie struct { type TopTrie struct {
T Trie T *Trie
MinFreq float64 MinFreq float64
Total float64 Total float64
Freq map[string]float64 Freq map[string]float64
@@ -38,7 +42,45 @@ func newTopTrie(filename string) (*TopTrie, error) {
file_path = filepath.Clean(filepath.Join(pwd, filename)) file_path = filepath.Clean(filepath.Join(pwd, filename))
} }
topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)} fi, err := os.Stat(file_path)
if err != nil {
return nil, err
}
log.Printf("Building Trie..., from %s\n", file_path)
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
cache_path := filepath.Join(os.TempDir(), cache_file_name)
isDictCached := true
cache_fi, err := os.Stat(cache_path)
if err != nil {
isDictCached = false
}
if isDictCached {
isDictCached = cache_fi.ModTime().After(fi.ModTime())
}
var cacheFile *os.File
if isDictCached {
cacheFile, err = os.Open(cache_path)
if err != nil {
isDictCached = false
}
defer cacheFile.Close()
}
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&topTrie)
if err != nil {
isDictCached = false
} else {
log.Printf("loaded model from cache %s\n", cache_path)
}
}
if !isDictCached {
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(file_path) file, openError := os.Open(file_path)
if openError != nil { if openError != nil {
return nil, openError return nil, openError
@@ -67,31 +109,39 @@ func newTopTrie(filename string) (*TopTrie, error) {
topTrie.Freq[key] = val topTrie.Freq[key] = val
} }
// dump topTrie
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return topTrie, err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err := enc.Encode(topTrie)
if err != nil {
return topTrie, err
} else {
log.Printf("dumped model from cache %s\n", cache_path)
}
}
return topTrie, nil return topTrie, nil
} }
func (tt *TopTrie) addWord(word string, freq float64) { func (tt *TopTrie) addWord(word string, freq float64) {
tt.Freq[word] = freq tt.Freq[word] = freq
var p Trie var p *Trie
var node *Node runes := []rune(word)
var key string count := len(runes)
count := utf8.RuneCountInString(word) for index, key := range runes {
for index, c := range []rune(word) {
if index == 0 { if index == 0 {
p = tt.T p = tt.T
} }
key = string(c) if _, ok := p.Nodes[key]; !ok {
if _, ok := p[key]; ok { p.Nodes[key] = NewTrie()
node = p[key]
} else {
node = &Node{Name: key, IsLeaf: false}
p[key] = node
node.SubNodes = make(Trie)
} }
if index == count-1 { if index == count-1 {
p[key].IsLeaf = true p.Nodes[key].IsLeaf = true
} }
p = node.SubNodes p = p.Nodes[key]
} }
} }