1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-08 10:30:23 +08:00

finished generilzation of dictionary load

This commit is contained in:
Wang Bin
2015-03-28 10:51:00 +08:00
parent e155fe5467
commit 45c7854fac
2 changed files with 14 additions and 89 deletions

76
trie.go
View File

@@ -1,86 +1,10 @@
package jiebago
import (
"crypto/md5"
"encoding/gob"
"fmt"
"log"
"os"
"path/filepath"
)
type Jieba struct {
Total float64
Freq map[string]float64
}
func (j *Jieba) load(dictFileName string) error {
dictFilePath, err := DictPath(dictFileName)
if err != nil {
return err
}
dictFileInfo, err := os.Stat(dictFilePath)
if err != nil {
return err
}
log.Printf("Building Trie..., from %s\n", dictFilePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath)))
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true
cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil {
isDictCached = false
}
if isDictCached {
isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime())
}
var cacheFile *os.File
if isDictCached {
cacheFile, err = os.Open(cacheFilePath)
if err != nil {
isDictCached = false
}
defer cacheFile.Close()
}
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&j)
if err != nil {
isDictCached = false
} else {
log.Printf("loaded model from cache %s\n", cacheFilePath)
}
}
if !isDictCached {
err = LoadDict(j, dictFilePath, false)
if err != nil {
return err
}
// dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err = enc.Encode(j)
if err != nil {
return err
} else {
log.Printf("dumped model from cache %s\n", cacheFilePath)
}
}
return nil
}
func (j *Jieba) AddEntry(entry *Entry) {
j.Add(entry.Word, entry.Freq)
}

27
util.go
View File

@@ -5,6 +5,7 @@ import (
"crypto/md5"
"encoding/gob"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
@@ -26,6 +27,8 @@ func DictPath(dictFileName string) (string, error) {
}
func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error {
log.Printf("Building Trie..., from %s\n", dictFilePath)
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
@@ -59,22 +62,14 @@ func cachePath(dictPath string) string {
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(dictPath))))
}
func fileInfo(filePath string, missingOk bool) (os.FileInfo, error) {
fi, err := os.Stat(filePath)
if missingOk && err == os.ErrNotExist {
return fi, nil
}
return fi, err
}
func cached(dictPath, cachePath string) (bool, error) {
dictFileInfo, err := fileInfo(dictPath, false)
dictFileInfo, err := os.Stat(dictPath)
if err != nil {
return false, err
}
cacheFileInfo, err := fileInfo(cachePath, true)
cacheFileInfo, err := os.Stat(cachePath)
if err != nil {
return false, err
return false, nil
}
return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil
}
@@ -87,7 +82,7 @@ func load(l DictLoader, cachePath string) error {
defer cacheFile.Close()
dec := gob.NewDecoder(cacheFile)
return dec.Decode(&l)
return dec.Decode(l)
}
func dump(l DictLoader, cachePath string) error {
@@ -114,6 +109,7 @@ func SetDict(l DictLoader, dictName string, pos bool) error {
if cached {
err = load(l, cachePath)
if err == nil {
log.Printf("loaded model from cache %s\n", cachePath)
return nil
}
cached = false
@@ -124,7 +120,12 @@ func SetDict(l DictLoader, dictName string, pos bool) error {
return err
}
return dump(l, cachePath)
err = dump(l, cachePath)
if err == nil {
log.Printf("dumped model from cache %s\n", cachePath)
return nil
}
return err
}
// Split sentence using regular expression.