1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 21:20:26 +08:00

small refactor, put parse dictionary function in seperate gile

This commit is contained in:
Wang Bin
2015-02-26 14:22:29 +08:00
parent 95a27da5cf
commit 60b2c9f763
2 changed files with 99 additions and 68 deletions

114
trie.go
View File

@@ -1,7 +1,6 @@
package jiebago
import (
"bufio"
"bytes"
"crypto/md5"
"encoding/gob"
@@ -9,7 +8,6 @@ import (
"log"
"os"
"path/filepath"
"strconv"
"strings"
)
@@ -48,36 +46,30 @@ func (t *Trie) UnmarshalBinary(data []byte) error {
return nil
}
func newTrie(fileName string) (*Trie, error) {
var filePath string
var trie *Trie
if filepath.IsAbs(fileName) {
filePath = fileName
} else {
pwd, err := os.Getwd()
if err != nil {
return nil, err
}
filePath = filepath.Clean(filepath.Join(pwd, fileName))
}
fi, err := os.Stat(filePath)
func newTrie(dictFileName string) (*Trie, error) {
dictFilePath, err := DictPath(dictFileName)
if err != nil {
return nil, err
}
log.Printf("Building Trie..., from %s\n", filePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(filePath)))
dictFileInfo, err := os.Stat(dictFilePath)
if err != nil {
return nil, err
}
log.Printf("Building Trie..., from %s\n", dictFilePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath)))
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true
cacheFileInfo, err := os.Stat(cacheFilePath)
cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil {
isDictCached = false
}
if isDictCached {
isDictCached = cacheFileInfo.ModTime().After(fi.ModTime())
isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime())
}
var cacheFile *os.File
@@ -88,6 +80,9 @@ func newTrie(fileName string) (*Trie, error) {
}
defer cacheFile.Close()
}
var trie *Trie
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&trie)
@@ -101,27 +96,20 @@ func newTrie(fileName string) (*Trie, error) {
if !isDictCached {
trie = &Trie{Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(filePath)
if openError != nil {
return nil, openError
dictFile, err := os.Open(dictFilePath)
if err != nil {
return nil, err
}
defer file.Close()
defer dictFile.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
freq, err := strconv.ParseFloat(freqStr, 64)
if err != nil {
return nil, err
}
trie.addWord(word, freq)
}
if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr
wtfs, err := ParseDictFile(dictFile)
if err != nil {
return nil, err
}
for _, wtf := range wtfs {
trie.addWord(wtf)
}
// dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
@@ -129,7 +117,7 @@ func newTrie(fileName string) (*Trie, error) {
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err := enc.Encode(trie)
err = enc.Encode(trie)
if err != nil {
return trie, err
} else {
@@ -139,10 +127,10 @@ func newTrie(fileName string) (*Trie, error) {
return trie, nil
}
func (t *Trie) addWord(word string, freq float64) {
t.Freq[word] = freq
t.Total += freq
runes := []rune(word)
func (t *Trie) addWord(wtf *WordTagFreq) {
t.Freq[wtf.Word] = wtf.Freq
t.Total += wtf.Freq
runes := []rune(wtf.Word)
count := len(runes)
for i := 0; i < count; i++ {
wfrag := string(runes[0 : i+1])
@@ -151,38 +139,28 @@ func (t *Trie) addWord(word string, freq float64) {
}
}
}
func addWord(word string, freq float64, tag string) {
if len(tag) > 0 {
UserWordTagTab[word] = strings.TrimSpace(tag)
func addWord(wtf *WordTagFreq) {
if len(wtf.Tag) > 0 {
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
}
T.addWord(word, freq)
T.addWord(wtf)
}
func LoadUserDict(filePath string) error {
file, openError := os.Open(filePath)
if openError != nil {
return openError
func LoadUserDict(dictFilePath string) error {
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
}
defer file.Close()
defer dictFile.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
word = strings.Replace(word, "\ufeff", "", 1)
freq, freqErr := strconv.ParseFloat(freqStr, 64)
if freqErr != nil {
continue // TODO: how to handle wrong type of frequency?
}
tag := ""
if len(words) == 3 {
tag = words[2]
}
addWord(word, freq, tag)
wtfs, err := ParseDictFile(dictFile)
if err != nil {
return err
}
return scanner.Err()
for _, wtf := range wtfs {
addWord(wtf)
}
return nil
}
func SetDictionary(dict_path string) (err error) {