1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-13 05:31:02 +08:00

small refactor, put parse dictionary function in seperate gile

This commit is contained in:
Wang Bin
2015-02-26 14:22:29 +08:00
parent 95a27da5cf
commit 60b2c9f763
2 changed files with 99 additions and 68 deletions

53
dict.go Normal file
View File

@@ -0,0 +1,53 @@
package jiebago
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
)
type WordTagFreq struct {
Word, Tag string
Freq float64
}
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
pwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(pwd, dictFileName))
return dictFilePath, nil
}
func ParseDictFile(dictFile *os.File) (wtfs []*WordTagFreq, err error) {
scanner := bufio.NewScanner(dictFile)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Split(line, " ")
length := len(fields)
word := fields[0]
word = strings.Replace(word, "\ufeff", "", 1)
wtf := &WordTagFreq{Word: word}
if length > 1 {
wtf.Freq, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
return nil, err
}
}
if length > 2 {
wtf.Tag = fields[2]
}
wtfs = append(wtfs, wtf)
}
if err = scanner.Err(); err != nil {
return nil, err
}
return wtfs, nil
}

114
trie.go
View File

@@ -1,7 +1,6 @@
package jiebago package jiebago
import ( import (
"bufio"
"bytes" "bytes"
"crypto/md5" "crypto/md5"
"encoding/gob" "encoding/gob"
@@ -9,7 +8,6 @@ import (
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"strings" "strings"
) )
@@ -48,36 +46,30 @@ func (t *Trie) UnmarshalBinary(data []byte) error {
return nil return nil
} }
func newTrie(fileName string) (*Trie, error) { func newTrie(dictFileName string) (*Trie, error) {
var filePath string dictFilePath, err := DictPath(dictFileName)
var trie *Trie
if filepath.IsAbs(fileName) {
filePath = fileName
} else {
pwd, err := os.Getwd()
if err != nil {
return nil, err
}
filePath = filepath.Clean(filepath.Join(pwd, fileName))
}
fi, err := os.Stat(filePath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
log.Printf("Building Trie..., from %s\n", filePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(filePath))) dictFileInfo, err := os.Stat(dictFilePath)
if err != nil {
return nil, err
}
log.Printf("Building Trie..., from %s\n", dictFilePath)
h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath)))
cacheFileName := fmt.Sprintf("jieba.%s.cache", h) cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName) cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
isDictCached := true isDictCached := true
cacheFileInfo, err := os.Stat(cacheFilePath)
cacheFileInfo, err := os.Stat(cacheFilePath)
if err != nil { if err != nil {
isDictCached = false isDictCached = false
} }
if isDictCached { if isDictCached {
isDictCached = cacheFileInfo.ModTime().After(fi.ModTime()) isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime())
} }
var cacheFile *os.File var cacheFile *os.File
@@ -88,6 +80,9 @@ func newTrie(fileName string) (*Trie, error) {
} }
defer cacheFile.Close() defer cacheFile.Close()
} }
var trie *Trie
if isDictCached { if isDictCached {
dec := gob.NewDecoder(cacheFile) dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&trie) err = dec.Decode(&trie)
@@ -101,27 +96,20 @@ func newTrie(fileName string) (*Trie, error) {
if !isDictCached { if !isDictCached {
trie = &Trie{Total: 0.0, Freq: make(map[string]float64)} trie = &Trie{Total: 0.0, Freq: make(map[string]float64)}
file, openError := os.Open(filePath) dictFile, err := os.Open(dictFilePath)
if openError != nil { if err != nil {
return nil, openError return nil, err
} }
defer file.Close() defer dictFile.Close()
scanner := bufio.NewScanner(file) wtfs, err := ParseDictFile(dictFile)
for scanner.Scan() { if err != nil {
line := scanner.Text() return nil, err
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
freq, err := strconv.ParseFloat(freqStr, 64)
if err != nil {
return nil, err
}
trie.addWord(word, freq)
}
if scanErr := scanner.Err(); scanErr != nil {
return nil, scanErr
} }
for _, wtf := range wtfs {
trie.addWord(wtf)
}
// dump trie // dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil { if err != nil {
@@ -129,7 +117,7 @@ func newTrie(fileName string) (*Trie, error) {
} }
defer cacheFile.Close() defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile) enc := gob.NewEncoder(cacheFile)
err := enc.Encode(trie) err = enc.Encode(trie)
if err != nil { if err != nil {
return trie, err return trie, err
} else { } else {
@@ -139,10 +127,10 @@ func newTrie(fileName string) (*Trie, error) {
return trie, nil return trie, nil
} }
func (t *Trie) addWord(word string, freq float64) { func (t *Trie) addWord(wtf *WordTagFreq) {
t.Freq[word] = freq t.Freq[wtf.Word] = wtf.Freq
t.Total += freq t.Total += wtf.Freq
runes := []rune(word) runes := []rune(wtf.Word)
count := len(runes) count := len(runes)
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
wfrag := string(runes[0 : i+1]) wfrag := string(runes[0 : i+1])
@@ -151,38 +139,28 @@ func (t *Trie) addWord(word string, freq float64) {
} }
} }
} }
func addWord(word string, freq float64, tag string) { func addWord(wtf *WordTagFreq) {
if len(tag) > 0 { if len(wtf.Tag) > 0 {
UserWordTagTab[word] = strings.TrimSpace(tag) UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
} }
T.addWord(word, freq) T.addWord(wtf)
} }
func LoadUserDict(filePath string) error { func LoadUserDict(dictFilePath string) error {
file, openError := os.Open(filePath) dictFile, err := os.Open(dictFilePath)
if openError != nil { if err != nil {
return openError return err
} }
defer file.Close() defer dictFile.Close()
scanner := bufio.NewScanner(file) wtfs, err := ParseDictFile(dictFile)
for scanner.Scan() { if err != nil {
line := scanner.Text() return err
words := strings.Split(line, " ")
word, freqStr := words[0], words[1]
word = strings.Replace(word, "\ufeff", "", 1)
freq, freqErr := strconv.ParseFloat(freqStr, 64)
if freqErr != nil {
continue // TODO: how to handle wrong type of frequency?
}
tag := ""
if len(words) == 3 {
tag = words[2]
}
addWord(word, freq, tag)
} }
for _, wtf := range wtfs {
return scanner.Err() addWord(wtf)
}
return nil
} }
func SetDictionary(dict_path string) (err error) { func SetDictionary(dict_path string) (err error) {