diff --git a/dict.go b/dict.go new file mode 100644 index 0000000..a753d32 --- /dev/null +++ b/dict.go @@ -0,0 +1,53 @@ +package jiebago + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" +) + +type WordTagFreq struct { + Word, Tag string + Freq float64 +} + +func DictPath(dictFileName string) (string, error) { + if filepath.IsAbs(dictFileName) { + return dictFileName, nil + } + var dictFilePath string + pwd, err := os.Getwd() + if err != nil { + return dictFilePath, err + } + dictFilePath = filepath.Clean(filepath.Join(pwd, dictFileName)) + return dictFilePath, nil +} + +func ParseDictFile(dictFile *os.File) (wtfs []*WordTagFreq, err error) { + scanner := bufio.NewScanner(dictFile) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Split(line, " ") + length := len(fields) + word := fields[0] + word = strings.Replace(word, "\ufeff", "", 1) + wtf := &WordTagFreq{Word: word} + if length > 1 { + wtf.Freq, err = strconv.ParseFloat(fields[1], 64) + if err != nil { + return nil, err + } + } + if length > 2 { + wtf.Tag = fields[2] + } + wtfs = append(wtfs, wtf) + } + if err = scanner.Err(); err != nil { + return nil, err + } + return wtfs, nil +} diff --git a/trie.go b/trie.go index 9354727..389cd97 100644 --- a/trie.go +++ b/trie.go @@ -1,7 +1,6 @@ package jiebago import ( - "bufio" "bytes" "crypto/md5" "encoding/gob" @@ -9,7 +8,6 @@ import ( "log" "os" "path/filepath" - "strconv" "strings" ) @@ -48,36 +46,30 @@ func (t *Trie) UnmarshalBinary(data []byte) error { return nil } -func newTrie(fileName string) (*Trie, error) { - var filePath string - var trie *Trie - if filepath.IsAbs(fileName) { - filePath = fileName - } else { - pwd, err := os.Getwd() - if err != nil { - return nil, err - } - filePath = filepath.Clean(filepath.Join(pwd, fileName)) - } - - fi, err := os.Stat(filePath) +func newTrie(dictFileName string) (*Trie, error) { + dictFilePath, err := DictPath(dictFileName) if err != nil { return nil, err } - log.Printf("Building Trie..., from %s\n", filePath) - h := fmt.Sprintf("%x", md5.Sum([]byte(filePath))) + + dictFileInfo, err := os.Stat(dictFilePath) + if err != nil { + return nil, err + } + + log.Printf("Building Trie..., from %s\n", dictFilePath) + h := fmt.Sprintf("%x", md5.Sum([]byte(dictFilePath))) cacheFileName := fmt.Sprintf("jieba.%s.cache", h) cacheFilePath := filepath.Join(os.TempDir(), cacheFileName) isDictCached := true - cacheFileInfo, err := os.Stat(cacheFilePath) + cacheFileInfo, err := os.Stat(cacheFilePath) if err != nil { isDictCached = false } if isDictCached { - isDictCached = cacheFileInfo.ModTime().After(fi.ModTime()) + isDictCached = cacheFileInfo.ModTime().After(dictFileInfo.ModTime()) } var cacheFile *os.File @@ -88,6 +80,9 @@ func newTrie(fileName string) (*Trie, error) { } defer cacheFile.Close() } + + var trie *Trie + if isDictCached { dec := gob.NewDecoder(cacheFile) err = dec.Decode(&trie) @@ -101,27 +96,20 @@ func newTrie(fileName string) (*Trie, error) { if !isDictCached { trie = &Trie{Total: 0.0, Freq: make(map[string]float64)} - file, openError := os.Open(filePath) - if openError != nil { - return nil, openError + dictFile, err := os.Open(dictFilePath) + if err != nil { + return nil, err } - defer file.Close() + defer dictFile.Close() - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - words := strings.Split(line, " ") - word, freqStr := words[0], words[1] - freq, err := strconv.ParseFloat(freqStr, 64) - if err != nil { - return nil, err - } - trie.addWord(word, freq) - } - if scanErr := scanner.Err(); scanErr != nil { - return nil, scanErr + wtfs, err := ParseDictFile(dictFile) + if err != nil { + return nil, err } + for _, wtf := range wtfs { + trie.addWord(wtf) + } // dump trie cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) if err != nil { @@ -129,7 +117,7 @@ func newTrie(fileName string) (*Trie, error) { } defer cacheFile.Close() enc := gob.NewEncoder(cacheFile) - err := enc.Encode(trie) + err = enc.Encode(trie) if err != nil { return trie, err } else { @@ -139,10 +127,10 @@ func newTrie(fileName string) (*Trie, error) { return trie, nil } -func (t *Trie) addWord(word string, freq float64) { - t.Freq[word] = freq - t.Total += freq - runes := []rune(word) +func (t *Trie) addWord(wtf *WordTagFreq) { + t.Freq[wtf.Word] = wtf.Freq + t.Total += wtf.Freq + runes := []rune(wtf.Word) count := len(runes) for i := 0; i < count; i++ { wfrag := string(runes[0 : i+1]) @@ -151,38 +139,28 @@ func (t *Trie) addWord(word string, freq float64) { } } } -func addWord(word string, freq float64, tag string) { - if len(tag) > 0 { - UserWordTagTab[word] = strings.TrimSpace(tag) +func addWord(wtf *WordTagFreq) { + if len(wtf.Tag) > 0 { + UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag) } - T.addWord(word, freq) + T.addWord(wtf) } -func LoadUserDict(filePath string) error { - file, openError := os.Open(filePath) - if openError != nil { - return openError +func LoadUserDict(dictFilePath string) error { + dictFile, err := os.Open(dictFilePath) + if err != nil { + return err } - defer file.Close() + defer dictFile.Close() - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - words := strings.Split(line, " ") - word, freqStr := words[0], words[1] - word = strings.Replace(word, "\ufeff", "", 1) - freq, freqErr := strconv.ParseFloat(freqStr, 64) - if freqErr != nil { - continue // TODO: how to handle wrong type of frequency? - } - tag := "" - if len(words) == 3 { - tag = words[2] - } - addWord(word, freq, tag) + wtfs, err := ParseDictFile(dictFile) + if err != nil { + return err } - - return scanner.Err() + for _, wtf := range wtfs { + addWord(wtf) + } + return nil } func SetDictionary(dict_path string) (err error) {