mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 08:40:36 +08:00
181 lines
3.7 KiB
Go
181 lines
3.7 KiB
Go
package jiebago
|
|
|
|
import (
|
|
"bufio"
|
|
"crypto/md5"
|
|
"encoding/gob"
|
|
"fmt"
|
|
"log"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
type Trie struct {
|
|
Nodes map[rune]*Trie
|
|
IsLeaf bool
|
|
}
|
|
|
|
func NewTrie() *Trie {
|
|
return &Trie{make(map[rune]*Trie), false}
|
|
}
|
|
|
|
type TopTrie struct {
|
|
T *Trie
|
|
MinFreq float64
|
|
Total float64
|
|
Freq map[string]float64
|
|
}
|
|
|
|
func newTopTrie(filename string) (*TopTrie, error) {
|
|
var file_path string
|
|
var topTrie *TopTrie
|
|
if filepath.IsAbs(filename) {
|
|
file_path = filename
|
|
} else {
|
|
pwd, err := os.Getwd()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
|
}
|
|
|
|
fi, err := os.Stat(file_path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
log.Printf("Building Trie..., from %s\n", file_path)
|
|
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
|
|
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
|
|
cache_path := filepath.Join(os.TempDir(), cache_file_name)
|
|
isDictCached := true
|
|
cache_fi, err := os.Stat(cache_path)
|
|
|
|
if err != nil {
|
|
isDictCached = false
|
|
}
|
|
|
|
if isDictCached {
|
|
isDictCached = cache_fi.ModTime().After(fi.ModTime())
|
|
}
|
|
|
|
var cacheFile *os.File
|
|
if isDictCached {
|
|
cacheFile, err = os.Open(cache_path)
|
|
if err != nil {
|
|
isDictCached = false
|
|
}
|
|
defer cacheFile.Close()
|
|
}
|
|
if isDictCached {
|
|
dec := gob.NewDecoder(cacheFile)
|
|
err = dec.Decode(&topTrie)
|
|
if err != nil {
|
|
isDictCached = false
|
|
} else {
|
|
log.Printf("loaded model from cache %s\n", cache_path)
|
|
}
|
|
}
|
|
|
|
if !isDictCached {
|
|
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
|
file, openError := os.Open(file_path)
|
|
if openError != nil {
|
|
return nil, openError
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
words := strings.Split(line, " ")
|
|
word, freqStr := words[0], words[1]
|
|
freq, _ := strconv.ParseFloat(freqStr, 64)
|
|
topTrie.Total += freq
|
|
topTrie.addWord(word, freq)
|
|
}
|
|
if scanErr := scanner.Err(); scanErr != nil {
|
|
return nil, scanErr
|
|
}
|
|
|
|
var val float64
|
|
for key := range topTrie.Freq {
|
|
val = math.Log(topTrie.Freq[key] / topTrie.Total)
|
|
if val < topTrie.MinFreq {
|
|
topTrie.MinFreq = val
|
|
}
|
|
topTrie.Freq[key] = val
|
|
}
|
|
|
|
// dump topTrie
|
|
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
|
if err != nil {
|
|
return topTrie, err
|
|
}
|
|
defer cacheFile.Close()
|
|
enc := gob.NewEncoder(cacheFile)
|
|
err := enc.Encode(topTrie)
|
|
if err != nil {
|
|
return topTrie, err
|
|
} else {
|
|
log.Printf("dumped model from cache %s\n", cache_path)
|
|
}
|
|
}
|
|
return topTrie, nil
|
|
}
|
|
|
|
func (tt *TopTrie) addWord(word string, freq float64) {
|
|
tt.Freq[word] = freq
|
|
var p *Trie
|
|
runes := []rune(word)
|
|
count := len(runes)
|
|
for index, key := range runes {
|
|
if index == 0 {
|
|
p = tt.T
|
|
}
|
|
if _, ok := p.Nodes[key]; !ok {
|
|
p.Nodes[key] = NewTrie()
|
|
}
|
|
if index == count-1 {
|
|
p.Nodes[key].IsLeaf = true
|
|
}
|
|
p = p.Nodes[key]
|
|
}
|
|
}
|
|
|
|
func addWord(word string, freq float64, tag string) {
|
|
if len(tag) > 0 {
|
|
UserWordTagTab[word] = strings.TrimSpace(tag)
|
|
}
|
|
TT.addWord(word, freq)
|
|
}
|
|
|
|
func LoadUserDict(file_path string) error {
|
|
file, openError := os.Open(file_path)
|
|
if openError != nil {
|
|
return openError
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
words := strings.Split(line, " ")
|
|
word, freqStr := words[0], words[1]
|
|
word = strings.Replace(word, "\ufeff", "", 1)
|
|
freq, freqErr := strconv.ParseFloat(freqStr, 64)
|
|
if freqErr != nil {
|
|
continue // TODO: how to handle wrong type of frequency?
|
|
}
|
|
tag := ""
|
|
if len(words) == 3 {
|
|
tag = words[2]
|
|
}
|
|
addWord(word, freq, tag)
|
|
}
|
|
|
|
return scanner.Err()
|
|
}
|