From 0124ebadce1394f3736de8ed331fc9c51d127dea Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Wed, 29 Apr 2015 18:51:38 +0800 Subject: [PATCH] put dictionary to a seperated module --- dictionary/dictionary.go | 121 +++++++++++++++++++++++++++++++++++++++ dictionary/token.go | 19 ++++++ 2 files changed, 140 insertions(+) create mode 100644 dictionary/dictionary.go create mode 100644 dictionary/token.go diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go new file mode 100644 index 0000000..7fa7625 --- /dev/null +++ b/dictionary/dictionary.go @@ -0,0 +1,121 @@ +package dictionary + +import ( + "bufio" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" +) + +type Dictionary struct { + total, logTotal float64 + freqMap map[string]float64 + sync.RWMutex +} + +func (d *Dictionary) addToken(token Token) { + d.freqMap[token.text] = token.frequency + d.total += token.frequency + runes := []rune(token.text) + n := len(runes) + for i := 0; i < n; i++ { + frag := string(runes[:i+1]) + if _, ok := d.freqMap[frag]; !ok { + d.freqMap[frag] = 0.0 + } + } +} + +func (d *Dictionary) updateLogTotal() { + d.logTotal = math.Log(d.total) +} + +func (d *Dictionary) AddToken(token Token) { + d.Lock() + d.addToken(token) + d.updateLogTotal() + d.Unlock() +} + +func (d Dictionary) Total() float64 { + return d.total +} + +func (d Dictionary) LogTotal() float64 { + return d.logTotal +} + +func (d Dictionary) Frequency(key string) (float64, bool) { + d.RLock() + freq, ok := d.freqMap[key] + d.RUnlock() + return freq, ok +} + +func (d *Dictionary) LoadDictionary(fileName string) error { + return d.loadDictionary(fileName, false) +} + +func (d *Dictionary) LoadUserDictionary(fileName string) error { + return d.loadDictionary(fileName, true) +} + +func (d *Dictionary) loadDictionary(fileName string, isUserDictionary bool) error { + filePath, err := dictPath(fileName) + if err != nil { + return err + } + dictFile, err := os.Open(filePath) + if err != nil { + return err + } + defer dictFile.Close() + + scanner := bufio.NewScanner(dictFile) + var token Token + var line string + var fields []string + + d.Lock() + defer d.Unlock() + + if !isUserDictionary && len(d.freqMap) > 0 { + d.freqMap = make(map[string]float64) + } + for scanner.Scan() { + line = scanner.Text() + fields = strings.Split(line, " ") + token.text = strings.Replace(fields[0], "\ufeff", "", 1) + if length := len(fields); length > 1 { + token.frequency, err = strconv.ParseFloat(fields[1], 64) + if err != nil { + return err + } + if length > 2 { + token.pos = fields[2] + } + } + d.addToken(token) + } + d.updateLogTotal() + if err = scanner.Err(); err != nil { + return err + } + return nil +} + +func dictPath(dictFileName string) (string, error) { + if filepath.IsAbs(dictFileName) { + return dictFileName, nil + } + var dictFilePath string + cwd, err := os.Getwd() + if err != nil { + return dictFilePath, err + } + dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) + return dictFilePath, nil +} diff --git a/dictionary/token.go b/dictionary/token.go new file mode 100644 index 0000000..f4124a1 --- /dev/null +++ b/dictionary/token.go @@ -0,0 +1,19 @@ +package dictionary + +type Token struct { + text string + frequency float64 + pos string +} + +func (t Token) Text() string { + return t.text +} + +func (t Token) Frequency() float64 { + return t.frequency +} + +func (t Token) Pos() string { + return t.pos +}