mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 08:40:36 +08:00
- issue details from https://github.com/fxsjy/jieba/issues/132 - updated tests - also some code refactors
130 lines
2.5 KiB
Go
130 lines
2.5 KiB
Go
package jiebago
|
|
|
|
import (
|
|
"bufio"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type Node struct {
|
|
Name string
|
|
SubNodes Trie
|
|
IsLeaf bool
|
|
}
|
|
|
|
type Trie map[string]*Node
|
|
|
|
type TopTrie struct {
|
|
T Trie
|
|
MinFreq float64
|
|
Total float64
|
|
Freq map[string]float64
|
|
}
|
|
|
|
func newTopTrie(filename string) (*TopTrie, error) {
|
|
var file_path string
|
|
var topTrie *TopTrie
|
|
if filepath.IsAbs(filename) {
|
|
file_path = filename
|
|
} else {
|
|
pwd, err := os.Getwd()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
|
}
|
|
|
|
topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
|
file, openError := os.Open(file_path)
|
|
if openError != nil {
|
|
return nil, openError
|
|
}
|
|
defer file.Close()
|
|
|
|
reader := bufio.NewReader(file)
|
|
for {
|
|
line, readError := reader.ReadString('\n')
|
|
if readError != nil && len(line) == 0 {
|
|
break
|
|
}
|
|
words := strings.Split(line, " ")
|
|
word, freqStr := words[0], words[1]
|
|
freq, _ := strconv.ParseFloat(freqStr, 64)
|
|
topTrie.Total += freq
|
|
topTrie.addWord(word, freq)
|
|
}
|
|
var val float64
|
|
for key := range topTrie.Freq {
|
|
val = math.Log(topTrie.Freq[key] / topTrie.Total)
|
|
if val < topTrie.MinFreq {
|
|
topTrie.MinFreq = val
|
|
}
|
|
topTrie.Freq[key] = val
|
|
}
|
|
|
|
return topTrie, nil
|
|
}
|
|
|
|
func (tt *TopTrie) addWord(word string, freq float64) {
|
|
tt.Freq[word] = freq
|
|
var p Trie
|
|
var node *Node
|
|
var key string
|
|
count := utf8.RuneCountInString(word)
|
|
for index, c := range []rune(word) {
|
|
if index == 0 {
|
|
p = tt.T
|
|
}
|
|
key = string(c)
|
|
if _, ok := p[key]; ok {
|
|
node = p[key]
|
|
} else {
|
|
node = &Node{Name: key, IsLeaf: false}
|
|
p[key] = node
|
|
node.SubNodes = make(Trie)
|
|
}
|
|
if index == count-1 {
|
|
p[key].IsLeaf = true
|
|
}
|
|
p = node.SubNodes
|
|
}
|
|
}
|
|
|
|
func addWord(word string, freq float64, tag string) {
|
|
if len(tag) > 0 {
|
|
UserWordTagTab[word] = strings.TrimSpace(tag)
|
|
}
|
|
TT.addWord(word, freq)
|
|
}
|
|
|
|
func LoadUserDict(file_path string) error {
|
|
file, openError := os.Open(file_path)
|
|
if openError != nil {
|
|
return openError
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
words := strings.Split(line, " ")
|
|
word, freqStr := words[0], words[1]
|
|
word = strings.Replace(word, "\ufeff", "", 1)
|
|
freq, freqErr := strconv.ParseFloat(freqStr, 64)
|
|
if freqErr != nil {
|
|
continue // TODO: how to handle wrong type of frequency?
|
|
}
|
|
tag := ""
|
|
if len(words) == 3 {
|
|
tag = words[2]
|
|
}
|
|
addWord(word, freq, tag)
|
|
}
|
|
|
|
return scanner.Err()
|
|
}
|