mirror of
https://github.com/fumiama/jieba.git
synced 2026-07-02 10:00:27 +08:00
use github.com/deckarep/golang-set instead of Trie, to reduce memory usage and improve performance, this is corresponding to jieba commit #4a93f21918a26083c039970edb9457c589c3a0ab
This commit is contained in:
60
jieba.go
60
jieba.go
@@ -9,7 +9,7 @@ import (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
Dictionary = "dict.txt"
|
Dictionary = "dict.txt"
|
||||||
TT *TopTrie
|
trie *Trie
|
||||||
UserWordTagTab = make(map[string]string)
|
UserWordTagTab = make(map[string]string)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -70,39 +70,29 @@ func GetDAG(sentence string) map[int][]int {
|
|||||||
dag := make(map[int][]int)
|
dag := make(map[int][]int)
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
p := TT.T
|
i := 0
|
||||||
i, j := 0, 0
|
var frag string
|
||||||
var c rune
|
for k := 0; k < n; k++ {
|
||||||
|
tmpList := make([]int, 0)
|
||||||
|
i = k
|
||||||
|
frag = string(runes[k])
|
||||||
for {
|
for {
|
||||||
|
if !trie.Nodes.Contains(frag) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if _, ok := trie.Freq[frag]; ok {
|
||||||
|
tmpList = append(tmpList, i)
|
||||||
|
}
|
||||||
|
i += 1
|
||||||
if i >= n {
|
if i >= n {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
c = runes[j]
|
frag = string(runes[k : i+1])
|
||||||
if _, ok := p.Nodes[c]; ok {
|
|
||||||
p = p.Nodes[c]
|
|
||||||
if p.IsLeaf {
|
|
||||||
if _, inDag := dag[i]; !inDag {
|
|
||||||
dag[i] = []int{j}
|
|
||||||
} else {
|
|
||||||
dag[i] = append(dag[i], j)
|
|
||||||
}
|
}
|
||||||
|
if len(tmpList) == 0 {
|
||||||
|
tmpList = append(tmpList, k)
|
||||||
}
|
}
|
||||||
j += 1
|
dag[k] = tmpList
|
||||||
if j >= n {
|
|
||||||
i += 1
|
|
||||||
j = i
|
|
||||||
p = TT.T
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
p = TT.T
|
|
||||||
i += 1
|
|
||||||
j = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
if _, ok := dag[i]; !ok {
|
|
||||||
dag[i] = []int{i}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return dag
|
return dag
|
||||||
}
|
}
|
||||||
@@ -122,10 +112,10 @@ func Calc(sentence string, dag map[int][]int, idx int) map[int]*Route {
|
|||||||
word = string(runes[idx : i+1])
|
word = string(runes[idx : i+1])
|
||||||
}
|
}
|
||||||
var route *Route
|
var route *Route
|
||||||
if _, ok := TT.Freq[word]; ok {
|
if _, ok := trie.Freq[word]; ok {
|
||||||
route = &Route{TT.Freq[word] + routes[i+1].Freq, i}
|
route = &Route{trie.Freq[word] + routes[i+1].Freq, i}
|
||||||
} else {
|
} else {
|
||||||
route = &Route{TT.MinFreq + routes[i+1].Freq, i}
|
route = &Route{trie.MinFreq + routes[i+1].Freq, i}
|
||||||
}
|
}
|
||||||
candidates = append(candidates, route)
|
candidates = append(candidates, route)
|
||||||
}
|
}
|
||||||
@@ -161,7 +151,7 @@ func cut_DAG(sentence string) []string {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if _, ok := TT.Freq[bufString]; !ok {
|
if _, ok := trie.Freq[bufString]; !ok {
|
||||||
recognized := finalseg.Cut(bufString)
|
recognized := finalseg.Cut(bufString)
|
||||||
for _, t := range recognized {
|
for _, t := range recognized {
|
||||||
result = append(result, t)
|
result = append(result, t)
|
||||||
@@ -184,7 +174,7 @@ func cut_DAG(sentence string) []string {
|
|||||||
result = append(result, string(buf))
|
result = append(result, string(buf))
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if _, ok := TT.Freq[bufString]; !ok {
|
if _, ok := trie.Freq[bufString]; !ok {
|
||||||
recognized := finalseg.Cut(bufString)
|
recognized := finalseg.Cut(bufString)
|
||||||
for _, t := range recognized {
|
for _, t := range recognized {
|
||||||
result = append(result, t)
|
result = append(result, t)
|
||||||
@@ -328,7 +318,7 @@ func CutForSearch(sentence string, hmm bool) []string {
|
|||||||
var gram2 string
|
var gram2 string
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
gram2 = string(runes[i : i+increment])
|
gram2 = string(runes[i : i+increment])
|
||||||
if _, ok := TT.Freq[gram2]; ok {
|
if _, ok := trie.Freq[gram2]; ok {
|
||||||
result = append(result, gram2)
|
result = append(result, gram2)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -340,6 +330,6 @@ func CutForSearch(sentence string, hmm bool) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func SetDictionary(dict_path string) (err error) {
|
func SetDictionary(dict_path string) (err error) {
|
||||||
TT, err = newTopTrie(dict_path)
|
trie, err = newTrie(dict_path)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ func Tokenize(sentence string, mode string, HMM bool) []Token {
|
|||||||
if width > step {
|
if width > step {
|
||||||
for i := 0; i < width-step+1; i++ {
|
for i := 0; i < width-step+1; i++ {
|
||||||
gram := string(runes[i : i+step])
|
gram := string(runes[i : i+step])
|
||||||
if _, ok := TT.Freq[gram]; ok {
|
if _, ok := trie.Freq[gram]; ok {
|
||||||
tokens = append(tokens, Token{gram, start + i, start + i + step})
|
tokens = append(tokens, Token{gram, start + i, start + i + step})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
102
trie_node.go
102
trie_node.go
@@ -5,6 +5,7 @@ import (
|
|||||||
"crypto/md5"
|
"crypto/md5"
|
||||||
"encoding/gob"
|
"encoding/gob"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
mapset "github.com/deckarep/golang-set"
|
||||||
"log"
|
"log"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
@@ -14,56 +15,47 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Trie struct {
|
type Trie struct {
|
||||||
Nodes map[rune]*Trie
|
Nodes mapset.Set
|
||||||
IsLeaf bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewTrie() *Trie {
|
|
||||||
return &Trie{make(map[rune]*Trie), false}
|
|
||||||
}
|
|
||||||
|
|
||||||
type TopTrie struct {
|
|
||||||
T *Trie
|
|
||||||
MinFreq float64
|
MinFreq float64
|
||||||
Total float64
|
Total float64
|
||||||
Freq map[string]float64
|
Freq map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTopTrie(filename string) (*TopTrie, error) {
|
func newTrie(fileName string) (*Trie, error) {
|
||||||
var file_path string
|
var filePath string
|
||||||
var topTrie *TopTrie
|
var trie *Trie
|
||||||
if filepath.IsAbs(filename) {
|
if filepath.IsAbs(fileName) {
|
||||||
file_path = filename
|
filePath = fileName
|
||||||
} else {
|
} else {
|
||||||
pwd, err := os.Getwd()
|
pwd, err := os.Getwd()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
filePath = filepath.Clean(filepath.Join(pwd, fileName))
|
||||||
}
|
}
|
||||||
|
|
||||||
fi, err := os.Stat(file_path)
|
fi, err := os.Stat(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
log.Printf("Building Trie..., from %s\n", file_path)
|
log.Printf("Building Trie..., from %s\n", filePath)
|
||||||
h := fmt.Sprintf("%x", md5.Sum([]byte(file_path)))
|
h := fmt.Sprintf("%x", md5.Sum([]byte(filePath)))
|
||||||
cache_file_name := fmt.Sprintf("jieba.%s.cache", h)
|
cacheFileName := fmt.Sprintf("jieba.%s.cache", h)
|
||||||
cache_path := filepath.Join(os.TempDir(), cache_file_name)
|
cacheFilePath := filepath.Join(os.TempDir(), cacheFileName)
|
||||||
isDictCached := true
|
isDictCached := true
|
||||||
cache_fi, err := os.Stat(cache_path)
|
cacheFileInfo, err := os.Stat(cacheFilePath)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
isDictCached = false
|
isDictCached = false
|
||||||
}
|
}
|
||||||
|
|
||||||
if isDictCached {
|
if isDictCached {
|
||||||
isDictCached = cache_fi.ModTime().After(fi.ModTime())
|
isDictCached = cacheFileInfo.ModTime().After(fi.ModTime())
|
||||||
}
|
}
|
||||||
|
|
||||||
var cacheFile *os.File
|
var cacheFile *os.File
|
||||||
if isDictCached {
|
if isDictCached {
|
||||||
cacheFile, err = os.Open(cache_path)
|
cacheFile, err = os.Open(cacheFilePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
isDictCached = false
|
isDictCached = false
|
||||||
}
|
}
|
||||||
@@ -71,17 +63,19 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
|||||||
}
|
}
|
||||||
if isDictCached {
|
if isDictCached {
|
||||||
dec := gob.NewDecoder(cacheFile)
|
dec := gob.NewDecoder(cacheFile)
|
||||||
err = dec.Decode(&topTrie)
|
err = dec.Decode(&trie)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
isDictCached = false
|
isDictCached = false
|
||||||
} else {
|
} else {
|
||||||
log.Printf("loaded model from cache %s\n", cache_path)
|
log.Printf("loaded model from cache %s\n", cacheFilePath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isDictCached {
|
if !isDictCached {
|
||||||
topTrie = &TopTrie{T: NewTrie(), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
trie = &Trie{Nodes: mapset.NewSet(), MinFreq: 0.0, Total: 0.0,
|
||||||
file, openError := os.Open(file_path)
|
Freq: make(map[string]float64)}
|
||||||
|
|
||||||
|
file, openError := os.Open(filePath)
|
||||||
if openError != nil {
|
if openError != nil {
|
||||||
return nil, openError
|
return nil, openError
|
||||||
}
|
}
|
||||||
@@ -93,55 +87,45 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
|||||||
words := strings.Split(line, " ")
|
words := strings.Split(line, " ")
|
||||||
word, freqStr := words[0], words[1]
|
word, freqStr := words[0], words[1]
|
||||||
freq, _ := strconv.ParseFloat(freqStr, 64)
|
freq, _ := strconv.ParseFloat(freqStr, 64)
|
||||||
topTrie.Total += freq
|
trie.addWord(word, freq)
|
||||||
topTrie.addWord(word, freq)
|
|
||||||
}
|
}
|
||||||
if scanErr := scanner.Err(); scanErr != nil {
|
if scanErr := scanner.Err(); scanErr != nil {
|
||||||
return nil, scanErr
|
return nil, scanErr
|
||||||
}
|
}
|
||||||
|
|
||||||
var val float64
|
var val float64
|
||||||
for key := range topTrie.Freq {
|
for key := range trie.Freq {
|
||||||
val = math.Log(topTrie.Freq[key] / topTrie.Total)
|
val = math.Log(trie.Freq[key] / trie.Total)
|
||||||
if val < topTrie.MinFreq {
|
if val < trie.MinFreq {
|
||||||
topTrie.MinFreq = val
|
trie.MinFreq = val
|
||||||
}
|
}
|
||||||
topTrie.Freq[key] = val
|
trie.Freq[key] = val
|
||||||
}
|
}
|
||||||
|
|
||||||
// dump topTrie
|
// dump trie
|
||||||
cacheFile, err = os.OpenFile(cache_path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return topTrie, err
|
return trie, err
|
||||||
}
|
}
|
||||||
defer cacheFile.Close()
|
defer cacheFile.Close()
|
||||||
enc := gob.NewEncoder(cacheFile)
|
enc := gob.NewEncoder(cacheFile)
|
||||||
err := enc.Encode(topTrie)
|
err := enc.Encode(trie)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return topTrie, err
|
return trie, err
|
||||||
} else {
|
} else {
|
||||||
log.Printf("dumped model from cache %s\n", cache_path)
|
log.Printf("dumped model from cache %s\n", cacheFilePath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return topTrie, nil
|
return trie, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tt *TopTrie) addWord(word string, freq float64) {
|
func (t *Trie) addWord(word string, freq float64) {
|
||||||
tt.Freq[word] = freq
|
t.Freq[word] = freq
|
||||||
var p *Trie
|
t.Total += freq
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
count := len(runes)
|
count := len(runes)
|
||||||
for index, key := range runes {
|
for i := 0; i < count; i++ {
|
||||||
if index == 0 {
|
t.Nodes.Add(string(runes[:i+1]))
|
||||||
p = tt.T
|
|
||||||
}
|
|
||||||
if _, ok := p.Nodes[key]; !ok {
|
|
||||||
p.Nodes[key] = NewTrie()
|
|
||||||
}
|
|
||||||
if index == count-1 {
|
|
||||||
p.Nodes[key].IsLeaf = true
|
|
||||||
}
|
|
||||||
p = p.Nodes[key]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,11 +133,11 @@ func addWord(word string, freq float64, tag string) {
|
|||||||
if len(tag) > 0 {
|
if len(tag) > 0 {
|
||||||
UserWordTagTab[word] = strings.TrimSpace(tag)
|
UserWordTagTab[word] = strings.TrimSpace(tag)
|
||||||
}
|
}
|
||||||
TT.addWord(word, freq)
|
trie.addWord(word, freq)
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadUserDict(file_path string) error {
|
func LoadUserDict(filePath string) error {
|
||||||
file, openError := os.Open(file_path)
|
file, openError := os.Open(filePath)
|
||||||
if openError != nil {
|
if openError != nil {
|
||||||
return openError
|
return openError
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user