mirror of
https://github.com/fumiama/jieba.git
synced 2026-07-02 01:50:29 +08:00
removed cache, added tests for LoadUserDict
This commit is contained in:
@@ -745,3 +745,45 @@ func TestSetdictionary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLoadUserDict(t *testing.T) {
|
||||||
|
SetDictionary("dict.txt")
|
||||||
|
LoadUserDict("userdict.txt")
|
||||||
|
|
||||||
|
sentence := "李小福是创新办主任也是云计算方面的专家;例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"}
|
||||||
|
words := Cut(sentence, false, true)
|
||||||
|
if len(words) != len(result) {
|
||||||
|
t.Error(len(words))
|
||||||
|
}
|
||||||
|
for index, word := range words {
|
||||||
|
if word != result[index] {
|
||||||
|
t.Error(word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sentence = "easy_install is great"
|
||||||
|
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||||
|
words = Cut(sentence, false, true)
|
||||||
|
if len(words) != len(result) {
|
||||||
|
t.Error(len(words))
|
||||||
|
}
|
||||||
|
for index, word := range words {
|
||||||
|
if word != result[index] {
|
||||||
|
t.Error(word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sentence = "python 的正则表达式是好用的"
|
||||||
|
result = []string{"python", " ", "\u7684", "\u6b63\u5219\u8868\u8fbe\u5f0f", "\u662f", "\u597d\u7528", "\u7684"}
|
||||||
|
words = Cut(sentence, false, true)
|
||||||
|
if len(words) != len(result) {
|
||||||
|
t.Error(words)
|
||||||
|
t.Error(result)
|
||||||
|
}
|
||||||
|
for index, word := range words {
|
||||||
|
if word != result[index] {
|
||||||
|
t.Error(word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
52
trie_node.go
52
trie_node.go
@@ -2,24 +2,14 @@ package jiebago
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"crypto/sha1"
|
|
||||||
"encoding/gob"
|
|
||||||
"fmt"
|
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
CACHE_NAME = "jieba.gob"
|
|
||||||
USER_CACHE_PREFIX = "jieba.user."
|
|
||||||
USER_CACHE_SUFFIX = ".gob"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Node struct {
|
type Node struct {
|
||||||
Name string
|
Name string
|
||||||
SubNodes Trie
|
SubNodes Trie
|
||||||
@@ -35,16 +25,6 @@ type TopTrie struct {
|
|||||||
Freq map[string]float64
|
Freq map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func hash(s string) string {
|
|
||||||
h := sha1.New()
|
|
||||||
h.Write([]byte(s))
|
|
||||||
return fmt.Sprintf("%x", h.Sum(nil))
|
|
||||||
}
|
|
||||||
|
|
||||||
func getUserCacheName(prefix string, path string, suffix string) string {
|
|
||||||
return fmt.Sprintf("%s%s%s", prefix, hash(path), suffix)
|
|
||||||
}
|
|
||||||
|
|
||||||
func newTopTrie(filename string) (*TopTrie, error) {
|
func newTopTrie(filename string) (*TopTrie, error) {
|
||||||
var file_path string
|
var file_path string
|
||||||
var topTrie *TopTrie
|
var topTrie *TopTrie
|
||||||
@@ -58,32 +38,6 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
|||||||
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
file_path = filepath.Clean(filepath.Join(pwd, filename))
|
||||||
}
|
}
|
||||||
|
|
||||||
_, curFileName, _, _ := runtime.Caller(1)
|
|
||||||
_curpath := filepath.Dir(curFileName)
|
|
||||||
abs_path := filepath.Join(_curpath, Dictionary)
|
|
||||||
var cache_file string
|
|
||||||
if file_path == abs_path {
|
|
||||||
cache_file = filepath.Join(os.TempDir(), CACHE_NAME)
|
|
||||||
} else {
|
|
||||||
cache_file = filepath.Join(os.TempDir(),
|
|
||||||
getUserCacheName(USER_CACHE_PREFIX, abs_path, USER_CACHE_SUFFIX))
|
|
||||||
}
|
|
||||||
|
|
||||||
cacheFileStat, cacheErr := os.Stat(cache_file)
|
|
||||||
dictFileStat, _ := os.Stat(abs_path)
|
|
||||||
if cacheErr == nil {
|
|
||||||
if cacheFileStat.ModTime().After(dictFileStat.ModTime()) {
|
|
||||||
cacheFile, openError := os.Open(cache_file)
|
|
||||||
if openError == nil {
|
|
||||||
dec := gob.NewDecoder(cacheFile)
|
|
||||||
err := dec.Decode(&topTrie)
|
|
||||||
if err == nil {
|
|
||||||
return topTrie, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)}
|
||||||
file, openError := os.Open(file_path)
|
file, openError := os.Open(file_path)
|
||||||
if openError != nil {
|
if openError != nil {
|
||||||
@@ -112,11 +66,6 @@ func newTopTrie(filename string) (*TopTrie, error) {
|
|||||||
topTrie.Freq[key] = val
|
topTrie.Freq[key] = val
|
||||||
}
|
}
|
||||||
|
|
||||||
cacheFile_, _ := os.OpenFile(cache_file, os.O_CREATE|os.O_WRONLY, 0644)
|
|
||||||
defer cacheFile_.Close()
|
|
||||||
enc := gob.NewEncoder(cacheFile_)
|
|
||||||
enc.Encode(topTrie)
|
|
||||||
|
|
||||||
return topTrie, nil
|
return topTrie, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,6 +116,7 @@ func LoadUserDict(file_path string) error {
|
|||||||
}
|
}
|
||||||
words := strings.Split(line, " ")
|
words := strings.Split(line, " ")
|
||||||
word, freqStr := words[0], words[1]
|
word, freqStr := words[0], words[1]
|
||||||
|
word = strings.Replace(word, "\ufeff", "", 1)
|
||||||
freq, _ := strconv.ParseFloat(freqStr, 64)
|
freq, _ := strconv.ParseFloat(freqStr, 64)
|
||||||
TT.addWord(word, freq)
|
TT.addWord(word, freq)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user