diff --git a/jieba_test.go b/jieba_test.go index a8013b7..7163224 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -745,3 +745,45 @@ func TestSetdictionary(t *testing.T) { } } } + +func TestLoadUserDict(t *testing.T) { + SetDictionary("dict.txt") + LoadUserDict("userdict.txt") + + sentence := "李小福是创新办主任也是云计算方面的专家;例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" + result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"} + words := Cut(sentence, false, true) + if len(words) != len(result) { + t.Error(len(words)) + } + for index, word := range words { + if word != result[index] { + t.Error(word) + } + } + + sentence = "easy_install is great" + result = []string{"easy_install", " ", "is", " ", "great"} + words = Cut(sentence, false, true) + if len(words) != len(result) { + t.Error(len(words)) + } + for index, word := range words { + if word != result[index] { + t.Error(word) + } + } + + sentence = "python 的正则表达式是好用的" + result = []string{"python", " ", "\u7684", "\u6b63\u5219\u8868\u8fbe\u5f0f", "\u662f", "\u597d\u7528", "\u7684"} + words = Cut(sentence, false, true) + if len(words) != len(result) { + t.Error(words) + t.Error(result) + } + for index, word := range words { + if word != result[index] { + t.Error(word) + } + } +} diff --git a/trie_node.go b/trie_node.go index 8a2b197..b3bf182 100644 --- a/trie_node.go +++ b/trie_node.go @@ -2,24 +2,14 @@ package jiebago import ( "bufio" - "crypto/sha1" - "encoding/gob" - "fmt" "math" "os" "path/filepath" - "runtime" "strconv" "strings" "unicode/utf8" ) -const ( - CACHE_NAME = "jieba.gob" - USER_CACHE_PREFIX = "jieba.user." - USER_CACHE_SUFFIX = ".gob" -) - type Node struct { Name string SubNodes Trie @@ -35,16 +25,6 @@ type TopTrie struct { Freq map[string]float64 } -func hash(s string) string { - h := sha1.New() - h.Write([]byte(s)) - return fmt.Sprintf("%x", h.Sum(nil)) -} - -func getUserCacheName(prefix string, path string, suffix string) string { - return fmt.Sprintf("%s%s%s", prefix, hash(path), suffix) -} - func newTopTrie(filename string) (*TopTrie, error) { var file_path string var topTrie *TopTrie @@ -58,32 +38,6 @@ func newTopTrie(filename string) (*TopTrie, error) { file_path = filepath.Clean(filepath.Join(pwd, filename)) } - _, curFileName, _, _ := runtime.Caller(1) - _curpath := filepath.Dir(curFileName) - abs_path := filepath.Join(_curpath, Dictionary) - var cache_file string - if file_path == abs_path { - cache_file = filepath.Join(os.TempDir(), CACHE_NAME) - } else { - cache_file = filepath.Join(os.TempDir(), - getUserCacheName(USER_CACHE_PREFIX, abs_path, USER_CACHE_SUFFIX)) - } - - cacheFileStat, cacheErr := os.Stat(cache_file) - dictFileStat, _ := os.Stat(abs_path) - if cacheErr == nil { - if cacheFileStat.ModTime().After(dictFileStat.ModTime()) { - cacheFile, openError := os.Open(cache_file) - if openError == nil { - dec := gob.NewDecoder(cacheFile) - err := dec.Decode(&topTrie) - if err == nil { - return topTrie, nil - } - } - } - } - topTrie = &TopTrie{T: make(Trie), MinFreq: 100.0, Total: 0.0, Freq: make(map[string]float64)} file, openError := os.Open(file_path) if openError != nil { @@ -112,11 +66,6 @@ func newTopTrie(filename string) (*TopTrie, error) { topTrie.Freq[key] = val } - cacheFile_, _ := os.OpenFile(cache_file, os.O_CREATE|os.O_WRONLY, 0644) - defer cacheFile_.Close() - enc := gob.NewEncoder(cacheFile_) - enc.Encode(topTrie) - return topTrie, nil } @@ -167,6 +116,7 @@ func LoadUserDict(file_path string) error { } words := strings.Split(line, " ") word, freqStr := words[0], words[1] + word = strings.Replace(word, "\ufeff", "", 1) freq, _ := strconv.ParseFloat(freqStr, 64) TT.addWord(word, freq) }