From 79adffe328bdd23e9b3e360abcda1805e7b411eb Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Sat, 28 Mar 2015 15:49:32 +0800 Subject: [PATCH] added a new interface for caching --- analyse/analyse.go | 21 +++------------------ analyse/stopwords.go | 33 +++++++++++++++++++++++++++++++- dictionary.go | 4 ++++ jieba.go | 6 ++++++ util.go | 45 +++++++++++++++++++++++--------------------- 5 files changed, 69 insertions(+), 40 deletions(-) diff --git a/analyse/analyse.go b/analyse/analyse.go index 83707a4..16be60d 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -38,11 +38,7 @@ func (ws wordWeights) Swap(i, j int) { type TagExtracter struct { *jiebago.Jieba *IDFLoader - stopWords map[string]int -} - -func (t *TagExtracter) AddEntry(entry *jiebago.Entry) { - t.stopWords[entry.Word] = 1 + *StopWordLoader } func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { @@ -54,18 +50,7 @@ func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { if err != nil { return nil, err } - return &TagExtracter{j, i, StopWords}, nil -} - -// Set the stop words file path, could be absolute path of stop words file, or -// file name in current directory. -func (t *TagExtracter) SetStopWords(stopWordsFileName string) error { - stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName) - if err != nil { - return err - } - - return jiebago.LoadDict(t, stopWordsFilePath, false) + return &TagExtracter{j, i, NewStopWordLoader()}, nil } // Keyword extraction. @@ -77,7 +62,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) if utf8.RuneCountInString(w) < 2 { continue } - if _, ok := t.stopWords[w]; ok { + if t.IsStopWord(w) { continue } if f, ok := freq[w]; ok { diff --git a/analyse/stopwords.go b/analyse/stopwords.go index 930d333..42a1591 100644 --- a/analyse/stopwords.go +++ b/analyse/stopwords.go @@ -1,6 +1,8 @@ package analyse -var StopWords = map[string]int{ +import "github.com/wangbin/jiebago" + +var defaultStopWords = map[string]int{ "the": 1, "of": 1, "is": 1, @@ -33,3 +35,32 @@ var StopWords = map[string]int{ "has": 1, "or": 1, } + +type StopWordLoader struct { + stopWords map[string]int +} + +func (s *StopWordLoader) AddEntry(entry *jiebago.Entry) { + s.stopWords[entry.Word] = 1 +} + +func NewStopWordLoader() *StopWordLoader { + s := new(StopWordLoader) + s.stopWords = defaultStopWords + return s +} + +// Set the stop words file path, could be absolute path of stop words file, or +// file name in current directory. +func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error { + stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName) + if err != nil { + return err + } + return jiebago.LoadDict(s, stopWordsFilePath, false) +} + +func (s StopWordLoader) IsStopWord(word string) bool { + _, ok := s.stopWords[word] + return ok +} diff --git a/dictionary.go b/dictionary.go index a0bd55b..b86dac3 100644 --- a/dictionary.go +++ b/dictionary.go @@ -17,3 +17,7 @@ func NewEntry() *Entry { type DictLoader interface { AddEntry(*Entry) } + +type Cacher interface { + CacheNameFormat() string +} diff --git a/jieba.go b/jieba.go index fe9b2cc..0eb7420 100644 --- a/jieba.go +++ b/jieba.go @@ -9,6 +9,8 @@ import ( "sort" ) +const cacheNameFormat = "jieba.%x.cache" + var ( // Word/Tag Map load from user dictionary UserWordTagTab = make(map[string]string) @@ -57,6 +59,10 @@ func (j *Jieba) AddEntry(entry *Entry) { j.Add(entry.Word, entry.Freq) } +func (j *Jieba) CacheNameFormat() string { + return cacheNameFormat +} + func (j *Jieba) Add(word string, freq float64) { j.Freq[word] = freq j.Total += freq diff --git a/util.go b/util.go index decfb99..66796b7 100644 --- a/util.go +++ b/util.go @@ -57,9 +57,9 @@ func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error { return scanner.Err() } -func cachePath(dictPath string) string { +func cacheFilePath(c Cacher, dictPath string) string { return filepath.Join(os.TempDir(), - fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(dictPath)))) + fmt.Sprintf(c.CacheNameFormat(), md5.Sum([]byte(dictPath)))) } func cached(dictPath, cachePath string) (bool, error) { @@ -85,14 +85,14 @@ func load(l DictLoader, cachePath string) error { return dec.Decode(l) } -func dump(l DictLoader, cachePath string) error { +func dump(c Cacher, cachePath string) error { cacheFile, err := os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) if err != nil { return err } defer cacheFile.Close() enc := gob.NewEncoder(cacheFile) - return enc.Encode(l) + return enc.Encode(c) } func SetDict(l DictLoader, dictName string, pos bool) error { @@ -100,30 +100,33 @@ func SetDict(l DictLoader, dictName string, pos bool) error { if err != nil { return err } - cachePath := cachePath(dictPath) - cached, err := cached(dictPath, cachePath) - if err != nil { - return err - } - if cached { - err = load(l, cachePath) - if err == nil { - log.Printf("loaded model from cache %s\n", cachePath) - return nil + var cachePath string + if c, ok := l.(Cacher); ok { + cachePath = cacheFilePath(c, dictPath) + cached, err := cached(dictPath, cachePath) + if err != nil { + return err } - cached = false - } + if cached { + err = load(l, cachePath) + if err == nil { + log.Printf("loaded model from cache %s\n", cachePath) + return nil + } + } + } err = LoadDict(l, dictPath, pos) if err != nil { return err } - - err = dump(l, cachePath) - if err == nil { - log.Printf("dumped model from cache %s\n", cachePath) - return nil + if c, ok := l.(Cacher); ok { + err = dump(c, cachePath) + if err == nil { + log.Printf("dumped model from cache %s\n", cachePath) + return nil + } } return err }