1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

added a new interface for caching

This commit is contained in:
Wang Bin
2015-03-28 15:49:32 +08:00
parent e11060513c
commit 79adffe328
5 changed files with 69 additions and 40 deletions

View File

@@ -38,11 +38,7 @@ func (ws wordWeights) Swap(i, j int) {
type TagExtracter struct {
*jiebago.Jieba
*IDFLoader
stopWords map[string]int
}
func (t *TagExtracter) AddEntry(entry *jiebago.Entry) {
t.stopWords[entry.Word] = 1
*StopWordLoader
}
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
@@ -54,18 +50,7 @@ func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
if err != nil {
return nil, err
}
return &TagExtracter{j, i, StopWords}, nil
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
return jiebago.LoadDict(t, stopWordsFilePath, false)
return &TagExtracter{j, i, NewStopWordLoader()}, nil
}
// Keyword extraction.
@@ -77,7 +62,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights)
if utf8.RuneCountInString(w) < 2 {
continue
}
if _, ok := t.stopWords[w]; ok {
if t.IsStopWord(w) {
continue
}
if f, ok := freq[w]; ok {

View File

@@ -1,6 +1,8 @@
package analyse
var StopWords = map[string]int{
import "github.com/wangbin/jiebago"
var defaultStopWords = map[string]int{
"the": 1,
"of": 1,
"is": 1,
@@ -33,3 +35,32 @@ var StopWords = map[string]int{
"has": 1,
"or": 1,
}
type StopWordLoader struct {
stopWords map[string]int
}
func (s *StopWordLoader) AddEntry(entry *jiebago.Entry) {
s.stopWords[entry.Word] = 1
}
func NewStopWordLoader() *StopWordLoader {
s := new(StopWordLoader)
s.stopWords = defaultStopWords
return s
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
return jiebago.LoadDict(s, stopWordsFilePath, false)
}
func (s StopWordLoader) IsStopWord(word string) bool {
_, ok := s.stopWords[word]
return ok
}

View File

@@ -17,3 +17,7 @@ func NewEntry() *Entry {
type DictLoader interface {
AddEntry(*Entry)
}
type Cacher interface {
CacheNameFormat() string
}

View File

@@ -9,6 +9,8 @@ import (
"sort"
)
const cacheNameFormat = "jieba.%x.cache"
var (
// Word/Tag Map load from user dictionary
UserWordTagTab = make(map[string]string)
@@ -57,6 +59,10 @@ func (j *Jieba) AddEntry(entry *Entry) {
j.Add(entry.Word, entry.Freq)
}
func (j *Jieba) CacheNameFormat() string {
return cacheNameFormat
}
func (j *Jieba) Add(word string, freq float64) {
j.Freq[word] = freq
j.Total += freq

45
util.go
View File

@@ -57,9 +57,9 @@ func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error {
return scanner.Err()
}
func cachePath(dictPath string) string {
func cacheFilePath(c Cacher, dictPath string) string {
return filepath.Join(os.TempDir(),
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(dictPath))))
fmt.Sprintf(c.CacheNameFormat(), md5.Sum([]byte(dictPath))))
}
func cached(dictPath, cachePath string) (bool, error) {
@@ -85,14 +85,14 @@ func load(l DictLoader, cachePath string) error {
return dec.Decode(l)
}
func dump(l DictLoader, cachePath string) error {
func dump(c Cacher, cachePath string) error {
cacheFile, err := os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
return enc.Encode(l)
return enc.Encode(c)
}
func SetDict(l DictLoader, dictName string, pos bool) error {
@@ -100,30 +100,33 @@ func SetDict(l DictLoader, dictName string, pos bool) error {
if err != nil {
return err
}
cachePath := cachePath(dictPath)
cached, err := cached(dictPath, cachePath)
if err != nil {
return err
}
if cached {
err = load(l, cachePath)
if err == nil {
log.Printf("loaded model from cache %s\n", cachePath)
return nil
var cachePath string
if c, ok := l.(Cacher); ok {
cachePath = cacheFilePath(c, dictPath)
cached, err := cached(dictPath, cachePath)
if err != nil {
return err
}
cached = false
}
if cached {
err = load(l, cachePath)
if err == nil {
log.Printf("loaded model from cache %s\n", cachePath)
return nil
}
}
}
err = LoadDict(l, dictPath, pos)
if err != nil {
return err
}
err = dump(l, cachePath)
if err == nil {
log.Printf("dumped model from cache %s\n", cachePath)
return nil
if c, ok := l.(Cacher); ok {
err = dump(c, cachePath)
if err == nil {
log.Printf("dumped model from cache %s\n", cachePath)
return nil
}
}
return err
}