mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-13 05:31:02 +08:00
added a new interface for caching
This commit is contained in:
@@ -38,11 +38,7 @@ func (ws wordWeights) Swap(i, j int) {
|
|||||||
type TagExtracter struct {
|
type TagExtracter struct {
|
||||||
*jiebago.Jieba
|
*jiebago.Jieba
|
||||||
*IDFLoader
|
*IDFLoader
|
||||||
stopWords map[string]int
|
*StopWordLoader
|
||||||
}
|
|
||||||
|
|
||||||
func (t *TagExtracter) AddEntry(entry *jiebago.Entry) {
|
|
||||||
t.stopWords[entry.Word] = 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
||||||
@@ -54,18 +50,7 @@ func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return &TagExtracter{j, i, StopWords}, nil
|
return &TagExtracter{j, i, NewStopWordLoader()}, nil
|
||||||
}
|
|
||||||
|
|
||||||
// Set the stop words file path, could be absolute path of stop words file, or
|
|
||||||
// file name in current directory.
|
|
||||||
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
|
|
||||||
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return jiebago.LoadDict(t, stopWordsFilePath, false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keyword extraction.
|
// Keyword extraction.
|
||||||
@@ -77,7 +62,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights)
|
|||||||
if utf8.RuneCountInString(w) < 2 {
|
if utf8.RuneCountInString(w) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := t.stopWords[w]; ok {
|
if t.IsStopWord(w) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if f, ok := freq[w]; ok {
|
if f, ok := freq[w]; ok {
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
var StopWords = map[string]int{
|
import "github.com/wangbin/jiebago"
|
||||||
|
|
||||||
|
var defaultStopWords = map[string]int{
|
||||||
"the": 1,
|
"the": 1,
|
||||||
"of": 1,
|
"of": 1,
|
||||||
"is": 1,
|
"is": 1,
|
||||||
@@ -33,3 +35,32 @@ var StopWords = map[string]int{
|
|||||||
"has": 1,
|
"has": 1,
|
||||||
"or": 1,
|
"or": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type StopWordLoader struct {
|
||||||
|
stopWords map[string]int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *StopWordLoader) AddEntry(entry *jiebago.Entry) {
|
||||||
|
s.stopWords[entry.Word] = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewStopWordLoader() *StopWordLoader {
|
||||||
|
s := new(StopWordLoader)
|
||||||
|
s.stopWords = defaultStopWords
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the stop words file path, could be absolute path of stop words file, or
|
||||||
|
// file name in current directory.
|
||||||
|
func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error {
|
||||||
|
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return jiebago.LoadDict(s, stopWordsFilePath, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s StopWordLoader) IsStopWord(word string) bool {
|
||||||
|
_, ok := s.stopWords[word]
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|||||||
@@ -17,3 +17,7 @@ func NewEntry() *Entry {
|
|||||||
type DictLoader interface {
|
type DictLoader interface {
|
||||||
AddEntry(*Entry)
|
AddEntry(*Entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Cacher interface {
|
||||||
|
CacheNameFormat() string
|
||||||
|
}
|
||||||
|
|||||||
6
jieba.go
6
jieba.go
@@ -9,6 +9,8 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const cacheNameFormat = "jieba.%x.cache"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
// Word/Tag Map load from user dictionary
|
// Word/Tag Map load from user dictionary
|
||||||
UserWordTagTab = make(map[string]string)
|
UserWordTagTab = make(map[string]string)
|
||||||
@@ -57,6 +59,10 @@ func (j *Jieba) AddEntry(entry *Entry) {
|
|||||||
j.Add(entry.Word, entry.Freq)
|
j.Add(entry.Word, entry.Freq)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (j *Jieba) CacheNameFormat() string {
|
||||||
|
return cacheNameFormat
|
||||||
|
}
|
||||||
|
|
||||||
func (j *Jieba) Add(word string, freq float64) {
|
func (j *Jieba) Add(word string, freq float64) {
|
||||||
j.Freq[word] = freq
|
j.Freq[word] = freq
|
||||||
j.Total += freq
|
j.Total += freq
|
||||||
|
|||||||
45
util.go
45
util.go
@@ -57,9 +57,9 @@ func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error {
|
|||||||
return scanner.Err()
|
return scanner.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
func cachePath(dictPath string) string {
|
func cacheFilePath(c Cacher, dictPath string) string {
|
||||||
return filepath.Join(os.TempDir(),
|
return filepath.Join(os.TempDir(),
|
||||||
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(dictPath))))
|
fmt.Sprintf(c.CacheNameFormat(), md5.Sum([]byte(dictPath))))
|
||||||
}
|
}
|
||||||
|
|
||||||
func cached(dictPath, cachePath string) (bool, error) {
|
func cached(dictPath, cachePath string) (bool, error) {
|
||||||
@@ -85,14 +85,14 @@ func load(l DictLoader, cachePath string) error {
|
|||||||
return dec.Decode(l)
|
return dec.Decode(l)
|
||||||
}
|
}
|
||||||
|
|
||||||
func dump(l DictLoader, cachePath string) error {
|
func dump(c Cacher, cachePath string) error {
|
||||||
cacheFile, err := os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
cacheFile, err := os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer cacheFile.Close()
|
defer cacheFile.Close()
|
||||||
enc := gob.NewEncoder(cacheFile)
|
enc := gob.NewEncoder(cacheFile)
|
||||||
return enc.Encode(l)
|
return enc.Encode(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
func SetDict(l DictLoader, dictName string, pos bool) error {
|
func SetDict(l DictLoader, dictName string, pos bool) error {
|
||||||
@@ -100,30 +100,33 @@ func SetDict(l DictLoader, dictName string, pos bool) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
cachePath := cachePath(dictPath)
|
|
||||||
cached, err := cached(dictPath, cachePath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if cached {
|
var cachePath string
|
||||||
err = load(l, cachePath)
|
if c, ok := l.(Cacher); ok {
|
||||||
if err == nil {
|
cachePath = cacheFilePath(c, dictPath)
|
||||||
log.Printf("loaded model from cache %s\n", cachePath)
|
cached, err := cached(dictPath, cachePath)
|
||||||
return nil
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
cached = false
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if cached {
|
||||||
|
err = load(l, cachePath)
|
||||||
|
if err == nil {
|
||||||
|
log.Printf("loaded model from cache %s\n", cachePath)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
err = LoadDict(l, dictPath, pos)
|
err = LoadDict(l, dictPath, pos)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if c, ok := l.(Cacher); ok {
|
||||||
err = dump(l, cachePath)
|
err = dump(c, cachePath)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("dumped model from cache %s\n", cachePath)
|
log.Printf("dumped model from cache %s\n", cachePath)
|
||||||
return nil
|
return nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user