1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-30 09:00:30 +08:00

fix performance problem of extrag_tags, corresponding to jieba commit #eb98eb92484d3d302cd96049be43c224fe45414a

This commit is contained in:
Wang Bin
2015-01-26 18:31:10 +08:00
parent 2b15490388
commit b828b25f67
3 changed files with 67 additions and 16 deletions

View File

@@ -57,10 +57,10 @@ func ExtractTags(sentence string, topK int) []string {
tis := make(TfIdfs, 0) tis := make(TfIdfs, 0)
for k, v := range freq { for k, v := range freq {
var ti TfIdf var ti TfIdf
if freq_, ok := idfFreq[k]; ok { if freq_, ok := idfLoader.Freq[k]; ok {
ti = TfIdf{word: k, freq: freq_ * v} ti = TfIdf{word: k, freq: freq_ * v}
} else { } else {
ti = TfIdf{word: k, freq: medianIdf * v} ti = TfIdf{word: k, freq: idfLoader.Median * v}
} }
tis = append(tis, ti) tis = append(tis, ti)
} }

View File

@@ -185,6 +185,7 @@ var (
func TestExtractTags(t *testing.T) { func TestExtractTags(t *testing.T) {
jiebago.SetDictionary("../dict.txt") jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt") SetIdf("idf.txt")
for index, sentence := range test_contents { for index, sentence := range test_contents {
result := ExtractTags(sentence, 20) result := ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) { if len(result) != len(Tags[index]) {
@@ -192,8 +193,9 @@ func TestExtractTags(t *testing.T) {
} }
for i, tag := range result { for i, tag := range result {
if tag != Tags[index][i] { if tag != Tags[index][i] {
t.Error(tag) t.Errorf("%s != %s", tag, Tags[index][i])
} }
} }
} }
} }

View File

@@ -11,24 +11,61 @@ import (
var ( var (
stopWords map[string]string stopWords map[string]string
idfFreq map[string]float64 idfLoader *IDFLoader
medianIdf float64
) )
func init() { func init() {
idfFreq = make(map[string]float64) idfLoader = NewIDFLoader()
stopWords = map[string]string{ stopWords = map[string]string{
"the": "the", "of": "of", "is": "is", "and": "and", "to": "to", "in": "in", "that": "that", "we": "we", "for": "for", "an": "an", "are": "are", "by": "bye", "be": "be", "as": "as", "on": "on", "with": "with", "can": "can", "if": "of", "from": "from", "which": "which", "you": "you", "it": "it", "this": "this", "then": "then", "at": "at", "have": "have", "all": "all", "not": "not", "one": "one", "has": "has", "or": "or", "the": "the",
"of": "of",
"is": "is",
"and": "and",
"to": "to",
"in": "in",
"that": "that",
"we": "we",
"for": "for",
"an": "an",
"are": "are",
"by": "bye",
"be": "be",
"as": "as",
"on": "on",
"with": "with",
"can": "can",
"if": "of",
"from": "from",
"which": "which",
"you": "you",
"it": "it",
"this": "this",
"then": "then",
"at": "at",
"have": "have",
"all": "all",
"not": "not",
"one": "one",
"has": "has",
"or": "or",
} }
} }
func SetIdf(idfFilePath string) error { type IDFLoader struct {
if !filepath.IsAbs(idfFilePath) { Path string
pwd, err := os.Getwd() Freq map[string]float64
if err != nil { Median float64
return err }
}
idfFilePath = filepath.Clean(filepath.Join(pwd, idfFilePath)) func NewIDFLoader() *IDFLoader {
loader := new(IDFLoader)
loader.Freq = make(map[string]float64)
return loader
}
func (loader *IDFLoader) NewPath(idfFilePath string) error {
if loader.Path == idfFilePath {
return nil
} }
idfFile, err := os.Open(idfFilePath) idfFile, err := os.Open(idfFilePath)
if err != nil { if err != nil {
@@ -44,15 +81,27 @@ func SetIdf(idfFilePath string) error {
if err != nil { if err != nil {
continue continue
} }
idfFreq[word] = freq loader.Freq[word] = freq
freqs = append(freqs, freq) freqs = append(freqs, freq)
} }
if err := scanner.Err(); err != nil { if err := scanner.Err(); err != nil {
return err return err
} }
sort.Float64s(freqs) sort.Float64s(freqs)
medianIdf = freqs[len(freqs)/2] loader.Median = freqs[len(freqs)/2]
return nil return nil
}
func SetIdf(idfFilePath string) error {
if !filepath.IsAbs(idfFilePath) {
pwd, err := os.Getwd()
if err != nil {
return err
}
idfFilePath = filepath.Clean(filepath.Join(pwd, idfFilePath))
}
return idfLoader.NewPath(idfFilePath)
} }
func SetStopWords(stopWordsFilePath string) error { func SetStopWords(stopWordsFilePath string) error {