From b828b25f67ac9d842821281bb75039d99d543299 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 26 Jan 2015 18:31:10 +0800 Subject: [PATCH] fix performance problem of extrag_tags, corresponding to jieba commit #eb98eb92484d3d302cd96049be43c224fe45414a --- analyse/analyse.go | 4 +-- analyse/analyse_test.go | 4 ++- analyse/idf.go | 75 ++++++++++++++++++++++++++++++++++------- 3 files changed, 67 insertions(+), 16 deletions(-) diff --git a/analyse/analyse.go b/analyse/analyse.go index 111214e..c61d4b2 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -57,10 +57,10 @@ func ExtractTags(sentence string, topK int) []string { tis := make(TfIdfs, 0) for k, v := range freq { var ti TfIdf - if freq_, ok := idfFreq[k]; ok { + if freq_, ok := idfLoader.Freq[k]; ok { ti = TfIdf{word: k, freq: freq_ * v} } else { - ti = TfIdf{word: k, freq: medianIdf * v} + ti = TfIdf{word: k, freq: idfLoader.Median * v} } tis = append(tis, ti) } diff --git a/analyse/analyse_test.go b/analyse/analyse_test.go index 059902f..e375152 100644 --- a/analyse/analyse_test.go +++ b/analyse/analyse_test.go @@ -185,6 +185,7 @@ var ( func TestExtractTags(t *testing.T) { jiebago.SetDictionary("../dict.txt") SetIdf("idf.txt") + for index, sentence := range test_contents { result := ExtractTags(sentence, 20) if len(result) != len(Tags[index]) { @@ -192,8 +193,9 @@ func TestExtractTags(t *testing.T) { } for i, tag := range result { if tag != Tags[index][i] { - t.Error(tag) + t.Errorf("%s != %s", tag, Tags[index][i]) } } } + } diff --git a/analyse/idf.go b/analyse/idf.go index 4509209..eeaed53 100644 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -11,24 +11,61 @@ import ( var ( stopWords map[string]string - idfFreq map[string]float64 - medianIdf float64 + idfLoader *IDFLoader ) func init() { - idfFreq = make(map[string]float64) + idfLoader = NewIDFLoader() stopWords = map[string]string{ - "the": "the", "of": "of", "is": "is", "and": "and", "to": "to", "in": "in", "that": "that", "we": "we", "for": "for", "an": "an", "are": "are", "by": "bye", "be": "be", "as": "as", "on": "on", "with": "with", "can": "can", "if": "of", "from": "from", "which": "which", "you": "you", "it": "it", "this": "this", "then": "then", "at": "at", "have": "have", "all": "all", "not": "not", "one": "one", "has": "has", "or": "or", + "the": "the", + "of": "of", + "is": "is", + "and": "and", + "to": "to", + "in": "in", + "that": "that", + "we": "we", + "for": "for", + "an": "an", + "are": "are", + "by": "bye", + "be": "be", + "as": "as", + "on": "on", + "with": "with", + "can": "can", + "if": "of", + "from": "from", + "which": "which", + "you": "you", + "it": "it", + "this": "this", + "then": "then", + "at": "at", + "have": "have", + "all": "all", + "not": "not", + "one": "one", + "has": "has", + "or": "or", } } -func SetIdf(idfFilePath string) error { - if !filepath.IsAbs(idfFilePath) { - pwd, err := os.Getwd() - if err != nil { - return err - } - idfFilePath = filepath.Clean(filepath.Join(pwd, idfFilePath)) +type IDFLoader struct { + Path string + Freq map[string]float64 + Median float64 +} + +func NewIDFLoader() *IDFLoader { + loader := new(IDFLoader) + loader.Freq = make(map[string]float64) + return loader +} + +func (loader *IDFLoader) NewPath(idfFilePath string) error { + if loader.Path == idfFilePath { + return nil } idfFile, err := os.Open(idfFilePath) if err != nil { @@ -44,15 +81,27 @@ func SetIdf(idfFilePath string) error { if err != nil { continue } - idfFreq[word] = freq + loader.Freq[word] = freq freqs = append(freqs, freq) } if err := scanner.Err(); err != nil { return err } sort.Float64s(freqs) - medianIdf = freqs[len(freqs)/2] + loader.Median = freqs[len(freqs)/2] return nil + +} + +func SetIdf(idfFilePath string) error { + if !filepath.IsAbs(idfFilePath) { + pwd, err := os.Getwd() + if err != nil { + return err + } + idfFilePath = filepath.Clean(filepath.Join(pwd, idfFilePath)) + } + return idfLoader.NewPath(idfFilePath) } func SetStopWords(stopWordsFilePath string) error {