diff --git a/analyse/analyse.go b/analyse/analyse.go index 384dd3b..47423c8 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) { ws[i], ws[j] = ws[j], ws[i] } +type TagExtracter struct { + *jiebago.Jieba + *IDFLoader + stopWords map[string]int +} + +func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { + j, err := jiebago.NewJieba(dictFileName) + if err != nil { + return nil, err + } + i, err := NewIDFLoader(IDFFileName) + if err != nil { + return nil, err + } + return &TagExtracter{j, i, StopWords}, nil +} + +// Set the stop words file path, could be absolute path of stop words file, or +// file name in current directory. +func (t *TagExtracter) SetStopWords(stopWordsFileName string) error { + stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName) + if err != nil { + return err + } + + wtfs, err := jiebago.ParseDictFile(stopWordsFilePath) + for _, wtf := range wtfs { + t.stopWords[wtf.Word] = 1 + } + return nil +} + // Keyword extraction. -func ExtractTags(sentence string, topK int) (tags wordWeights) { +func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) { freq := make(map[string]float64) - for w := range jiebago.Cut(sentence, false, true) { + for w := range t.Cut(sentence, false, true) { w = strings.TrimSpace(w) if utf8.RuneCountInString(w) < 2 { continue } - if _, ok := stopWords[w]; ok { + if _, ok := t.stopWords[w]; ok { continue } if f, ok := freq[w]; ok { @@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) { ws := make(wordWeights, 0) for k, v := range freq { var ti wordWeight - if freq_, ok := loader.Freq[k]; ok { + if freq_, ok := t.IDFFreq[k]; ok { ti = wordWeight{Word: k, Weight: freq_ * v} } else { - ti = wordWeight{Word: k, Weight: loader.Median * v} + ti = wordWeight{Word: k, Weight: t.Median * v} } ws = append(ws, ti) } diff --git a/analyse/analyse_test.go b/analyse/analyse_test.go index ad55f81..05b2ecd 100644 --- a/analyse/analyse_test.go +++ b/analyse/analyse_test.go @@ -1,7 +1,6 @@ package analyse import ( - "github.com/wangbin/jiebago" "math" "testing" ) @@ -256,11 +255,10 @@ var ( ) func TestExtractTags(t *testing.T) { - jiebago.SetDictionary("../dict.txt") - SetIdf("idf.txt") + et, _ := NewTagExtracter("../dict.txt", "idf.txt") for index, sentence := range test_contents { - result := ExtractTags(sentence, 20) + result := et.ExtractTags(sentence, 20) if len(result) != len(Tags[index]) { t.Errorf("%s = %v", sentence, result) } @@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) { } func TestExtratTagsWithWeight(t *testing.T) { - jiebago.SetDictionary("../dict.txt") - SetIdf("idf.txt") - result := ExtractTags(Lyric, 10) + et, _ := NewTagExtracter("../dict.txt", "idf.txt") + result := et.ExtractTags(Lyric, 10) for index, tag := range result { if LyciWeight[index].Word != tag.Word || math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 { @@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) { } func TestExtractTagsWithStopWordsFile(t *testing.T) { - jiebago.SetDictionary("../dict.txt") - SetIdf("idf.txt") - SetStopWords("stop_words.txt") - result := ExtractTags(Lyric, 7) + et, _ := NewTagExtracter("../dict.txt", "idf.txt") + et.SetStopWords("stop_words.txt") + result := et.ExtractTags(Lyric, 7) for index, tag := range result { if LyciWeight2[index].Word != tag.Word || math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 { diff --git a/analyse/idf.go b/analyse/idf.go index 125c795..a706cc0 100644 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -5,53 +5,28 @@ import ( "sort" ) -var ( - loader *idfLoader -) - -func init() { - loader = newIDFLoader() +type IDFLoader struct { + IDFFreq map[string]float64 + Median float64 } -type idfLoader struct { - Path string - Freq map[string]float64 - Median float64 -} - -func newIDFLoader() *idfLoader { - loader := new(idfLoader) - loader.Freq = make(map[string]float64) - return loader -} - -func (loader *idfLoader) newPath(idfFilePath string) error { - if loader.Path == idfFilePath { - return nil - } - wtfs, err := jiebago.ParseDictFile(idfFilePath) +func NewIDFLoader(IDFFileName string) (*IDFLoader, error) { + IDFFilePath, err := jiebago.DictPath(IDFFileName) if err != nil { - return err + return nil, err + } + wtfs, err := jiebago.ParseDictFile(IDFFilePath) + if err != nil { + return nil, err } - freqs := make([]float64, 0) - - for _, wtf := range wtfs { - loader.Freq[wtf.Word] = wtf.Freq - freqs = append(freqs, wtf.Freq) + freqs := make([]float64, len(wtfs)) + loader := &IDFLoader{make(map[string]float64), 0.0} + for index, wtf := range wtfs { + loader.IDFFreq[wtf.Word] = wtf.Freq + freqs[index] = wtf.Freq } - sort.Float64s(freqs) loader.Median = freqs[len(freqs)/2] - return nil -} - -// Set the IDF file path, could be absolute path of IDF file, or IDF file -// name in current directory. -func SetIdf(idfFileName string) error { - idfFilePath, err := jiebago.DictPath(idfFileName) - if err != nil { - return err - } - return loader.newPath(idfFilePath) + return loader, nil } diff --git a/analyse/stopwords.go b/analyse/stopwords.go index cf797ab..930d333 100644 --- a/analyse/stopwords.go +++ b/analyse/stopwords.go @@ -1,58 +1,35 @@ package analyse -import ( - "github.com/wangbin/jiebago" -) - -var stopWords map[string]int - -func init() { - stopWords = map[string]int{ - "the": 1, - "of": 1, - "is": 1, - "and": 1, - "to": 1, - "in": 1, - "that": 1, - "we": 1, - "for": 1, - "an": 1, - "are": 1, - "by": 1, - "be": 1, - "as": 1, - "on": 1, - "with": 1, - "can": 1, - "if": 1, - "from": 1, - "which": 1, - "you": 1, - "it": 1, - "this": 1, - "then": 1, - "at": 1, - "have": 1, - "all": 1, - "not": 1, - "one": 1, - "has": 1, - "or": 1, - } -} - -// Set the stop words file path, could be absolute path of stop words file, or -// file name in current directory. -func SetStopWords(stopWordsFileName string) error { - stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName) - if err != nil { - return err - } - - wtfs, err := jiebago.ParseDictFile(stopWordsFilePath) - for _, wtf := range wtfs { - stopWords[wtf.Word] = 1 - } - return nil +var StopWords = map[string]int{ + "the": 1, + "of": 1, + "is": 1, + "and": 1, + "to": 1, + "in": 1, + "that": 1, + "we": 1, + "for": 1, + "an": 1, + "are": 1, + "by": 1, + "be": 1, + "as": 1, + "on": 1, + "with": 1, + "can": 1, + "if": 1, + "from": 1, + "which": 1, + "you": 1, + "it": 1, + "this": 1, + "then": 1, + "at": 1, + "have": 1, + "all": 1, + "not": 1, + "one": 1, + "has": 1, + "or": 1, } diff --git a/analyse/textrank.go b/analyse/textrank.go index 21819de..2a41b3b 100644 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights { // Extract keywords from sentence using TextRank algorithm. the allowed POS list // could be manually speificed. -func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { +func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { posFilt := make(map[string]int) for _, pos := range allowPOS { posFilt[pos] = 1 @@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { cm := make(map[[2]string]float64) span := 5 wordTags := make([]posseg.WordTag, 0) - for wordTag := range posseg.Cut(sentence, true) { + for wordTag := range t.Cut(sentence, true) { wordTags = append(wordTags, wordTag) } for i, _ := range wordTags { @@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { // Extract keywords from sentence using TextRank algorithm. // topK specify how many top keywords to be returned at most. -func TextRank(sentence string, topK int) wordWeights { - return TextRankWithPOS(sentence, topK, defaultAllowPOS) +func (t *TextRanker) TextRank(sentence string, topK int) wordWeights { + return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) } // Set the dictionary, could be absolute path of dictionary file, or dictionary // name in current directory. This function must be called before cut any // sentence. -func SetDictionary(dictFileName string) error { - return posseg.SetDictionary(dictFileName) +func NewTextRanker(dictFileName string) (*TextRanker, error) { + p, err := posseg.NewPosseg(dictFileName) + if err != nil { + return nil, err + } + return &TextRanker{p}, nil +} + +type TextRanker struct { + *posseg.Posseg } diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index 012f675..f8d0fa3 100644 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -23,8 +23,8 @@ var ( ) func TestTextRank(t *testing.T) { - SetDictionary("../dict.txt") - results := TextRank(sentence, 10) + tr, _ := NewTextRanker("../dict.txt") + results := tr.TextRank(sentence, 10) for index, tw := range results { if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 { t.Errorf("%v != %v", tw, tagRanks[index]) diff --git a/tokenizers/jieba.go b/tokenizers/jieba.go index 1e1547d..55e4292 100644 --- a/tokenizers/jieba.go +++ b/tokenizers/jieba.go @@ -14,16 +14,16 @@ const Name = "jieba" var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) type JiebaTokenizer struct { - dictFileName string + j *jiebago.Jieba hmm, searchMode bool } func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { - err := jiebago.SetDictionary(dictFileName) + j, err := jiebago.NewJieba(dictFileName) return &JiebaTokenizer{ - dictFileName: dictFileName, - hmm: hmm, - searchMode: searchMode, + j: j, + hmm: hmm, + searchMode: searchMode, }, err } @@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - for word := range jiebago.Cut(string(input), false, jt.hmm) { + for word := range jt.j.Cut(string(input), false, jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes) @@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { for i := 0; i < width-step+1; i++ { gram = string(runes[i : i+step]) gramLen := len(gram) - if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 { + if value, ok := jt.j.Freq[gram]; ok && value > 0 { gramStart := start + len(string(runes[:i])) token := analysis.Token{ Term: []byte(gram),