From 52fad004035212bce77f6535819e54efb40bb6bb Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Mon, 4 May 2015 16:39:37 +0800 Subject: [PATCH] refactor analyse module --- analyse/analyse.go | 98 ---------------------------------------- analyse/analyse_test.go | 74 ++++++++++++++++-------------- analyse/idf.go | 50 ++++++++++++++------ analyse/stopwords.go | 49 +++++++++++++------- analyse/tag_extracker.go | 80 ++++++++++++++++++-------------- analyse/textrank.go | 45 ++++++++---------- analyse/textrank_test.go | 27 +++++------ 7 files changed, 187 insertions(+), 236 deletions(-) delete mode 100644 analyse/analyse.go diff --git a/analyse/analyse.go b/analyse/analyse.go deleted file mode 100644 index 43bfa00..0000000 --- a/analyse/analyse.go +++ /dev/null @@ -1,98 +0,0 @@ -package analyse - -import ( - "fmt" - "github.com/wangbin/jiebago" - "sort" - "strings" - "unicode/utf8" -) - -type wordWeight struct { - Word string - Weight float64 -} - -func (w wordWeight) String() string { - return fmt.Sprintf("{%s: %f}", w.Word, w.Weight) -} - -type wordWeights []wordWeight - -func (ws wordWeights) Len() int { - return len(ws) -} - -func (ws wordWeights) Less(i, j int) bool { - if ws[i].Weight == ws[j].Weight { - return ws[i].Word < ws[j].Word - } - - return ws[i].Weight < ws[j].Weight -} - -func (ws wordWeights) Swap(i, j int) { - ws[i], ws[j] = ws[j], ws[i] -} - -type TagExtracter struct { - *jiebago.Jieba - *IDFLoader - *StopWordLoader -} - -func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { - j, err := jiebago.Open(dictFileName) - if err != nil { - return nil, err - } - i, err := NewIDFLoader(IDFFileName) - if err != nil { - return nil, err - } - return &TagExtracter{j, i, NewStopWordLoader()}, nil -} - -// Keyword extraction. -func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) { - freq := make(map[string]float64) - - for w := range t.Cut(sentence, true) { - w = strings.TrimSpace(w) - if utf8.RuneCountInString(w) < 2 { - continue - } - if t.IsStopWord(w) { - continue - } - if f, ok := freq[w]; ok { - freq[w] = f + 1.0 - } else { - freq[w] = 1.0 - } - } - total := 0.0 - for _, f := range freq { - total += f - } - for k, v := range freq { - freq[k] = v / total - } - ws := make(wordWeights, 0) - for k, v := range freq { - var ti wordWeight - if freq_, ok := t.IDFFreq[k]; ok { - ti = wordWeight{Word: k, Weight: freq_ * v} - } else { - ti = wordWeight{Word: k, Weight: t.Median * v} - } - ws = append(ws, ti) - } - sort.Sort(sort.Reverse(ws)) - if len(ws) > topK { - tags = ws[:topK] - } else { - tags = ws - } - return tags -} diff --git a/analyse/analyse_test.go b/analyse/analyse_test.go index eea3ffe..ba14cfe 100644 --- a/analyse/analyse_test.go +++ b/analyse/analyse_test.go @@ -227,43 +227,45 @@ var ( 只是逼不得已 雖然沒有藉口 ` - LyciWeight = []wordWeight{ - wordWeight{Word: "所謂", Weight: 1.010262}, - wordWeight{Word: "是否", Weight: 0.738650}, - wordWeight{Word: "一般", Weight: 0.607600}, - wordWeight{Word: "雖然", Weight: 0.336754}, - wordWeight{Word: "退縮", Weight: 0.336754}, - wordWeight{Word: "肌迫", Weight: 0.336754}, - wordWeight{Word: "矯作", Weight: 0.336754}, - wordWeight{Word: "沒有", Weight: 0.336754}, - wordWeight{Word: "怯懦", Weight: 0.271099}, - wordWeight{Word: "隨便", Weight: 0.168377}, + LyciWeight = Segments{ + Segment{text: "所謂", weight: 1.010262}, + Segment{text: "是否", weight: 0.738650}, + Segment{text: "一般", weight: 0.607600}, + Segment{text: "雖然", weight: 0.336754}, + Segment{text: "退縮", weight: 0.336754}, + Segment{text: "肌迫", weight: 0.336754}, + Segment{text: "矯作", weight: 0.336754}, + Segment{text: "沒有", weight: 0.336754}, + Segment{text: "怯懦", weight: 0.271099}, + Segment{text: "隨便", weight: 0.168377}, } - LyciWeight2 = []wordWeight{ - wordWeight{Word: "所謂", Weight: 1.215739}, - wordWeight{Word: "一般", Weight: 0.731179}, - wordWeight{Word: "雖然", Weight: 0.405246}, - wordWeight{Word: "退縮", Weight: 0.405246}, - wordWeight{Word: "肌迫", Weight: 0.405246}, - wordWeight{Word: "矯作", Weight: 0.405246}, - wordWeight{Word: "怯懦", Weight: 0.326238}, - wordWeight{Word: "逼不得已", Weight: 0.202623}, - wordWeight{Word: "右銘", Weight: 0.202623}, - wordWeight{Word: "寬闊", Weight: 0.202623}, + LyciWeight2 = Segments{ + Segment{text: "所謂", weight: 1.215739}, + Segment{text: "一般", weight: 0.731179}, + Segment{text: "雖然", weight: 0.405246}, + Segment{text: "退縮", weight: 0.405246}, + Segment{text: "肌迫", weight: 0.405246}, + Segment{text: "矯作", weight: 0.405246}, + Segment{text: "怯懦", weight: 0.326238}, + Segment{text: "逼不得已", weight: 0.202623}, + Segment{text: "右銘", weight: 0.202623}, + Segment{text: "寬闊", weight: 0.202623}, } ) func TestExtractTags(t *testing.T) { - et, _ := NewTagExtracter("../dict.txt", "idf.txt") + var te TagExtracter + te.LoadDictionary("../dict.txt") + te.LoadIdf("idf.txt") for index, sentence := range test_contents { - result := et.ExtractTags(sentence, 20) + result := te.ExtractTags(sentence, 20) if len(result) != len(Tags[index]) { t.Fatalf("%s = %v", sentence, result) } for i, tag := range result { - if tag.Word != Tags[index][i] { + if tag.text != Tags[index][i] { t.Fatalf("%s != %s", tag, Tags[index][i]) } } @@ -271,23 +273,27 @@ func TestExtractTags(t *testing.T) { } func TestExtratTagsWithWeight(t *testing.T) { - et, _ := NewTagExtracter("../dict.txt", "idf.txt") - result := et.ExtractTags(Lyric, 10) + var te TagExtracter + te.LoadDictionary("../dict.txt") + te.LoadIdf("idf.txt") + result := te.ExtractTags(Lyric, 10) for index, tag := range result { - if LyciWeight[index].Word != tag.Word || - math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 { + if LyciWeight[index].text != tag.text || + math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 { t.Fatalf("%v != %v", tag, LyciWeight[index]) } } } func TestExtractTagsWithStopWordsFile(t *testing.T) { - et, _ := NewTagExtracter("../dict.txt", "idf.txt") - et.SetStopWords("stop_words.txt") - result := et.ExtractTags(Lyric, 7) + var te TagExtracter + te.LoadDictionary("../dict.txt") + te.LoadIdf("idf.txt") + te.LoadStopWords("stop_words.txt") + result := te.ExtractTags(Lyric, 7) for index, tag := range result { - if LyciWeight2[index].Word != tag.Word || - math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 { + if LyciWeight2[index].text != tag.text || + math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 { t.Fatalf("%v != %v", tag, LyciWeight2[index]) } } diff --git a/analyse/idf.go b/analyse/idf.go index 59a5a02..c054f5e 100644 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -1,30 +1,50 @@ package analyse import ( - "github.com/wangbin/jiebago" "sort" + "sync" + + "github.com/wangbin/jiebago/dictionary" ) -type idf struct { +type Idf struct { freqMap map[string]float64 median float64 freqs []float64 + sync.RWMutex } -func (l *IDFLoader) AddEntry(entry jiebago.Entry) { - l.IDFFreq[entry.Word] = entry.Freq - l.freqs = append(l.freqs, entry.Freq) +func (i *Idf) AddToken(token dictionary.Token) { + i.Lock() + i.freqMap[token.Text()] = token.Frequency() + i.freqs = append(i.freqs, token.Frequency()) + sort.Float64s(i.freqs) + i.median = i.freqs[len(i.freqs)/2] + i.Unlock() } -func NewIDFLoader(IDFFileName string) (*IDFLoader, error) { - loader := &IDFLoader{make(map[string]float64), 0.0, make([]float64, 0)} - err := jiebago.LoadDict(loader, IDFFileName, false) - if err != nil { - return nil, err +func (i *Idf) Load(ch <-chan dictionary.Token) { + i.Lock() + for token := range ch { + i.freqMap[token.Text()] = token.Frequency() + i.freqs = append(i.freqs, token.Frequency()) } - - sort.Float64s(loader.freqs) - loader.Median = loader.freqs[len(loader.freqs)/2] - loader.freqs = []float64{} - return loader, nil + sort.Float64s(i.freqs) + i.median = i.freqs[len(i.freqs)/2] + i.Unlock() +} + +func (i *Idf) loadDictionary(fileName string) error { + return dictionary.LoadDictionary(i, fileName) +} + +func (i Idf) Frequency(key string) (float64, bool) { + i.RLock() + freq, ok := i.freqMap[key] + i.RUnlock() + return freq, ok +} + +func NewIdf() *Idf { + return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)} } diff --git a/analyse/stopwords.go b/analyse/stopwords.go index b0a39fe..ac2f598 100644 --- a/analyse/stopwords.go +++ b/analyse/stopwords.go @@ -1,8 +1,12 @@ package analyse -import "github.com/wangbin/jiebago" +import ( + "sync" -var defaultStopWords = map[string]int{ + "github.com/wangbin/jiebago/dictionary" +) + +var DefaultStopWordMap = map[string]int{ "the": 1, "of": 1, "is": 1, @@ -36,27 +40,38 @@ var defaultStopWords = map[string]int{ "or": 1, } -type StopWordLoader struct { - stopWords map[string]int +type StopWord struct { + stopWordMap map[string]int + sync.RWMutex } -func (s *StopWordLoader) AddEntry(entry jiebago.Entry) { - s.stopWords[entry.Word] = 1 +func (s *StopWord) AddToken(token dictionary.Token) { + s.Lock() + s.stopWordMap[token.Text()] = 1 + s.Unlock() } -func NewStopWordLoader() *StopWordLoader { - s := new(StopWordLoader) - s.stopWords = defaultStopWords +func NewStopWord() *StopWord { + s := new(StopWord) + s.stopWordMap = DefaultStopWordMap return s } -// Set the stop words file path, could be absolute path of stop words file, or -// file name in current directory. -func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error { - return jiebago.LoadDict(s, stopWordsFileName, false) -} - -func (s StopWordLoader) IsStopWord(word string) bool { - _, ok := s.stopWords[word] +func (s StopWord) IsStopWord(word string) bool { + s.RLock() + _, ok := s.stopWordMap[word] + s.RUnlock() return ok } + +func (s *StopWord) Load(ch <-chan dictionary.Token) { + s.Lock() + for token := range ch { + s.stopWordMap[token.Text()] = 1 + } + s.Unlock() +} + +func (s *StopWord) loadDictionary(fileName string) error { + return dictionary.LoadDictionary(s, fileName) +} diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index 16a8c86..d9b6f96 100644 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -6,7 +6,7 @@ import ( "strings" "unicode/utf8" - "github.com/wangbin/jiebago/dictionary" + "github.com/wangbin/jiebago" ) type Segment struct { @@ -14,11 +14,19 @@ type Segment struct { weight float64 } +func (s Segment) Text() string { + return s.text +} + +func (s Segment) Weight() float64 { + return s.weight +} + func (s Segment) String() string { return fmt.Sprintf("{%s: %f}", s.text, s.weight) } -type Segments []Segments +type Segments []Segment func (ss Segments) Len() int { return len(ss) @@ -26,7 +34,7 @@ func (ss Segments) Len() int { func (ss Segments) Less(i, j int) bool { if ss[i].weight == ss[j].weight { - return ss[i].text < ws[j].text + return ss[i].text < ss[j].text } return ss[i].weight < ss[j].weight @@ -37,57 +45,61 @@ func (ss Segments) Swap(i, j int) { } type TagExtracter struct { - seg *jieba.Segmenter - i *idf - *StopWordLoader + seg *jiebago.Segmenter + idf *Idf + stopWord *StopWord } -func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { - j, err := jiebago.Open(dictFileName) - if err != nil { - return nil, err - } - i, err := NewIDFLoader(IDFFileName) - if err != nil { - return nil, err - } - return &TagExtracter{j, i, NewStopWordLoader()}, nil +func (t *TagExtracter) LoadDictionary(fileName string) error { + t.stopWord = NewStopWord() + t.seg = new(jiebago.Segmenter) + return t.seg.LoadDictionary(fileName) +} + +func (t *TagExtracter) LoadIdf(fileName string) error { + t.idf = NewIdf() + return t.idf.loadDictionary(fileName) +} + +func (t *TagExtracter) LoadStopWords(fileName string) error { + t.stopWord = NewStopWord() + return t.stopWord.loadDictionary(fileName) } // Keyword extraction. -func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) { - freq := make(map[string]float64) +func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { + freqMap := make(map[string]float64) - for w := range t.Cut(sentence, true) { + for w := range t.seg.Cut(sentence, true) { w = strings.TrimSpace(w) if utf8.RuneCountInString(w) < 2 { continue } - if t.IsStopWord(w) { + if t.stopWord.IsStopWord(w) { continue } - if f, ok := freq[w]; ok { - freq[w] = f + 1.0 + if f, ok := freqMap[w]; ok { + freqMap[w] = f + 1.0 } else { - freq[w] = 1.0 + freqMap[w] = 1.0 } } total := 0.0 - for _, f := range freq { - total += f + for _, freq := range freqMap { + total += freq } - for k, v := range freq { - freq[k] = v / total + for k, v := range freqMap { + freqMap[k] = v / total } - ws := make(wordWeights, 0) - for k, v := range freq { - var ti wordWeight - if freq_, ok := t.IDFFreq[k]; ok { - ti = wordWeight{Word: k, Weight: freq_ * v} + ws := make(Segments, 0) + var s Segment + for k, v := range freqMap { + if freq, ok := t.idf.Frequency(k); ok { + s = Segment{text: k, weight: freq * v} } else { - ti = wordWeight{Word: k, Weight: t.Median * v} + s = Segment{text: k, weight: t.idf.median * v} } - ws = append(ws, ti) + ws = append(ws, s) } sort.Sort(sort.Reverse(ws)) if len(ws) > topK { diff --git a/analyse/textrank.go b/analyse/textrank.go index e6fb6c4..fc0a54d 100644 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -2,9 +2,10 @@ package analyse import ( "fmt" - "github.com/wangbin/jiebago/posseg" "math" "sort" + + "github.com/wangbin/jiebago/posseg" ) const dampingFactor = 0.85 @@ -65,7 +66,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { } } -func (u *undirectWeightedGraph) rank() wordWeights { +func (u *undirectWeightedGraph) rank() Segments { if !sort.IsSorted(u.keys) { sort.Sort(u.keys) } @@ -105,9 +106,9 @@ func (u *undirectWeightedGraph) rank() wordWeights { maxRank = w } } - result := make(wordWeights, 0) + result := make(Segments, 0) for n, w := range ws { - result = append(result, wordWeight{Word: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}) + result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}) } sort.Sort(sort.Reverse(result)) return result @@ -115,7 +116,7 @@ func (u *undirectWeightedGraph) rank() wordWeights { // Extract keywords from sentence using TextRank algorithm. the allowed POS list // could be manually speificed. -func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { +func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments { posFilt := make(map[string]int) for _, pos := range allowPOS { posFilt[pos] = 1 @@ -123,20 +124,20 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin g := newUndirectWeightedGraph() cm := make(map[[2]string]float64) span := 5 - pairs := make([]posseg.Pair, 0) - for pair := range t.Cut(sentence, true) { + pairs := make([]posseg.Segment, 0) + for pair := range t.seg.Cut(sentence, true) { pairs = append(pairs, pair) } - for i, _ := range pairs { - if _, ok := posFilt[pairs[i].Flag]; ok { + for i := range pairs { + if _, ok := posFilt[pairs[i].Pos()]; ok { for j := i + 1; j < i+span && j <= len(pairs); j++ { - if _, ok := posFilt[pairs[j].Flag]; !ok { + if _, ok := posFilt[pairs[j].Pos()]; !ok { continue } - if _, ok := cm[[2]string{pairs[i].Word, pairs[j].Word}]; !ok { - cm[[2]string{pairs[i].Word, pairs[j].Word}] = 1.0 + if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok { + cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0 } else { - cm[[2]string{pairs[i].Word, pairs[j].Word}] += 1.0 + cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0 } } } @@ -153,21 +154,15 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin // Extract keywords from sentence using TextRank algorithm. // topK specify how many top keywords to be returned at most. -func (t *TextRanker) TextRank(sentence string, topK int) wordWeights { +func (t *TextRanker) TextRank(sentence string, topK int) Segments { return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) } -// Set the dictionary, could be absolute path of dictionary file, or dictionary -// name in current directory. This function must be called before cut any -// sentence. -func NewTextRanker(dictFileName string) (*TextRanker, error) { - p, err := posseg.Open(dictFileName) - if err != nil { - return nil, err - } - return &TextRanker{p}, nil +type TextRanker struct { + seg *posseg.Segmenter } -type TextRanker struct { - *posseg.Posseg +func (t *TextRanker) LoadDictionary(fileName string) error { + t.seg = new(posseg.Segmenter) + return t.seg.LoadDictionary(fileName) } diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index dfbf8d5..f3b85f3 100644 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -8,25 +8,26 @@ import ( var ( sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" - tagRanks = wordWeights{ - wordWeight{Word: "吉林", Weight: 1.0}, - wordWeight{Word: "欧亚", Weight: 0.87807810644}, - wordWeight{Word: "置业", Weight: 0.562048250306}, - wordWeight{Word: "实现", Weight: 0.520905743929}, - wordWeight{Word: "收入", Weight: 0.384283870648}, - wordWeight{Word: "增资", Weight: 0.360590945312}, - wordWeight{Word: "子公司", Weight: 0.353131980904}, - wordWeight{Word: "城市", Weight: 0.307509449283}, - wordWeight{Word: "全资", Weight: 0.306324426665}, - wordWeight{Word: "商业", Weight: 0.306138241063}, + tagRanks = Segments{ + Segment{text: "吉林", weight: 1.0}, + Segment{text: "欧亚", weight: 0.87807810644}, + Segment{text: "置业", weight: 0.562048250306}, + Segment{text: "实现", weight: 0.520905743929}, + Segment{text: "收入", weight: 0.384283870648}, + Segment{text: "增资", weight: 0.360590945312}, + Segment{text: "子公司", weight: 0.353131980904}, + Segment{text: "城市", weight: 0.307509449283}, + Segment{text: "全资", weight: 0.306324426665}, + Segment{text: "商业", weight: 0.306138241063}, } ) func TestTextRank(t *testing.T) { - tr, _ := NewTextRanker("../dict.txt") + var tr TextRanker + tr.LoadDictionary("../dict.txt") results := tr.TextRank(sentence, 10) for index, tw := range results { - if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 { + if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 { t.Fatalf("%v != %v", tw, tagRanks[index]) } }