refactor analyse module

2026-06-12 13:10:25 +08:00 · 2015-05-04 16:39:37 +08:00
parent 500e6bd10e
commit 52fad00403
7 changed files with 187 additions and 236 deletions
--- a/analyse/analyse.go
+++ b/analyse/analyse.go
@@ -1,98 +0,0 @@
-package analyse
-
-import (
-	"fmt"
-	"github.com/wangbin/jiebago"
-	"sort"
-	"strings"
-	"unicode/utf8"
-)
-
-type wordWeight struct {
-	Word   string
-	Weight float64
-}
-
-func (w wordWeight) String() string {
-	return fmt.Sprintf("{%s: %f}", w.Word, w.Weight)
-}
-
-type wordWeights []wordWeight
-
-func (ws wordWeights) Len() int {
-	return len(ws)
-}
-
-func (ws wordWeights) Less(i, j int) bool {
-	if ws[i].Weight == ws[j].Weight {
-		return ws[i].Word < ws[j].Word
-	}
-
-	return ws[i].Weight < ws[j].Weight
-}
-
-func (ws wordWeights) Swap(i, j int) {
-	ws[i], ws[j] = ws[j], ws[i]
-}
-
-type TagExtracter struct {
-	*jiebago.Jieba
-	*IDFLoader
-	*StopWordLoader
-}
-
-func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
-	j, err := jiebago.Open(dictFileName)
-	if err != nil {
-		return nil, err
-	}
-	i, err := NewIDFLoader(IDFFileName)
-	if err != nil {
-		return nil, err
-	}
-	return &TagExtracter{j, i, NewStopWordLoader()}, nil
-}
-
-// Keyword extraction.
-func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
-	freq := make(map[string]float64)
-
-	for w := range t.Cut(sentence, true) {
-		w = strings.TrimSpace(w)
-		if utf8.RuneCountInString(w) < 2 {
-			continue
-		}
-		if t.IsStopWord(w) {
-			continue
-		}
-		if f, ok := freq[w]; ok {
-			freq[w] = f + 1.0
-		} else {
-			freq[w] = 1.0
-		}
-	}
-	total := 0.0
-	for _, f := range freq {
-		total += f
-	}
-	for k, v := range freq {
-		freq[k] = v / total
-	}
-	ws := make(wordWeights, 0)
-	for k, v := range freq {
-		var ti wordWeight
-		if freq_, ok := t.IDFFreq[k]; ok {
-			ti = wordWeight{Word: k, Weight: freq_ * v}
-		} else {
-			ti = wordWeight{Word: k, Weight: t.Median * v}
-		}
-		ws = append(ws, ti)
-	}
-	sort.Sort(sort.Reverse(ws))
-	if len(ws) > topK {
-		tags = ws[:topK]
-	} else {
-		tags = ws
-	}
-	return tags
-}
--- a/analyse/analyse_test.go
+++ b/analyse/analyse_test.go
@@ -227,43 +227,45 @@ var (
 只是逼不得已
 雖然沒有藉口
 `
-	LyciWeight = []wordWeight{
-		wordWeight{Word: "所謂", Weight: 1.010262},
-		wordWeight{Word: "是否", Weight: 0.738650},
-		wordWeight{Word: "一般", Weight: 0.607600},
-		wordWeight{Word: "雖然", Weight: 0.336754},
-		wordWeight{Word: "退縮", Weight: 0.336754},
-		wordWeight{Word: "肌迫", Weight: 0.336754},
-		wordWeight{Word: "矯作", Weight: 0.336754},
-		wordWeight{Word: "沒有", Weight: 0.336754},
-		wordWeight{Word: "怯懦", Weight: 0.271099},
-		wordWeight{Word: "隨便", Weight: 0.168377},
+	LyciWeight = Segments{
+		Segment{text: "所謂", weight: 1.010262},
+		Segment{text: "是否", weight: 0.738650},
+		Segment{text: "一般", weight: 0.607600},
+		Segment{text: "雖然", weight: 0.336754},
+		Segment{text: "退縮", weight: 0.336754},
+		Segment{text: "肌迫", weight: 0.336754},
+		Segment{text: "矯作", weight: 0.336754},
+		Segment{text: "沒有", weight: 0.336754},
+		Segment{text: "怯懦", weight: 0.271099},
+		Segment{text: "隨便", weight: 0.168377},
 	}

-	LyciWeight2 = []wordWeight{
-		wordWeight{Word: "所謂", Weight: 1.215739},
-		wordWeight{Word: "一般", Weight: 0.731179},
-		wordWeight{Word: "雖然", Weight: 0.405246},
-		wordWeight{Word: "退縮", Weight: 0.405246},
-		wordWeight{Word: "肌迫", Weight: 0.405246},
-		wordWeight{Word: "矯作", Weight: 0.405246},
-		wordWeight{Word: "怯懦", Weight: 0.326238},
-		wordWeight{Word: "逼不得已", Weight: 0.202623},
-		wordWeight{Word: "右銘", Weight: 0.202623},
-		wordWeight{Word: "寬闊", Weight: 0.202623},
+	LyciWeight2 = Segments{
+		Segment{text: "所謂", weight: 1.215739},
+		Segment{text: "一般", weight: 0.731179},
+		Segment{text: "雖然", weight: 0.405246},
+		Segment{text: "退縮", weight: 0.405246},
+		Segment{text: "肌迫", weight: 0.405246},
+		Segment{text: "矯作", weight: 0.405246},
+		Segment{text: "怯懦", weight: 0.326238},
+		Segment{text: "逼不得已", weight: 0.202623},
+		Segment{text: "右銘", weight: 0.202623},
+		Segment{text: "寬闊", weight: 0.202623},
 	}
 )

 func TestExtractTags(t *testing.T) {
-	et, _ := NewTagExtracter("../dict.txt", "idf.txt")
+	var te TagExtracter
+	te.LoadDictionary("../dict.txt")
+	te.LoadIdf("idf.txt")

 	for index, sentence := range test_contents {
-		result := et.ExtractTags(sentence, 20)
+		result := te.ExtractTags(sentence, 20)
 		if len(result) != len(Tags[index]) {
 			t.Fatalf("%s = %v", sentence, result)
 		}
 		for i, tag := range result {
-			if tag.Word != Tags[index][i] {
+			if tag.text != Tags[index][i] {
 				t.Fatalf("%s != %s", tag, Tags[index][i])
 			}
 		}
@@ -271,23 +273,27 @@ func TestExtractTags(t *testing.T) {
 }

 func TestExtratTagsWithWeight(t *testing.T) {
-	et, _ := NewTagExtracter("../dict.txt", "idf.txt")
-	result := et.ExtractTags(Lyric, 10)
+	var te TagExtracter
+	te.LoadDictionary("../dict.txt")
+	te.LoadIdf("idf.txt")
+	result := te.ExtractTags(Lyric, 10)
 	for index, tag := range result {
-		if LyciWeight[index].Word != tag.Word ||
-			math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
+		if LyciWeight[index].text != tag.text ||
+			math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 {
 			t.Fatalf("%v != %v", tag, LyciWeight[index])
 		}
 	}
 }

 func TestExtractTagsWithStopWordsFile(t *testing.T) {
-	et, _ := NewTagExtracter("../dict.txt", "idf.txt")
-	et.SetStopWords("stop_words.txt")
-	result := et.ExtractTags(Lyric, 7)
+	var te TagExtracter
+	te.LoadDictionary("../dict.txt")
+	te.LoadIdf("idf.txt")
+	te.LoadStopWords("stop_words.txt")
+	result := te.ExtractTags(Lyric, 7)
 	for index, tag := range result {
-		if LyciWeight2[index].Word != tag.Word ||
-			math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
+		if LyciWeight2[index].text != tag.text ||
+			math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 {
 			t.Fatalf("%v != %v", tag, LyciWeight2[index])
 		}
 	}
--- a/analyse/idf.go
+++ b/analyse/idf.go
@@ -1,30 +1,50 @@
 package analyse

 import (
-	"github.com/wangbin/jiebago"
 	"sort"
+	"sync"
+
+	"github.com/wangbin/jiebago/dictionary"
 )

-type idf struct {
+type Idf struct {
 	freqMap map[string]float64
 	median  float64
 	freqs   []float64
+	sync.RWMutex
 }

-func (l *IDFLoader) AddEntry(entry jiebago.Entry) {
-	l.IDFFreq[entry.Word] = entry.Freq
-	l.freqs = append(l.freqs, entry.Freq)
+func (i *Idf) AddToken(token dictionary.Token) {
+	i.Lock()
+	i.freqMap[token.Text()] = token.Frequency()
+	i.freqs = append(i.freqs, token.Frequency())
+	sort.Float64s(i.freqs)
+	i.median = i.freqs[len(i.freqs)/2]
+	i.Unlock()
 }

-func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
-	loader := &IDFLoader{make(map[string]float64), 0.0, make([]float64, 0)}
-	err := jiebago.LoadDict(loader, IDFFileName, false)
-	if err != nil {
-		return nil, err
+func (i *Idf) Load(ch <-chan dictionary.Token) {
+	i.Lock()
+	for token := range ch {
+		i.freqMap[token.Text()] = token.Frequency()
+		i.freqs = append(i.freqs, token.Frequency())
 	}
-
-	sort.Float64s(loader.freqs)
-	loader.Median = loader.freqs[len(loader.freqs)/2]
-	loader.freqs = []float64{}
-	return loader, nil
+	sort.Float64s(i.freqs)
+	i.median = i.freqs[len(i.freqs)/2]
+	i.Unlock()
+}
+
+func (i *Idf) loadDictionary(fileName string) error {
+	return dictionary.LoadDictionary(i, fileName)
+}
+
+func (i Idf) Frequency(key string) (float64, bool) {
+	i.RLock()
+	freq, ok := i.freqMap[key]
+	i.RUnlock()
+	return freq, ok
+}
+
+func NewIdf() *Idf {
+	return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
 }
--- a/analyse/stopwords.go
+++ b/analyse/stopwords.go
@@ -1,8 +1,12 @@
 package analyse

-import "github.com/wangbin/jiebago"
+import (
+	"sync"

-var defaultStopWords = map[string]int{
+	"github.com/wangbin/jiebago/dictionary"
+)
+
+var DefaultStopWordMap = map[string]int{
 	"the":   1,
 	"of":    1,
 	"is":    1,
@@ -36,27 +40,38 @@ var defaultStopWords = map[string]int{
 	"or":    1,
 }

-type StopWordLoader struct {
-	stopWords map[string]int
+type StopWord struct {
+	stopWordMap map[string]int
+	sync.RWMutex
 }

-func (s *StopWordLoader) AddEntry(entry jiebago.Entry) {
-	s.stopWords[entry.Word] = 1
+func (s *StopWord) AddToken(token dictionary.Token) {
+	s.Lock()
+	s.stopWordMap[token.Text()] = 1
+	s.Unlock()
 }

-func NewStopWordLoader() *StopWordLoader {
-	s := new(StopWordLoader)
-	s.stopWords = defaultStopWords
+func NewStopWord() *StopWord {
+	s := new(StopWord)
+	s.stopWordMap = DefaultStopWordMap
 	return s
 }

-// Set the stop words file path, could be absolute path of stop words file, or
-// file name in current directory.
-func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error {
-	return jiebago.LoadDict(s, stopWordsFileName, false)
-}
-
-func (s StopWordLoader) IsStopWord(word string) bool {
-	_, ok := s.stopWords[word]
+func (s StopWord) IsStopWord(word string) bool {
+	s.RLock()
+	_, ok := s.stopWordMap[word]
+	s.RUnlock()
 	return ok
 }
+
+func (s *StopWord) Load(ch <-chan dictionary.Token) {
+	s.Lock()
+	for token := range ch {
+		s.stopWordMap[token.Text()] = 1
+	}
+	s.Unlock()
+}
+
+func (s *StopWord) loadDictionary(fileName string) error {
+	return dictionary.LoadDictionary(s, fileName)
+}
--- a/analyse/tag_extracker.go
+++ b/analyse/tag_extracker.go
@@ -6,7 +6,7 @@ import (
 	"strings"
 	"unicode/utf8"

-	"github.com/wangbin/jiebago/dictionary"
+	"github.com/wangbin/jiebago"
 )

 type Segment struct {
@@ -14,11 +14,19 @@ type Segment struct {
 	weight float64
 }

+func (s Segment) Text() string {
+	return s.text
+}
+
+func (s Segment) Weight() float64 {
+	return s.weight
+}
+
 func (s Segment) String() string {
 	return fmt.Sprintf("{%s: %f}", s.text, s.weight)
 }

-type Segments []Segments
+type Segments []Segment

 func (ss Segments) Len() int {
 	return len(ss)
@@ -26,7 +34,7 @@ func (ss Segments) Len() int {

 func (ss Segments) Less(i, j int) bool {
 	if ss[i].weight == ss[j].weight {
-		return ss[i].text < ws[j].text
+		return ss[i].text < ss[j].text
 	}

 	return ss[i].weight < ss[j].weight
@@ -37,57 +45,61 @@ func (ss Segments) Swap(i, j int) {
 }

 type TagExtracter struct {
-	seg *jieba.Segmenter
-	i   *idf
-	*StopWordLoader
+	seg      *jiebago.Segmenter
+	idf      *Idf
+	stopWord *StopWord
 }

-func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
-	j, err := jiebago.Open(dictFileName)
-	if err != nil {
-		return nil, err
-	}
-	i, err := NewIDFLoader(IDFFileName)
-	if err != nil {
-		return nil, err
-	}
-	return &TagExtracter{j, i, NewStopWordLoader()}, nil
+func (t *TagExtracter) LoadDictionary(fileName string) error {
+	t.stopWord = NewStopWord()
+	t.seg = new(jiebago.Segmenter)
+	return t.seg.LoadDictionary(fileName)
+}
+
+func (t *TagExtracter) LoadIdf(fileName string) error {
+	t.idf = NewIdf()
+	return t.idf.loadDictionary(fileName)
+}
+
+func (t *TagExtracter) LoadStopWords(fileName string) error {
+	t.stopWord = NewStopWord()
+	return t.stopWord.loadDictionary(fileName)
 }

 // Keyword extraction.
-func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
-	freq := make(map[string]float64)
+func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
+	freqMap := make(map[string]float64)

-	for w := range t.Cut(sentence, true) {
+	for w := range t.seg.Cut(sentence, true) {
 		w = strings.TrimSpace(w)
 		if utf8.RuneCountInString(w) < 2 {
 			continue
 		}
-		if t.IsStopWord(w) {
+		if t.stopWord.IsStopWord(w) {
 			continue
 		}
-		if f, ok := freq[w]; ok {
-			freq[w] = f + 1.0
+		if f, ok := freqMap[w]; ok {
+			freqMap[w] = f + 1.0
 		} else {
-			freq[w] = 1.0
+			freqMap[w] = 1.0
 		}
 	}
 	total := 0.0
-	for _, f := range freq {
-		total += f
+	for _, freq := range freqMap {
+		total += freq
 	}
-	for k, v := range freq {
-		freq[k] = v / total
+	for k, v := range freqMap {
+		freqMap[k] = v / total
 	}
-	ws := make(wordWeights, 0)
-	for k, v := range freq {
-		var ti wordWeight
-		if freq_, ok := t.IDFFreq[k]; ok {
-			ti = wordWeight{Word: k, Weight: freq_ * v}
+	ws := make(Segments, 0)
+	var s Segment
+	for k, v := range freqMap {
+		if freq, ok := t.idf.Frequency(k); ok {
+			s = Segment{text: k, weight: freq * v}
 		} else {
-			ti = wordWeight{Word: k, Weight: t.Median * v}
+			s = Segment{text: k, weight: t.idf.median * v}
 		}
-		ws = append(ws, ti)
+		ws = append(ws, s)
 	}
 	sort.Sort(sort.Reverse(ws))
 	if len(ws) > topK {
--- a/analyse/textrank.go
+++ b/analyse/textrank.go
@@ -2,9 +2,10 @@ package analyse

 import (
 	"fmt"
-	"github.com/wangbin/jiebago/posseg"
 	"math"
 	"sort"
+
+	"github.com/wangbin/jiebago/posseg"
 )

 const dampingFactor = 0.85
@@ -65,7 +66,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
 	}
 }

-func (u *undirectWeightedGraph) rank() wordWeights {
+func (u *undirectWeightedGraph) rank() Segments {
 	if !sort.IsSorted(u.keys) {
 		sort.Sort(u.keys)
 	}
@@ -105,9 +106,9 @@ func (u *undirectWeightedGraph) rank() wordWeights {
 			maxRank = w
 		}
 	}
-	result := make(wordWeights, 0)
+	result := make(Segments, 0)
 	for n, w := range ws {
-		result = append(result, wordWeight{Word: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
+		result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
 	}
 	sort.Sort(sort.Reverse(result))
 	return result
@@ -115,7 +116,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {

 // Extract keywords from sentence using TextRank algorithm. the allowed POS list
 // could be manually speificed.
-func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
+func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
 	posFilt := make(map[string]int)
 	for _, pos := range allowPOS {
 		posFilt[pos] = 1
@@ -123,20 +124,20 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
 	g := newUndirectWeightedGraph()
 	cm := make(map[[2]string]float64)
 	span := 5
-	pairs := make([]posseg.Pair, 0)
-	for pair := range t.Cut(sentence, true) {
+	pairs := make([]posseg.Segment, 0)
+	for pair := range t.seg.Cut(sentence, true) {
 		pairs = append(pairs, pair)
 	}
-	for i, _ := range pairs {
-		if _, ok := posFilt[pairs[i].Flag]; ok {
+	for i := range pairs {
+		if _, ok := posFilt[pairs[i].Pos()]; ok {
 			for j := i + 1; j < i+span && j <= len(pairs); j++ {
-				if _, ok := posFilt[pairs[j].Flag]; !ok {
+				if _, ok := posFilt[pairs[j].Pos()]; !ok {
 					continue
 				}
-				if _, ok := cm[[2]string{pairs[i].Word, pairs[j].Word}]; !ok {
-					cm[[2]string{pairs[i].Word, pairs[j].Word}] = 1.0
+				if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok {
+					cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0
 				} else {
-					cm[[2]string{pairs[i].Word, pairs[j].Word}] += 1.0
+					cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0
 				}
 			}
 		}
@@ -153,21 +154,15 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin

 // Extract keywords from sentence using TextRank algorithm.
 // topK specify how many top keywords to be returned at most.
-func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
+func (t *TextRanker) TextRank(sentence string, topK int) Segments {
 	return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
 }

-// Set the dictionary, could be absolute path of dictionary file, or dictionary
-// name in current directory. This function must be called before cut any
-// sentence.
-func NewTextRanker(dictFileName string) (*TextRanker, error) {
-	p, err := posseg.Open(dictFileName)
-	if err != nil {
-		return nil, err
-	}
-	return &TextRanker{p}, nil
+type TextRanker struct {
+	seg *posseg.Segmenter
 }

-type TextRanker struct {
-	*posseg.Posseg
+func (t *TextRanker) LoadDictionary(fileName string) error {
+	t.seg = new(posseg.Segmenter)
+	return t.seg.LoadDictionary(fileName)
 }
--- a/analyse/textrank_test.go
+++ b/analyse/textrank_test.go
@@ -8,25 +8,26 @@ import (
 var (
 	sentence = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"

-	tagRanks = wordWeights{
-		wordWeight{Word: "吉林", Weight: 1.0},
-		wordWeight{Word: "欧亚", Weight: 0.87807810644},
-		wordWeight{Word: "置业", Weight: 0.562048250306},
-		wordWeight{Word: "实现", Weight: 0.520905743929},
-		wordWeight{Word: "收入", Weight: 0.384283870648},
-		wordWeight{Word: "增资", Weight: 0.360590945312},
-		wordWeight{Word: "子公司", Weight: 0.353131980904},
-		wordWeight{Word: "城市", Weight: 0.307509449283},
-		wordWeight{Word: "全资", Weight: 0.306324426665},
-		wordWeight{Word: "商业", Weight: 0.306138241063},
+	tagRanks = Segments{
+		Segment{text: "吉林", weight: 1.0},
+		Segment{text: "欧亚", weight: 0.87807810644},
+		Segment{text: "置业", weight: 0.562048250306},
+		Segment{text: "实现", weight: 0.520905743929},
+		Segment{text: "收入", weight: 0.384283870648},
+		Segment{text: "增资", weight: 0.360590945312},
+		Segment{text: "子公司", weight: 0.353131980904},
+		Segment{text: "城市", weight: 0.307509449283},
+		Segment{text: "全资", weight: 0.306324426665},
+		Segment{text: "商业", weight: 0.306138241063},
 	}
 )

 func TestTextRank(t *testing.T) {
-	tr, _ := NewTextRanker("../dict.txt")
+	var tr TextRanker
+	tr.LoadDictionary("../dict.txt")
 	results := tr.TextRank(sentence, 10)
 	for index, tw := range results {
-		if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
+		if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 {
 			t.Fatalf("%v != %v", tw, tagRanks[index])
 		}
 	}