finished all OOP refactor

2026-07-18 11:39:57 +08:00 · 2015-03-24 18:34:07 +08:00
parent 73d87e4ed6
commit 1c378c28a7
7 changed files with 116 additions and 127 deletions
--- a/analyse/analyse.go
+++ b/analyse/analyse.go
@@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) {
 	ws[i], ws[j] = ws[j], ws[i]
 }

+type TagExtracter struct {
+	*jiebago.Jieba
+	*IDFLoader
+	stopWords map[string]int
+}
+
+func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
+	j, err := jiebago.NewJieba(dictFileName)
+	if err != nil {
+		return nil, err
+	}
+	i, err := NewIDFLoader(IDFFileName)
+	if err != nil {
+		return nil, err
+	}
+	return &TagExtracter{j, i, StopWords}, nil
+}
+
+// Set the stop words file path, could be absolute path of stop words file, or
+// file name in current directory.
+func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
+	stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
+	if err != nil {
+		return err
+	}
+
+	wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
+	for _, wtf := range wtfs {
+		t.stopWords[wtf.Word] = 1
+	}
+	return nil
+}
+
 // Keyword extraction.
-func ExtractTags(sentence string, topK int) (tags wordWeights) {
+func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
 	freq := make(map[string]float64)

-	for w := range jiebago.Cut(sentence, false, true) {
+	for w := range t.Cut(sentence, false, true) {
 		w = strings.TrimSpace(w)
 		if utf8.RuneCountInString(w) < 2 {
 			continue
 		}
-		if _, ok := stopWords[w]; ok {
+		if _, ok := t.stopWords[w]; ok {
 			continue
 		}
 		if f, ok := freq[w]; ok {
@@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) {
 	ws := make(wordWeights, 0)
 	for k, v := range freq {
 		var ti wordWeight
-		if freq_, ok := loader.Freq[k]; ok {
+		if freq_, ok := t.IDFFreq[k]; ok {
 			ti = wordWeight{Word: k, Weight: freq_ * v}
 		} else {
-			ti = wordWeight{Word: k, Weight: loader.Median * v}
+			ti = wordWeight{Word: k, Weight: t.Median * v}
 		}
 		ws = append(ws, ti)
 	}
--- a/analyse/analyse_test.go
+++ b/analyse/analyse_test.go
@@ -1,7 +1,6 @@
 package analyse

 import (
-	"github.com/wangbin/jiebago"
 	"math"
 	"testing"
 )
@@ -256,11 +255,10 @@ var (
 )

 func TestExtractTags(t *testing.T) {
-	jiebago.SetDictionary("../dict.txt")
-	SetIdf("idf.txt")
+	et, _ := NewTagExtracter("../dict.txt", "idf.txt")

 	for index, sentence := range test_contents {
-		result := ExtractTags(sentence, 20)
+		result := et.ExtractTags(sentence, 20)
 		if len(result) != len(Tags[index]) {
 			t.Errorf("%s = %v", sentence, result)
 		}
@@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) {
 }

 func TestExtratTagsWithWeight(t *testing.T) {
-	jiebago.SetDictionary("../dict.txt")
-	SetIdf("idf.txt")
-	result := ExtractTags(Lyric, 10)
+	et, _ := NewTagExtracter("../dict.txt", "idf.txt")
+	result := et.ExtractTags(Lyric, 10)
 	for index, tag := range result {
 		if LyciWeight[index].Word != tag.Word ||
 			math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
@@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
 }

 func TestExtractTagsWithStopWordsFile(t *testing.T) {
-	jiebago.SetDictionary("../dict.txt")
-	SetIdf("idf.txt")
-	SetStopWords("stop_words.txt")
-	result := ExtractTags(Lyric, 7)
+	et, _ := NewTagExtracter("../dict.txt", "idf.txt")
+	et.SetStopWords("stop_words.txt")
+	result := et.ExtractTags(Lyric, 7)
 	for index, tag := range result {
 		if LyciWeight2[index].Word != tag.Word ||
 			math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
--- a/analyse/idf.go
+++ b/analyse/idf.go
@@ -5,53 +5,28 @@ import (
 	"sort"
 )

-var (
-	loader *idfLoader
-)
-
-func init() {
-	loader = newIDFLoader()
+type IDFLoader struct {
+	IDFFreq map[string]float64
+	Median  float64
 }

-type idfLoader struct {
-	Path   string
-	Freq   map[string]float64
-	Median float64
-}
-
-func newIDFLoader() *idfLoader {
-	loader := new(idfLoader)
-	loader.Freq = make(map[string]float64)
-	return loader
-}
-
-func (loader *idfLoader) newPath(idfFilePath string) error {
-	if loader.Path == idfFilePath {
-		return nil
-	}
-	wtfs, err := jiebago.ParseDictFile(idfFilePath)
+func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
+	IDFFilePath, err := jiebago.DictPath(IDFFileName)
 	if err != nil {
-		return err
+		return nil, err
+	}
+	wtfs, err := jiebago.ParseDictFile(IDFFilePath)
+	if err != nil {
+		return nil, err
 	}

-	freqs := make([]float64, 0)
-
-	for _, wtf := range wtfs {
-		loader.Freq[wtf.Word] = wtf.Freq
-		freqs = append(freqs, wtf.Freq)
+	freqs := make([]float64, len(wtfs))
+	loader := &IDFLoader{make(map[string]float64), 0.0}
+	for index, wtf := range wtfs {
+		loader.IDFFreq[wtf.Word] = wtf.Freq
+		freqs[index] = wtf.Freq
 	}
-
 	sort.Float64s(freqs)
 	loader.Median = freqs[len(freqs)/2]
-	return nil
-}
-
-// Set the IDF file path, could be absolute path of IDF file, or IDF file
-// name in current directory.
-func SetIdf(idfFileName string) error {
-	idfFilePath, err := jiebago.DictPath(idfFileName)
-	if err != nil {
-		return err
-	}
-	return loader.newPath(idfFilePath)
+	return loader, nil
 }
--- a/analyse/stopwords.go
+++ b/analyse/stopwords.go
@@ -1,58 +1,35 @@
 package analyse

-import (
-	"github.com/wangbin/jiebago"
-)
-
-var stopWords map[string]int
-
-func init() {
-	stopWords = map[string]int{
-		"the":   1,
-		"of":    1,
-		"is":    1,
-		"and":   1,
-		"to":    1,
-		"in":    1,
-		"that":  1,
-		"we":    1,
-		"for":   1,
-		"an":    1,
-		"are":   1,
-		"by":    1,
-		"be":    1,
-		"as":    1,
-		"on":    1,
-		"with":  1,
-		"can":   1,
-		"if":    1,
-		"from":  1,
-		"which": 1,
-		"you":   1,
-		"it":    1,
-		"this":  1,
-		"then":  1,
-		"at":    1,
-		"have":  1,
-		"all":   1,
-		"not":   1,
-		"one":   1,
-		"has":   1,
-		"or":    1,
-	}
-}
-
-// Set the stop words file path, could be absolute path of stop words file, or
-// file name in current directory.
-func SetStopWords(stopWordsFileName string) error {
-	stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
-	if err != nil {
-		return err
-	}
-
-	wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
-	for _, wtf := range wtfs {
-		stopWords[wtf.Word] = 1
-	}
-	return nil
+var StopWords = map[string]int{
+	"the":   1,
+	"of":    1,
+	"is":    1,
+	"and":   1,
+	"to":    1,
+	"in":    1,
+	"that":  1,
+	"we":    1,
+	"for":   1,
+	"an":    1,
+	"are":   1,
+	"by":    1,
+	"be":    1,
+	"as":    1,
+	"on":    1,
+	"with":  1,
+	"can":   1,
+	"if":    1,
+	"from":  1,
+	"which": 1,
+	"you":   1,
+	"it":    1,
+	"this":  1,
+	"then":  1,
+	"at":    1,
+	"have":  1,
+	"all":   1,
+	"not":   1,
+	"one":   1,
+	"has":   1,
+	"or":    1,
 }
--- a/analyse/textrank.go
+++ b/analyse/textrank.go
@@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {

 // Extract keywords from sentence using TextRank algorithm. the allowed POS list
 // could be manually speificed.
-func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
+func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
 	posFilt := make(map[string]int)
 	for _, pos := range allowPOS {
 		posFilt[pos] = 1
@@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
 	cm := make(map[[2]string]float64)
 	span := 5
 	wordTags := make([]posseg.WordTag, 0)
-	for wordTag := range posseg.Cut(sentence, true) {
+	for wordTag := range t.Cut(sentence, true) {
 		wordTags = append(wordTags, wordTag)
 	}
 	for i, _ := range wordTags {
@@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {

 // Extract keywords from sentence using TextRank algorithm.
 // topK specify how many top keywords to be returned at most.
-func TextRank(sentence string, topK int) wordWeights {
-	return TextRankWithPOS(sentence, topK, defaultAllowPOS)
+func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
+	return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
 }

 // Set the dictionary, could be absolute path of dictionary file, or dictionary
 // name in current directory. This function must be called before cut any
 // sentence.
-func SetDictionary(dictFileName string) error {
-	return posseg.SetDictionary(dictFileName)
+func NewTextRanker(dictFileName string) (*TextRanker, error) {
+	p, err := posseg.NewPosseg(dictFileName)
+	if err != nil {
+		return nil, err
+	}
+	return &TextRanker{p}, nil
+}
+
+type TextRanker struct {
+	*posseg.Posseg
 }
--- a/analyse/textrank_test.go
+++ b/analyse/textrank_test.go
@@ -23,8 +23,8 @@ var (
 )

 func TestTextRank(t *testing.T) {
-	SetDictionary("../dict.txt")
-	results := TextRank(sentence, 10)
+	tr, _ := NewTextRanker("../dict.txt")
+	results := tr.TextRank(sentence, 10)
 	for index, tw := range results {
 		if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
 			t.Errorf("%v != %v", tw, tagRanks[index])
--- a/tokenizers/jieba.go
+++ b/tokenizers/jieba.go
@@ -14,16 +14,16 @@ const Name = "jieba"
 var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)

 type JiebaTokenizer struct {
-	dictFileName    string
+	j               *jiebago.Jieba
 	hmm, searchMode bool
 }

 func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
-	err := jiebago.SetDictionary(dictFileName)
+	j, err := jiebago.NewJieba(dictFileName)
 	return &JiebaTokenizer{
-		dictFileName: dictFileName,
-		hmm:          hmm,
-		searchMode:   searchMode,
+		j:          j,
+		hmm:        hmm,
+		searchMode: searchMode,
 	}, err
 }

@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	pos := 1
 	var width int
 	var gram string
-	for word := range jiebago.Cut(string(input), false, jt.hmm) {
+	for word := range jt.j.Cut(string(input), false, jt.hmm) {
 		if jt.searchMode {
 			runes := []rune(word)
 			width = len(runes)
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
 					for i := 0; i < width-step+1; i++ {
 						gram = string(runes[i : i+step])
 						gramLen := len(gram)
-						if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
+						if value, ok := jt.j.Freq[gram]; ok && value > 0 {
 							gramStart := start + len(string(runes[:i]))
 							token := analysis.Token{
 								Term:     []byte(gram),