code refactor, added more documents

2026-06-05 00:32:51 +08:00 · 2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions
--- a/analyse/analyse_test.go
+++ b/analyse/analyse_test.go
@@ -6,7 +6,7 @@ import (
 )

 var (
-	test_contents = []string{
+	testContents = []string{
 		"这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。",
 		"我不喜欢日本和服。",
 		"雷猴回归人间。",
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
 	te.LoadDictionary("../dict.txt")
 	te.LoadIdf("idf.txt")

-	for index, sentence := range test_contents {
+	for index, sentence := range testContents {
 		result := te.ExtractTags(sentence, 20)
 		if len(result) != len(Tags[index]) {
 			t.Fatalf("%s = %v", sentence, result)
--- a/analyse/idf.go
+++ b/analyse/idf.go
@@ -7,6 +7,8 @@ import (
 	"github.com/wangbin/jiebago/dictionary"
 )

+// Idf represents a thread-safe dictionary for all words with their
+// IDFs(Inverse Document Frequency).
 type Idf struct {
 	freqMap map[string]float64
 	median  float64
@@ -14,6 +16,7 @@ type Idf struct {
 	sync.RWMutex
 }

+// AddToken adds a new word with IDF into it's dictionary.
 func (i *Idf) AddToken(token dictionary.Token) {
 	i.Lock()
 	i.freqMap[token.Text()] = token.Frequency()
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
 	i.Unlock()
 }

+// Load loads all tokens from channel into it's dictionary.
 func (i *Idf) Load(ch <-chan dictionary.Token) {
 	i.Lock()
 	for token := range ch {
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
 	return dictionary.LoadDictionary(i, fileName)
 }

+// Frequency returns the IDF of given word.
 func (i *Idf) Frequency(key string) (float64, bool) {
 	i.RLock()
 	freq, ok := i.freqMap[key]
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
 	return freq, ok
 }

+// NewIdf creates a new Idf instance.
 func NewIdf() *Idf {
 	return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
 }
--- a/analyse/stopwords.go
+++ b/analyse/stopwords.go
@@ -6,6 +6,7 @@ import (
 	"github.com/wangbin/jiebago/dictionary"
 )

+// DefaultStopWordMap contains some stop words.
 var DefaultStopWordMap = map[string]int{
 	"the":   1,
 	"of":    1,
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
 	"or":    1,
 }

+// StopWord is a thread-safe dictionary for all stop words.
 type StopWord struct {
 	stopWordMap map[string]int
 	sync.RWMutex
 }

+// AddToken adds a token into StopWord dictionary.
 func (s *StopWord) AddToken(token dictionary.Token) {
 	s.Lock()
 	s.stopWordMap[token.Text()] = 1
 	s.Unlock()
 }

+// NewStopWord create a new StopWord with default stop words.
 func NewStopWord() *StopWord {
 	s := new(StopWord)
 	s.stopWordMap = DefaultStopWordMap
 	return s
 }

+// IsStopWord checks if a given word is stop word.
 func (s *StopWord) IsStopWord(word string) bool {
 	s.RLock()
 	_, ok := s.stopWordMap[word]
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
 	return ok
 }

+// Load loads all tokens from given channel into StopWord dictionary.
 func (s *StopWord) Load(ch <-chan dictionary.Token) {
 	s.Lock()
 	for token := range ch {
--- a/analyse/tag_extracker.go
+++ b/analyse/tag_extracker.go
@@ -1,7 +1,6 @@
 package analyse

 import (
-	"fmt"
 	"sort"
 	"strings"
 	"unicode/utf8"
@@ -9,23 +8,23 @@ import (
 	"github.com/wangbin/jiebago"
 )

+// Segment represents a word with weight.
 type Segment struct {
 	text   string
 	weight float64
 }

+// Text returns the segment's text.
 func (s Segment) Text() string {
 	return s.text
 }

+// Weight returns the segment's weight.
 func (s Segment) Weight() float64 {
 	return s.weight
 }

-func (s Segment) String() string {
-	return fmt.Sprintf("{%s: %f}", s.text, s.weight)
-}
-
+// Segments represents a slice of Segment.
 type Segments []Segment

 func (ss Segments) Len() int {
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
 	ss[i], ss[j] = ss[j], ss[i]
 }

+// TagExtracter is used to extract tags from sentence.
 type TagExtracter struct {
 	seg      *jiebago.Segmenter
 	idf      *Idf
 	stopWord *StopWord
 }

+// LoadDictionary reads the given filename and create a new dictionary.
 func (t *TagExtracter) LoadDictionary(fileName string) error {
 	t.stopWord = NewStopWord()
 	t.seg = new(jiebago.Segmenter)
 	return t.seg.LoadDictionary(fileName)
 }

+// LoadIdf reads the given file and create a new Idf dictionary.
 func (t *TagExtracter) LoadIdf(fileName string) error {
 	t.idf = NewIdf()
 	return t.idf.loadDictionary(fileName)
 }

+// LoadStopWords reads the given file and create a new StopWord dictionary.
 func (t *TagExtracter) LoadStopWords(fileName string) error {
 	t.stopWord = NewStopWord()
 	return t.stopWord.loadDictionary(fileName)
 }

-// Keyword extraction.
+// ExtractTags extracts the topK key words from sentence.
 func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
 	freqMap := make(map[string]float64)

--- a/analyse/textrank.go
+++ b/analyse/textrank.go
@@ -1,7 +1,6 @@
 package analyse

 import (
-	"fmt"
 	"math"
 	"sort"

@@ -20,10 +19,6 @@ type edge struct {
 	weight float64
 }

-func (e edge) String() string {
-	return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
-}
-
 type edges []edge

 func (es edges) Len() int {
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
 	return result
 }

-// Extract keywords from sentence using TextRank algorithm. the allowed POS list
-// could be manually speificed.
+// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
+// Parameter allowPOS allows a customized pos list.
 func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
 	posFilt := make(map[string]int)
 	for _, pos := range allowPOS {
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
 	g := newUndirectWeightedGraph()
 	cm := make(map[[2]string]float64)
 	span := 5
-	pairs := make([]posseg.Segment, 0)
+	var pairs []posseg.Segment
 	for pair := range t.seg.Cut(sentence, true) {
 		pairs = append(pairs, pair)
 	}
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
 	return tags
 }

-// Extract keywords from sentence using TextRank algorithm.
-// topK specify how many top keywords to be returned at most.
+// TextRank extract keywords from sentence using TextRank algorithm.
+// Parameter topK specify how many top keywords to be returned at most.
 func (t *TextRanker) TextRank(sentence string, topK int) Segments {
 	return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
 }

+// TextRanker is used to extract tags from sentence.
 type TextRanker struct {
 	seg *posseg.Segmenter
 }

+// LoadDictionary reads a given file and create a new dictionary file for Textranker.
 func (t *TextRanker) LoadDictionary(fileName string) error {
 	t.seg = new(posseg.Segmenter)
 	return t.seg.LoadDictionary(fileName)