1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

code refactor, added more documents

This commit is contained in:
Wang Bin
2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions

View File

@@ -6,7 +6,7 @@ import (
)
var (
test_contents = []string{
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
for index, sentence := range test_contents {
for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) {
t.Fatalf("%s = %v", sentence, result)

View File

@@ -7,6 +7,8 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// Idf represents a thread-safe dictionary for all words with their
// IDFs(Inverse Document Frequency).
type Idf struct {
freqMap map[string]float64
median float64
@@ -14,6 +16,7 @@ type Idf struct {
sync.RWMutex
}
// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(token dictionary.Token) {
i.Lock()
i.freqMap[token.Text()] = token.Frequency()
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
i.Unlock()
}
// Load loads all tokens from channel into it's dictionary.
func (i *Idf) Load(ch <-chan dictionary.Token) {
i.Lock()
for token := range ch {
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName)
}
// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) {
i.RLock()
freq, ok := i.freqMap[key]
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
return freq, ok
}
// NewIdf creates a new Idf instance.
func NewIdf() *Idf {
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
}

View File

@@ -6,6 +6,7 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// DefaultStopWordMap contains some stop words.
var DefaultStopWordMap = map[string]int{
"the": 1,
"of": 1,
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
"or": 1,
}
// StopWord is a thread-safe dictionary for all stop words.
type StopWord struct {
stopWordMap map[string]int
sync.RWMutex
}
// AddToken adds a token into StopWord dictionary.
func (s *StopWord) AddToken(token dictionary.Token) {
s.Lock()
s.stopWordMap[token.Text()] = 1
s.Unlock()
}
// NewStopWord create a new StopWord with default stop words.
func NewStopWord() *StopWord {
s := new(StopWord)
s.stopWordMap = DefaultStopWordMap
return s
}
// IsStopWord checks if a given word is stop word.
func (s *StopWord) IsStopWord(word string) bool {
s.RLock()
_, ok := s.stopWordMap[word]
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
return ok
}
// Load loads all tokens from given channel into StopWord dictionary.
func (s *StopWord) Load(ch <-chan dictionary.Token) {
s.Lock()
for token := range ch {

View File

@@ -1,7 +1,6 @@
package analyse
import (
"fmt"
"sort"
"strings"
"unicode/utf8"
@@ -9,23 +8,23 @@ import (
"github.com/wangbin/jiebago"
)
// Segment represents a word with weight.
type Segment struct {
text string
weight float64
}
// Text returns the segment's text.
func (s Segment) Text() string {
return s.text
}
// Weight returns the segment's weight.
func (s Segment) Weight() float64 {
return s.weight
}
func (s Segment) String() string {
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
}
// Segments represents a slice of Segment.
type Segments []Segment
func (ss Segments) Len() int {
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct {
seg *jiebago.Segmenter
idf *Idf
stopWord *StopWord
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jiebago.Segmenter)
return t.seg.LoadDictionary(fileName)
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionary(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName)
}
// Keyword extraction.
// ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)

View File

@@ -1,7 +1,6 @@
package analyse
import (
"fmt"
"math"
"sort"
@@ -20,10 +19,6 @@ type edge struct {
weight float64
}
func (e edge) String() string {
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
}
type edges []edge
func (es edges) Len() int {
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
return result
}
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
// could be manually speificed.
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
// Parameter allowPOS allows a customized pos list.
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
posFilt := make(map[string]int)
for _, pos := range allowPOS {
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64)
span := 5
pairs := make([]posseg.Segment, 0)
var pairs []posseg.Segment
for pair := range t.seg.Cut(sentence, true) {
pairs = append(pairs, pair)
}
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
return tags
}
// Extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most.
// TextRank extract keywords from sentence using TextRank algorithm.
// Parameter topK specify how many top keywords to be returned at most.
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
}
// TextRanker is used to extract tags from sentence.
type TextRanker struct {
seg *posseg.Segmenter
}
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
func (t *TextRanker) LoadDictionary(fileName string) error {
t.seg = new(posseg.Segmenter)
return t.seg.LoadDictionary(fileName)