mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
code refactor, added more documents
This commit is contained in:
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
test_contents = []string{
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
|
||||
for index, sentence := range test_contents {
|
||||
for index, sentence := range testContents {
|
||||
result := te.ExtractTags(sentence, 20)
|
||||
if len(result) != len(Tags[index]) {
|
||||
t.Fatalf("%s = %v", sentence, result)
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// Idf represents a thread-safe dictionary for all words with their
|
||||
// IDFs(Inverse Document Frequency).
|
||||
type Idf struct {
|
||||
freqMap map[string]float64
|
||||
median float64
|
||||
@@ -14,6 +16,7 @@ type Idf struct {
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// AddToken adds a new word with IDF into it's dictionary.
|
||||
func (i *Idf) AddToken(token dictionary.Token) {
|
||||
i.Lock()
|
||||
i.freqMap[token.Text()] = token.Frequency()
|
||||
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
|
||||
i.Unlock()
|
||||
}
|
||||
|
||||
// Load loads all tokens from channel into it's dictionary.
|
||||
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
||||
i.Lock()
|
||||
for token := range ch {
|
||||
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(i, fileName)
|
||||
}
|
||||
|
||||
// Frequency returns the IDF of given word.
|
||||
func (i *Idf) Frequency(key string) (float64, bool) {
|
||||
i.RLock()
|
||||
freq, ok := i.freqMap[key]
|
||||
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// NewIdf creates a new Idf instance.
|
||||
func NewIdf() *Idf {
|
||||
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// DefaultStopWordMap contains some stop words.
|
||||
var DefaultStopWordMap = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
|
||||
"or": 1,
|
||||
}
|
||||
|
||||
// StopWord is a thread-safe dictionary for all stop words.
|
||||
type StopWord struct {
|
||||
stopWordMap map[string]int
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// AddToken adds a token into StopWord dictionary.
|
||||
func (s *StopWord) AddToken(token dictionary.Token) {
|
||||
s.Lock()
|
||||
s.stopWordMap[token.Text()] = 1
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
// NewStopWord create a new StopWord with default stop words.
|
||||
func NewStopWord() *StopWord {
|
||||
s := new(StopWord)
|
||||
s.stopWordMap = DefaultStopWordMap
|
||||
return s
|
||||
}
|
||||
|
||||
// IsStopWord checks if a given word is stop word.
|
||||
func (s *StopWord) IsStopWord(word string) bool {
|
||||
s.RLock()
|
||||
_, ok := s.stopWordMap[word]
|
||||
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
|
||||
return ok
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel into StopWord dictionary.
|
||||
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
||||
s.Lock()
|
||||
for token := range ch {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
@@ -9,23 +8,23 @@ import (
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
// Segment represents a word with weight.
|
||||
type Segment struct {
|
||||
text string
|
||||
weight float64
|
||||
}
|
||||
|
||||
// Text returns the segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
// Weight returns the segment's weight.
|
||||
func (s Segment) Weight() float64 {
|
||||
return s.weight
|
||||
}
|
||||
|
||||
func (s Segment) String() string {
|
||||
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
||||
}
|
||||
|
||||
// Segments represents a slice of Segment.
|
||||
type Segments []Segment
|
||||
|
||||
func (ss Segments) Len() int {
|
||||
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
|
||||
ss[i], ss[j] = ss[j], ss[i]
|
||||
}
|
||||
|
||||
// TagExtracter is used to extract tags from sentence.
|
||||
type TagExtracter struct {
|
||||
seg *jiebago.Segmenter
|
||||
idf *Idf
|
||||
stopWord *StopWord
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jiebago.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||
t.idf = NewIdf()
|
||||
return t.idf.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
return t.stopWord.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// Keyword extraction.
|
||||
// ExtractTags extracts the topK key words from sentence.
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
freqMap := make(map[string]float64)
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
@@ -20,10 +19,6 @@ type edge struct {
|
||||
weight float64
|
||||
}
|
||||
|
||||
func (e edge) String() string {
|
||||
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
|
||||
}
|
||||
|
||||
type edges []edge
|
||||
|
||||
func (es edges) Len() int {
|
||||
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
|
||||
return result
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||
// could be manually speificed.
|
||||
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
|
||||
// Parameter allowPOS allows a customized pos list.
|
||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
||||
posFilt := make(map[string]int)
|
||||
for _, pos := range allowPOS {
|
||||
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
||||
g := newUndirectWeightedGraph()
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
pairs := make([]posseg.Segment, 0)
|
||||
var pairs []posseg.Segment
|
||||
for pair := range t.seg.Cut(sentence, true) {
|
||||
pairs = append(pairs, pair)
|
||||
}
|
||||
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
||||
return tags
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm.
|
||||
// topK specify how many top keywords to be returned at most.
|
||||
// TextRank extract keywords from sentence using TextRank algorithm.
|
||||
// Parameter topK specify how many top keywords to be returned at most.
|
||||
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
}
|
||||
|
||||
// TextRanker is used to extract tags from sentence.
|
||||
type TextRanker struct {
|
||||
seg *posseg.Segmenter
|
||||
}
|
||||
|
||||
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
|
||||
func (t *TextRanker) LoadDictionary(fileName string) error {
|
||||
t.seg = new(posseg.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
|
||||
Reference in New Issue
Block a user