1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-07 17:50:26 +08:00

finished all OOP refactor

This commit is contained in:
Wang Bin
2015-03-24 18:34:07 +08:00
parent 73d87e4ed6
commit 1c378c28a7
7 changed files with 116 additions and 127 deletions

View File

@@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) {
ws[i], ws[j] = ws[j], ws[i]
}
type TagExtracter struct {
*jiebago.Jieba
*IDFLoader
stopWords map[string]int
}
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
j, err := jiebago.NewJieba(dictFileName)
if err != nil {
return nil, err
}
i, err := NewIDFLoader(IDFFileName)
if err != nil {
return nil, err
}
return &TagExtracter{j, i, StopWords}, nil
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
for _, wtf := range wtfs {
t.stopWords[wtf.Word] = 1
}
return nil
}
// Keyword extraction.
func ExtractTags(sentence string, topK int) (tags wordWeights) {
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
freq := make(map[string]float64)
for w := range jiebago.Cut(sentence, false, true) {
for w := range t.Cut(sentence, false, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
}
if _, ok := stopWords[w]; ok {
if _, ok := t.stopWords[w]; ok {
continue
}
if f, ok := freq[w]; ok {
@@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) {
ws := make(wordWeights, 0)
for k, v := range freq {
var ti wordWeight
if freq_, ok := loader.Freq[k]; ok {
if freq_, ok := t.IDFFreq[k]; ok {
ti = wordWeight{Word: k, Weight: freq_ * v}
} else {
ti = wordWeight{Word: k, Weight: loader.Median * v}
ti = wordWeight{Word: k, Weight: t.Median * v}
}
ws = append(ws, ti)
}

View File

@@ -1,7 +1,6 @@
package analyse
import (
"github.com/wangbin/jiebago"
"math"
"testing"
)
@@ -256,11 +255,10 @@ var (
)
func TestExtractTags(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
for index, sentence := range test_contents {
result := ExtractTags(sentence, 20)
result := et.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) {
t.Errorf("%s = %v", sentence, result)
}
@@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) {
}
func TestExtratTagsWithWeight(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
result := ExtractTags(Lyric, 10)
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
result := et.ExtractTags(Lyric, 10)
for index, tag := range result {
if LyciWeight[index].Word != tag.Word ||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
@@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
}
func TestExtractTagsWithStopWordsFile(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt")
SetStopWords("stop_words.txt")
result := ExtractTags(Lyric, 7)
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
et.SetStopWords("stop_words.txt")
result := et.ExtractTags(Lyric, 7)
for index, tag := range result {
if LyciWeight2[index].Word != tag.Word ||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {

View File

@@ -5,53 +5,28 @@ import (
"sort"
)
var (
loader *idfLoader
)
func init() {
loader = newIDFLoader()
type IDFLoader struct {
IDFFreq map[string]float64
Median float64
}
type idfLoader struct {
Path string
Freq map[string]float64
Median float64
}
func newIDFLoader() *idfLoader {
loader := new(idfLoader)
loader.Freq = make(map[string]float64)
return loader
}
func (loader *idfLoader) newPath(idfFilePath string) error {
if loader.Path == idfFilePath {
return nil
}
wtfs, err := jiebago.ParseDictFile(idfFilePath)
func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
IDFFilePath, err := jiebago.DictPath(IDFFileName)
if err != nil {
return err
return nil, err
}
wtfs, err := jiebago.ParseDictFile(IDFFilePath)
if err != nil {
return nil, err
}
freqs := make([]float64, 0)
for _, wtf := range wtfs {
loader.Freq[wtf.Word] = wtf.Freq
freqs = append(freqs, wtf.Freq)
freqs := make([]float64, len(wtfs))
loader := &IDFLoader{make(map[string]float64), 0.0}
for index, wtf := range wtfs {
loader.IDFFreq[wtf.Word] = wtf.Freq
freqs[index] = wtf.Freq
}
sort.Float64s(freqs)
loader.Median = freqs[len(freqs)/2]
return nil
}
// Set the IDF file path, could be absolute path of IDF file, or IDF file
// name in current directory.
func SetIdf(idfFileName string) error {
idfFilePath, err := jiebago.DictPath(idfFileName)
if err != nil {
return err
}
return loader.newPath(idfFilePath)
return loader, nil
}

View File

@@ -1,58 +1,35 @@
package analyse
import (
"github.com/wangbin/jiebago"
)
var stopWords map[string]int
func init() {
stopWords = map[string]int{
"the": 1,
"of": 1,
"is": 1,
"and": 1,
"to": 1,
"in": 1,
"that": 1,
"we": 1,
"for": 1,
"an": 1,
"are": 1,
"by": 1,
"be": 1,
"as": 1,
"on": 1,
"with": 1,
"can": 1,
"if": 1,
"from": 1,
"which": 1,
"you": 1,
"it": 1,
"this": 1,
"then": 1,
"at": 1,
"have": 1,
"all": 1,
"not": 1,
"one": 1,
"has": 1,
"or": 1,
}
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
for _, wtf := range wtfs {
stopWords[wtf.Word] = 1
}
return nil
var StopWords = map[string]int{
"the": 1,
"of": 1,
"is": 1,
"and": 1,
"to": 1,
"in": 1,
"that": 1,
"we": 1,
"for": 1,
"an": 1,
"are": 1,
"by": 1,
"be": 1,
"as": 1,
"on": 1,
"with": 1,
"can": 1,
"if": 1,
"from": 1,
"which": 1,
"you": 1,
"it": 1,
"this": 1,
"then": 1,
"at": 1,
"have": 1,
"all": 1,
"not": 1,
"one": 1,
"has": 1,
"or": 1,
}

View File

@@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
// could be manually speificed.
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
posFilt := make(map[string]int)
for _, pos := range allowPOS {
posFilt[pos] = 1
@@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
cm := make(map[[2]string]float64)
span := 5
wordTags := make([]posseg.WordTag, 0)
for wordTag := range posseg.Cut(sentence, true) {
for wordTag := range t.Cut(sentence, true) {
wordTags = append(wordTags, wordTag)
}
for i, _ := range wordTags {
@@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
// Extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most.
func TextRank(sentence string, topK int) wordWeights {
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
}
// Set the dictionary, could be absolute path of dictionary file, or dictionary
// name in current directory. This function must be called before cut any
// sentence.
func SetDictionary(dictFileName string) error {
return posseg.SetDictionary(dictFileName)
func NewTextRanker(dictFileName string) (*TextRanker, error) {
p, err := posseg.NewPosseg(dictFileName)
if err != nil {
return nil, err
}
return &TextRanker{p}, nil
}
type TextRanker struct {
*posseg.Posseg
}

View File

@@ -23,8 +23,8 @@ var (
)
func TestTextRank(t *testing.T) {
SetDictionary("../dict.txt")
results := TextRank(sentence, 10)
tr, _ := NewTextRanker("../dict.txt")
results := tr.TextRank(sentence, 10)
for index, tw := range results {
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
t.Errorf("%v != %v", tw, tagRanks[index])

View File

@@ -14,16 +14,16 @@ const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
type JiebaTokenizer struct {
dictFileName string
j *jiebago.Jieba
hmm, searchMode bool
}
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
err := jiebago.SetDictionary(dictFileName)
j, err := jiebago.NewJieba(dictFileName)
return &JiebaTokenizer{
dictFileName: dictFileName,
hmm: hmm,
searchMode: searchMode,
j: j,
hmm: hmm,
searchMode: searchMode,
}, err
}
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1
var width int
var gram string
for word := range jiebago.Cut(string(input), false, jt.hmm) {
for word := range jt.j.Cut(string(input), false, jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step])
gramLen := len(gram)
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
if value, ok := jt.j.Freq[gram]; ok && value > 0 {
gramStart := start + len(string(runes[:i]))
token := analysis.Token{
Term: []byte(gram),