1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-27 07:30:32 +08:00

finished all OOP refactor

This commit is contained in:
Wang Bin
2015-03-24 18:34:07 +08:00
parent 73d87e4ed6
commit 1c378c28a7
7 changed files with 116 additions and 127 deletions

View File

@@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) {
ws[i], ws[j] = ws[j], ws[i] ws[i], ws[j] = ws[j], ws[i]
} }
type TagExtracter struct {
*jiebago.Jieba
*IDFLoader
stopWords map[string]int
}
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
j, err := jiebago.NewJieba(dictFileName)
if err != nil {
return nil, err
}
i, err := NewIDFLoader(IDFFileName)
if err != nil {
return nil, err
}
return &TagExtracter{j, i, StopWords}, nil
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
for _, wtf := range wtfs {
t.stopWords[wtf.Word] = 1
}
return nil
}
// Keyword extraction. // Keyword extraction.
func ExtractTags(sentence string, topK int) (tags wordWeights) { func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
freq := make(map[string]float64) freq := make(map[string]float64)
for w := range jiebago.Cut(sentence, false, true) { for w := range t.Cut(sentence, false, true) {
w = strings.TrimSpace(w) w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 { if utf8.RuneCountInString(w) < 2 {
continue continue
} }
if _, ok := stopWords[w]; ok { if _, ok := t.stopWords[w]; ok {
continue continue
} }
if f, ok := freq[w]; ok { if f, ok := freq[w]; ok {
@@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) {
ws := make(wordWeights, 0) ws := make(wordWeights, 0)
for k, v := range freq { for k, v := range freq {
var ti wordWeight var ti wordWeight
if freq_, ok := loader.Freq[k]; ok { if freq_, ok := t.IDFFreq[k]; ok {
ti = wordWeight{Word: k, Weight: freq_ * v} ti = wordWeight{Word: k, Weight: freq_ * v}
} else { } else {
ti = wordWeight{Word: k, Weight: loader.Median * v} ti = wordWeight{Word: k, Weight: t.Median * v}
} }
ws = append(ws, ti) ws = append(ws, ti)
} }

View File

@@ -1,7 +1,6 @@
package analyse package analyse
import ( import (
"github.com/wangbin/jiebago"
"math" "math"
"testing" "testing"
) )
@@ -256,11 +255,10 @@ var (
) )
func TestExtractTags(t *testing.T) { func TestExtractTags(t *testing.T) {
jiebago.SetDictionary("../dict.txt") et, _ := NewTagExtracter("../dict.txt", "idf.txt")
SetIdf("idf.txt")
for index, sentence := range test_contents { for index, sentence := range test_contents {
result := ExtractTags(sentence, 20) result := et.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) { if len(result) != len(Tags[index]) {
t.Errorf("%s = %v", sentence, result) t.Errorf("%s = %v", sentence, result)
} }
@@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) {
} }
func TestExtratTagsWithWeight(t *testing.T) { func TestExtratTagsWithWeight(t *testing.T) {
jiebago.SetDictionary("../dict.txt") et, _ := NewTagExtracter("../dict.txt", "idf.txt")
SetIdf("idf.txt") result := et.ExtractTags(Lyric, 10)
result := ExtractTags(Lyric, 10)
for index, tag := range result { for index, tag := range result {
if LyciWeight[index].Word != tag.Word || if LyciWeight[index].Word != tag.Word ||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 { math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
@@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
} }
func TestExtractTagsWithStopWordsFile(t *testing.T) { func TestExtractTagsWithStopWordsFile(t *testing.T) {
jiebago.SetDictionary("../dict.txt") et, _ := NewTagExtracter("../dict.txt", "idf.txt")
SetIdf("idf.txt") et.SetStopWords("stop_words.txt")
SetStopWords("stop_words.txt") result := et.ExtractTags(Lyric, 7)
result := ExtractTags(Lyric, 7)
for index, tag := range result { for index, tag := range result {
if LyciWeight2[index].Word != tag.Word || if LyciWeight2[index].Word != tag.Word ||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 { math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {

View File

@@ -5,53 +5,28 @@ import (
"sort" "sort"
) )
var ( type IDFLoader struct {
loader *idfLoader IDFFreq map[string]float64
) Median float64
func init() {
loader = newIDFLoader()
} }
type idfLoader struct { func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
Path string IDFFilePath, err := jiebago.DictPath(IDFFileName)
Freq map[string]float64
Median float64
}
func newIDFLoader() *idfLoader {
loader := new(idfLoader)
loader.Freq = make(map[string]float64)
return loader
}
func (loader *idfLoader) newPath(idfFilePath string) error {
if loader.Path == idfFilePath {
return nil
}
wtfs, err := jiebago.ParseDictFile(idfFilePath)
if err != nil { if err != nil {
return err return nil, err
}
wtfs, err := jiebago.ParseDictFile(IDFFilePath)
if err != nil {
return nil, err
} }
freqs := make([]float64, 0) freqs := make([]float64, len(wtfs))
loader := &IDFLoader{make(map[string]float64), 0.0}
for _, wtf := range wtfs { for index, wtf := range wtfs {
loader.Freq[wtf.Word] = wtf.Freq loader.IDFFreq[wtf.Word] = wtf.Freq
freqs = append(freqs, wtf.Freq) freqs[index] = wtf.Freq
} }
sort.Float64s(freqs) sort.Float64s(freqs)
loader.Median = freqs[len(freqs)/2] loader.Median = freqs[len(freqs)/2]
return nil return loader, nil
}
// Set the IDF file path, could be absolute path of IDF file, or IDF file
// name in current directory.
func SetIdf(idfFileName string) error {
idfFilePath, err := jiebago.DictPath(idfFileName)
if err != nil {
return err
}
return loader.newPath(idfFilePath)
} }

View File

@@ -1,58 +1,35 @@
package analyse package analyse
import ( var StopWords = map[string]int{
"github.com/wangbin/jiebago" "the": 1,
) "of": 1,
"is": 1,
var stopWords map[string]int "and": 1,
"to": 1,
func init() { "in": 1,
stopWords = map[string]int{ "that": 1,
"the": 1, "we": 1,
"of": 1, "for": 1,
"is": 1, "an": 1,
"and": 1, "are": 1,
"to": 1, "by": 1,
"in": 1, "be": 1,
"that": 1, "as": 1,
"we": 1, "on": 1,
"for": 1, "with": 1,
"an": 1, "can": 1,
"are": 1, "if": 1,
"by": 1, "from": 1,
"be": 1, "which": 1,
"as": 1, "you": 1,
"on": 1, "it": 1,
"with": 1, "this": 1,
"can": 1, "then": 1,
"if": 1, "at": 1,
"from": 1, "have": 1,
"which": 1, "all": 1,
"you": 1, "not": 1,
"it": 1, "one": 1,
"this": 1, "has": 1,
"then": 1, "or": 1,
"at": 1,
"have": 1,
"all": 1,
"not": 1,
"one": 1,
"has": 1,
"or": 1,
}
}
// Set the stop words file path, could be absolute path of stop words file, or
// file name in current directory.
func SetStopWords(stopWordsFileName string) error {
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
if err != nil {
return err
}
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
for _, wtf := range wtfs {
stopWords[wtf.Word] = 1
}
return nil
} }

View File

@@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {
// Extract keywords from sentence using TextRank algorithm. the allowed POS list // Extract keywords from sentence using TextRank algorithm. the allowed POS list
// could be manually speificed. // could be manually speificed.
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights { func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
posFilt := make(map[string]int) posFilt := make(map[string]int)
for _, pos := range allowPOS { for _, pos := range allowPOS {
posFilt[pos] = 1 posFilt[pos] = 1
@@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
cm := make(map[[2]string]float64) cm := make(map[[2]string]float64)
span := 5 span := 5
wordTags := make([]posseg.WordTag, 0) wordTags := make([]posseg.WordTag, 0)
for wordTag := range posseg.Cut(sentence, true) { for wordTag := range t.Cut(sentence, true) {
wordTags = append(wordTags, wordTag) wordTags = append(wordTags, wordTag)
} }
for i, _ := range wordTags { for i, _ := range wordTags {
@@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
// Extract keywords from sentence using TextRank algorithm. // Extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most. // topK specify how many top keywords to be returned at most.
func TextRank(sentence string, topK int) wordWeights { func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
return TextRankWithPOS(sentence, topK, defaultAllowPOS) return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
} }
// Set the dictionary, could be absolute path of dictionary file, or dictionary // Set the dictionary, could be absolute path of dictionary file, or dictionary
// name in current directory. This function must be called before cut any // name in current directory. This function must be called before cut any
// sentence. // sentence.
func SetDictionary(dictFileName string) error { func NewTextRanker(dictFileName string) (*TextRanker, error) {
return posseg.SetDictionary(dictFileName) p, err := posseg.NewPosseg(dictFileName)
if err != nil {
return nil, err
}
return &TextRanker{p}, nil
}
type TextRanker struct {
*posseg.Posseg
} }

View File

@@ -23,8 +23,8 @@ var (
) )
func TestTextRank(t *testing.T) { func TestTextRank(t *testing.T) {
SetDictionary("../dict.txt") tr, _ := NewTextRanker("../dict.txt")
results := TextRank(sentence, 10) results := tr.TextRank(sentence, 10)
for index, tw := range results { for index, tw := range results {
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 { if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
t.Errorf("%v != %v", tw, tagRanks[index]) t.Errorf("%v != %v", tw, tagRanks[index])

View File

@@ -14,16 +14,16 @@ const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
type JiebaTokenizer struct { type JiebaTokenizer struct {
dictFileName string j *jiebago.Jieba
hmm, searchMode bool hmm, searchMode bool
} }
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
err := jiebago.SetDictionary(dictFileName) j, err := jiebago.NewJieba(dictFileName)
return &JiebaTokenizer{ return &JiebaTokenizer{
dictFileName: dictFileName, j: j,
hmm: hmm, hmm: hmm,
searchMode: searchMode, searchMode: searchMode,
}, err }, err
} }
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1 pos := 1
var width int var width int
var gram string var gram string
for word := range jiebago.Cut(string(input), false, jt.hmm) { for word := range jt.j.Cut(string(input), false, jt.hmm) {
if jt.searchMode { if jt.searchMode {
runes := []rune(word) runes := []rune(word)
width = len(runes) width = len(runes)
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
for i := 0; i < width-step+1; i++ { for i := 0; i < width-step+1; i++ {
gram = string(runes[i : i+step]) gram = string(runes[i : i+step])
gramLen := len(gram) gramLen := len(gram)
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 { if value, ok := jt.j.Freq[gram]; ok && value > 0 {
gramStart := start + len(string(runes[:i])) gramStart := start + len(string(runes[:i]))
token := analysis.Token{ token := analysis.Token{
Term: []byte(gram), Term: []byte(gram),