mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-07 17:50:26 +08:00
finished all OOP refactor
This commit is contained in:
@@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) {
|
||||
ws[i], ws[j] = ws[j], ws[i]
|
||||
}
|
||||
|
||||
type TagExtracter struct {
|
||||
*jiebago.Jieba
|
||||
*IDFLoader
|
||||
stopWords map[string]int
|
||||
}
|
||||
|
||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
||||
j, err := jiebago.NewJieba(dictFileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
i, err := NewIDFLoader(IDFFileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &TagExtracter{j, i, StopWords}, nil
|
||||
}
|
||||
|
||||
// Set the stop words file path, could be absolute path of stop words file, or
|
||||
// file name in current directory.
|
||||
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
|
||||
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
|
||||
for _, wtf := range wtfs {
|
||||
t.stopWords[wtf.Word] = 1
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Keyword extraction.
|
||||
func ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||
freq := make(map[string]float64)
|
||||
|
||||
for w := range jiebago.Cut(sentence, false, true) {
|
||||
for w := range t.Cut(sentence, false, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
if utf8.RuneCountInString(w) < 2 {
|
||||
continue
|
||||
}
|
||||
if _, ok := stopWords[w]; ok {
|
||||
if _, ok := t.stopWords[w]; ok {
|
||||
continue
|
||||
}
|
||||
if f, ok := freq[w]; ok {
|
||||
@@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||
ws := make(wordWeights, 0)
|
||||
for k, v := range freq {
|
||||
var ti wordWeight
|
||||
if freq_, ok := loader.Freq[k]; ok {
|
||||
if freq_, ok := t.IDFFreq[k]; ok {
|
||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
||||
} else {
|
||||
ti = wordWeight{Word: k, Weight: loader.Median * v}
|
||||
ti = wordWeight{Word: k, Weight: t.Median * v}
|
||||
}
|
||||
ws = append(ws, ti)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
@@ -256,11 +255,10 @@ var (
|
||||
)
|
||||
|
||||
func TestExtractTags(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||
|
||||
for index, sentence := range test_contents {
|
||||
result := ExtractTags(sentence, 20)
|
||||
result := et.ExtractTags(sentence, 20)
|
||||
if len(result) != len(Tags[index]) {
|
||||
t.Errorf("%s = %v", sentence, result)
|
||||
}
|
||||
@@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExtratTagsWithWeight(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
result := ExtractTags(Lyric, 10)
|
||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||
result := et.ExtractTags(Lyric, 10)
|
||||
for index, tag := range result {
|
||||
if LyciWeight[index].Word != tag.Word ||
|
||||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
|
||||
@@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
SetStopWords("stop_words.txt")
|
||||
result := ExtractTags(Lyric, 7)
|
||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||
et.SetStopWords("stop_words.txt")
|
||||
result := et.ExtractTags(Lyric, 7)
|
||||
for index, tag := range result {
|
||||
if LyciWeight2[index].Word != tag.Word ||
|
||||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
|
||||
|
||||
@@ -5,53 +5,28 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
var (
|
||||
loader *idfLoader
|
||||
)
|
||||
|
||||
func init() {
|
||||
loader = newIDFLoader()
|
||||
type IDFLoader struct {
|
||||
IDFFreq map[string]float64
|
||||
Median float64
|
||||
}
|
||||
|
||||
type idfLoader struct {
|
||||
Path string
|
||||
Freq map[string]float64
|
||||
Median float64
|
||||
}
|
||||
|
||||
func newIDFLoader() *idfLoader {
|
||||
loader := new(idfLoader)
|
||||
loader.Freq = make(map[string]float64)
|
||||
return loader
|
||||
}
|
||||
|
||||
func (loader *idfLoader) newPath(idfFilePath string) error {
|
||||
if loader.Path == idfFilePath {
|
||||
return nil
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(idfFilePath)
|
||||
func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
|
||||
IDFFilePath, err := jiebago.DictPath(IDFFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(IDFFilePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
freqs := make([]float64, 0)
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
loader.Freq[wtf.Word] = wtf.Freq
|
||||
freqs = append(freqs, wtf.Freq)
|
||||
freqs := make([]float64, len(wtfs))
|
||||
loader := &IDFLoader{make(map[string]float64), 0.0}
|
||||
for index, wtf := range wtfs {
|
||||
loader.IDFFreq[wtf.Word] = wtf.Freq
|
||||
freqs[index] = wtf.Freq
|
||||
}
|
||||
|
||||
sort.Float64s(freqs)
|
||||
loader.Median = freqs[len(freqs)/2]
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set the IDF file path, could be absolute path of IDF file, or IDF file
|
||||
// name in current directory.
|
||||
func SetIdf(idfFileName string) error {
|
||||
idfFilePath, err := jiebago.DictPath(idfFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return loader.newPath(idfFilePath)
|
||||
return loader, nil
|
||||
}
|
||||
|
||||
@@ -1,58 +1,35 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
var stopWords map[string]int
|
||||
|
||||
func init() {
|
||||
stopWords = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
"is": 1,
|
||||
"and": 1,
|
||||
"to": 1,
|
||||
"in": 1,
|
||||
"that": 1,
|
||||
"we": 1,
|
||||
"for": 1,
|
||||
"an": 1,
|
||||
"are": 1,
|
||||
"by": 1,
|
||||
"be": 1,
|
||||
"as": 1,
|
||||
"on": 1,
|
||||
"with": 1,
|
||||
"can": 1,
|
||||
"if": 1,
|
||||
"from": 1,
|
||||
"which": 1,
|
||||
"you": 1,
|
||||
"it": 1,
|
||||
"this": 1,
|
||||
"then": 1,
|
||||
"at": 1,
|
||||
"have": 1,
|
||||
"all": 1,
|
||||
"not": 1,
|
||||
"one": 1,
|
||||
"has": 1,
|
||||
"or": 1,
|
||||
}
|
||||
}
|
||||
|
||||
// Set the stop words file path, could be absolute path of stop words file, or
|
||||
// file name in current directory.
|
||||
func SetStopWords(stopWordsFileName string) error {
|
||||
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
|
||||
for _, wtf := range wtfs {
|
||||
stopWords[wtf.Word] = 1
|
||||
}
|
||||
return nil
|
||||
var StopWords = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
"is": 1,
|
||||
"and": 1,
|
||||
"to": 1,
|
||||
"in": 1,
|
||||
"that": 1,
|
||||
"we": 1,
|
||||
"for": 1,
|
||||
"an": 1,
|
||||
"are": 1,
|
||||
"by": 1,
|
||||
"be": 1,
|
||||
"as": 1,
|
||||
"on": 1,
|
||||
"with": 1,
|
||||
"can": 1,
|
||||
"if": 1,
|
||||
"from": 1,
|
||||
"which": 1,
|
||||
"you": 1,
|
||||
"it": 1,
|
||||
"this": 1,
|
||||
"then": 1,
|
||||
"at": 1,
|
||||
"have": 1,
|
||||
"all": 1,
|
||||
"not": 1,
|
||||
"one": 1,
|
||||
"has": 1,
|
||||
"or": 1,
|
||||
}
|
||||
|
||||
@@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||
// could be manually speificed.
|
||||
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
posFilt := make(map[string]int)
|
||||
for _, pos := range allowPOS {
|
||||
posFilt[pos] = 1
|
||||
@@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
wordTags := make([]posseg.WordTag, 0)
|
||||
for wordTag := range posseg.Cut(sentence, true) {
|
||||
for wordTag := range t.Cut(sentence, true) {
|
||||
wordTags = append(wordTags, wordTag)
|
||||
}
|
||||
for i, _ := range wordTags {
|
||||
@@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm.
|
||||
// topK specify how many top keywords to be returned at most.
|
||||
func TextRank(sentence string, topK int) wordWeights {
|
||||
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
|
||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
}
|
||||
|
||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||
// name in current directory. This function must be called before cut any
|
||||
// sentence.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
return posseg.SetDictionary(dictFileName)
|
||||
func NewTextRanker(dictFileName string) (*TextRanker, error) {
|
||||
p, err := posseg.NewPosseg(dictFileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &TextRanker{p}, nil
|
||||
}
|
||||
|
||||
type TextRanker struct {
|
||||
*posseg.Posseg
|
||||
}
|
||||
|
||||
@@ -23,8 +23,8 @@ var (
|
||||
)
|
||||
|
||||
func TestTextRank(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
results := TextRank(sentence, 10)
|
||||
tr, _ := NewTextRanker("../dict.txt")
|
||||
results := tr.TextRank(sentence, 10)
|
||||
for index, tw := range results {
|
||||
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
|
||||
t.Errorf("%v != %v", tw, tagRanks[index])
|
||||
|
||||
@@ -14,16 +14,16 @@ const Name = "jieba"
|
||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
type JiebaTokenizer struct {
|
||||
dictFileName string
|
||||
j *jiebago.Jieba
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
j, err := jiebago.NewJieba(dictFileName)
|
||||
return &JiebaTokenizer{
|
||||
dictFileName: dictFileName,
|
||||
hmm: hmm,
|
||||
searchMode: searchMode,
|
||||
j: j,
|
||||
hmm: hmm,
|
||||
searchMode: searchMode,
|
||||
}, err
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
pos := 1
|
||||
var width int
|
||||
var gram string
|
||||
for word := range jiebago.Cut(string(input), false, jt.hmm) {
|
||||
for word := range jt.j.Cut(string(input), false, jt.hmm) {
|
||||
if jt.searchMode {
|
||||
runes := []rune(word)
|
||||
width = len(runes)
|
||||
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
for i := 0; i < width-step+1; i++ {
|
||||
gram = string(runes[i : i+step])
|
||||
gramLen := len(gram)
|
||||
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
|
||||
if value, ok := jt.j.Freq[gram]; ok && value > 0 {
|
||||
gramStart := start + len(string(runes[:i]))
|
||||
token := analysis.Token{
|
||||
Term: []byte(gram),
|
||||
|
||||
Reference in New Issue
Block a user