mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-27 07:30:32 +08:00
finished all OOP refactor
This commit is contained in:
@@ -35,16 +35,49 @@ func (ws wordWeights) Swap(i, j int) {
|
|||||||
ws[i], ws[j] = ws[j], ws[i]
|
ws[i], ws[j] = ws[j], ws[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TagExtracter struct {
|
||||||
|
*jiebago.Jieba
|
||||||
|
*IDFLoader
|
||||||
|
stopWords map[string]int
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
||||||
|
j, err := jiebago.NewJieba(dictFileName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
i, err := NewIDFLoader(IDFFileName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &TagExtracter{j, i, StopWords}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the stop words file path, could be absolute path of stop words file, or
|
||||||
|
// file name in current directory.
|
||||||
|
func (t *TagExtracter) SetStopWords(stopWordsFileName string) error {
|
||||||
|
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
|
||||||
|
for _, wtf := range wtfs {
|
||||||
|
t.stopWords[wtf.Word] = 1
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Keyword extraction.
|
// Keyword extraction.
|
||||||
func ExtractTags(sentence string, topK int) (tags wordWeights) {
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||||
freq := make(map[string]float64)
|
freq := make(map[string]float64)
|
||||||
|
|
||||||
for w := range jiebago.Cut(sentence, false, true) {
|
for w := range t.Cut(sentence, false, true) {
|
||||||
w = strings.TrimSpace(w)
|
w = strings.TrimSpace(w)
|
||||||
if utf8.RuneCountInString(w) < 2 {
|
if utf8.RuneCountInString(w) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := stopWords[w]; ok {
|
if _, ok := t.stopWords[w]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if f, ok := freq[w]; ok {
|
if f, ok := freq[w]; ok {
|
||||||
@@ -63,10 +96,10 @@ func ExtractTags(sentence string, topK int) (tags wordWeights) {
|
|||||||
ws := make(wordWeights, 0)
|
ws := make(wordWeights, 0)
|
||||||
for k, v := range freq {
|
for k, v := range freq {
|
||||||
var ti wordWeight
|
var ti wordWeight
|
||||||
if freq_, ok := loader.Freq[k]; ok {
|
if freq_, ok := t.IDFFreq[k]; ok {
|
||||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
ti = wordWeight{Word: k, Weight: freq_ * v}
|
||||||
} else {
|
} else {
|
||||||
ti = wordWeight{Word: k, Weight: loader.Median * v}
|
ti = wordWeight{Word: k, Weight: t.Median * v}
|
||||||
}
|
}
|
||||||
ws = append(ws, ti)
|
ws = append(ws, ti)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/wangbin/jiebago"
|
|
||||||
"math"
|
"math"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@@ -256,11 +255,10 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestExtractTags(t *testing.T) {
|
func TestExtractTags(t *testing.T) {
|
||||||
jiebago.SetDictionary("../dict.txt")
|
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||||
SetIdf("idf.txt")
|
|
||||||
|
|
||||||
for index, sentence := range test_contents {
|
for index, sentence := range test_contents {
|
||||||
result := ExtractTags(sentence, 20)
|
result := et.ExtractTags(sentence, 20)
|
||||||
if len(result) != len(Tags[index]) {
|
if len(result) != len(Tags[index]) {
|
||||||
t.Errorf("%s = %v", sentence, result)
|
t.Errorf("%s = %v", sentence, result)
|
||||||
}
|
}
|
||||||
@@ -273,9 +271,8 @@ func TestExtractTags(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExtratTagsWithWeight(t *testing.T) {
|
func TestExtratTagsWithWeight(t *testing.T) {
|
||||||
jiebago.SetDictionary("../dict.txt")
|
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||||
SetIdf("idf.txt")
|
result := et.ExtractTags(Lyric, 10)
|
||||||
result := ExtractTags(Lyric, 10)
|
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight[index].Word != tag.Word ||
|
if LyciWeight[index].Word != tag.Word ||
|
||||||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
|
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
|
||||||
@@ -285,10 +282,9 @@ func TestExtratTagsWithWeight(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||||
jiebago.SetDictionary("../dict.txt")
|
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
||||||
SetIdf("idf.txt")
|
et.SetStopWords("stop_words.txt")
|
||||||
SetStopWords("stop_words.txt")
|
result := et.ExtractTags(Lyric, 7)
|
||||||
result := ExtractTags(Lyric, 7)
|
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight2[index].Word != tag.Word ||
|
if LyciWeight2[index].Word != tag.Word ||
|
||||||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
|
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
|
||||||
|
|||||||
@@ -5,53 +5,28 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
type IDFLoader struct {
|
||||||
loader *idfLoader
|
IDFFreq map[string]float64
|
||||||
)
|
Median float64
|
||||||
|
|
||||||
func init() {
|
|
||||||
loader = newIDFLoader()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type idfLoader struct {
|
func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
|
||||||
Path string
|
IDFFilePath, err := jiebago.DictPath(IDFFileName)
|
||||||
Freq map[string]float64
|
|
||||||
Median float64
|
|
||||||
}
|
|
||||||
|
|
||||||
func newIDFLoader() *idfLoader {
|
|
||||||
loader := new(idfLoader)
|
|
||||||
loader.Freq = make(map[string]float64)
|
|
||||||
return loader
|
|
||||||
}
|
|
||||||
|
|
||||||
func (loader *idfLoader) newPath(idfFilePath string) error {
|
|
||||||
if loader.Path == idfFilePath {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
wtfs, err := jiebago.ParseDictFile(idfFilePath)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return nil, err
|
||||||
|
}
|
||||||
|
wtfs, err := jiebago.ParseDictFile(IDFFilePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
freqs := make([]float64, 0)
|
freqs := make([]float64, len(wtfs))
|
||||||
|
loader := &IDFLoader{make(map[string]float64), 0.0}
|
||||||
for _, wtf := range wtfs {
|
for index, wtf := range wtfs {
|
||||||
loader.Freq[wtf.Word] = wtf.Freq
|
loader.IDFFreq[wtf.Word] = wtf.Freq
|
||||||
freqs = append(freqs, wtf.Freq)
|
freqs[index] = wtf.Freq
|
||||||
}
|
}
|
||||||
|
|
||||||
sort.Float64s(freqs)
|
sort.Float64s(freqs)
|
||||||
loader.Median = freqs[len(freqs)/2]
|
loader.Median = freqs[len(freqs)/2]
|
||||||
return nil
|
return loader, nil
|
||||||
}
|
|
||||||
|
|
||||||
// Set the IDF file path, could be absolute path of IDF file, or IDF file
|
|
||||||
// name in current directory.
|
|
||||||
func SetIdf(idfFileName string) error {
|
|
||||||
idfFilePath, err := jiebago.DictPath(idfFileName)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return loader.newPath(idfFilePath)
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,58 +1,35 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
var StopWords = map[string]int{
|
||||||
"github.com/wangbin/jiebago"
|
"the": 1,
|
||||||
)
|
"of": 1,
|
||||||
|
"is": 1,
|
||||||
var stopWords map[string]int
|
"and": 1,
|
||||||
|
"to": 1,
|
||||||
func init() {
|
"in": 1,
|
||||||
stopWords = map[string]int{
|
"that": 1,
|
||||||
"the": 1,
|
"we": 1,
|
||||||
"of": 1,
|
"for": 1,
|
||||||
"is": 1,
|
"an": 1,
|
||||||
"and": 1,
|
"are": 1,
|
||||||
"to": 1,
|
"by": 1,
|
||||||
"in": 1,
|
"be": 1,
|
||||||
"that": 1,
|
"as": 1,
|
||||||
"we": 1,
|
"on": 1,
|
||||||
"for": 1,
|
"with": 1,
|
||||||
"an": 1,
|
"can": 1,
|
||||||
"are": 1,
|
"if": 1,
|
||||||
"by": 1,
|
"from": 1,
|
||||||
"be": 1,
|
"which": 1,
|
||||||
"as": 1,
|
"you": 1,
|
||||||
"on": 1,
|
"it": 1,
|
||||||
"with": 1,
|
"this": 1,
|
||||||
"can": 1,
|
"then": 1,
|
||||||
"if": 1,
|
"at": 1,
|
||||||
"from": 1,
|
"have": 1,
|
||||||
"which": 1,
|
"all": 1,
|
||||||
"you": 1,
|
"not": 1,
|
||||||
"it": 1,
|
"one": 1,
|
||||||
"this": 1,
|
"has": 1,
|
||||||
"then": 1,
|
"or": 1,
|
||||||
"at": 1,
|
|
||||||
"have": 1,
|
|
||||||
"all": 1,
|
|
||||||
"not": 1,
|
|
||||||
"one": 1,
|
|
||||||
"has": 1,
|
|
||||||
"or": 1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the stop words file path, could be absolute path of stop words file, or
|
|
||||||
// file name in current directory.
|
|
||||||
func SetStopWords(stopWordsFileName string) error {
|
|
||||||
stopWordsFilePath, err := jiebago.DictPath(stopWordsFileName)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
wtfs, err := jiebago.ParseDictFile(stopWordsFilePath)
|
|
||||||
for _, wtf := range wtfs {
|
|
||||||
stopWords[wtf.Word] = 1
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {
|
|||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||||
// could be manually speificed.
|
// could be manually speificed.
|
||||||
func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
||||||
posFilt := make(map[string]int)
|
posFilt := make(map[string]int)
|
||||||
for _, pos := range allowPOS {
|
for _, pos := range allowPOS {
|
||||||
posFilt[pos] = 1
|
posFilt[pos] = 1
|
||||||
@@ -124,7 +124,7 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
|||||||
cm := make(map[[2]string]float64)
|
cm := make(map[[2]string]float64)
|
||||||
span := 5
|
span := 5
|
||||||
wordTags := make([]posseg.WordTag, 0)
|
wordTags := make([]posseg.WordTag, 0)
|
||||||
for wordTag := range posseg.Cut(sentence, true) {
|
for wordTag := range t.Cut(sentence, true) {
|
||||||
wordTags = append(wordTags, wordTag)
|
wordTags = append(wordTags, wordTag)
|
||||||
}
|
}
|
||||||
for i, _ := range wordTags {
|
for i, _ := range wordTags {
|
||||||
@@ -156,13 +156,21 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
|||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm.
|
// Extract keywords from sentence using TextRank algorithm.
|
||||||
// topK specify how many top keywords to be returned at most.
|
// topK specify how many top keywords to be returned at most.
|
||||||
func TextRank(sentence string, topK int) wordWeights {
|
func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
|
||||||
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||||
// name in current directory. This function must be called before cut any
|
// name in current directory. This function must be called before cut any
|
||||||
// sentence.
|
// sentence.
|
||||||
func SetDictionary(dictFileName string) error {
|
func NewTextRanker(dictFileName string) (*TextRanker, error) {
|
||||||
return posseg.SetDictionary(dictFileName)
|
p, err := posseg.NewPosseg(dictFileName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &TextRanker{p}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type TextRanker struct {
|
||||||
|
*posseg.Posseg
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestTextRank(t *testing.T) {
|
func TestTextRank(t *testing.T) {
|
||||||
SetDictionary("../dict.txt")
|
tr, _ := NewTextRanker("../dict.txt")
|
||||||
results := TextRank(sentence, 10)
|
results := tr.TextRank(sentence, 10)
|
||||||
for index, tw := range results {
|
for index, tw := range results {
|
||||||
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
|
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
|
||||||
t.Errorf("%v != %v", tw, tagRanks[index])
|
t.Errorf("%v != %v", tw, tagRanks[index])
|
||||||
|
|||||||
@@ -14,16 +14,16 @@ const Name = "jieba"
|
|||||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
|
||||||
type JiebaTokenizer struct {
|
type JiebaTokenizer struct {
|
||||||
dictFileName string
|
j *jiebago.Jieba
|
||||||
hmm, searchMode bool
|
hmm, searchMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
err := jiebago.SetDictionary(dictFileName)
|
j, err := jiebago.NewJieba(dictFileName)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
dictFileName: dictFileName,
|
j: j,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
searchMode: searchMode,
|
searchMode: searchMode,
|
||||||
}, err
|
}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
pos := 1
|
pos := 1
|
||||||
var width int
|
var width int
|
||||||
var gram string
|
var gram string
|
||||||
for word := range jiebago.Cut(string(input), false, jt.hmm) {
|
for word := range jt.j.Cut(string(input), false, jt.hmm) {
|
||||||
if jt.searchMode {
|
if jt.searchMode {
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
width = len(runes)
|
width = len(runes)
|
||||||
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
for i := 0; i < width-step+1; i++ {
|
for i := 0; i < width-step+1; i++ {
|
||||||
gram = string(runes[i : i+step])
|
gram = string(runes[i : i+step])
|
||||||
gramLen := len(gram)
|
gramLen := len(gram)
|
||||||
if value, ok := jiebago.Trie.Freq[gram]; ok && value > 0 {
|
if value, ok := jt.j.Freq[gram]; ok && value > 0 {
|
||||||
gramStart := start + len(string(runes[:i]))
|
gramStart := start + len(string(runes[:i]))
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Term: []byte(gram),
|
Term: []byte(gram),
|
||||||
|
|||||||
Reference in New Issue
Block a user