mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-24 21:54:24 +08:00
refactor analyse module
This commit is contained in:
@@ -1,98 +0,0 @@
|
|||||||
package analyse
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"github.com/wangbin/jiebago"
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
"unicode/utf8"
|
|
||||||
)
|
|
||||||
|
|
||||||
type wordWeight struct {
|
|
||||||
Word string
|
|
||||||
Weight float64
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w wordWeight) String() string {
|
|
||||||
return fmt.Sprintf("{%s: %f}", w.Word, w.Weight)
|
|
||||||
}
|
|
||||||
|
|
||||||
type wordWeights []wordWeight
|
|
||||||
|
|
||||||
func (ws wordWeights) Len() int {
|
|
||||||
return len(ws)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ws wordWeights) Less(i, j int) bool {
|
|
||||||
if ws[i].Weight == ws[j].Weight {
|
|
||||||
return ws[i].Word < ws[j].Word
|
|
||||||
}
|
|
||||||
|
|
||||||
return ws[i].Weight < ws[j].Weight
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ws wordWeights) Swap(i, j int) {
|
|
||||||
ws[i], ws[j] = ws[j], ws[i]
|
|
||||||
}
|
|
||||||
|
|
||||||
type TagExtracter struct {
|
|
||||||
*jiebago.Jieba
|
|
||||||
*IDFLoader
|
|
||||||
*StopWordLoader
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
|
||||||
j, err := jiebago.Open(dictFileName)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
i, err := NewIDFLoader(IDFFileName)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &TagExtracter{j, i, NewStopWordLoader()}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keyword extraction.
|
|
||||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
|
||||||
freq := make(map[string]float64)
|
|
||||||
|
|
||||||
for w := range t.Cut(sentence, true) {
|
|
||||||
w = strings.TrimSpace(w)
|
|
||||||
if utf8.RuneCountInString(w) < 2 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if t.IsStopWord(w) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if f, ok := freq[w]; ok {
|
|
||||||
freq[w] = f + 1.0
|
|
||||||
} else {
|
|
||||||
freq[w] = 1.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
total := 0.0
|
|
||||||
for _, f := range freq {
|
|
||||||
total += f
|
|
||||||
}
|
|
||||||
for k, v := range freq {
|
|
||||||
freq[k] = v / total
|
|
||||||
}
|
|
||||||
ws := make(wordWeights, 0)
|
|
||||||
for k, v := range freq {
|
|
||||||
var ti wordWeight
|
|
||||||
if freq_, ok := t.IDFFreq[k]; ok {
|
|
||||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
|
||||||
} else {
|
|
||||||
ti = wordWeight{Word: k, Weight: t.Median * v}
|
|
||||||
}
|
|
||||||
ws = append(ws, ti)
|
|
||||||
}
|
|
||||||
sort.Sort(sort.Reverse(ws))
|
|
||||||
if len(ws) > topK {
|
|
||||||
tags = ws[:topK]
|
|
||||||
} else {
|
|
||||||
tags = ws
|
|
||||||
}
|
|
||||||
return tags
|
|
||||||
}
|
|
||||||
@@ -227,43 +227,45 @@ var (
|
|||||||
只是逼不得已
|
只是逼不得已
|
||||||
雖然沒有藉口
|
雖然沒有藉口
|
||||||
`
|
`
|
||||||
LyciWeight = []wordWeight{
|
LyciWeight = Segments{
|
||||||
wordWeight{Word: "所謂", Weight: 1.010262},
|
Segment{text: "所謂", weight: 1.010262},
|
||||||
wordWeight{Word: "是否", Weight: 0.738650},
|
Segment{text: "是否", weight: 0.738650},
|
||||||
wordWeight{Word: "一般", Weight: 0.607600},
|
Segment{text: "一般", weight: 0.607600},
|
||||||
wordWeight{Word: "雖然", Weight: 0.336754},
|
Segment{text: "雖然", weight: 0.336754},
|
||||||
wordWeight{Word: "退縮", Weight: 0.336754},
|
Segment{text: "退縮", weight: 0.336754},
|
||||||
wordWeight{Word: "肌迫", Weight: 0.336754},
|
Segment{text: "肌迫", weight: 0.336754},
|
||||||
wordWeight{Word: "矯作", Weight: 0.336754},
|
Segment{text: "矯作", weight: 0.336754},
|
||||||
wordWeight{Word: "沒有", Weight: 0.336754},
|
Segment{text: "沒有", weight: 0.336754},
|
||||||
wordWeight{Word: "怯懦", Weight: 0.271099},
|
Segment{text: "怯懦", weight: 0.271099},
|
||||||
wordWeight{Word: "隨便", Weight: 0.168377},
|
Segment{text: "隨便", weight: 0.168377},
|
||||||
}
|
}
|
||||||
|
|
||||||
LyciWeight2 = []wordWeight{
|
LyciWeight2 = Segments{
|
||||||
wordWeight{Word: "所謂", Weight: 1.215739},
|
Segment{text: "所謂", weight: 1.215739},
|
||||||
wordWeight{Word: "一般", Weight: 0.731179},
|
Segment{text: "一般", weight: 0.731179},
|
||||||
wordWeight{Word: "雖然", Weight: 0.405246},
|
Segment{text: "雖然", weight: 0.405246},
|
||||||
wordWeight{Word: "退縮", Weight: 0.405246},
|
Segment{text: "退縮", weight: 0.405246},
|
||||||
wordWeight{Word: "肌迫", Weight: 0.405246},
|
Segment{text: "肌迫", weight: 0.405246},
|
||||||
wordWeight{Word: "矯作", Weight: 0.405246},
|
Segment{text: "矯作", weight: 0.405246},
|
||||||
wordWeight{Word: "怯懦", Weight: 0.326238},
|
Segment{text: "怯懦", weight: 0.326238},
|
||||||
wordWeight{Word: "逼不得已", Weight: 0.202623},
|
Segment{text: "逼不得已", weight: 0.202623},
|
||||||
wordWeight{Word: "右銘", Weight: 0.202623},
|
Segment{text: "右銘", weight: 0.202623},
|
||||||
wordWeight{Word: "寬闊", Weight: 0.202623},
|
Segment{text: "寬闊", weight: 0.202623},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestExtractTags(t *testing.T) {
|
func TestExtractTags(t *testing.T) {
|
||||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
var te TagExtracter
|
||||||
|
te.LoadDictionary("../dict.txt")
|
||||||
|
te.LoadIdf("idf.txt")
|
||||||
|
|
||||||
for index, sentence := range test_contents {
|
for index, sentence := range test_contents {
|
||||||
result := et.ExtractTags(sentence, 20)
|
result := te.ExtractTags(sentence, 20)
|
||||||
if len(result) != len(Tags[index]) {
|
if len(result) != len(Tags[index]) {
|
||||||
t.Fatalf("%s = %v", sentence, result)
|
t.Fatalf("%s = %v", sentence, result)
|
||||||
}
|
}
|
||||||
for i, tag := range result {
|
for i, tag := range result {
|
||||||
if tag.Word != Tags[index][i] {
|
if tag.text != Tags[index][i] {
|
||||||
t.Fatalf("%s != %s", tag, Tags[index][i])
|
t.Fatalf("%s != %s", tag, Tags[index][i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -271,23 +273,27 @@ func TestExtractTags(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExtratTagsWithWeight(t *testing.T) {
|
func TestExtratTagsWithWeight(t *testing.T) {
|
||||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
var te TagExtracter
|
||||||
result := et.ExtractTags(Lyric, 10)
|
te.LoadDictionary("../dict.txt")
|
||||||
|
te.LoadIdf("idf.txt")
|
||||||
|
result := te.ExtractTags(Lyric, 10)
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight[index].Word != tag.Word ||
|
if LyciWeight[index].text != tag.text ||
|
||||||
math.Abs(LyciWeight[index].Weight-tag.Weight) > 1e-6 {
|
math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 {
|
||||||
t.Fatalf("%v != %v", tag, LyciWeight[index])
|
t.Fatalf("%v != %v", tag, LyciWeight[index])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
func TestExtractTagsWithStopWordsFile(t *testing.T) {
|
||||||
et, _ := NewTagExtracter("../dict.txt", "idf.txt")
|
var te TagExtracter
|
||||||
et.SetStopWords("stop_words.txt")
|
te.LoadDictionary("../dict.txt")
|
||||||
result := et.ExtractTags(Lyric, 7)
|
te.LoadIdf("idf.txt")
|
||||||
|
te.LoadStopWords("stop_words.txt")
|
||||||
|
result := te.ExtractTags(Lyric, 7)
|
||||||
for index, tag := range result {
|
for index, tag := range result {
|
||||||
if LyciWeight2[index].Word != tag.Word ||
|
if LyciWeight2[index].text != tag.text ||
|
||||||
math.Abs(LyciWeight2[index].Weight-tag.Weight) > 1e-6 {
|
math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 {
|
||||||
t.Fatalf("%v != %v", tag, LyciWeight2[index])
|
t.Fatalf("%v != %v", tag, LyciWeight2[index])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,30 +1,50 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/wangbin/jiebago"
|
|
||||||
"sort"
|
"sort"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
)
|
)
|
||||||
|
|
||||||
type idf struct {
|
type Idf struct {
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
median float64
|
median float64
|
||||||
freqs []float64
|
freqs []float64
|
||||||
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *IDFLoader) AddEntry(entry jiebago.Entry) {
|
func (i *Idf) AddToken(token dictionary.Token) {
|
||||||
l.IDFFreq[entry.Word] = entry.Freq
|
i.Lock()
|
||||||
l.freqs = append(l.freqs, entry.Freq)
|
i.freqMap[token.Text()] = token.Frequency()
|
||||||
|
i.freqs = append(i.freqs, token.Frequency())
|
||||||
|
sort.Float64s(i.freqs)
|
||||||
|
i.median = i.freqs[len(i.freqs)/2]
|
||||||
|
i.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewIDFLoader(IDFFileName string) (*IDFLoader, error) {
|
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
||||||
loader := &IDFLoader{make(map[string]float64), 0.0, make([]float64, 0)}
|
i.Lock()
|
||||||
err := jiebago.LoadDict(loader, IDFFileName, false)
|
for token := range ch {
|
||||||
if err != nil {
|
i.freqMap[token.Text()] = token.Frequency()
|
||||||
return nil, err
|
i.freqs = append(i.freqs, token.Frequency())
|
||||||
}
|
}
|
||||||
|
sort.Float64s(i.freqs)
|
||||||
sort.Float64s(loader.freqs)
|
i.median = i.freqs[len(i.freqs)/2]
|
||||||
loader.Median = loader.freqs[len(loader.freqs)/2]
|
i.Unlock()
|
||||||
loader.freqs = []float64{}
|
}
|
||||||
return loader, nil
|
|
||||||
|
func (i *Idf) loadDictionary(fileName string) error {
|
||||||
|
return dictionary.LoadDictionary(i, fileName)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i Idf) Frequency(key string) (float64, bool) {
|
||||||
|
i.RLock()
|
||||||
|
freq, ok := i.freqMap[key]
|
||||||
|
i.RUnlock()
|
||||||
|
return freq, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewIdf() *Idf {
|
||||||
|
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import "github.com/wangbin/jiebago"
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
var defaultStopWords = map[string]int{
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
|
)
|
||||||
|
|
||||||
|
var DefaultStopWordMap = map[string]int{
|
||||||
"the": 1,
|
"the": 1,
|
||||||
"of": 1,
|
"of": 1,
|
||||||
"is": 1,
|
"is": 1,
|
||||||
@@ -36,27 +40,38 @@ var defaultStopWords = map[string]int{
|
|||||||
"or": 1,
|
"or": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
type StopWordLoader struct {
|
type StopWord struct {
|
||||||
stopWords map[string]int
|
stopWordMap map[string]int
|
||||||
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *StopWordLoader) AddEntry(entry jiebago.Entry) {
|
func (s *StopWord) AddToken(token dictionary.Token) {
|
||||||
s.stopWords[entry.Word] = 1
|
s.Lock()
|
||||||
|
s.stopWordMap[token.Text()] = 1
|
||||||
|
s.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewStopWordLoader() *StopWordLoader {
|
func NewStopWord() *StopWord {
|
||||||
s := new(StopWordLoader)
|
s := new(StopWord)
|
||||||
s.stopWords = defaultStopWords
|
s.stopWordMap = DefaultStopWordMap
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the stop words file path, could be absolute path of stop words file, or
|
func (s StopWord) IsStopWord(word string) bool {
|
||||||
// file name in current directory.
|
s.RLock()
|
||||||
func (s *StopWordLoader) SetStopWords(stopWordsFileName string) error {
|
_, ok := s.stopWordMap[word]
|
||||||
return jiebago.LoadDict(s, stopWordsFileName, false)
|
s.RUnlock()
|
||||||
}
|
|
||||||
|
|
||||||
func (s StopWordLoader) IsStopWord(word string) bool {
|
|
||||||
_, ok := s.stopWords[word]
|
|
||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
||||||
|
s.Lock()
|
||||||
|
for token := range ch {
|
||||||
|
s.stopWordMap[token.Text()] = 1
|
||||||
|
}
|
||||||
|
s.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *StopWord) loadDictionary(fileName string) error {
|
||||||
|
return dictionary.LoadDictionary(s, fileName)
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/wangbin/jiebago/dictionary"
|
"github.com/wangbin/jiebago"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Segment struct {
|
type Segment struct {
|
||||||
@@ -14,11 +14,19 @@ type Segment struct {
|
|||||||
weight float64
|
weight float64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s Segment) Text() string {
|
||||||
|
return s.text
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s Segment) Weight() float64 {
|
||||||
|
return s.weight
|
||||||
|
}
|
||||||
|
|
||||||
func (s Segment) String() string {
|
func (s Segment) String() string {
|
||||||
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
||||||
}
|
}
|
||||||
|
|
||||||
type Segments []Segments
|
type Segments []Segment
|
||||||
|
|
||||||
func (ss Segments) Len() int {
|
func (ss Segments) Len() int {
|
||||||
return len(ss)
|
return len(ss)
|
||||||
@@ -26,7 +34,7 @@ func (ss Segments) Len() int {
|
|||||||
|
|
||||||
func (ss Segments) Less(i, j int) bool {
|
func (ss Segments) Less(i, j int) bool {
|
||||||
if ss[i].weight == ss[j].weight {
|
if ss[i].weight == ss[j].weight {
|
||||||
return ss[i].text < ws[j].text
|
return ss[i].text < ss[j].text
|
||||||
}
|
}
|
||||||
|
|
||||||
return ss[i].weight < ss[j].weight
|
return ss[i].weight < ss[j].weight
|
||||||
@@ -37,57 +45,61 @@ func (ss Segments) Swap(i, j int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type TagExtracter struct {
|
type TagExtracter struct {
|
||||||
seg *jieba.Segmenter
|
seg *jiebago.Segmenter
|
||||||
i *idf
|
idf *Idf
|
||||||
*StopWordLoader
|
stopWord *StopWord
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||||
j, err := jiebago.Open(dictFileName)
|
t.stopWord = NewStopWord()
|
||||||
if err != nil {
|
t.seg = new(jiebago.Segmenter)
|
||||||
return nil, err
|
return t.seg.LoadDictionary(fileName)
|
||||||
}
|
}
|
||||||
i, err := NewIDFLoader(IDFFileName)
|
|
||||||
if err != nil {
|
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||||
return nil, err
|
t.idf = NewIdf()
|
||||||
}
|
return t.idf.loadDictionary(fileName)
|
||||||
return &TagExtracter{j, i, NewStopWordLoader()}, nil
|
}
|
||||||
|
|
||||||
|
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||||
|
t.stopWord = NewStopWord()
|
||||||
|
return t.stopWord.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keyword extraction.
|
// Keyword extraction.
|
||||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||||
freq := make(map[string]float64)
|
freqMap := make(map[string]float64)
|
||||||
|
|
||||||
for w := range t.Cut(sentence, true) {
|
for w := range t.seg.Cut(sentence, true) {
|
||||||
w = strings.TrimSpace(w)
|
w = strings.TrimSpace(w)
|
||||||
if utf8.RuneCountInString(w) < 2 {
|
if utf8.RuneCountInString(w) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if t.IsStopWord(w) {
|
if t.stopWord.IsStopWord(w) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if f, ok := freq[w]; ok {
|
if f, ok := freqMap[w]; ok {
|
||||||
freq[w] = f + 1.0
|
freqMap[w] = f + 1.0
|
||||||
} else {
|
} else {
|
||||||
freq[w] = 1.0
|
freqMap[w] = 1.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total := 0.0
|
total := 0.0
|
||||||
for _, f := range freq {
|
for _, freq := range freqMap {
|
||||||
total += f
|
total += freq
|
||||||
}
|
}
|
||||||
for k, v := range freq {
|
for k, v := range freqMap {
|
||||||
freq[k] = v / total
|
freqMap[k] = v / total
|
||||||
}
|
}
|
||||||
ws := make(wordWeights, 0)
|
ws := make(Segments, 0)
|
||||||
for k, v := range freq {
|
var s Segment
|
||||||
var ti wordWeight
|
for k, v := range freqMap {
|
||||||
if freq_, ok := t.IDFFreq[k]; ok {
|
if freq, ok := t.idf.Frequency(k); ok {
|
||||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
s = Segment{text: k, weight: freq * v}
|
||||||
} else {
|
} else {
|
||||||
ti = wordWeight{Word: k, Weight: t.Median * v}
|
s = Segment{text: k, weight: t.idf.median * v}
|
||||||
}
|
}
|
||||||
ws = append(ws, ti)
|
ws = append(ws, s)
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(ws))
|
sort.Sort(sort.Reverse(ws))
|
||||||
if len(ws) > topK {
|
if len(ws) > topK {
|
||||||
|
|||||||
@@ -2,9 +2,10 @@ package analyse
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/wangbin/jiebago/posseg"
|
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
|
"github.com/wangbin/jiebago/posseg"
|
||||||
)
|
)
|
||||||
|
|
||||||
const dampingFactor = 0.85
|
const dampingFactor = 0.85
|
||||||
@@ -65,7 +66,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *undirectWeightedGraph) rank() wordWeights {
|
func (u *undirectWeightedGraph) rank() Segments {
|
||||||
if !sort.IsSorted(u.keys) {
|
if !sort.IsSorted(u.keys) {
|
||||||
sort.Sort(u.keys)
|
sort.Sort(u.keys)
|
||||||
}
|
}
|
||||||
@@ -105,9 +106,9 @@ func (u *undirectWeightedGraph) rank() wordWeights {
|
|||||||
maxRank = w
|
maxRank = w
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result := make(wordWeights, 0)
|
result := make(Segments, 0)
|
||||||
for n, w := range ws {
|
for n, w := range ws {
|
||||||
result = append(result, wordWeight{Word: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
|
result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(result))
|
sort.Sort(sort.Reverse(result))
|
||||||
return result
|
return result
|
||||||
@@ -115,7 +116,7 @@ func (u *undirectWeightedGraph) rank() wordWeights {
|
|||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||||
// could be manually speificed.
|
// could be manually speificed.
|
||||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) wordWeights {
|
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
||||||
posFilt := make(map[string]int)
|
posFilt := make(map[string]int)
|
||||||
for _, pos := range allowPOS {
|
for _, pos := range allowPOS {
|
||||||
posFilt[pos] = 1
|
posFilt[pos] = 1
|
||||||
@@ -123,20 +124,20 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
|||||||
g := newUndirectWeightedGraph()
|
g := newUndirectWeightedGraph()
|
||||||
cm := make(map[[2]string]float64)
|
cm := make(map[[2]string]float64)
|
||||||
span := 5
|
span := 5
|
||||||
pairs := make([]posseg.Pair, 0)
|
pairs := make([]posseg.Segment, 0)
|
||||||
for pair := range t.Cut(sentence, true) {
|
for pair := range t.seg.Cut(sentence, true) {
|
||||||
pairs = append(pairs, pair)
|
pairs = append(pairs, pair)
|
||||||
}
|
}
|
||||||
for i, _ := range pairs {
|
for i := range pairs {
|
||||||
if _, ok := posFilt[pairs[i].Flag]; ok {
|
if _, ok := posFilt[pairs[i].Pos()]; ok {
|
||||||
for j := i + 1; j < i+span && j <= len(pairs); j++ {
|
for j := i + 1; j < i+span && j <= len(pairs); j++ {
|
||||||
if _, ok := posFilt[pairs[j].Flag]; !ok {
|
if _, ok := posFilt[pairs[j].Pos()]; !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := cm[[2]string{pairs[i].Word, pairs[j].Word}]; !ok {
|
if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok {
|
||||||
cm[[2]string{pairs[i].Word, pairs[j].Word}] = 1.0
|
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0
|
||||||
} else {
|
} else {
|
||||||
cm[[2]string{pairs[i].Word, pairs[j].Word}] += 1.0
|
cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -153,21 +154,15 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
|||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm.
|
// Extract keywords from sentence using TextRank algorithm.
|
||||||
// topK specify how many top keywords to be returned at most.
|
// topK specify how many top keywords to be returned at most.
|
||||||
func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
|
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
type TextRanker struct {
|
||||||
// name in current directory. This function must be called before cut any
|
seg *posseg.Segmenter
|
||||||
// sentence.
|
|
||||||
func NewTextRanker(dictFileName string) (*TextRanker, error) {
|
|
||||||
p, err := posseg.Open(dictFileName)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &TextRanker{p}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type TextRanker struct {
|
func (t *TextRanker) LoadDictionary(fileName string) error {
|
||||||
*posseg.Posseg
|
t.seg = new(posseg.Segmenter)
|
||||||
|
return t.seg.LoadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,25 +8,26 @@ import (
|
|||||||
var (
|
var (
|
||||||
sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||||
|
|
||||||
tagRanks = wordWeights{
|
tagRanks = Segments{
|
||||||
wordWeight{Word: "吉林", Weight: 1.0},
|
Segment{text: "吉林", weight: 1.0},
|
||||||
wordWeight{Word: "欧亚", Weight: 0.87807810644},
|
Segment{text: "欧亚", weight: 0.87807810644},
|
||||||
wordWeight{Word: "置业", Weight: 0.562048250306},
|
Segment{text: "置业", weight: 0.562048250306},
|
||||||
wordWeight{Word: "实现", Weight: 0.520905743929},
|
Segment{text: "实现", weight: 0.520905743929},
|
||||||
wordWeight{Word: "收入", Weight: 0.384283870648},
|
Segment{text: "收入", weight: 0.384283870648},
|
||||||
wordWeight{Word: "增资", Weight: 0.360590945312},
|
Segment{text: "增资", weight: 0.360590945312},
|
||||||
wordWeight{Word: "子公司", Weight: 0.353131980904},
|
Segment{text: "子公司", weight: 0.353131980904},
|
||||||
wordWeight{Word: "城市", Weight: 0.307509449283},
|
Segment{text: "城市", weight: 0.307509449283},
|
||||||
wordWeight{Word: "全资", Weight: 0.306324426665},
|
Segment{text: "全资", weight: 0.306324426665},
|
||||||
wordWeight{Word: "商业", Weight: 0.306138241063},
|
Segment{text: "商业", weight: 0.306138241063},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTextRank(t *testing.T) {
|
func TestTextRank(t *testing.T) {
|
||||||
tr, _ := NewTextRanker("../dict.txt")
|
var tr TextRanker
|
||||||
|
tr.LoadDictionary("../dict.txt")
|
||||||
results := tr.TextRank(sentence, 10)
|
results := tr.TextRank(sentence, 10)
|
||||||
for index, tw := range results {
|
for index, tw := range results {
|
||||||
if tw.Word != tagRanks[index].Word || math.Abs(tw.Weight-tagRanks[index].Weight) > 1e-6 {
|
if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 {
|
||||||
t.Fatalf("%v != %v", tw, tagRanks[index])
|
t.Fatalf("%v != %v", tw, tagRanks[index])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user