mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-09 02:50:24 +08:00
100 lines
1.7 KiB
Go
100 lines
1.7 KiB
Go
package analyse
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"github.com/wangbin/jiebago/dictionary"
|
|
)
|
|
|
|
type Segment struct {
|
|
text string
|
|
weight float64
|
|
}
|
|
|
|
func (s Segment) String() string {
|
|
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
|
}
|
|
|
|
type Segments []Segments
|
|
|
|
func (ss Segments) Len() int {
|
|
return len(ss)
|
|
}
|
|
|
|
func (ss Segments) Less(i, j int) bool {
|
|
if ss[i].weight == ss[j].weight {
|
|
return ss[i].text < ws[j].text
|
|
}
|
|
|
|
return ss[i].weight < ss[j].weight
|
|
}
|
|
|
|
func (ss Segments) Swap(i, j int) {
|
|
ss[i], ss[j] = ss[j], ss[i]
|
|
}
|
|
|
|
type TagExtracter struct {
|
|
seg *jieba.Segmenter
|
|
i *idf
|
|
*StopWordLoader
|
|
}
|
|
|
|
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
|
j, err := jiebago.Open(dictFileName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
i, err := NewIDFLoader(IDFFileName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &TagExtracter{j, i, NewStopWordLoader()}, nil
|
|
}
|
|
|
|
// Keyword extraction.
|
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
|
freq := make(map[string]float64)
|
|
|
|
for w := range t.Cut(sentence, true) {
|
|
w = strings.TrimSpace(w)
|
|
if utf8.RuneCountInString(w) < 2 {
|
|
continue
|
|
}
|
|
if t.IsStopWord(w) {
|
|
continue
|
|
}
|
|
if f, ok := freq[w]; ok {
|
|
freq[w] = f + 1.0
|
|
} else {
|
|
freq[w] = 1.0
|
|
}
|
|
}
|
|
total := 0.0
|
|
for _, f := range freq {
|
|
total += f
|
|
}
|
|
for k, v := range freq {
|
|
freq[k] = v / total
|
|
}
|
|
ws := make(wordWeights, 0)
|
|
for k, v := range freq {
|
|
var ti wordWeight
|
|
if freq_, ok := t.IDFFreq[k]; ok {
|
|
ti = wordWeight{Word: k, Weight: freq_ * v}
|
|
} else {
|
|
ti = wordWeight{Word: k, Weight: t.Median * v}
|
|
}
|
|
ws = append(ws, ti)
|
|
}
|
|
sort.Sort(sort.Reverse(ws))
|
|
if len(ws) > topK {
|
|
tags = ws[:topK]
|
|
} else {
|
|
tags = ws
|
|
}
|
|
return tags
|
|
}
|