mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-12 13:10:25 +08:00
tweak style
This commit is contained in:
@@ -5,9 +5,9 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
type IDFLoader struct {
|
||||
IDFFreq map[string]float64
|
||||
Median float64
|
||||
type idf struct {
|
||||
freqMap map[string]float64
|
||||
median float64
|
||||
freqs []float64
|
||||
}
|
||||
|
||||
|
||||
99
analyse/tag_extracker.go
Normal file
99
analyse/tag_extracker.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
type Segment struct {
|
||||
text string
|
||||
weight float64
|
||||
}
|
||||
|
||||
func (s Segment) String() string {
|
||||
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
||||
}
|
||||
|
||||
type Segments []Segments
|
||||
|
||||
func (ss Segments) Len() int {
|
||||
return len(ss)
|
||||
}
|
||||
|
||||
func (ss Segments) Less(i, j int) bool {
|
||||
if ss[i].weight == ss[j].weight {
|
||||
return ss[i].text < ws[j].text
|
||||
}
|
||||
|
||||
return ss[i].weight < ss[j].weight
|
||||
}
|
||||
|
||||
func (ss Segments) Swap(i, j int) {
|
||||
ss[i], ss[j] = ss[j], ss[i]
|
||||
}
|
||||
|
||||
type TagExtracter struct {
|
||||
seg *jieba.Segmenter
|
||||
i *idf
|
||||
*StopWordLoader
|
||||
}
|
||||
|
||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
||||
j, err := jiebago.Open(dictFileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
i, err := NewIDFLoader(IDFFileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &TagExtracter{j, i, NewStopWordLoader()}, nil
|
||||
}
|
||||
|
||||
// Keyword extraction.
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||
freq := make(map[string]float64)
|
||||
|
||||
for w := range t.Cut(sentence, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
if utf8.RuneCountInString(w) < 2 {
|
||||
continue
|
||||
}
|
||||
if t.IsStopWord(w) {
|
||||
continue
|
||||
}
|
||||
if f, ok := freq[w]; ok {
|
||||
freq[w] = f + 1.0
|
||||
} else {
|
||||
freq[w] = 1.0
|
||||
}
|
||||
}
|
||||
total := 0.0
|
||||
for _, f := range freq {
|
||||
total += f
|
||||
}
|
||||
for k, v := range freq {
|
||||
freq[k] = v / total
|
||||
}
|
||||
ws := make(wordWeights, 0)
|
||||
for k, v := range freq {
|
||||
var ti wordWeight
|
||||
if freq_, ok := t.IDFFreq[k]; ok {
|
||||
ti = wordWeight{Word: k, Weight: freq_ * v}
|
||||
} else {
|
||||
ti = wordWeight{Word: k, Weight: t.Median * v}
|
||||
}
|
||||
ws = append(ws, ti)
|
||||
}
|
||||
sort.Sort(sort.Reverse(ws))
|
||||
if len(ws) > topK {
|
||||
tags = ws[:topK]
|
||||
} else {
|
||||
tags = ws
|
||||
}
|
||||
return tags
|
||||
}
|
||||
Reference in New Issue
Block a user