1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
Files
jieba/analyse/tag_extracker.go
2022-12-03 10:54:06 +08:00

134 lines
3.0 KiB
Go
Executable File

// Package analyse is the Golang implementation of Jieba's analyse module.
package analyse
import (
"io"
"sort"
"strings"
"unicode/utf8"
jieba "github.com/fumiama/jieba"
)
// Segment represents a word with weight.
type Segment struct {
text string
weight float64
}
// Text returns the segment's text.
func (s Segment) Text() string {
return s.text
}
// Weight returns the segment's weight.
func (s Segment) Weight() float64 {
return s.weight
}
// Segments represents a slice of Segment.
type Segments []Segment
func (ss Segments) Len() int {
return len(ss)
}
func (ss Segments) Less(i, j int) bool {
if ss[i].weight == ss[j].weight {
return ss[i].text < ss[j].text
}
return ss[i].weight < ss[j].weight
}
func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct {
seg *jieba.Segmenter
idf *Idf
stopWord *StopWord
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(file io.Reader) (err error) {
t.stopWord = NewStopWord()
t.seg, err = jieba.LoadDictionary(file)
return
}
// LoadDictionaryAt reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
t.stopWord = NewStopWord()
t.seg, err = jieba.LoadDictionaryAt(file)
return
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(file io.Reader) error {
t.idf = NewIdf()
return t.idf.loadDictionary(file)
}
// LoadIdfAt reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdfAt(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionaryAt(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(file io.Reader) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(file)
}
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWordsAt(file string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionaryAt(file)
}
// ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]uint64, 256)
for _, w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
}
if t.stopWord.IsStopWord(w) {
continue
}
if v, ok := freqMap[w]; ok {
freqMap[w] = v + 1
} else {
freqMap[w] = 1
}
}
total := uint64(0)
for _, freq := range freqMap {
total += freq
}
ws := make(Segments, len(freqMap))
i := 0
for k, v := range freqMap {
ws[i].text = k
if freq, ok := t.idf.Frequency(k); ok {
ws[i].weight = freq * float64(v) / float64(total)
} else {
ws[i].weight = t.idf.median * float64(v) / float64(total)
}
i++
}
sort.Sort(sort.Reverse(ws))
if len(ws) > topK {
tags = ws[:topK]
} else {
tags = ws
}
return tags
}