mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
134 lines
3.0 KiB
Go
Executable File
134 lines
3.0 KiB
Go
Executable File
// Package analyse is the Golang implementation of Jieba's analyse module.
|
|
package analyse
|
|
|
|
import (
|
|
"io"
|
|
"sort"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
jieba "github.com/fumiama/jieba"
|
|
)
|
|
|
|
// Segment represents a word with weight.
|
|
type Segment struct {
|
|
text string
|
|
weight float64
|
|
}
|
|
|
|
// Text returns the segment's text.
|
|
func (s Segment) Text() string {
|
|
return s.text
|
|
}
|
|
|
|
// Weight returns the segment's weight.
|
|
func (s Segment) Weight() float64 {
|
|
return s.weight
|
|
}
|
|
|
|
// Segments represents a slice of Segment.
|
|
type Segments []Segment
|
|
|
|
func (ss Segments) Len() int {
|
|
return len(ss)
|
|
}
|
|
|
|
func (ss Segments) Less(i, j int) bool {
|
|
if ss[i].weight == ss[j].weight {
|
|
return ss[i].text < ss[j].text
|
|
}
|
|
|
|
return ss[i].weight < ss[j].weight
|
|
}
|
|
|
|
func (ss Segments) Swap(i, j int) {
|
|
ss[i], ss[j] = ss[j], ss[i]
|
|
}
|
|
|
|
// TagExtracter is used to extract tags from sentence.
|
|
type TagExtracter struct {
|
|
seg *jieba.Segmenter
|
|
idf *Idf
|
|
stopWord *StopWord
|
|
}
|
|
|
|
// LoadDictionary reads the given filename and create a new dictionary.
|
|
func (t *TagExtracter) LoadDictionary(file io.Reader) (err error) {
|
|
t.stopWord = NewStopWord()
|
|
t.seg, err = jieba.LoadDictionary(file)
|
|
return
|
|
}
|
|
|
|
// LoadDictionaryAt reads the given filename and create a new dictionary.
|
|
func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
|
|
t.stopWord = NewStopWord()
|
|
t.seg, err = jieba.LoadDictionaryAt(file)
|
|
return
|
|
}
|
|
|
|
// LoadIdf reads the given file and create a new Idf dictionary.
|
|
func (t *TagExtracter) LoadIdf(file io.Reader) error {
|
|
t.idf = NewIdf()
|
|
return t.idf.loadDictionary(file)
|
|
}
|
|
|
|
// LoadIdfAt reads the given file and create a new Idf dictionary.
|
|
func (t *TagExtracter) LoadIdfAt(fileName string) error {
|
|
t.idf = NewIdf()
|
|
return t.idf.loadDictionaryAt(fileName)
|
|
}
|
|
|
|
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
|
func (t *TagExtracter) LoadStopWords(file io.Reader) error {
|
|
t.stopWord = NewStopWord()
|
|
return t.stopWord.loadDictionary(file)
|
|
}
|
|
|
|
// LoadStopWordsAt reads the given file and create a new StopWord dictionary.
|
|
func (t *TagExtracter) LoadStopWordsAt(file string) error {
|
|
t.stopWord = NewStopWord()
|
|
return t.stopWord.loadDictionaryAt(file)
|
|
}
|
|
|
|
// ExtractTags extracts the topK key words from sentence.
|
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
|
freqMap := make(map[string]uint64, 256)
|
|
|
|
for _, w := range t.seg.Cut(sentence, true) {
|
|
w = strings.TrimSpace(w)
|
|
if utf8.RuneCountInString(w) < 2 {
|
|
continue
|
|
}
|
|
if t.stopWord.IsStopWord(w) {
|
|
continue
|
|
}
|
|
if v, ok := freqMap[w]; ok {
|
|
freqMap[w] = v + 1
|
|
} else {
|
|
freqMap[w] = 1
|
|
}
|
|
}
|
|
total := uint64(0)
|
|
for _, freq := range freqMap {
|
|
total += freq
|
|
}
|
|
ws := make(Segments, len(freqMap))
|
|
i := 0
|
|
for k, v := range freqMap {
|
|
ws[i].text = k
|
|
if freq, ok := t.idf.Frequency(k); ok {
|
|
ws[i].weight = freq * float64(v) / float64(total)
|
|
} else {
|
|
ws[i].weight = t.idf.median * float64(v) / float64(total)
|
|
}
|
|
i++
|
|
}
|
|
sort.Sort(sort.Reverse(ws))
|
|
if len(ws) > topK {
|
|
tags = ws[:topK]
|
|
} else {
|
|
tags = ws
|
|
}
|
|
return tags
|
|
}
|