1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

优化 tag_extracker

This commit is contained in:
源文雨
2022-11-30 13:35:21 +08:00
parent ae85ccb20a
commit d487545eb5
14 changed files with 60 additions and 62 deletions

View File

@@ -6,7 +6,7 @@ import (
"strings"
"unicode/utf8"
jiebago "github.com/fumiama/jieba"
jieba "github.com/fumiama/jieba"
)
// Segment represents a word with weight.
@@ -26,7 +26,7 @@ func (s Segment) Weight() float64 {
}
// Segments represents a slice of Segment.
type Segments []*Segment
type Segments []Segment
func (ss Segments) Len() int {
return len(ss)
@@ -46,7 +46,7 @@ func (ss Segments) Swap(i, j int) {
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct {
seg *jiebago.Segmenter
seg *jieba.Segmenter
idf *Idf
stopWord *StopWord
}
@@ -54,7 +54,7 @@ type TagExtracter struct {
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jiebago.Segmenter)
t.seg = new(jieba.Segmenter)
return t.seg.LoadDictionary(fileName)
}
@@ -72,7 +72,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
// ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)
freqMap := make(map[string]uint64, 256)
for _, w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w)
@@ -82,28 +82,25 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
if t.stopWord.IsStopWord(w) {
continue
}
if f, ok := freqMap[w]; ok {
freqMap[w] = f + 1.0
if v, ok := freqMap[w]; ok {
freqMap[w] = v + 1
} else {
freqMap[w] = 1.0
freqMap[w] = 1
}
}
total := 0.0
total := uint64(0)
for _, freq := range freqMap {
total += freq
}
for k, v := range freqMap {
freqMap[k] = v / total
}
ws := make(Segments, 0)
var s Segment
for k, v := range freqMap {
if freq, ok := t.idf.Frequency(k); ok {
s = Segment{text: k, weight: freq * v}
s = Segment{text: k, weight: freq * float64(v) / float64(total)}
} else {
s = Segment{text: k, weight: t.idf.median * v}
s = Segment{text: k, weight: t.idf.median * float64(v) / float64(total)}
}
ws = append(ws, &s)
ws = append(ws, s)
}
sort.Sort(sort.Reverse(ws))
if len(ws) > topK {

View File

@@ -228,29 +228,29 @@ var (
雖然沒有藉口
`
LyciWeight = Segments{
&Segment{text: "所謂", weight: 1.010262},
&Segment{text: "是否", weight: 0.738650},
&Segment{text: "一般", weight: 0.607600},
&Segment{text: "雖然", weight: 0.336754},
&Segment{text: "退縮", weight: 0.336754},
&Segment{text: "肌迫", weight: 0.336754},
&Segment{text: "矯作", weight: 0.336754},
&Segment{text: "沒有", weight: 0.336754},
&Segment{text: "怯懦", weight: 0.271099},
&Segment{text: "隨便", weight: 0.168377},
Segment{text: "所謂", weight: 1.010262},
Segment{text: "是否", weight: 0.738650},
Segment{text: "一般", weight: 0.607600},
Segment{text: "雖然", weight: 0.336754},
Segment{text: "退縮", weight: 0.336754},
Segment{text: "肌迫", weight: 0.336754},
Segment{text: "矯作", weight: 0.336754},
Segment{text: "沒有", weight: 0.336754},
Segment{text: "怯懦", weight: 0.271099},
Segment{text: "隨便", weight: 0.168377},
}
LyciWeight2 = Segments{
&Segment{text: "所謂", weight: 1.215739},
&Segment{text: "一般", weight: 0.731179},
&Segment{text: "雖然", weight: 0.405246},
&Segment{text: "退縮", weight: 0.405246},
&Segment{text: "肌迫", weight: 0.405246},
&Segment{text: "矯作", weight: 0.405246},
&Segment{text: "怯懦", weight: 0.326238},
&Segment{text: "逼不得已", weight: 0.202623},
&Segment{text: "右銘", weight: 0.202623},
&Segment{text: "寬闊", weight: 0.202623},
Segment{text: "所謂", weight: 1.215739},
Segment{text: "一般", weight: 0.731179},
Segment{text: "雖然", weight: 0.405246},
Segment{text: "退縮", weight: 0.405246},
Segment{text: "肌迫", weight: 0.405246},
Segment{text: "矯作", weight: 0.405246},
Segment{text: "怯懦", weight: 0.326238},
Segment{text: "逼不得已", weight: 0.202623},
Segment{text: "右銘", weight: 0.202623},
Segment{text: "寬闊", weight: 0.202623},
}
)

View File

@@ -106,7 +106,8 @@ func (u *undirectWeightedGraph) rank() Segments {
result := make(Segments, len(ws))
i := 0
for n, w := range ws {
result[i] = &Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}
result[i].text = n
result[i].weight = (w - minRank/10.0) / (maxRank - minRank/10.0)
i++
}
sort.Sort(sort.Reverse(result))

View File

@@ -9,16 +9,16 @@ var (
sentence = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
tagRanks = Segments{
&Segment{text: "吉林", weight: 1.0},
&Segment{text: "欧亚", weight: 0.87807810644},
&Segment{text: "置业", weight: 0.562048250306},
&Segment{text: "实现", weight: 0.520905743929},
&Segment{text: "收入", weight: 0.384283870648},
&Segment{text: "增资", weight: 0.360590945312},
&Segment{text: "子公司", weight: 0.353131980904},
&Segment{text: "城市", weight: 0.307509449283},
&Segment{text: "全资", weight: 0.306324426665},
&Segment{text: "商业", weight: 0.306138241063},
Segment{text: "吉林", weight: 1.0},
Segment{text: "欧亚", weight: 0.87807810644},
Segment{text: "置业", weight: 0.562048250306},
Segment{text: "实现", weight: 0.520905743929},
Segment{text: "收入", weight: 0.384283870648},
Segment{text: "增资", weight: 0.360590945312},
Segment{text: "子公司", weight: 0.353131980904},
Segment{text: "城市", weight: 0.307509449283},
Segment{text: "全资", weight: 0.306324426665},
Segment{text: "商业", weight: 0.306138241063},
}
)