mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化 tag_extracker
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
#结巴分词 Go 语言版:Jiebago
|
||||
#结巴分词 Go 语言版:jieba
|
||||
|
||||
|
||||
[](https://travis-ci.org/wangbin/jiebago) [](https://godoc.org/github.com/fumiama/jieba)
|
||||
[](https://travis-ci.org/wangbin/jieba) [](https://godoc.org/github.com/fumiama/jieba)
|
||||
|
||||
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。
|
||||
|
||||
@@ -23,7 +23,7 @@ import (
|
||||
"github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
var seg jiebago.Segmenter
|
||||
var seg jieba.Segmenter
|
||||
|
||||
func init() {
|
||||
seg.LoadDictionary("dict.txt")
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
jiebago "github.com/fumiama/jieba"
|
||||
jieba "github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
// Segment represents a word with weight.
|
||||
@@ -26,7 +26,7 @@ func (s Segment) Weight() float64 {
|
||||
}
|
||||
|
||||
// Segments represents a slice of Segment.
|
||||
type Segments []*Segment
|
||||
type Segments []Segment
|
||||
|
||||
func (ss Segments) Len() int {
|
||||
return len(ss)
|
||||
@@ -46,7 +46,7 @@ func (ss Segments) Swap(i, j int) {
|
||||
|
||||
// TagExtracter is used to extract tags from sentence.
|
||||
type TagExtracter struct {
|
||||
seg *jiebago.Segmenter
|
||||
seg *jieba.Segmenter
|
||||
idf *Idf
|
||||
stopWord *StopWord
|
||||
}
|
||||
@@ -54,7 +54,7 @@ type TagExtracter struct {
|
||||
// LoadDictionary reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jiebago.Segmenter)
|
||||
t.seg = new(jieba.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
}
|
||||
|
||||
@@ -72,7 +72,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
|
||||
// ExtractTags extracts the topK key words from sentence.
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
freqMap := make(map[string]float64)
|
||||
freqMap := make(map[string]uint64, 256)
|
||||
|
||||
for _, w := range t.seg.Cut(sentence, true) {
|
||||
w = strings.TrimSpace(w)
|
||||
@@ -82,28 +82,25 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
if t.stopWord.IsStopWord(w) {
|
||||
continue
|
||||
}
|
||||
if f, ok := freqMap[w]; ok {
|
||||
freqMap[w] = f + 1.0
|
||||
if v, ok := freqMap[w]; ok {
|
||||
freqMap[w] = v + 1
|
||||
} else {
|
||||
freqMap[w] = 1.0
|
||||
freqMap[w] = 1
|
||||
}
|
||||
}
|
||||
total := 0.0
|
||||
total := uint64(0)
|
||||
for _, freq := range freqMap {
|
||||
total += freq
|
||||
}
|
||||
for k, v := range freqMap {
|
||||
freqMap[k] = v / total
|
||||
}
|
||||
ws := make(Segments, 0)
|
||||
var s Segment
|
||||
for k, v := range freqMap {
|
||||
if freq, ok := t.idf.Frequency(k); ok {
|
||||
s = Segment{text: k, weight: freq * v}
|
||||
s = Segment{text: k, weight: freq * float64(v) / float64(total)}
|
||||
} else {
|
||||
s = Segment{text: k, weight: t.idf.median * v}
|
||||
s = Segment{text: k, weight: t.idf.median * float64(v) / float64(total)}
|
||||
}
|
||||
ws = append(ws, &s)
|
||||
ws = append(ws, s)
|
||||
}
|
||||
sort.Sort(sort.Reverse(ws))
|
||||
if len(ws) > topK {
|
||||
|
||||
@@ -228,29 +228,29 @@ var (
|
||||
雖然沒有藉口
|
||||
`
|
||||
LyciWeight = Segments{
|
||||
&Segment{text: "所謂", weight: 1.010262},
|
||||
&Segment{text: "是否", weight: 0.738650},
|
||||
&Segment{text: "一般", weight: 0.607600},
|
||||
&Segment{text: "雖然", weight: 0.336754},
|
||||
&Segment{text: "退縮", weight: 0.336754},
|
||||
&Segment{text: "肌迫", weight: 0.336754},
|
||||
&Segment{text: "矯作", weight: 0.336754},
|
||||
&Segment{text: "沒有", weight: 0.336754},
|
||||
&Segment{text: "怯懦", weight: 0.271099},
|
||||
&Segment{text: "隨便", weight: 0.168377},
|
||||
Segment{text: "所謂", weight: 1.010262},
|
||||
Segment{text: "是否", weight: 0.738650},
|
||||
Segment{text: "一般", weight: 0.607600},
|
||||
Segment{text: "雖然", weight: 0.336754},
|
||||
Segment{text: "退縮", weight: 0.336754},
|
||||
Segment{text: "肌迫", weight: 0.336754},
|
||||
Segment{text: "矯作", weight: 0.336754},
|
||||
Segment{text: "沒有", weight: 0.336754},
|
||||
Segment{text: "怯懦", weight: 0.271099},
|
||||
Segment{text: "隨便", weight: 0.168377},
|
||||
}
|
||||
|
||||
LyciWeight2 = Segments{
|
||||
&Segment{text: "所謂", weight: 1.215739},
|
||||
&Segment{text: "一般", weight: 0.731179},
|
||||
&Segment{text: "雖然", weight: 0.405246},
|
||||
&Segment{text: "退縮", weight: 0.405246},
|
||||
&Segment{text: "肌迫", weight: 0.405246},
|
||||
&Segment{text: "矯作", weight: 0.405246},
|
||||
&Segment{text: "怯懦", weight: 0.326238},
|
||||
&Segment{text: "逼不得已", weight: 0.202623},
|
||||
&Segment{text: "右銘", weight: 0.202623},
|
||||
&Segment{text: "寬闊", weight: 0.202623},
|
||||
Segment{text: "所謂", weight: 1.215739},
|
||||
Segment{text: "一般", weight: 0.731179},
|
||||
Segment{text: "雖然", weight: 0.405246},
|
||||
Segment{text: "退縮", weight: 0.405246},
|
||||
Segment{text: "肌迫", weight: 0.405246},
|
||||
Segment{text: "矯作", weight: 0.405246},
|
||||
Segment{text: "怯懦", weight: 0.326238},
|
||||
Segment{text: "逼不得已", weight: 0.202623},
|
||||
Segment{text: "右銘", weight: 0.202623},
|
||||
Segment{text: "寬闊", weight: 0.202623},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -106,7 +106,8 @@ func (u *undirectWeightedGraph) rank() Segments {
|
||||
result := make(Segments, len(ws))
|
||||
i := 0
|
||||
for n, w := range ws {
|
||||
result[i] = &Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}
|
||||
result[i].text = n
|
||||
result[i].weight = (w - minRank/10.0) / (maxRank - minRank/10.0)
|
||||
i++
|
||||
}
|
||||
sort.Sort(sort.Reverse(result))
|
||||
|
||||
@@ -9,16 +9,16 @@ var (
|
||||
sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
|
||||
tagRanks = Segments{
|
||||
&Segment{text: "吉林", weight: 1.0},
|
||||
&Segment{text: "欧亚", weight: 0.87807810644},
|
||||
&Segment{text: "置业", weight: 0.562048250306},
|
||||
&Segment{text: "实现", weight: 0.520905743929},
|
||||
&Segment{text: "收入", weight: 0.384283870648},
|
||||
&Segment{text: "增资", weight: 0.360590945312},
|
||||
&Segment{text: "子公司", weight: 0.353131980904},
|
||||
&Segment{text: "城市", weight: 0.307509449283},
|
||||
&Segment{text: "全资", weight: 0.306324426665},
|
||||
&Segment{text: "商业", weight: 0.306138241063},
|
||||
Segment{text: "吉林", weight: 1.0},
|
||||
Segment{text: "欧亚", weight: 0.87807810644},
|
||||
Segment{text: "置业", weight: 0.562048250306},
|
||||
Segment{text: "实现", weight: 0.520905743929},
|
||||
Segment{text: "收入", weight: 0.384283870648},
|
||||
Segment{text: "增资", weight: 0.360590945312},
|
||||
Segment{text: "子公司", weight: 0.353131980904},
|
||||
Segment{text: "城市", weight: 0.307509449283},
|
||||
Segment{text: "全资", weight: 0.306324426665},
|
||||
Segment{text: "商业", weight: 0.306138241063},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// Package dictionary contains a interface and wraps all io related work.
|
||||
// It is used by jiebago module to read/write files.
|
||||
// It is used by jieba module to read/write files.
|
||||
package dictionary
|
||||
|
||||
import (
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
@@ -37,7 +37,7 @@ func cutHan(sentence string) []string {
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by Jiebago for unknonw words.
|
||||
// algorithm. It is used by jieba for unknonw words.
|
||||
func Cut(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
s := sentence
|
||||
|
||||
4
jieba.go
4
jieba.go
@@ -1,5 +1,5 @@
|
||||
// Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
|
||||
package jiebago
|
||||
// Package jieba is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
|
||||
package jieba
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package jiebago
|
||||
package jieba
|
||||
|
||||
import "testing"
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
jiebago "github.com/fumiama/jieba"
|
||||
jieba "github.com/fumiama/jieba"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
@@ -15,9 +15,9 @@ const Name = "jieba"
|
||||
|
||||
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||
// JiebaTokenizer is the beleve tokenizer for jieba.
|
||||
type JiebaTokenizer struct {
|
||||
seg jiebago.Segmenter
|
||||
seg jieba.Segmenter
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ Parameters:
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg jiebago.Segmenter
|
||||
var seg jieba.Segmenter
|
||||
err := seg.LoadDictionary(dictFilePath)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Package util contains some util functions used by jiebago.
|
||||
// Package util contains some util functions used by jieba.
|
||||
package util
|
||||
|
||||
import "regexp"
|
||||
|
||||
Reference in New Issue
Block a user