From d487545eb53ba6a8742526c7343e515af34b8d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Wed, 30 Nov 2022 13:35:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=20tag=5Fextracker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 +++--- analyse/tag_extracker.go | 27 +++++++++++------------ analyse/tag_extracker_test.go | 40 +++++++++++++++++------------------ analyse/textrank.go | 3 ++- analyse/textrank_test.go | 20 +++++++++--------- dictionary.go | 2 +- dictionary/dictionary.go | 2 +- example_parallel_cut_test.go | 2 +- example_test.go | 2 +- finalseg/finalseg.go | 2 +- jieba.go | 4 ++-- jieba_test.go | 2 +- tokenizers/tokenizer.go | 8 +++---- util/util.go | 2 +- 14 files changed, 60 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index f7ea0b0..7f1679c 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -#结巴分词 Go 语言版:Jiebago +#结巴分词 Go 语言版:jieba -[![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba) +[![Build Status](https://travis-ci.org/wangbin/jieba.png?branch=master)](https://travis-ci.org/wangbin/jieba) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba) [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。 @@ -23,7 +23,7 @@ import ( "github.com/fumiama/jieba" ) -var seg jiebago.Segmenter +var seg jieba.Segmenter func init() { seg.LoadDictionary("dict.txt") diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index f6b2948..0a8dde5 100755 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -6,7 +6,7 @@ import ( "strings" "unicode/utf8" - jiebago "github.com/fumiama/jieba" + jieba "github.com/fumiama/jieba" ) // Segment represents a word with weight. @@ -26,7 +26,7 @@ func (s Segment) Weight() float64 { } // Segments represents a slice of Segment. -type Segments []*Segment +type Segments []Segment func (ss Segments) Len() int { return len(ss) @@ -46,7 +46,7 @@ func (ss Segments) Swap(i, j int) { // TagExtracter is used to extract tags from sentence. type TagExtracter struct { - seg *jiebago.Segmenter + seg *jieba.Segmenter idf *Idf stopWord *StopWord } @@ -54,7 +54,7 @@ type TagExtracter struct { // LoadDictionary reads the given filename and create a new dictionary. func (t *TagExtracter) LoadDictionary(fileName string) error { t.stopWord = NewStopWord() - t.seg = new(jiebago.Segmenter) + t.seg = new(jieba.Segmenter) return t.seg.LoadDictionary(fileName) } @@ -72,7 +72,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error { // ExtractTags extracts the topK key words from sentence. func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { - freqMap := make(map[string]float64) + freqMap := make(map[string]uint64, 256) for _, w := range t.seg.Cut(sentence, true) { w = strings.TrimSpace(w) @@ -82,28 +82,25 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { if t.stopWord.IsStopWord(w) { continue } - if f, ok := freqMap[w]; ok { - freqMap[w] = f + 1.0 + if v, ok := freqMap[w]; ok { + freqMap[w] = v + 1 } else { - freqMap[w] = 1.0 + freqMap[w] = 1 } } - total := 0.0 + total := uint64(0) for _, freq := range freqMap { total += freq } - for k, v := range freqMap { - freqMap[k] = v / total - } ws := make(Segments, 0) var s Segment for k, v := range freqMap { if freq, ok := t.idf.Frequency(k); ok { - s = Segment{text: k, weight: freq * v} + s = Segment{text: k, weight: freq * float64(v) / float64(total)} } else { - s = Segment{text: k, weight: t.idf.median * v} + s = Segment{text: k, weight: t.idf.median * float64(v) / float64(total)} } - ws = append(ws, &s) + ws = append(ws, s) } sort.Sort(sort.Reverse(ws)) if len(ws) > topK { diff --git a/analyse/tag_extracker_test.go b/analyse/tag_extracker_test.go index 809c1cb..24f80f0 100755 --- a/analyse/tag_extracker_test.go +++ b/analyse/tag_extracker_test.go @@ -228,29 +228,29 @@ var ( 雖然沒有藉口 ` LyciWeight = Segments{ - &Segment{text: "所謂", weight: 1.010262}, - &Segment{text: "是否", weight: 0.738650}, - &Segment{text: "一般", weight: 0.607600}, - &Segment{text: "雖然", weight: 0.336754}, - &Segment{text: "退縮", weight: 0.336754}, - &Segment{text: "肌迫", weight: 0.336754}, - &Segment{text: "矯作", weight: 0.336754}, - &Segment{text: "沒有", weight: 0.336754}, - &Segment{text: "怯懦", weight: 0.271099}, - &Segment{text: "隨便", weight: 0.168377}, + Segment{text: "所謂", weight: 1.010262}, + Segment{text: "是否", weight: 0.738650}, + Segment{text: "一般", weight: 0.607600}, + Segment{text: "雖然", weight: 0.336754}, + Segment{text: "退縮", weight: 0.336754}, + Segment{text: "肌迫", weight: 0.336754}, + Segment{text: "矯作", weight: 0.336754}, + Segment{text: "沒有", weight: 0.336754}, + Segment{text: "怯懦", weight: 0.271099}, + Segment{text: "隨便", weight: 0.168377}, } LyciWeight2 = Segments{ - &Segment{text: "所謂", weight: 1.215739}, - &Segment{text: "一般", weight: 0.731179}, - &Segment{text: "雖然", weight: 0.405246}, - &Segment{text: "退縮", weight: 0.405246}, - &Segment{text: "肌迫", weight: 0.405246}, - &Segment{text: "矯作", weight: 0.405246}, - &Segment{text: "怯懦", weight: 0.326238}, - &Segment{text: "逼不得已", weight: 0.202623}, - &Segment{text: "右銘", weight: 0.202623}, - &Segment{text: "寬闊", weight: 0.202623}, + Segment{text: "所謂", weight: 1.215739}, + Segment{text: "一般", weight: 0.731179}, + Segment{text: "雖然", weight: 0.405246}, + Segment{text: "退縮", weight: 0.405246}, + Segment{text: "肌迫", weight: 0.405246}, + Segment{text: "矯作", weight: 0.405246}, + Segment{text: "怯懦", weight: 0.326238}, + Segment{text: "逼不得已", weight: 0.202623}, + Segment{text: "右銘", weight: 0.202623}, + Segment{text: "寬闊", weight: 0.202623}, } ) diff --git a/analyse/textrank.go b/analyse/textrank.go index 06c4839..3db0cdf 100755 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -106,7 +106,8 @@ func (u *undirectWeightedGraph) rank() Segments { result := make(Segments, len(ws)) i := 0 for n, w := range ws { - result[i] = &Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)} + result[i].text = n + result[i].weight = (w - minRank/10.0) / (maxRank - minRank/10.0) i++ } sort.Sort(sort.Reverse(result)) diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index 5aa4c17..94176d1 100755 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -9,16 +9,16 @@ var ( sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" tagRanks = Segments{ - &Segment{text: "吉林", weight: 1.0}, - &Segment{text: "欧亚", weight: 0.87807810644}, - &Segment{text: "置业", weight: 0.562048250306}, - &Segment{text: "实现", weight: 0.520905743929}, - &Segment{text: "收入", weight: 0.384283870648}, - &Segment{text: "增资", weight: 0.360590945312}, - &Segment{text: "子公司", weight: 0.353131980904}, - &Segment{text: "城市", weight: 0.307509449283}, - &Segment{text: "全资", weight: 0.306324426665}, - &Segment{text: "商业", weight: 0.306138241063}, + Segment{text: "吉林", weight: 1.0}, + Segment{text: "欧亚", weight: 0.87807810644}, + Segment{text: "置业", weight: 0.562048250306}, + Segment{text: "实现", weight: 0.520905743929}, + Segment{text: "收入", weight: 0.384283870648}, + Segment{text: "增资", weight: 0.360590945312}, + Segment{text: "子公司", weight: 0.353131980904}, + Segment{text: "城市", weight: 0.307509449283}, + Segment{text: "全资", weight: 0.306324426665}, + Segment{text: "商业", weight: 0.306138241063}, } ) diff --git a/dictionary.go b/dictionary.go index 7350e1a..715d9b4 100755 --- a/dictionary.go +++ b/dictionary.go @@ -1,4 +1,4 @@ -package jiebago +package jieba import ( "math" diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go index d216415..6ddae2c 100755 --- a/dictionary/dictionary.go +++ b/dictionary/dictionary.go @@ -1,5 +1,5 @@ // Package dictionary contains a interface and wraps all io related work. -// It is used by jiebago module to read/write files. +// It is used by jieba module to read/write files. package dictionary import ( diff --git a/example_parallel_cut_test.go b/example_parallel_cut_test.go index 8e9f1a0..373815c 100755 --- a/example_parallel_cut_test.go +++ b/example_parallel_cut_test.go @@ -1,4 +1,4 @@ -package jiebago +package jieba import ( "bufio" diff --git a/example_test.go b/example_test.go index 174d327..7332f88 100755 --- a/example_test.go +++ b/example_test.go @@ -1,4 +1,4 @@ -package jiebago +package jieba import ( "fmt" diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index ae6679c..566070b 100755 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -37,7 +37,7 @@ func cutHan(sentence string) []string { } // Cut cuts sentence into words using Hidden Markov Model with Viterbi -// algorithm. It is used by Jiebago for unknonw words. +// algorithm. It is used by jieba for unknonw words. func Cut(sentence string) []string { result := make([]string, 0, 10) s := sentence diff --git a/jieba.go b/jieba.go index d5de40e..f1484c3 100755 --- a/jieba.go +++ b/jieba.go @@ -1,5 +1,5 @@ -// Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module. -package jiebago +// Package jieba is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module. +package jieba import ( "math" diff --git a/jieba_test.go b/jieba_test.go index 94d1ac7..206188c 100755 --- a/jieba_test.go +++ b/jieba_test.go @@ -1,4 +1,4 @@ -package jiebago +package jieba import "testing" diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index cc46891..0b1b657 100755 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -7,7 +7,7 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/registry" - jiebago "github.com/fumiama/jieba" + jieba "github.com/fumiama/jieba" ) // Name is the jieba tokenizer name. @@ -15,9 +15,9 @@ const Name = "jieba" var ideographRegexp = regexp.MustCompile(`\p{Han}+`) -// JiebaTokenizer is the beleve tokenizer for jiebago. +// JiebaTokenizer is the beleve tokenizer for jieba. type JiebaTokenizer struct { - seg jiebago.Segmenter + seg jieba.Segmenter hmm, searchMode bool } @@ -42,7 +42,7 @@ Parameters: this word into "交换", "换机", which are valid Chinese words. */ func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { - var seg jiebago.Segmenter + var seg jieba.Segmenter err := seg.LoadDictionary(dictFilePath) return &JiebaTokenizer{ seg: seg, diff --git a/util/util.go b/util/util.go index e2fc8bf..754ccdf 100755 --- a/util/util.go +++ b/util/util.go @@ -1,4 +1,4 @@ -// Package util contains some util functions used by jiebago. +// Package util contains some util functions used by jieba. package util import "regexp"