From d487545eb53ba6a8742526c7343e515af34b8d89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?=
 <41315874+fumiama@users.noreply.github.com>
Date: Wed, 30 Nov 2022 13:35:21 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=20tag=5Fextracker?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                     |  6 +++---
 analyse/tag_extracker.go      | 27 +++++++++++------------
 analyse/tag_extracker_test.go | 40 +++++++++++++++++------------------
 analyse/textrank.go           |  3 ++-
 analyse/textrank_test.go      | 20 +++++++++---------
 dictionary.go                 |  2 +-
 dictionary/dictionary.go      |  2 +-
 example_parallel_cut_test.go  |  2 +-
 example_test.go               |  2 +-
 finalseg/finalseg.go          |  2 +-
 jieba.go                      |  4 ++--
 jieba_test.go                 |  2 +-
 tokenizers/tokenizer.go       |  8 +++----
 util/util.go                  |  2 +-
 14 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index f7ea0b0..7f1679c 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
-#结巴分词 Go 语言版：Jiebago
+#结巴分词 Go 语言版：jieba
 
 
-[![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba)
+[![Build Status](https://travis-ci.org/wangbin/jieba.png?branch=master)](https://travis-ci.org/wangbin/jieba) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba)
 
 [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件，Iiebago 是结巴分词的 Golang 语言实现。
 
@@ -23,7 +23,7 @@ import (
         "github.com/fumiama/jieba"
 )
 
-var seg jiebago.Segmenter
+var seg jieba.Segmenter
 
 func init() {
         seg.LoadDictionary("dict.txt")
diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go
index f6b2948..0a8dde5 100755
--- a/analyse/tag_extracker.go
+++ b/analyse/tag_extracker.go
@@ -6,7 +6,7 @@ import (
 	"strings"
 	"unicode/utf8"
 
-	jiebago "github.com/fumiama/jieba"
+	jieba "github.com/fumiama/jieba"
 )
 
 // Segment represents a word with weight.
@@ -26,7 +26,7 @@ func (s Segment) Weight() float64 {
 }
 
 // Segments represents a slice of Segment.
-type Segments []*Segment
+type Segments []Segment
 
 func (ss Segments) Len() int {
 	return len(ss)
@@ -46,7 +46,7 @@ func (ss Segments) Swap(i, j int) {
 
 // TagExtracter is used to extract tags from sentence.
 type TagExtracter struct {
-	seg      *jiebago.Segmenter
+	seg      *jieba.Segmenter
 	idf      *Idf
 	stopWord *StopWord
 }
@@ -54,7 +54,7 @@ type TagExtracter struct {
 // LoadDictionary reads the given filename and create a new dictionary.
 func (t *TagExtracter) LoadDictionary(fileName string) error {
 	t.stopWord = NewStopWord()
-	t.seg = new(jiebago.Segmenter)
+	t.seg = new(jieba.Segmenter)
 	return t.seg.LoadDictionary(fileName)
 }
 
@@ -72,7 +72,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
 
 // ExtractTags extracts the topK key words from sentence.
 func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
-	freqMap := make(map[string]float64)
+	freqMap := make(map[string]uint64, 256)
 
 	for _, w := range t.seg.Cut(sentence, true) {
 		w = strings.TrimSpace(w)
@@ -82,28 +82,25 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
 		if t.stopWord.IsStopWord(w) {
 			continue
 		}
-		if f, ok := freqMap[w]; ok {
-			freqMap[w] = f + 1.0
+		if v, ok := freqMap[w]; ok {
+			freqMap[w] = v + 1
 		} else {
-			freqMap[w] = 1.0
+			freqMap[w] = 1
 		}
 	}
-	total := 0.0
+	total := uint64(0)
 	for _, freq := range freqMap {
 		total += freq
 	}
-	for k, v := range freqMap {
-		freqMap[k] = v / total
-	}
 	ws := make(Segments, 0)
 	var s Segment
 	for k, v := range freqMap {
 		if freq, ok := t.idf.Frequency(k); ok {
-			s = Segment{text: k, weight: freq * v}
+			s = Segment{text: k, weight: freq * float64(v) / float64(total)}
 		} else {
-			s = Segment{text: k, weight: t.idf.median * v}
+			s = Segment{text: k, weight: t.idf.median * float64(v) / float64(total)}
 		}
-		ws = append(ws, &s)
+		ws = append(ws, s)
 	}
 	sort.Sort(sort.Reverse(ws))
 	if len(ws) > topK {
diff --git a/analyse/tag_extracker_test.go b/analyse/tag_extracker_test.go
index 809c1cb..24f80f0 100755
--- a/analyse/tag_extracker_test.go
+++ b/analyse/tag_extracker_test.go
@@ -228,29 +228,29 @@ var (
 雖然沒有藉口
 `
 	LyciWeight = Segments{
-		&Segment{text: "所謂", weight: 1.010262},
-		&Segment{text: "是否", weight: 0.738650},
-		&Segment{text: "一般", weight: 0.607600},
-		&Segment{text: "雖然", weight: 0.336754},
-		&Segment{text: "退縮", weight: 0.336754},
-		&Segment{text: "肌迫", weight: 0.336754},
-		&Segment{text: "矯作", weight: 0.336754},
-		&Segment{text: "沒有", weight: 0.336754},
-		&Segment{text: "怯懦", weight: 0.271099},
-		&Segment{text: "隨便", weight: 0.168377},
+		Segment{text: "所謂", weight: 1.010262},
+		Segment{text: "是否", weight: 0.738650},
+		Segment{text: "一般", weight: 0.607600},
+		Segment{text: "雖然", weight: 0.336754},
+		Segment{text: "退縮", weight: 0.336754},
+		Segment{text: "肌迫", weight: 0.336754},
+		Segment{text: "矯作", weight: 0.336754},
+		Segment{text: "沒有", weight: 0.336754},
+		Segment{text: "怯懦", weight: 0.271099},
+		Segment{text: "隨便", weight: 0.168377},
 	}
 
 	LyciWeight2 = Segments{
-		&Segment{text: "所謂", weight: 1.215739},
-		&Segment{text: "一般", weight: 0.731179},
-		&Segment{text: "雖然", weight: 0.405246},
-		&Segment{text: "退縮", weight: 0.405246},
-		&Segment{text: "肌迫", weight: 0.405246},
-		&Segment{text: "矯作", weight: 0.405246},
-		&Segment{text: "怯懦", weight: 0.326238},
-		&Segment{text: "逼不得已", weight: 0.202623},
-		&Segment{text: "右銘", weight: 0.202623},
-		&Segment{text: "寬闊", weight: 0.202623},
+		Segment{text: "所謂", weight: 1.215739},
+		Segment{text: "一般", weight: 0.731179},
+		Segment{text: "雖然", weight: 0.405246},
+		Segment{text: "退縮", weight: 0.405246},
+		Segment{text: "肌迫", weight: 0.405246},
+		Segment{text: "矯作", weight: 0.405246},
+		Segment{text: "怯懦", weight: 0.326238},
+		Segment{text: "逼不得已", weight: 0.202623},
+		Segment{text: "右銘", weight: 0.202623},
+		Segment{text: "寬闊", weight: 0.202623},
 	}
 )
 
diff --git a/analyse/textrank.go b/analyse/textrank.go
index 06c4839..3db0cdf 100755
--- a/analyse/textrank.go
+++ b/analyse/textrank.go
@@ -106,7 +106,8 @@ func (u *undirectWeightedGraph) rank() Segments {
 	result := make(Segments, len(ws))
 	i := 0
 	for n, w := range ws {
-		result[i] = &Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}
+		result[i].text = n
+		result[i].weight = (w - minRank/10.0) / (maxRank - minRank/10.0)
 		i++
 	}
 	sort.Sort(sort.Reverse(result))
diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go
index 5aa4c17..94176d1 100755
--- a/analyse/textrank_test.go
+++ b/analyse/textrank_test.go
@@ -9,16 +9,16 @@ var (
 	sentence = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
 
 	tagRanks = Segments{
-		&Segment{text: "吉林", weight: 1.0},
-		&Segment{text: "欧亚", weight: 0.87807810644},
-		&Segment{text: "置业", weight: 0.562048250306},
-		&Segment{text: "实现", weight: 0.520905743929},
-		&Segment{text: "收入", weight: 0.384283870648},
-		&Segment{text: "增资", weight: 0.360590945312},
-		&Segment{text: "子公司", weight: 0.353131980904},
-		&Segment{text: "城市", weight: 0.307509449283},
-		&Segment{text: "全资", weight: 0.306324426665},
-		&Segment{text: "商业", weight: 0.306138241063},
+		Segment{text: "吉林", weight: 1.0},
+		Segment{text: "欧亚", weight: 0.87807810644},
+		Segment{text: "置业", weight: 0.562048250306},
+		Segment{text: "实现", weight: 0.520905743929},
+		Segment{text: "收入", weight: 0.384283870648},
+		Segment{text: "增资", weight: 0.360590945312},
+		Segment{text: "子公司", weight: 0.353131980904},
+		Segment{text: "城市", weight: 0.307509449283},
+		Segment{text: "全资", weight: 0.306324426665},
+		Segment{text: "商业", weight: 0.306138241063},
 	}
 )
 
diff --git a/dictionary.go b/dictionary.go
index 7350e1a..715d9b4 100755
--- a/dictionary.go
+++ b/dictionary.go
@@ -1,4 +1,4 @@
-package jiebago
+package jieba
 
 import (
 	"math"
diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go
index d216415..6ddae2c 100755
--- a/dictionary/dictionary.go
+++ b/dictionary/dictionary.go
@@ -1,5 +1,5 @@
 // Package dictionary contains a interface and wraps all io related work.
-// It is used by jiebago module to read/write files.
+// It is used by jieba module to read/write files.
 package dictionary
 
 import (
diff --git a/example_parallel_cut_test.go b/example_parallel_cut_test.go
index 8e9f1a0..373815c 100755
--- a/example_parallel_cut_test.go
+++ b/example_parallel_cut_test.go
@@ -1,4 +1,4 @@
-package jiebago
+package jieba
 
 import (
 	"bufio"
diff --git a/example_test.go b/example_test.go
index 174d327..7332f88 100755
--- a/example_test.go
+++ b/example_test.go
@@ -1,4 +1,4 @@
-package jiebago
+package jieba
 
 import (
 	"fmt"
diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go
index ae6679c..566070b 100755
--- a/finalseg/finalseg.go
+++ b/finalseg/finalseg.go
@@ -37,7 +37,7 @@ func cutHan(sentence string) []string {
 }
 
 // Cut cuts sentence into words using Hidden Markov Model with Viterbi
-// algorithm. It is used by Jiebago for unknonw words.
+// algorithm. It is used by jieba for unknonw words.
 func Cut(sentence string) []string {
 	result := make([]string, 0, 10)
 	s := sentence
diff --git a/jieba.go b/jieba.go
index d5de40e..f1484c3 100755
--- a/jieba.go
+++ b/jieba.go
@@ -1,5 +1,5 @@
-// Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
-package jiebago
+// Package jieba is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
+package jieba
 
 import (
 	"math"
diff --git a/jieba_test.go b/jieba_test.go
index 94d1ac7..206188c 100755
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -1,4 +1,4 @@
-package jiebago
+package jieba
 
 import "testing"
 
diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go
index cc46891..0b1b657 100755
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -7,7 +7,7 @@ import (
 
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
-	jiebago "github.com/fumiama/jieba"
+	jieba "github.com/fumiama/jieba"
 )
 
 // Name is the jieba tokenizer name.
@@ -15,9 +15,9 @@ const Name = "jieba"
 
 var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
 
-// JiebaTokenizer is the beleve tokenizer for jiebago.
+// JiebaTokenizer is the beleve tokenizer for jieba.
 type JiebaTokenizer struct {
-	seg             jiebago.Segmenter
+	seg             jieba.Segmenter
 	hmm, searchMode bool
 }
 
@@ -42,7 +42,7 @@ Parameters:
 	this word into "交换", "换机", which are valid Chinese words.
 */
 func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
-	var seg jiebago.Segmenter
+	var seg jieba.Segmenter
 	err := seg.LoadDictionary(dictFilePath)
 	return &JiebaTokenizer{
 		seg:        seg,
diff --git a/util/util.go b/util/util.go
index e2fc8bf..754ccdf 100755
--- a/util/util.go
+++ b/util/util.go
@@ -1,4 +1,4 @@
-// Package util contains some util functions used by jiebago.
+// Package util contains some util functions used by jieba.
 package util
 
 import "regexp"