mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
added simple implemention of textrank, corresponding to jieba commit #4030d8ed86dd3ff54e215ebe88c141b2a8345eda
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
// "fmt"
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -13,9 +13,9 @@ type TfIdf struct {
|
||||
Freq float64
|
||||
}
|
||||
|
||||
//func (t TfIdf) String() string {
|
||||
// return fmt.Sprintf("{%s: %f}", t.Word, t.Freq)
|
||||
//}
|
||||
func (t TfIdf) String() string {
|
||||
return fmt.Sprintf("{%s: %f}", t.Word, t.Freq)
|
||||
}
|
||||
|
||||
type TfIdfs []TfIdf
|
||||
|
||||
|
||||
126
analyse/textrank.go
Normal file
126
analyse/textrank.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
mapset "github.com/deckarep/golang-set"
|
||||
"github.com/wangbin/jiebago/posseg"
|
||||
"math"
|
||||
"sort"
|
||||
)
|
||||
|
||||
const (
|
||||
DampingFactor = 0.85
|
||||
)
|
||||
|
||||
var (
|
||||
defaultAllowPOS = []string{"ns", "n", "vn", "v"}
|
||||
)
|
||||
|
||||
type edge struct {
|
||||
start string
|
||||
end string
|
||||
weight float64
|
||||
}
|
||||
|
||||
type undirectWeightedGraph struct {
|
||||
graph map[string][]edge
|
||||
}
|
||||
|
||||
func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
|
||||
if _, ok := u.graph[start]; !ok {
|
||||
u.graph[start] = []edge{edge{start: start, end: end, weight: weight}}
|
||||
} else {
|
||||
u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight})
|
||||
}
|
||||
|
||||
if _, ok := u.graph[end]; !ok {
|
||||
u.graph[start] = []edge{edge{start: end, end: start, weight: weight}}
|
||||
} else {
|
||||
u.graph[start] = append(u.graph[start], edge{start: end, end: start, weight: weight})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (u *undirectWeightedGraph) rank() TfIdfs {
|
||||
ws := make(map[string]float64)
|
||||
outSum := make(map[string]float64)
|
||||
|
||||
wsdef := 1.0
|
||||
if len(u.graph) > 0 {
|
||||
wsdef /= float64(len(u.graph))
|
||||
}
|
||||
|
||||
for n, out := range u.graph {
|
||||
ws[n] = wsdef
|
||||
sum := 0.0
|
||||
for _, e := range out {
|
||||
sum += e.weight
|
||||
}
|
||||
outSum[n] = sum
|
||||
}
|
||||
|
||||
for x := 0; x < 10; x++ {
|
||||
for n, inedges := range u.graph {
|
||||
s := 0.0
|
||||
for _, e := range inedges {
|
||||
s += e.weight / outSum[e.end] * ws[e.start]
|
||||
}
|
||||
ws[n] = (1 - DampingFactor) + DampingFactor*s
|
||||
}
|
||||
}
|
||||
|
||||
minRank := math.MaxFloat64
|
||||
maxRank := math.SmallestNonzeroFloat64
|
||||
for _, w := range ws {
|
||||
if w < minRank {
|
||||
minRank = w
|
||||
} else if w > maxRank {
|
||||
maxRank = w
|
||||
}
|
||||
}
|
||||
result := make(TfIdfs, 0)
|
||||
for n, w := range ws {
|
||||
result = append(result, TfIdf{Word: n, Freq: (w - minRank/10.0) / (maxRank - minRank/10.0)})
|
||||
}
|
||||
sort.Sort(sort.Reverse(result))
|
||||
return result
|
||||
}
|
||||
|
||||
func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
|
||||
posFilt := mapset.NewSet()
|
||||
for _, pos := range allowPOS {
|
||||
posFilt.Add(pos)
|
||||
}
|
||||
g := new(undirectWeightedGraph)
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
wordTags := posseg.Cut(sentence, true)
|
||||
for i := range wordTags {
|
||||
if posFilt.Contains(wordTags[i].Tag) {
|
||||
for j := i + 1; j < i+span; i++ {
|
||||
if j > len(wordTags) {
|
||||
break
|
||||
}
|
||||
if !posFilt.Contains(wordTags[j].Tag) {
|
||||
continue
|
||||
}
|
||||
if _, ok := cm[[2]string{wordTags[i].Word, wordTags[j].Word}]; !ok {
|
||||
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] = 1.0
|
||||
} else {
|
||||
cm[[2]string{wordTags[i].Word, wordTags[j].Word}] += 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for startEnd, weight := range cm {
|
||||
g.addEdge(startEnd[0], startEnd[1], weight)
|
||||
}
|
||||
tags := g.rank()
|
||||
if topK > 0 && len(tags) > topK {
|
||||
tags = tags[:topK]
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
func TextRank(sentence string, topK int) TfIdfs {
|
||||
return TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
}
|
||||
36
analyse/textrank_test.go
Normal file
36
analyse/textrank_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var (
|
||||
sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
|
||||
tagRanks = TfIdfs{
|
||||
TfIdf{Word: "吉林", Freq: 1.000000},
|
||||
TfIdf{Word: "欧亚", Freq: 0.864834},
|
||||
TfIdf{Word: "置业", Freq: 0.553466},
|
||||
TfIdf{Word: "实现", Freq: 0.520661},
|
||||
TfIdf{Word: "收入", Freq: 0.379700},
|
||||
TfIdf{Word: "增资", Freq: 0.355086},
|
||||
TfIdf{Word: "子公司", Freq: 0.349758},
|
||||
TfIdf{Word: "全资", Freq: 0.308537},
|
||||
TfIdf{Word: "城市", Freq: 0.306104},
|
||||
TfIdf{Word: "商业", Freq: 0.304837},
|
||||
}
|
||||
)
|
||||
|
||||
func TesTextRank(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetIdf("idf.txt")
|
||||
|
||||
results := TextRank(sentence, 10)
|
||||
for index, tw := range results {
|
||||
if tw.Word != tagRanks[index].Word || math.Abs(tw.Freq-tagRanks[index].Freq) > 1e-6 {
|
||||
t.Errorf("%v != %v", tw, tagRanks[index])
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user