1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-12 21:20:26 +08:00

fixed the bug of textrank, updated tests

This commit is contained in:
Wang Bin
2015-02-17 12:04:06 +08:00
parent 74dbb7d525
commit e60dcd3e9e
2 changed files with 54 additions and 23 deletions

View File

@@ -1,6 +1,7 @@
package analyse package analyse
import ( import (
"fmt"
mapset "github.com/deckarep/golang-set" mapset "github.com/deckarep/golang-set"
"github.com/wangbin/jiebago/posseg" "github.com/wangbin/jiebago/posseg"
"math" "math"
@@ -21,26 +22,57 @@ type edge struct {
weight float64 weight float64
} }
func (e edge) String() string {
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
}
type edges []edge
func (es edges) Len() int {
return len(es)
}
func (es edges) Less(i, j int) bool {
return es[i].weight < es[j].weight
}
func (es edges) Swap(i, j int) {
es[i], es[j] = es[j], es[i]
}
type undirectWeightedGraph struct { type undirectWeightedGraph struct {
graph map[string][]edge graph map[string]edges
keys sort.StringSlice
}
func newUndirectWeightedGraph() *undirectWeightedGraph {
u := new(undirectWeightedGraph)
u.graph = make(map[string]edges)
u.keys = make(sort.StringSlice, 0)
return u
} }
func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
if _, ok := u.graph[start]; !ok { if _, ok := u.graph[start]; !ok {
u.graph[start] = []edge{edge{start: start, end: end, weight: weight}} u.keys = append(u.keys, start)
u.graph[start] = edges{edge{start: start, end: end, weight: weight}}
} else { } else {
u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight}) u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight})
} }
if _, ok := u.graph[end]; !ok { if _, ok := u.graph[end]; !ok {
u.graph[start] = []edge{edge{start: end, end: start, weight: weight}} u.keys = append(u.keys, end)
u.graph[end] = edges{edge{start: end, end: start, weight: weight}}
} else { } else {
u.graph[start] = append(u.graph[start], edge{start: end, end: start, weight: weight}) u.graph[end] = append(u.graph[end], edge{start: end, end: start, weight: weight})
} }
} }
func (u *undirectWeightedGraph) rank() TfIdfs { func (u *undirectWeightedGraph) rank() TfIdfs {
if !sort.IsSorted(u.keys) {
sort.Sort(u.keys)
}
ws := make(map[string]float64) ws := make(map[string]float64)
outSum := make(map[string]float64) outSum := make(map[string]float64)
@@ -48,7 +80,6 @@ func (u *undirectWeightedGraph) rank() TfIdfs {
if len(u.graph) > 0 { if len(u.graph) > 0 {
wsdef /= float64(len(u.graph)) wsdef /= float64(len(u.graph))
} }
for n, out := range u.graph { for n, out := range u.graph {
ws[n] = wsdef ws[n] = wsdef
sum := 0.0 sum := 0.0
@@ -59,15 +90,15 @@ func (u *undirectWeightedGraph) rank() TfIdfs {
} }
for x := 0; x < 10; x++ { for x := 0; x < 10; x++ {
for n, inedges := range u.graph { for _, n := range u.keys {
s := 0.0 s := 0.0
inedges := u.graph[n]
for _, e := range inedges { for _, e := range inedges {
s += e.weight / outSum[e.end] * ws[e.start] s += e.weight / outSum[e.end] * ws[e.end]
} }
ws[n] = (1 - DampingFactor) + DampingFactor*s ws[n] = (1 - DampingFactor) + DampingFactor*s
} }
} }
minRank := math.MaxFloat64 minRank := math.MaxFloat64
maxRank := math.SmallestNonzeroFloat64 maxRank := math.SmallestNonzeroFloat64
for _, w := range ws { for _, w := range ws {
@@ -90,13 +121,13 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs {
for _, pos := range allowPOS { for _, pos := range allowPOS {
posFilt.Add(pos) posFilt.Add(pos)
} }
g := new(undirectWeightedGraph) g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64) cm := make(map[[2]string]float64)
span := 5 span := 5
wordTags := posseg.Cut(sentence, true) wordTags := posseg.Cut(sentence, true)
for i := range wordTags { for i, _ := range wordTags {
if posFilt.Contains(wordTags[i].Tag) { if posFilt.Contains(wordTags[i].Tag) {
for j := i + 1; j < i+span; i++ { for j := i + 1; j < i+span; j++ {
if j > len(wordTags) { if j > len(wordTags) {
break break
} }

View File

@@ -10,20 +10,20 @@ var (
sentence = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。" sentence = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
tagRanks = TfIdfs{ tagRanks = TfIdfs{
TfIdf{Word: "吉林", Freq: 1.000000}, TfIdf{Word: "吉林", Freq: 1.0},
TfIdf{Word: "欧亚", Freq: 0.864834}, TfIdf{Word: "欧亚", Freq: 0.87807810644},
TfIdf{Word: "置业", Freq: 0.553466}, TfIdf{Word: "置业", Freq: 0.562048250306},
TfIdf{Word: "实现", Freq: 0.520661}, TfIdf{Word: "实现", Freq: 0.520905743929},
TfIdf{Word: "收入", Freq: 0.379700}, TfIdf{Word: "收入", Freq: 0.384283870648},
TfIdf{Word: "增资", Freq: 0.355086}, TfIdf{Word: "增资", Freq: 0.360590945312},
TfIdf{Word: "子公司", Freq: 0.349758}, TfIdf{Word: "子公司", Freq: 0.353131980904},
TfIdf{Word: "全资", Freq: 0.308537}, TfIdf{Word: "城市", Freq: 0.307509449283},
TfIdf{Word: "城市", Freq: 0.306104}, TfIdf{Word: "全资", Freq: 0.306324426665},
TfIdf{Word: "商业", Freq: 0.304837}, TfIdf{Word: "商业", Freq: 0.306138241063},
} }
) )
func TesTextRank(t *testing.T) { func TestTextRank(t *testing.T) {
jiebago.SetDictionary("../dict.txt") jiebago.SetDictionary("../dict.txt")
SetIdf("idf.txt") SetIdf("idf.txt")