diff --git a/analyse/textrank.go b/analyse/textrank.go index 2cbab22..ed39f02 100644 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -1,6 +1,7 @@ package analyse import ( + "fmt" mapset "github.com/deckarep/golang-set" "github.com/wangbin/jiebago/posseg" "math" @@ -21,26 +22,57 @@ type edge struct { weight float64 } +func (e edge) String() string { + return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight) +} + +type edges []edge + +func (es edges) Len() int { + return len(es) +} + +func (es edges) Less(i, j int) bool { + return es[i].weight < es[j].weight +} + +func (es edges) Swap(i, j int) { + es[i], es[j] = es[j], es[i] +} + type undirectWeightedGraph struct { - graph map[string][]edge + graph map[string]edges + keys sort.StringSlice +} + +func newUndirectWeightedGraph() *undirectWeightedGraph { + u := new(undirectWeightedGraph) + u.graph = make(map[string]edges) + u.keys = make(sort.StringSlice, 0) + return u } func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { if _, ok := u.graph[start]; !ok { - u.graph[start] = []edge{edge{start: start, end: end, weight: weight}} + u.keys = append(u.keys, start) + u.graph[start] = edges{edge{start: start, end: end, weight: weight}} } else { u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight}) } if _, ok := u.graph[end]; !ok { - u.graph[start] = []edge{edge{start: end, end: start, weight: weight}} + u.keys = append(u.keys, end) + u.graph[end] = edges{edge{start: end, end: start, weight: weight}} } else { - u.graph[start] = append(u.graph[start], edge{start: end, end: start, weight: weight}) + u.graph[end] = append(u.graph[end], edge{start: end, end: start, weight: weight}) } - } func (u *undirectWeightedGraph) rank() TfIdfs { + if !sort.IsSorted(u.keys) { + sort.Sort(u.keys) + } + ws := make(map[string]float64) outSum := make(map[string]float64) @@ -48,7 +80,6 @@ func (u *undirectWeightedGraph) rank() TfIdfs { if len(u.graph) > 0 { wsdef /= float64(len(u.graph)) } - for n, out := range u.graph { ws[n] = wsdef sum := 0.0 @@ -59,15 +90,15 @@ func (u *undirectWeightedGraph) rank() TfIdfs { } for x := 0; x < 10; x++ { - for n, inedges := range u.graph { + for _, n := range u.keys { s := 0.0 + inedges := u.graph[n] for _, e := range inedges { - s += e.weight / outSum[e.end] * ws[e.start] + s += e.weight / outSum[e.end] * ws[e.end] } ws[n] = (1 - DampingFactor) + DampingFactor*s } } - minRank := math.MaxFloat64 maxRank := math.SmallestNonzeroFloat64 for _, w := range ws { @@ -90,13 +121,13 @@ func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs { for _, pos := range allowPOS { posFilt.Add(pos) } - g := new(undirectWeightedGraph) + g := newUndirectWeightedGraph() cm := make(map[[2]string]float64) span := 5 wordTags := posseg.Cut(sentence, true) - for i := range wordTags { + for i, _ := range wordTags { if posFilt.Contains(wordTags[i].Tag) { - for j := i + 1; j < i+span; i++ { + for j := i + 1; j < i+span; j++ { if j > len(wordTags) { break } diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index d8f1179..2d80587 100644 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -10,20 +10,20 @@ var ( sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" tagRanks = TfIdfs{ - TfIdf{Word: "吉林", Freq: 1.000000}, - TfIdf{Word: "欧亚", Freq: 0.864834}, - TfIdf{Word: "置业", Freq: 0.553466}, - TfIdf{Word: "实现", Freq: 0.520661}, - TfIdf{Word: "收入", Freq: 0.379700}, - TfIdf{Word: "增资", Freq: 0.355086}, - TfIdf{Word: "子公司", Freq: 0.349758}, - TfIdf{Word: "全资", Freq: 0.308537}, - TfIdf{Word: "城市", Freq: 0.306104}, - TfIdf{Word: "商业", Freq: 0.304837}, + TfIdf{Word: "吉林", Freq: 1.0}, + TfIdf{Word: "欧亚", Freq: 0.87807810644}, + TfIdf{Word: "置业", Freq: 0.562048250306}, + TfIdf{Word: "实现", Freq: 0.520905743929}, + TfIdf{Word: "收入", Freq: 0.384283870648}, + TfIdf{Word: "增资", Freq: 0.360590945312}, + TfIdf{Word: "子公司", Freq: 0.353131980904}, + TfIdf{Word: "城市", Freq: 0.307509449283}, + TfIdf{Word: "全资", Freq: 0.306324426665}, + TfIdf{Word: "商业", Freq: 0.306138241063}, } ) -func TesTextRank(t *testing.T) { +func TestTextRank(t *testing.T) { jiebago.SetDictionary("../dict.txt") SetIdf("idf.txt")