From 74dbb7d525fc3887dd714460edf6362dd2968cb3 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Wed, 4 Feb 2015 18:23:47 +0800 Subject: [PATCH] added simple implemention of textrank, corresponding to jieba commit #4030d8ed86dd3ff54e215ebe88c141b2a8345eda --- analyse/analyse.go | 8 +-- analyse/textrank.go | 126 +++++++++++++++++++++++++++++++++++++++ analyse/textrank_test.go | 36 +++++++++++ 3 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 analyse/textrank.go create mode 100644 analyse/textrank_test.go diff --git a/analyse/analyse.go b/analyse/analyse.go index c7412e9..e4502e9 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -1,7 +1,7 @@ package analyse import ( - // "fmt" + "fmt" "github.com/wangbin/jiebago" "sort" "strings" @@ -13,9 +13,9 @@ type TfIdf struct { Freq float64 } -//func (t TfIdf) String() string { -// return fmt.Sprintf("{%s: %f}", t.Word, t.Freq) -//} +func (t TfIdf) String() string { + return fmt.Sprintf("{%s: %f}", t.Word, t.Freq) +} type TfIdfs []TfIdf diff --git a/analyse/textrank.go b/analyse/textrank.go new file mode 100644 index 0000000..2cbab22 --- /dev/null +++ b/analyse/textrank.go @@ -0,0 +1,126 @@ +package analyse + +import ( + mapset "github.com/deckarep/golang-set" + "github.com/wangbin/jiebago/posseg" + "math" + "sort" +) + +const ( + DampingFactor = 0.85 +) + +var ( + defaultAllowPOS = []string{"ns", "n", "vn", "v"} +) + +type edge struct { + start string + end string + weight float64 +} + +type undirectWeightedGraph struct { + graph map[string][]edge +} + +func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { + if _, ok := u.graph[start]; !ok { + u.graph[start] = []edge{edge{start: start, end: end, weight: weight}} + } else { + u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight}) + } + + if _, ok := u.graph[end]; !ok { + u.graph[start] = []edge{edge{start: end, end: start, weight: weight}} + } else { + u.graph[start] = append(u.graph[start], edge{start: end, end: start, weight: weight}) + } + +} + +func (u *undirectWeightedGraph) rank() TfIdfs { + ws := make(map[string]float64) + outSum := make(map[string]float64) + + wsdef := 1.0 + if len(u.graph) > 0 { + wsdef /= float64(len(u.graph)) + } + + for n, out := range u.graph { + ws[n] = wsdef + sum := 0.0 + for _, e := range out { + sum += e.weight + } + outSum[n] = sum + } + + for x := 0; x < 10; x++ { + for n, inedges := range u.graph { + s := 0.0 + for _, e := range inedges { + s += e.weight / outSum[e.end] * ws[e.start] + } + ws[n] = (1 - DampingFactor) + DampingFactor*s + } + } + + minRank := math.MaxFloat64 + maxRank := math.SmallestNonzeroFloat64 + for _, w := range ws { + if w < minRank { + minRank = w + } else if w > maxRank { + maxRank = w + } + } + result := make(TfIdfs, 0) + for n, w := range ws { + result = append(result, TfIdf{Word: n, Freq: (w - minRank/10.0) / (maxRank - minRank/10.0)}) + } + sort.Sort(sort.Reverse(result)) + return result +} + +func TextRankWithPOS(sentence string, topK int, allowPOS []string) TfIdfs { + posFilt := mapset.NewSet() + for _, pos := range allowPOS { + posFilt.Add(pos) + } + g := new(undirectWeightedGraph) + cm := make(map[[2]string]float64) + span := 5 + wordTags := posseg.Cut(sentence, true) + for i := range wordTags { + if posFilt.Contains(wordTags[i].Tag) { + for j := i + 1; j < i+span; i++ { + if j > len(wordTags) { + break + } + if !posFilt.Contains(wordTags[j].Tag) { + continue + } + if _, ok := cm[[2]string{wordTags[i].Word, wordTags[j].Word}]; !ok { + cm[[2]string{wordTags[i].Word, wordTags[j].Word}] = 1.0 + } else { + cm[[2]string{wordTags[i].Word, wordTags[j].Word}] += 1.0 + } + } + } + } + for startEnd, weight := range cm { + g.addEdge(startEnd[0], startEnd[1], weight) + } + tags := g.rank() + if topK > 0 && len(tags) > topK { + tags = tags[:topK] + } + return tags +} + +func TextRank(sentence string, topK int) TfIdfs { + return TextRankWithPOS(sentence, topK, defaultAllowPOS) +} diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go new file mode 100644 index 0000000..d8f1179 --- /dev/null +++ b/analyse/textrank_test.go @@ -0,0 +1,36 @@ +package analyse + +import ( + "github.com/wangbin/jiebago" + "math" + "testing" +) + +var ( + sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + + tagRanks = TfIdfs{ + TfIdf{Word: "吉林", Freq: 1.000000}, + TfIdf{Word: "欧亚", Freq: 0.864834}, + TfIdf{Word: "置业", Freq: 0.553466}, + TfIdf{Word: "实现", Freq: 0.520661}, + TfIdf{Word: "收入", Freq: 0.379700}, + TfIdf{Word: "增资", Freq: 0.355086}, + TfIdf{Word: "子公司", Freq: 0.349758}, + TfIdf{Word: "全资", Freq: 0.308537}, + TfIdf{Word: "城市", Freq: 0.306104}, + TfIdf{Word: "商业", Freq: 0.304837}, + } +) + +func TesTextRank(t *testing.T) { + jiebago.SetDictionary("../dict.txt") + SetIdf("idf.txt") + + results := TextRank(sentence, 10) + for index, tw := range results { + if tw.Word != tagRanks[index].Word || math.Abs(tw.Freq-tagRanks[index].Freq) > 1e-6 { + t.Errorf("%v != %v", tw, tagRanks[index]) + } + } +}