From 21cdb2e863b910c890038b1c614f3625ab288ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Wed, 30 Nov 2022 12:38:47 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=20textrank?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- analyse/tag_extracker.go | 4 +- analyse/tag_extracker_test.go | 212 +++++++++++++++++----------------- analyse/textrank.go | 59 ++++++---- analyse/textrank_test.go | 20 ++-- util/helper/helper.go | 21 ++++ 5 files changed, 176 insertions(+), 140 deletions(-) create mode 100644 util/helper/helper.go diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index c57f0e3..f6b2948 100755 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -26,7 +26,7 @@ func (s Segment) Weight() float64 { } // Segments represents a slice of Segment. -type Segments []Segment +type Segments []*Segment func (ss Segments) Len() int { return len(ss) @@ -103,7 +103,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { } else { s = Segment{text: k, weight: t.idf.median * v} } - ws = append(ws, s) + ws = append(ws, &s) } sort.Sort(sort.Reverse(ws)) if len(ws) > topK { diff --git a/analyse/tag_extracker_test.go b/analyse/tag_extracker_test.go index 7a7d979..809c1cb 100755 --- a/analyse/tag_extracker_test.go +++ b/analyse/tag_extracker_test.go @@ -94,91 +94,91 @@ var ( "枪杆子中出政权"} Tags = [][]string{ - []string{"Python", "C++", "伸手不见五指", "孙悟空", "黑夜", "北京", "这是", "一个"}, - []string{"和服", "喜欢", "日本"}, - []string{"雷猴", "人间", "回归"}, - []string{"工信处", "女干事", "24", "交换机", "科室", "亲口", "器件", "技术性", "下属", "交代", "每月", "安装", "经过", "工作"}, - []string{"廉租房", "需要"}, - []string{"饰品", "永和", "服装", "有限公司"}, - []string{"天安门", "北京"}, - []string{"abc"}, - []string{"马尔可夫"}, - []string{"雷猴", "网站"}, - []string{"SOFTware", "Microsoft", "MICROcomputer", "微型", "一词", "软件", "计算机", "组成", "部分"}, - []string{"草泥马", "欺实", "词汇", "流行", "今年"}, - []string{"洋华堂", "总府", "伊藤"}, - []string{"中国科学院计算技术研究所"}, - []string{"朱丽叶", "罗密欧"}, - []string{"道具", "服装", "购买"}, - []string{"自珍", "敞帚", "PS", "开源", "不断改进", "敦促", "好处", "避免", "能够", "觉得", "就是", "自己", "一个"}, - []string{"石首市", "湖北省"}, - []string{"十堰市", "湖北省"}, - []string{"总经理", "这件", "完成", "事情"}, - []string{"修好", "电脑"}, - []string{"一了百了", "做好", "这件", "事情"}, - []string{"审美", "观点", "人们", "不同"}, - []string{"美的", "空调", "我们", "一个"}, - []string{"线程", "初始化", "注意", "我们"}, - []string{"好多", "原子", "分子", "组织", "一个"}, - []string{"马到功成"}, - []string{"无底洞"}, - []string{"首都", "北京", "中国"}, - []string{"孙君意"}, - []string{"马朝旭", "外交部", "发言人"}, - []string{"第四届", "东亚", "峰会", "领导人", "会议"}, - []string{"五年", "过去"}, - []string{"很长", "需要"}, - []string{"60", "阅兵", "周年", "首都"}, - []string{"审美", "你好", "观点", "人们", "不同"}, - []string{"世博园", "水果", "然后"}, - []string{"世博园", "水果", "然后"}, - []string{"后来", "但是", "知道"}, - []string{"合理", "存在"}, - []string{}, - []string{"rong", "love", "不以为耻", "以为"}, - []string{}, - []string{}, - []string{"hello", "审美", "你好", "观点", "人们", "不同"}, - []string{"网页", "基于", "形式", "主要"}, - []string{"hello", "审美", "你好", "观点", "人们", "不同"}, - []string{"想要", "拥有", "为什么", "生活", "不能"}, - []string{"后来"}, - []string{"此次", "为了", "中国"}, - []string{"解决", "使用", "一些", "问题", "可以"}, - []string{"解决", "使用", "一些", "问题", "可以"}, - []string{"解决", "其实", "使用", "一些", "问题", "可以"}, - []string{"好人", "解决", "使用", "一些", "问题", "可以"}, - []string{"是因为", "国家"}, - []string{"老年", "搜索", "支持"}, - []string{"闲法", "中本", "laoshipukong", "RT", "27", "责任法", "蒙人", "万劫不复", "举证", "倒置", "医患", "那部", "拉倒", "侵权", "全国人大常委会", "草案", "境地", "纠纷", "删除", "弱势"}, - []string{}, - []string{}, - []string{"在理", "确实"}, - []string{"长春", "春节", "讲话", "市长"}, - []string{"结婚", "尚未"}, - []string{"分子", "结合"}, - []string{"旅游", "最好", "服务"}, - []string{"的确", "这件", "事情"}, - []string{"指正", "参考", "大家"}, - []string{"塌桥", "哈尔滨", "公布", "原因", "政府"}, - []string{"入口处", "机场"}, - []string{"邢永臣", "摄影", "报道"}, - []string{"区分度", "BP", "神经网络", "训练", "分类", "才能", "如何", "增加"}, - []string{"长江大桥", "南京市"}, - []string{"SMT", "NiuTrans", "使用者", "便于", "用于", "建议", "利用", "为了", "研究", "一些"}, - []string{"长春市", "药店", "长春"}, - []string{"邓颖超", "生前", "衣服", "喜欢"}, - []string{"政治局", "热爱", "常委", "胡锦涛", "和平", "世界"}, - []string{"右面", "孙健", "范凯", "李松洪", "朱会震", "海林", "左面", "程序员", "再往"}, - []string{"一次性", "多少"}, - []string{"四块", "五块", "三块", "一斤", "两块", "一本", "一套", "一条"}, - []string{"和尚", "和尚头", "一样", "一个"}, - []string{"和平门", "共和党", "地铁", "党员", "公民", "爸爸", "中华人民共和国"}, - []string{"张晓梅", "T恤", "B超", "医院", "人民", "然后"}, - []string{"offer", "AT&T", "不错", "一件", "公司"}, - []string{"c#", "PI", "C++", "3.14159", "133", "122", "11", "关系", "什么"}, - []string{"的士", "的哥", "他开", "握手", "一辆", "黑色", "主席", "认识", "那个"}, - []string{"枪杆子", "政权"}, + {"Python", "C++", "伸手不见五指", "孙悟空", "黑夜", "北京", "这是", "一个"}, + {"和服", "喜欢", "日本"}, + {"雷猴", "人间", "回归"}, + {"工信处", "女干事", "24", "交换机", "科室", "亲口", "器件", "技术性", "下属", "交代", "每月", "安装", "经过", "工作"}, + {"廉租房", "需要"}, + {"饰品", "永和", "服装", "有限公司"}, + {"天安门", "北京"}, + {"abc"}, + {"马尔可夫"}, + {"雷猴", "网站"}, + {"SOFTware", "Microsoft", "MICROcomputer", "微型", "一词", "软件", "计算机", "组成", "部分"}, + {"草泥马", "欺实", "词汇", "流行", "今年"}, + {"洋华堂", "总府", "伊藤"}, + {"中国科学院计算技术研究所"}, + {"朱丽叶", "罗密欧"}, + {"道具", "服装", "购买"}, + {"自珍", "敞帚", "PS", "开源", "不断改进", "敦促", "好处", "避免", "能够", "觉得", "就是", "自己", "一个"}, + {"石首市", "湖北省"}, + {"十堰市", "湖北省"}, + {"总经理", "这件", "完成", "事情"}, + {"修好", "电脑"}, + {"一了百了", "做好", "这件", "事情"}, + {"审美", "观点", "人们", "不同"}, + {"美的", "空调", "我们", "一个"}, + {"线程", "初始化", "注意", "我们"}, + {"好多", "原子", "分子", "组织", "一个"}, + {"马到功成"}, + {"无底洞"}, + {"首都", "北京", "中国"}, + {"孙君意"}, + {"马朝旭", "外交部", "发言人"}, + {"第四届", "东亚", "峰会", "领导人", "会议"}, + {"五年", "过去"}, + {"很长", "需要"}, + {"60", "阅兵", "周年", "首都"}, + {"审美", "你好", "观点", "人们", "不同"}, + {"世博园", "水果", "然后"}, + {"世博园", "水果", "然后"}, + {"后来", "但是", "知道"}, + {"合理", "存在"}, + {}, + {"rong", "love", "不以为耻", "以为"}, + {}, + {}, + {"hello", "审美", "你好", "观点", "人们", "不同"}, + {"网页", "基于", "形式", "主要"}, + {"hello", "审美", "你好", "观点", "人们", "不同"}, + {"想要", "拥有", "为什么", "生活", "不能"}, + {"后来"}, + {"此次", "为了", "中国"}, + {"解决", "使用", "一些", "问题", "可以"}, + {"解决", "使用", "一些", "问题", "可以"}, + {"解决", "其实", "使用", "一些", "问题", "可以"}, + {"好人", "解决", "使用", "一些", "问题", "可以"}, + {"是因为", "国家"}, + {"老年", "搜索", "支持"}, + {"闲法", "中本", "laoshipukong", "RT", "27", "责任法", "蒙人", "万劫不复", "举证", "倒置", "医患", "那部", "拉倒", "侵权", "全国人大常委会", "草案", "境地", "纠纷", "删除", "弱势"}, + {}, + {}, + {"在理", "确实"}, + {"长春", "春节", "讲话", "市长"}, + {"结婚", "尚未"}, + {"分子", "结合"}, + {"旅游", "最好", "服务"}, + {"的确", "这件", "事情"}, + {"指正", "参考", "大家"}, + {"塌桥", "哈尔滨", "公布", "原因", "政府"}, + {"入口处", "机场"}, + {"邢永臣", "摄影", "报道"}, + {"区分度", "BP", "神经网络", "训练", "分类", "才能", "如何", "增加"}, + {"长江大桥", "南京市"}, + {"SMT", "NiuTrans", "使用者", "便于", "用于", "建议", "利用", "为了", "研究", "一些"}, + {"长春市", "药店", "长春"}, + {"邓颖超", "生前", "衣服", "喜欢"}, + {"政治局", "热爱", "常委", "胡锦涛", "和平", "世界"}, + {"右面", "孙健", "范凯", "李松洪", "朱会震", "海林", "左面", "程序员", "再往"}, + {"一次性", "多少"}, + {"四块", "五块", "三块", "一斤", "两块", "一本", "一套", "一条"}, + {"和尚", "和尚头", "一样", "一个"}, + {"和平门", "共和党", "地铁", "党员", "公民", "爸爸", "中华人民共和国"}, + {"张晓梅", "T恤", "B超", "医院", "人民", "然后"}, + {"offer", "AT&T", "不错", "一件", "公司"}, + {"c#", "PI", "C++", "3.14159", "133", "122", "11", "关系", "什么"}, + {"的士", "的哥", "他开", "握手", "一辆", "黑色", "主席", "认识", "那个"}, + {"枪杆子", "政权"}, } Lyric = ` @@ -228,29 +228,29 @@ var ( 雖然沒有藉口 ` LyciWeight = Segments{ - Segment{text: "所謂", weight: 1.010262}, - Segment{text: "是否", weight: 0.738650}, - Segment{text: "一般", weight: 0.607600}, - Segment{text: "雖然", weight: 0.336754}, - Segment{text: "退縮", weight: 0.336754}, - Segment{text: "肌迫", weight: 0.336754}, - Segment{text: "矯作", weight: 0.336754}, - Segment{text: "沒有", weight: 0.336754}, - Segment{text: "怯懦", weight: 0.271099}, - Segment{text: "隨便", weight: 0.168377}, + &Segment{text: "所謂", weight: 1.010262}, + &Segment{text: "是否", weight: 0.738650}, + &Segment{text: "一般", weight: 0.607600}, + &Segment{text: "雖然", weight: 0.336754}, + &Segment{text: "退縮", weight: 0.336754}, + &Segment{text: "肌迫", weight: 0.336754}, + &Segment{text: "矯作", weight: 0.336754}, + &Segment{text: "沒有", weight: 0.336754}, + &Segment{text: "怯懦", weight: 0.271099}, + &Segment{text: "隨便", weight: 0.168377}, } LyciWeight2 = Segments{ - Segment{text: "所謂", weight: 1.215739}, - Segment{text: "一般", weight: 0.731179}, - Segment{text: "雖然", weight: 0.405246}, - Segment{text: "退縮", weight: 0.405246}, - Segment{text: "肌迫", weight: 0.405246}, - Segment{text: "矯作", weight: 0.405246}, - Segment{text: "怯懦", weight: 0.326238}, - Segment{text: "逼不得已", weight: 0.202623}, - Segment{text: "右銘", weight: 0.202623}, - Segment{text: "寬闊", weight: 0.202623}, + &Segment{text: "所謂", weight: 1.215739}, + &Segment{text: "一般", weight: 0.731179}, + &Segment{text: "雖然", weight: 0.405246}, + &Segment{text: "退縮", weight: 0.405246}, + &Segment{text: "肌迫", weight: 0.405246}, + &Segment{text: "矯作", weight: 0.405246}, + &Segment{text: "怯懦", weight: 0.326238}, + &Segment{text: "逼不得已", weight: 0.202623}, + &Segment{text: "右銘", weight: 0.202623}, + &Segment{text: "寬闊", weight: 0.202623}, } ) @@ -266,7 +266,7 @@ func TestExtractTags(t *testing.T) { } for i, tag := range result { if tag.text != Tags[index][i] { - t.Fatalf("%s != %s", tag, Tags[index][i]) + t.Fatalf("%v != %v", tag, Tags[index][i]) } } } diff --git a/analyse/textrank.go b/analyse/textrank.go index d9399d0..a895558 100755 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -1,25 +1,27 @@ package analyse import ( + "hash/crc64" "math" "sort" "github.com/fumiama/jieba/posseg" + "github.com/fumiama/jieba/util/helper" ) const dampingFactor = 0.85 var ( - defaultAllowPOS = []string{"ns", "n", "vn", "v"} + defaultAllowPOS = [...]string{"ns", "n", "vn", "v"} ) type edge struct { + weight float64 start string end string - weight float64 } -type edges []edge +type edges []*edge func (es edges) Len() int { return len(es) @@ -39,25 +41,25 @@ type undirectWeightedGraph struct { } func newUndirectWeightedGraph() *undirectWeightedGraph { - u := new(undirectWeightedGraph) - u.graph = make(map[string]edges) - u.keys = make(sort.StringSlice, 0) - return u + return &undirectWeightedGraph{ + graph: make(map[string]edges, 256), + keys: make(sort.StringSlice, 0, 256), + } } func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { if _, ok := u.graph[start]; !ok { u.keys = append(u.keys, start) - u.graph[start] = edges{edge{start: start, end: end, weight: weight}} + u.graph[start] = edges{&edge{start: start, end: end, weight: weight}} } else { - u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight}) + u.graph[start] = append(u.graph[start], &edge{start: start, end: end, weight: weight}) } if _, ok := u.graph[end]; !ok { u.keys = append(u.keys, end) - u.graph[end] = edges{edge{start: end, end: start, weight: weight}} + u.graph[end] = edges{&edge{start: end, end: start, weight: weight}} } else { - u.graph[end] = append(u.graph[end], edge{start: end, end: start, weight: weight}) + u.graph[end] = append(u.graph[end], &edge{start: end, end: start, weight: weight}) } } @@ -66,8 +68,8 @@ func (u *undirectWeightedGraph) rank() Segments { sort.Sort(u.keys) } - ws := make(map[string]float64) - outSum := make(map[string]float64) + ws := make(map[string]float64, len(u.graph)*2) + outSum := make(map[string]float64, len(u.graph)*2) wsdef := 1.0 if len(u.graph) > 0 { @@ -101,9 +103,11 @@ func (u *undirectWeightedGraph) rank() Segments { maxRank = w } } - result := make(Segments, 0) + result := make(Segments, len(ws)) + i := 0 for n, w := range ws { - result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}) + result[i] = &Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)} + i++ } sort.Sort(sort.Reverse(result)) return result @@ -112,12 +116,20 @@ func (u *undirectWeightedGraph) rank() Segments { // TextRankWithPOS extracts keywords from sentence using TextRank algorithm. // Parameter allowPOS allows a customized pos list. func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments { - posFilt := make(map[string]int) + posFilt := make(map[string]int, len(allowPOS)*2) for _, pos := range allowPOS { posFilt[pos] = 1 } g := newUndirectWeightedGraph() - cm := make(map[[2]string]float64) + cm := make(map[uint64]float64, 256) + hm := make(map[uint64][2]string, 256) + gethash := func(a, b string) uint64 { + h := crc64.New(crc64.MakeTable(crc64.ISO)) + h.Write(helper.StringToBytes(a)) + h.Write([]byte("\t")) + h.Write(helper.StringToBytes(b)) + return h.Sum64() + } span := 5 var pairs []posseg.Segment for pair := range t.seg.Cut(sentence, true) { @@ -129,15 +141,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin if _, ok := posFilt[pairs[j].Pos()]; !ok { continue } - if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok { - cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0 + h := gethash(pairs[i].Text(), pairs[j].Text()) + if _, ok := cm[h]; !ok { + cm[h] = 1.0 + hm[h] = [2]string{pairs[i].Text(), pairs[j].Text()} } else { - cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0 + cm[h] += 1.0 } } } } - for startEnd, weight := range cm { + for h, weight := range cm { + startEnd := hm[h] g.addEdge(startEnd[0], startEnd[1], weight) } tags := g.rank() @@ -150,7 +165,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin // TextRank extract keywords from sentence using TextRank algorithm. // Parameter topK specify how many top keywords to be returned at most. func (t *TextRanker) TextRank(sentence string, topK int) Segments { - return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) + return t.TextRankWithPOS(sentence, topK, defaultAllowPOS[:]) } // TextRanker is used to extract tags from sentence. diff --git a/analyse/textrank_test.go b/analyse/textrank_test.go index f3b85f3..339e3cf 100755 --- a/analyse/textrank_test.go +++ b/analyse/textrank_test.go @@ -9,16 +9,16 @@ var ( sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" tagRanks = Segments{ - Segment{text: "吉林", weight: 1.0}, - Segment{text: "欧亚", weight: 0.87807810644}, - Segment{text: "置业", weight: 0.562048250306}, - Segment{text: "实现", weight: 0.520905743929}, - Segment{text: "收入", weight: 0.384283870648}, - Segment{text: "增资", weight: 0.360590945312}, - Segment{text: "子公司", weight: 0.353131980904}, - Segment{text: "城市", weight: 0.307509449283}, - Segment{text: "全资", weight: 0.306324426665}, - Segment{text: "商业", weight: 0.306138241063}, + &Segment{text: "吉林", weight: 1.0}, + &Segment{text: "欧亚", weight: 0.87807810644}, + &Segment{text: "置业", weight: 0.562048250306}, + &Segment{text: "实现", weight: 0.520905743929}, + &Segment{text: "收入", weight: 0.384283870648}, + &Segment{text: "增资", weight: 0.360590945312}, + &Segment{text: "子公司", weight: 0.353131980904}, + &Segment{text: "城市", weight: 0.307509449283}, + &Segment{text: "全资", weight: 0.306324426665}, + &Segment{text: "商业", weight: 0.306138241063}, } ) diff --git a/util/helper/helper.go b/util/helper/helper.go new file mode 100644 index 0000000..46a474b --- /dev/null +++ b/util/helper/helper.go @@ -0,0 +1,21 @@ +package helper + +import ( + "reflect" + "unsafe" +) + +// BytesToString 没有内存开销的转换 +func BytesToString(b []byte) string { + return *(*string)(unsafe.Pointer(&b)) +} + +// StringToBytes 没有内存开销的转换 +func StringToBytes(s string) (b []byte) { + bh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + sh := (*reflect.StringHeader)(unsafe.Pointer(&s)) + bh.Data = sh.Data + bh.Len = sh.Len + bh.Cap = sh.Len + return b +}