From 08ac49d10bae5fac494ba04ff653db30490e9673 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Wed, 25 Feb 2015 16:32:28 +0800 Subject: [PATCH] small refactor, don't compile regular expression every time, corresponding to jieba commit #32a0e92a09614cf5c72f87b1a59a5c4369200516 --- finalseg/finalseg.go | 11 +++++++---- jieba.go | 43 ++++++++++++++++++++++++------------------- posseg/posseg.go | 43 ++++++++++++++++++++----------------------- 3 files changed, 51 insertions(+), 46 deletions(-) diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index d42d195..8725610 100644 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -4,6 +4,11 @@ import ( "regexp" ) +var ( + reHan = regexp.MustCompile(`\p{Han}+`) + reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) +) + func cutHan(sentence string) []string { runes := []rune(sentence) result := make([]string, 0) @@ -30,14 +35,12 @@ func cutHan(sentence string) []string { func Cut(sentence string) []string { result := make([]string, 0) - re_han := regexp.MustCompile(`\p{Han}+`) - re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) s := sentence var hans string var hanLoc []int var nonhanLoc []int for { - hanLoc = re_han.FindStringIndex(s) + hanLoc = reHan.FindStringIndex(s) if hanLoc == nil { if len(s) == 0 { break @@ -50,7 +53,7 @@ func Cut(sentence string) []string { } continue } - nonhanLoc = re_skip.FindStringIndex(s) + nonhanLoc = reSkip.FindStringIndex(s) if nonhanLoc == nil { if len(s) == 0 { break diff --git a/jieba.go b/jieba.go index 8ae8962..214a9d4 100644 --- a/jieba.go +++ b/jieba.go @@ -11,6 +11,11 @@ import ( var ( Dictionary = "dict.txt" UserWordTagTab = make(map[string]string) + reEng = regexp.MustCompile(`[[:alnum:]]`) + reHanCutAll = regexp.MustCompile(`\p{Han}+`) + reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) + reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) + reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) type Route struct { @@ -187,7 +192,7 @@ func cut_DAG(sentence string) []string { func cut_DAG_NO_HMM(sentence string) []string { result := make([]string, 0) - re_eng := regexp.MustCompile(`[[:alnum:]]`) + dag := GetDAG(sentence) routes := Calc(sentence, dag) x := 0 @@ -201,7 +206,7 @@ func cut_DAG_NO_HMM(sentence string) []string { } y = routes[x].Index + 1 l_word := runes[x:y] - if re_eng.MatchString(string(l_word)) && len(l_word) == 1 { + if reEng.MatchString(string(l_word)) && len(l_word) == 1 { buf = append(buf, l_word...) x = y } else { @@ -220,7 +225,7 @@ func cut_DAG_NO_HMM(sentence string) []string { return result } -func cut_All(sentence string) []string { +func cutAll(sentence string) []string { result := make([]string, 0) runes := []rune(sentence) dag := GetDAG(sentence) @@ -247,51 +252,51 @@ func cut_All(sentence string) []string { return result } -func Cut(sentence string, cut_all bool, HMM bool) []string { +func Cut(sentence string, isCutAll bool, HMM bool) []string { result := make([]string, 0) - var re_han, re_skip *regexp.Regexp - if cut_all { - re_han = regexp.MustCompile(`\p{Han}+`) - re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`) + var reHan, reSkip *regexp.Regexp + if isCutAll { + reHan = reHanCutAll + reSkip = reSkipCutAll } else { - re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) - re_skip = regexp.MustCompile(`(\r\n|\s)`) + reHan = reHanDefault + reSkip = reSkipDefault } - blocks := RegexpSplit(re_han, sentence) + blocks := RegexpSplit(reHan, sentence) var cut_block cutAction if HMM { cut_block = cut_DAG } else { cut_block = cut_DAG_NO_HMM } - if cut_all { - cut_block = cut_All + if isCutAll { + cut_block = cutAll } for _, blk := range blocks { if len(blk) == 0 { continue } - if re_han.MatchString(blk) { + if reHan.MatchString(blk) { for _, word := range cut_block(blk) { result = append(result, word) } } else { type skipSplitFunc func(sentence string) []string var ssf skipSplitFunc - if cut_all { + if isCutAll { ssf = func(sentence string) []string { - return re_skip.Split(sentence, -1) + return reSkip.Split(sentence, -1) } } else { ssf = func(sentence string) []string { - return RegexpSplit(re_skip, sentence) + return RegexpSplit(reSkip, sentence) } } for _, x := range ssf(blk) { - if re_skip.MatchString(x) { + if reSkip.MatchString(x) { result = append(result, x) - } else if !cut_all { + } else if !isCutAll { for _, xx := range x { result = append(result, string(xx)) } diff --git a/posseg/posseg.go b/posseg/posseg.go index 04a4a62..8f11f49 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -12,7 +12,14 @@ import ( ) var ( - WordTagTab = make(map[string]string) + WordTagTab = make(map[string]string) + reHanDetail = regexp.MustCompile(`\p{Han}+`) + reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) + reEng = regexp.MustCompile(`[[:alnum:]]`) + reNum = regexp.MustCompile(`[\.[:digit:]]+`) + reEng1 = regexp.MustCompile(`[[:alnum:]]$`) + reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) + reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) ) type WordTag struct { @@ -80,26 +87,21 @@ func __cut(sentence string) []WordTag { func cutDetail(sentence string) []WordTag { result := make([]WordTag, 0) - re_han := regexp.MustCompile(`\p{Han}+`) - re_skip := regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) - - re_eng := regexp.MustCompile(`[[:alnum:]]`) - re_num := regexp.MustCompile(`[\.[:digit:]]+`) - blocks := jiebago.RegexpSplit(re_han, sentence) + blocks := jiebago.RegexpSplit(reHanDetail, sentence) for _, blk := range blocks { - if re_han.MatchString(blk) { + if reHanDetail.MatchString(blk) { for _, wordTag := range __cut(blk) { result = append(result, wordTag) } } else { - for _, x := range jiebago.RegexpSplit(re_skip, blk) { + for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) { if len(x) == 0 { continue } switch { - case re_num.MatchString(x): + case reNum.MatchString(x): result = append(result, WordTag{x, "m"}) - case re_eng.MatchString(x): + case reEng.MatchString(x): result = append(result, WordTag{x, "eng"}) default: result = append(result, WordTag{x, "x"}) @@ -203,7 +205,6 @@ func cut_DAG(sentence string) []WordTag { func cut_DAG_NO_HMM(sentence string) []WordTag { result := make([]WordTag, 0) - re_eng := regexp.MustCompile(`[[:alnum:]]`) dag := jiebago.GetDAG(sentence) routes := jiebago.Calc(sentence, dag) x := 0 @@ -217,7 +218,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag { } y = routes[x].Index + 1 l_word := runes[x:y] - if re_eng.MatchString(string(l_word)) && len(l_word) == 1 { + if reEng1.MatchString(string(l_word)) && len(l_word) == 1 { buf = append(buf, l_word...) x = y } else { @@ -243,11 +244,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag { func cut(sentence string, HMM bool) []WordTag { result := make([]WordTag, 0) - re_han := regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) - re_skip := regexp.MustCompile(`(\r\n|\s)`) - re_eng := regexp.MustCompile(`[[:alnum:]]`) - re_num := regexp.MustCompile(`[\.[:digit:]]+`) - blocks := jiebago.RegexpSplit(re_han, sentence) + blocks := jiebago.RegexpSplit(reHanInternal, sentence) var cut_block cutAction if HMM { cut_block = cut_DAG @@ -255,21 +252,21 @@ func cut(sentence string, HMM bool) []WordTag { cut_block = cut_DAG_NO_HMM } for _, blk := range blocks { - if re_han.MatchString(blk) { + if reHanInternal.MatchString(blk) { for _, wordTag := range cut_block(blk) { result = append(result, wordTag) } } else { - for _, x := range jiebago.RegexpSplit(re_skip, blk) { - if re_skip.MatchString(x) { + for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) { + if reSkipInternal.MatchString(x) { result = append(result, WordTag{x, "x"}) } else { for _, xx := range x { s := string(xx) switch { - case re_num.MatchString(s): + case reNum.MatchString(s): result = append(result, WordTag{s, "m"}) - case re_eng.MatchString(x): + case reEng.MatchString(x): result = append(result, WordTag{x, "eng"}) break default: