1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-23 20:50:27 +08:00

small refactor, don't compile regular expression every time, corresponding to jieba commit #32a0e92a09614cf5c72f87b1a59a5c4369200516

This commit is contained in:
Wang Bin
2015-02-25 16:32:28 +08:00
parent 5702495bf6
commit 08ac49d10b
3 changed files with 51 additions and 46 deletions

View File

@@ -4,6 +4,11 @@ import (
"regexp" "regexp"
) )
var (
reHan = regexp.MustCompile(`\p{Han}+`)
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
)
func cutHan(sentence string) []string { func cutHan(sentence string) []string {
runes := []rune(sentence) runes := []rune(sentence)
result := make([]string, 0) result := make([]string, 0)
@@ -30,14 +35,12 @@ func cutHan(sentence string) []string {
func Cut(sentence string) []string { func Cut(sentence string) []string {
result := make([]string, 0) result := make([]string, 0)
re_han := regexp.MustCompile(`\p{Han}+`)
re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
s := sentence s := sentence
var hans string var hans string
var hanLoc []int var hanLoc []int
var nonhanLoc []int var nonhanLoc []int
for { for {
hanLoc = re_han.FindStringIndex(s) hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil { if hanLoc == nil {
if len(s) == 0 { if len(s) == 0 {
break break
@@ -50,7 +53,7 @@ func Cut(sentence string) []string {
} }
continue continue
} }
nonhanLoc = re_skip.FindStringIndex(s) nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil { if nonhanLoc == nil {
if len(s) == 0 { if len(s) == 0 {
break break

View File

@@ -11,6 +11,11 @@ import (
var ( var (
Dictionary = "dict.txt" Dictionary = "dict.txt"
UserWordTagTab = make(map[string]string) UserWordTagTab = make(map[string]string)
reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
) )
type Route struct { type Route struct {
@@ -187,7 +192,7 @@ func cut_DAG(sentence string) []string {
func cut_DAG_NO_HMM(sentence string) []string { func cut_DAG_NO_HMM(sentence string) []string {
result := make([]string, 0) result := make([]string, 0)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
dag := GetDAG(sentence) dag := GetDAG(sentence)
routes := Calc(sentence, dag) routes := Calc(sentence, dag)
x := 0 x := 0
@@ -201,7 +206,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
} }
y = routes[x].Index + 1 y = routes[x].Index + 1
l_word := runes[x:y] l_word := runes[x:y]
if re_eng.MatchString(string(l_word)) && len(l_word) == 1 { if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...) buf = append(buf, l_word...)
x = y x = y
} else { } else {
@@ -220,7 +225,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
return result return result
} }
func cut_All(sentence string) []string { func cutAll(sentence string) []string {
result := make([]string, 0) result := make([]string, 0)
runes := []rune(sentence) runes := []rune(sentence)
dag := GetDAG(sentence) dag := GetDAG(sentence)
@@ -247,51 +252,51 @@ func cut_All(sentence string) []string {
return result return result
} }
func Cut(sentence string, cut_all bool, HMM bool) []string { func Cut(sentence string, isCutAll bool, HMM bool) []string {
result := make([]string, 0) result := make([]string, 0)
var re_han, re_skip *regexp.Regexp var reHan, reSkip *regexp.Regexp
if cut_all { if isCutAll {
re_han = regexp.MustCompile(`\p{Han}+`) reHan = reHanCutAll
re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`) reSkip = reSkipCutAll
} else { } else {
re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reHan = reHanDefault
re_skip = regexp.MustCompile(`(\r\n|\s)`) reSkip = reSkipDefault
} }
blocks := RegexpSplit(re_han, sentence) blocks := RegexpSplit(reHan, sentence)
var cut_block cutAction var cut_block cutAction
if HMM { if HMM {
cut_block = cut_DAG cut_block = cut_DAG
} else { } else {
cut_block = cut_DAG_NO_HMM cut_block = cut_DAG_NO_HMM
} }
if cut_all { if isCutAll {
cut_block = cut_All cut_block = cutAll
} }
for _, blk := range blocks { for _, blk := range blocks {
if len(blk) == 0 { if len(blk) == 0 {
continue continue
} }
if re_han.MatchString(blk) { if reHan.MatchString(blk) {
for _, word := range cut_block(blk) { for _, word := range cut_block(blk) {
result = append(result, word) result = append(result, word)
} }
} else { } else {
type skipSplitFunc func(sentence string) []string type skipSplitFunc func(sentence string) []string
var ssf skipSplitFunc var ssf skipSplitFunc
if cut_all { if isCutAll {
ssf = func(sentence string) []string { ssf = func(sentence string) []string {
return re_skip.Split(sentence, -1) return reSkip.Split(sentence, -1)
} }
} else { } else {
ssf = func(sentence string) []string { ssf = func(sentence string) []string {
return RegexpSplit(re_skip, sentence) return RegexpSplit(reSkip, sentence)
} }
} }
for _, x := range ssf(blk) { for _, x := range ssf(blk) {
if re_skip.MatchString(x) { if reSkip.MatchString(x) {
result = append(result, x) result = append(result, x)
} else if !cut_all { } else if !isCutAll {
for _, xx := range x { for _, xx := range x {
result = append(result, string(xx)) result = append(result, string(xx))
} }

View File

@@ -12,7 +12,14 @@ import (
) )
var ( var (
WordTagTab = make(map[string]string) WordTagTab = make(map[string]string)
reHanDetail = regexp.MustCompile(`\p{Han}+`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
) )
type WordTag struct { type WordTag struct {
@@ -80,26 +87,21 @@ func __cut(sentence string) []WordTag {
func cutDetail(sentence string) []WordTag { func cutDetail(sentence string) []WordTag {
result := make([]WordTag, 0) result := make([]WordTag, 0)
re_han := regexp.MustCompile(`\p{Han}+`) blocks := jiebago.RegexpSplit(reHanDetail, sentence)
re_skip := regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
blocks := jiebago.RegexpSplit(re_han, sentence)
for _, blk := range blocks { for _, blk := range blocks {
if re_han.MatchString(blk) { if reHanDetail.MatchString(blk) {
for _, wordTag := range __cut(blk) { for _, wordTag := range __cut(blk) {
result = append(result, wordTag) result = append(result, wordTag)
} }
} else { } else {
for _, x := range jiebago.RegexpSplit(re_skip, blk) { for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
if len(x) == 0 { if len(x) == 0 {
continue continue
} }
switch { switch {
case re_num.MatchString(x): case reNum.MatchString(x):
result = append(result, WordTag{x, "m"}) result = append(result, WordTag{x, "m"})
case re_eng.MatchString(x): case reEng.MatchString(x):
result = append(result, WordTag{x, "eng"}) result = append(result, WordTag{x, "eng"})
default: default:
result = append(result, WordTag{x, "x"}) result = append(result, WordTag{x, "x"})
@@ -203,7 +205,6 @@ func cut_DAG(sentence string) []WordTag {
func cut_DAG_NO_HMM(sentence string) []WordTag { func cut_DAG_NO_HMM(sentence string) []WordTag {
result := make([]WordTag, 0) result := make([]WordTag, 0)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
dag := jiebago.GetDAG(sentence) dag := jiebago.GetDAG(sentence)
routes := jiebago.Calc(sentence, dag) routes := jiebago.Calc(sentence, dag)
x := 0 x := 0
@@ -217,7 +218,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
} }
y = routes[x].Index + 1 y = routes[x].Index + 1
l_word := runes[x:y] l_word := runes[x:y]
if re_eng.MatchString(string(l_word)) && len(l_word) == 1 { if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...) buf = append(buf, l_word...)
x = y x = y
} else { } else {
@@ -243,11 +244,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
func cut(sentence string, HMM bool) []WordTag { func cut(sentence string, HMM bool) []WordTag {
result := make([]WordTag, 0) result := make([]WordTag, 0)
re_han := regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) blocks := jiebago.RegexpSplit(reHanInternal, sentence)
re_skip := regexp.MustCompile(`(\r\n|\s)`)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
blocks := jiebago.RegexpSplit(re_han, sentence)
var cut_block cutAction var cut_block cutAction
if HMM { if HMM {
cut_block = cut_DAG cut_block = cut_DAG
@@ -255,21 +252,21 @@ func cut(sentence string, HMM bool) []WordTag {
cut_block = cut_DAG_NO_HMM cut_block = cut_DAG_NO_HMM
} }
for _, blk := range blocks { for _, blk := range blocks {
if re_han.MatchString(blk) { if reHanInternal.MatchString(blk) {
for _, wordTag := range cut_block(blk) { for _, wordTag := range cut_block(blk) {
result = append(result, wordTag) result = append(result, wordTag)
} }
} else { } else {
for _, x := range jiebago.RegexpSplit(re_skip, blk) { for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
if re_skip.MatchString(x) { if reSkipInternal.MatchString(x) {
result = append(result, WordTag{x, "x"}) result = append(result, WordTag{x, "x"})
} else { } else {
for _, xx := range x { for _, xx := range x {
s := string(xx) s := string(xx)
switch { switch {
case re_num.MatchString(s): case reNum.MatchString(s):
result = append(result, WordTag{s, "m"}) result = append(result, WordTag{s, "m"})
case re_eng.MatchString(x): case reEng.MatchString(x):
result = append(result, WordTag{x, "eng"}) result = append(result, WordTag{x, "eng"})
break break
default: default: