mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-23 20:50:27 +08:00
small refactor, don't compile regular expression every time, corresponding to jieba commit #32a0e92a09614cf5c72f87b1a59a5c4369200516
This commit is contained in:
@@ -4,6 +4,11 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
reHan = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
|
||||||
|
)
|
||||||
|
|
||||||
func cutHan(sentence string) []string {
|
func cutHan(sentence string) []string {
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
@@ -30,14 +35,12 @@ func cutHan(sentence string) []string {
|
|||||||
|
|
||||||
func Cut(sentence string) []string {
|
func Cut(sentence string) []string {
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
re_han := regexp.MustCompile(`\p{Han}+`)
|
|
||||||
re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
|
|
||||||
s := sentence
|
s := sentence
|
||||||
var hans string
|
var hans string
|
||||||
var hanLoc []int
|
var hanLoc []int
|
||||||
var nonhanLoc []int
|
var nonhanLoc []int
|
||||||
for {
|
for {
|
||||||
hanLoc = re_han.FindStringIndex(s)
|
hanLoc = reHan.FindStringIndex(s)
|
||||||
if hanLoc == nil {
|
if hanLoc == nil {
|
||||||
if len(s) == 0 {
|
if len(s) == 0 {
|
||||||
break
|
break
|
||||||
@@ -50,7 +53,7 @@ func Cut(sentence string) []string {
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
nonhanLoc = re_skip.FindStringIndex(s)
|
nonhanLoc = reSkip.FindStringIndex(s)
|
||||||
if nonhanLoc == nil {
|
if nonhanLoc == nil {
|
||||||
if len(s) == 0 {
|
if len(s) == 0 {
|
||||||
break
|
break
|
||||||
|
|||||||
43
jieba.go
43
jieba.go
@@ -11,6 +11,11 @@ import (
|
|||||||
var (
|
var (
|
||||||
Dictionary = "dict.txt"
|
Dictionary = "dict.txt"
|
||||||
UserWordTagTab = make(map[string]string)
|
UserWordTagTab = make(map[string]string)
|
||||||
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||||
|
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
||||||
|
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||||
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
type Route struct {
|
type Route struct {
|
||||||
@@ -187,7 +192,7 @@ func cut_DAG(sentence string) []string {
|
|||||||
|
|
||||||
func cut_DAG_NO_HMM(sentence string) []string {
|
func cut_DAG_NO_HMM(sentence string) []string {
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
re_eng := regexp.MustCompile(`[[:alnum:]]`)
|
|
||||||
dag := GetDAG(sentence)
|
dag := GetDAG(sentence)
|
||||||
routes := Calc(sentence, dag)
|
routes := Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
@@ -201,7 +206,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
|
|||||||
}
|
}
|
||||||
y = routes[x].Index + 1
|
y = routes[x].Index + 1
|
||||||
l_word := runes[x:y]
|
l_word := runes[x:y]
|
||||||
if re_eng.MatchString(string(l_word)) && len(l_word) == 1 {
|
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
|
||||||
buf = append(buf, l_word...)
|
buf = append(buf, l_word...)
|
||||||
x = y
|
x = y
|
||||||
} else {
|
} else {
|
||||||
@@ -220,7 +225,7 @@ func cut_DAG_NO_HMM(sentence string) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func cut_All(sentence string) []string {
|
func cutAll(sentence string) []string {
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
dag := GetDAG(sentence)
|
dag := GetDAG(sentence)
|
||||||
@@ -247,51 +252,51 @@ func cut_All(sentence string) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func Cut(sentence string, cut_all bool, HMM bool) []string {
|
func Cut(sentence string, isCutAll bool, HMM bool) []string {
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
var re_han, re_skip *regexp.Regexp
|
var reHan, reSkip *regexp.Regexp
|
||||||
if cut_all {
|
if isCutAll {
|
||||||
re_han = regexp.MustCompile(`\p{Han}+`)
|
reHan = reHanCutAll
|
||||||
re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
reSkip = reSkipCutAll
|
||||||
} else {
|
} else {
|
||||||
re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
reHan = reHanDefault
|
||||||
re_skip = regexp.MustCompile(`(\r\n|\s)`)
|
reSkip = reSkipDefault
|
||||||
}
|
}
|
||||||
blocks := RegexpSplit(re_han, sentence)
|
blocks := RegexpSplit(reHan, sentence)
|
||||||
var cut_block cutAction
|
var cut_block cutAction
|
||||||
if HMM {
|
if HMM {
|
||||||
cut_block = cut_DAG
|
cut_block = cut_DAG
|
||||||
} else {
|
} else {
|
||||||
cut_block = cut_DAG_NO_HMM
|
cut_block = cut_DAG_NO_HMM
|
||||||
}
|
}
|
||||||
if cut_all {
|
if isCutAll {
|
||||||
cut_block = cut_All
|
cut_block = cutAll
|
||||||
}
|
}
|
||||||
for _, blk := range blocks {
|
for _, blk := range blocks {
|
||||||
if len(blk) == 0 {
|
if len(blk) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if re_han.MatchString(blk) {
|
if reHan.MatchString(blk) {
|
||||||
for _, word := range cut_block(blk) {
|
for _, word := range cut_block(blk) {
|
||||||
result = append(result, word)
|
result = append(result, word)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
type skipSplitFunc func(sentence string) []string
|
type skipSplitFunc func(sentence string) []string
|
||||||
var ssf skipSplitFunc
|
var ssf skipSplitFunc
|
||||||
if cut_all {
|
if isCutAll {
|
||||||
ssf = func(sentence string) []string {
|
ssf = func(sentence string) []string {
|
||||||
return re_skip.Split(sentence, -1)
|
return reSkip.Split(sentence, -1)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ssf = func(sentence string) []string {
|
ssf = func(sentence string) []string {
|
||||||
return RegexpSplit(re_skip, sentence)
|
return RegexpSplit(reSkip, sentence)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, x := range ssf(blk) {
|
for _, x := range ssf(blk) {
|
||||||
if re_skip.MatchString(x) {
|
if reSkip.MatchString(x) {
|
||||||
result = append(result, x)
|
result = append(result, x)
|
||||||
} else if !cut_all {
|
} else if !isCutAll {
|
||||||
for _, xx := range x {
|
for _, xx := range x {
|
||||||
result = append(result, string(xx))
|
result = append(result, string(xx))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,7 +12,14 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
WordTagTab = make(map[string]string)
|
WordTagTab = make(map[string]string)
|
||||||
|
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||||
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||||
|
reNum = regexp.MustCompile(`[\.[:digit:]]+`)
|
||||||
|
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
|
||||||
|
reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||||
|
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
type WordTag struct {
|
type WordTag struct {
|
||||||
@@ -80,26 +87,21 @@ func __cut(sentence string) []WordTag {
|
|||||||
|
|
||||||
func cutDetail(sentence string) []WordTag {
|
func cutDetail(sentence string) []WordTag {
|
||||||
result := make([]WordTag, 0)
|
result := make([]WordTag, 0)
|
||||||
re_han := regexp.MustCompile(`\p{Han}+`)
|
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
|
||||||
re_skip := regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
|
||||||
|
|
||||||
re_eng := regexp.MustCompile(`[[:alnum:]]`)
|
|
||||||
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
|
|
||||||
blocks := jiebago.RegexpSplit(re_han, sentence)
|
|
||||||
for _, blk := range blocks {
|
for _, blk := range blocks {
|
||||||
if re_han.MatchString(blk) {
|
if reHanDetail.MatchString(blk) {
|
||||||
for _, wordTag := range __cut(blk) {
|
for _, wordTag := range __cut(blk) {
|
||||||
result = append(result, wordTag)
|
result = append(result, wordTag)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for _, x := range jiebago.RegexpSplit(re_skip, blk) {
|
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
|
||||||
if len(x) == 0 {
|
if len(x) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
switch {
|
switch {
|
||||||
case re_num.MatchString(x):
|
case reNum.MatchString(x):
|
||||||
result = append(result, WordTag{x, "m"})
|
result = append(result, WordTag{x, "m"})
|
||||||
case re_eng.MatchString(x):
|
case reEng.MatchString(x):
|
||||||
result = append(result, WordTag{x, "eng"})
|
result = append(result, WordTag{x, "eng"})
|
||||||
default:
|
default:
|
||||||
result = append(result, WordTag{x, "x"})
|
result = append(result, WordTag{x, "x"})
|
||||||
@@ -203,7 +205,6 @@ func cut_DAG(sentence string) []WordTag {
|
|||||||
|
|
||||||
func cut_DAG_NO_HMM(sentence string) []WordTag {
|
func cut_DAG_NO_HMM(sentence string) []WordTag {
|
||||||
result := make([]WordTag, 0)
|
result := make([]WordTag, 0)
|
||||||
re_eng := regexp.MustCompile(`[[:alnum:]]`)
|
|
||||||
dag := jiebago.GetDAG(sentence)
|
dag := jiebago.GetDAG(sentence)
|
||||||
routes := jiebago.Calc(sentence, dag)
|
routes := jiebago.Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
@@ -217,7 +218,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
|
|||||||
}
|
}
|
||||||
y = routes[x].Index + 1
|
y = routes[x].Index + 1
|
||||||
l_word := runes[x:y]
|
l_word := runes[x:y]
|
||||||
if re_eng.MatchString(string(l_word)) && len(l_word) == 1 {
|
if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
|
||||||
buf = append(buf, l_word...)
|
buf = append(buf, l_word...)
|
||||||
x = y
|
x = y
|
||||||
} else {
|
} else {
|
||||||
@@ -243,11 +244,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
|
|||||||
|
|
||||||
func cut(sentence string, HMM bool) []WordTag {
|
func cut(sentence string, HMM bool) []WordTag {
|
||||||
result := make([]WordTag, 0)
|
result := make([]WordTag, 0)
|
||||||
re_han := regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
||||||
re_skip := regexp.MustCompile(`(\r\n|\s)`)
|
|
||||||
re_eng := regexp.MustCompile(`[[:alnum:]]`)
|
|
||||||
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
|
|
||||||
blocks := jiebago.RegexpSplit(re_han, sentence)
|
|
||||||
var cut_block cutAction
|
var cut_block cutAction
|
||||||
if HMM {
|
if HMM {
|
||||||
cut_block = cut_DAG
|
cut_block = cut_DAG
|
||||||
@@ -255,21 +252,21 @@ func cut(sentence string, HMM bool) []WordTag {
|
|||||||
cut_block = cut_DAG_NO_HMM
|
cut_block = cut_DAG_NO_HMM
|
||||||
}
|
}
|
||||||
for _, blk := range blocks {
|
for _, blk := range blocks {
|
||||||
if re_han.MatchString(blk) {
|
if reHanInternal.MatchString(blk) {
|
||||||
for _, wordTag := range cut_block(blk) {
|
for _, wordTag := range cut_block(blk) {
|
||||||
result = append(result, wordTag)
|
result = append(result, wordTag)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for _, x := range jiebago.RegexpSplit(re_skip, blk) {
|
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||||
if re_skip.MatchString(x) {
|
if reSkipInternal.MatchString(x) {
|
||||||
result = append(result, WordTag{x, "x"})
|
result = append(result, WordTag{x, "x"})
|
||||||
} else {
|
} else {
|
||||||
for _, xx := range x {
|
for _, xx := range x {
|
||||||
s := string(xx)
|
s := string(xx)
|
||||||
switch {
|
switch {
|
||||||
case re_num.MatchString(s):
|
case reNum.MatchString(s):
|
||||||
result = append(result, WordTag{s, "m"})
|
result = append(result, WordTag{s, "m"})
|
||||||
case re_eng.MatchString(x):
|
case reEng.MatchString(x):
|
||||||
result = append(result, WordTag{x, "eng"})
|
result = append(result, WordTag{x, "eng"})
|
||||||
break
|
break
|
||||||
default:
|
default:
|
||||||
|
|||||||
Reference in New Issue
Block a user