优化 finalseg

2026-06-05 00:32:51 +08:00 · 2022-11-30 14:47:50 +08:00
parent 6b239b5918
commit b2508252d5
5 changed files with 52 additions and 72 deletions
--- a/finalseg/finalseg.go
+++ b/finalseg/finalseg.go
@@ -11,10 +11,9 @@ var (
 )

 func cutHan(sentence string) []string {
-	result := make([]string, 0, 10)
-
 	runes := []rune(sentence)
-	_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
+	result := make([]string, 0, len(runes))
+	_, posList := viterbi(runes, 'B', 'M', 'E', 'S')
 	begin, next := 0, 0
 	for i, char := range runes {
 		pos := posList[i]
@@ -37,29 +36,23 @@ func cutHan(sentence string) []string {
 }

 // Cut cuts sentence into words using Hidden Markov Model with Viterbi
-// algorithm. It is used by jieba for unknonw words.
-func Cut(sentence string) []string {
-	result := make([]string, 0, 10)
-	s := sentence
-	var hans string
-	var hanLoc []int
-	var nonhanLoc []int
-
+// algorithm. It is used by jieba for unknown words.
+func Cut(s string) []string {
+	result := make([]string, 0, len(s))
+lop:
 	for {
-		hanLoc = reHan.FindStringIndex(s)
+		hanLoc := reHan.FindStringIndex(s)
 		if hanLoc == nil {
 			if len(s) == 0 {
 				break
 			}
 		} else if hanLoc[0] == 0 {
-			hans = s[hanLoc[0]:hanLoc[1]]
+			hans := s[hanLoc[0]:hanLoc[1]]
 			s = s[hanLoc[1]:]
-			for _, han := range cutHan(hans) {
-				result = append(result, han)
-			}
+			result = append(result, cutHan(hans)...)
 			continue
 		}
-		nonhanLoc = reSkip.FindStringIndex(s)
+		nonhanLoc := reSkip.FindStringIndex(s)
 		if nonhanLoc == nil {
 			if len(s) == 0 {
 				break
@@ -73,18 +66,19 @@ func Cut(sentence string) []string {
 			}
 		}
 		var loc []int
-		if hanLoc == nil && nonhanLoc == nil {
+		switch {
+		case hanLoc == nil && nonhanLoc == nil:
 			if len(s) > 0 {
 				result = append(result, s)
-				break
+				break lop
 			}
-		} else if hanLoc == nil {
+		case hanLoc == nil:
 			loc = nonhanLoc
-		} else if nonhanLoc == nil {
+		case nonhanLoc == nil:
 			loc = hanLoc
-		} else if hanLoc[0] < nonhanLoc[0] {
+		case hanLoc[0] < nonhanLoc[0]:
 			loc = hanLoc
-		} else {
+		default:
 			loc = nonhanLoc
 		}
 		result = append(result, s[:loc[0]])
--- a/finalseg/finalseg_test.go
+++ b/finalseg/finalseg_test.go
@@ -7,8 +7,7 @@ import (

 func TestViterbi(t *testing.T) {
 	obs := "我们是程序员"
-	states := []byte{'B', 'M', 'E', 'S'}
-	prob, path := viterbi([]rune(obs), states)
+	prob, path := viterbi([]rune(obs), 'B', 'M', 'E', 'S')
 	if math.Abs(prob+39.68824128493802) > 1e-10 {
 		t.Fatal(prob)
 	}
--- a/finalseg/prob_emit.go
+++ b/finalseg/prob_emit.go
@@ -1,9 +1,7 @@
 package finalseg

-var probEmit = make(map[byte]map[rune]float64)
-
-func init() {
-	probEmit['B'] = map[rune]float64{'\u4e00': -3.6544978750449433,
+var probEmit = map[byte]map[rune]float64{
+	'B': {'\u4e00': -3.6544978750449433,
 		'\u4e01': -8.125041941842026,
 		'\u4e03': -7.817392401429855,
 		'\u4e07': -6.3096425804013165,
@@ -6859,8 +6857,8 @@ func init() {
 		'\u9f99': -7.892474414343774,
 		'\u9f9a': -9.557108305917183,
 		'\u9f9c': -10.895131537474946,
-		'\u9f9f': -10.895131537474946}
-	probEmit['E'] = map[rune]float64{'\u4e00': -6.044987536255073,
+		'\u9f9f': -10.895131537474946},
+	'E': {'\u4e00': -6.044987536255073,
 		'\u4e01': -9.075800412310807,
 		'\u4e03': -9.198842005220659,
 		'\u4e07': -7.655326112989935,
@@ -14298,8 +14296,8 @@ func init() {
 		'\u9f9a': -15.137257331238825,
 		'\u9f9b': -12.729311722586953,
 		'\u9f9c': -10.574067217491615,
-		'\u9f9f': -10.574067217491615}
-	probEmit['M'] = map[rune]float64{'\u4e00': -4.428158526435913,
+		'\u9f9f': -10.574067217491615},
+	'M': {'\u4e00': -4.428158526435913,
 		'\u4e01': -7.932945687598502,
 		'\u4e03': -6.559715525951586,
 		'\u4e07': -6.139922374120667,
@@ -20707,8 +20705,8 @@ func init() {
 		'\u9f99': -6.908072798071771,
 		'\u9f9a': -14.14915250738439,
 		'\u9f9c': -11.058110054026073,
-		'\u9f9f': -11.058110054026073}
-	probEmit['S'] = map[rune]float64{'\u2236': -15.828865681131282,
+		'\u9f9f': -11.058110054026073},
+	'S': {'\u2236': -15.828865681131282,
 		'\u4e00': -4.92368982120877,
 		'\u4e01': -9.024528361347633,
 		'\u4e02': -16.522012861691227,
@@ -35226,6 +35224,5 @@ func init() {
 		'\u9f9c': -10.409437488834186,
 		'\u9f9f': -10.409437488834186,
 		'\u9fa0': -15.605722129817071,
-		'\u9fa2': -10.61937952828986}
-
+		'\u9fa2': -10.61937952828986},
 }
--- a/finalseg/prob_trans.go
+++ b/finalseg/prob_trans.go
@@ -1,14 +1,8 @@
 package finalseg

-var probTrans = make(map[byte]map[byte]float64)
-
-func init() {
-	probTrans['B'] = map[byte]float64{'E': -0.510825623765990,
-		'M': -0.916290731874155}
-	probTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
-		'S': -0.8085250474669937}
-	probTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
-		'M': -1.2603623820268226}
-	probTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
-		'S': -0.6658631448798212}
+var probTrans = map[byte]map[byte]float64{
+	'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
+	'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
+	'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
+	'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212},
 }
--- a/finalseg/viterbi.go
+++ b/finalseg/viterbi.go
@@ -8,20 +8,19 @@ import (
 const minFloat = -3.14e100

 var (
-	prevStatus = make(map[byte][]byte)
-	probStart  = make(map[byte]float64)
-)
-
-func init() {
-	prevStatus['B'] = []byte{'E', 'S'}
-	prevStatus['M'] = []byte{'M', 'B'}
-	prevStatus['S'] = []byte{'S', 'E'}
-	prevStatus['E'] = []byte{'B', 'M'}
-	probStart['B'] = -0.26268660809250016
-	probStart['E'] = -3.14e+100
-	probStart['M'] = -3.14e+100
-	probStart['S'] = -1.4652633398537678
+	prevStatus = map[byte][2]byte{
+		'B': {'E', 'S'},
+		'M': {'M', 'B'},
+		'S': {'S', 'E'},
+		'E': {'B', 'M'},
 	}
+	probStart = map[byte]float64{
+		'B': -0.26268660809250016,
+		'E': -3.14e+100,
+		'M': -3.14e+100,
+		'S': -1.4652633398537678,
+	}
+)

 type probState struct {
 	prob  float64
@@ -49,10 +48,10 @@ func (ps probStates) Swap(i, j int) {
 	ps[i], ps[j] = ps[j], ps[i]
 }

-func viterbi(obs []rune, states []byte) (float64, []byte) {
-	path := make(map[byte][]byte)
-	V := make([]map[byte]float64, len(obs))
-	V[0] = make(map[byte]float64)
+func viterbi(obs []rune, states ...byte) (float64, []byte) {
+	path := [256][]byte{}
+	newPath := [256][]byte{}
+	V := make([][256]float64, len(obs))
 	for _, y := range states {
 		if val, ok := probEmit[y][obs[0]]; ok {
 			V[0][y] = val + probStart[y]
@@ -61,12 +60,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
 		}
 		path[y] = []byte{y}
 	}
-
 	for t := 1; t < len(obs); t++ {
-		newPath := make(map[byte][]byte)
-		V[t] = make(map[byte]float64)
 		for _, y := range states {
-			ps0 := make(probStates, 0)
+			ps0 := make(probStates, 0, 2)
 			var emP float64
 			if val, ok := probEmit[y][obs[t]]; ok {
 				emP = val
@@ -91,9 +87,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
 		}
 		path = newPath
 	}
-	ps := make(probStates, 0)
-	for _, y := range []byte{'E', 'S'} {
-		ps = append(ps, &probState{V[len(obs)-1][y], y})
+	ps := probStates{
+		&probState{V[len(obs)-1]['E'], 'E'},
+		&probState{V[len(obs)-1]['S'], 'S'},
 	}
 	sort.Sort(sort.Reverse(ps))
 	v := ps[0]