diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index 566070b..7b7688c 100755 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -11,10 +11,9 @@ var ( ) func cutHan(sentence string) []string { - result := make([]string, 0, 10) - runes := []rune(sentence) - _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) + result := make([]string, 0, len(runes)) + _, posList := viterbi(runes, 'B', 'M', 'E', 'S') begin, next := 0, 0 for i, char := range runes { pos := posList[i] @@ -37,29 +36,23 @@ func cutHan(sentence string) []string { } // Cut cuts sentence into words using Hidden Markov Model with Viterbi -// algorithm. It is used by jieba for unknonw words. -func Cut(sentence string) []string { - result := make([]string, 0, 10) - s := sentence - var hans string - var hanLoc []int - var nonhanLoc []int - +// algorithm. It is used by jieba for unknown words. +func Cut(s string) []string { + result := make([]string, 0, len(s)) +lop: for { - hanLoc = reHan.FindStringIndex(s) + hanLoc := reHan.FindStringIndex(s) if hanLoc == nil { if len(s) == 0 { break } } else if hanLoc[0] == 0 { - hans = s[hanLoc[0]:hanLoc[1]] + hans := s[hanLoc[0]:hanLoc[1]] s = s[hanLoc[1]:] - for _, han := range cutHan(hans) { - result = append(result, han) - } + result = append(result, cutHan(hans)...) continue } - nonhanLoc = reSkip.FindStringIndex(s) + nonhanLoc := reSkip.FindStringIndex(s) if nonhanLoc == nil { if len(s) == 0 { break @@ -73,18 +66,19 @@ func Cut(sentence string) []string { } } var loc []int - if hanLoc == nil && nonhanLoc == nil { + switch { + case hanLoc == nil && nonhanLoc == nil: if len(s) > 0 { result = append(result, s) - break + break lop } - } else if hanLoc == nil { + case hanLoc == nil: loc = nonhanLoc - } else if nonhanLoc == nil { + case nonhanLoc == nil: loc = hanLoc - } else if hanLoc[0] < nonhanLoc[0] { + case hanLoc[0] < nonhanLoc[0]: loc = hanLoc - } else { + default: loc = nonhanLoc } result = append(result, s[:loc[0]]) diff --git a/finalseg/finalseg_test.go b/finalseg/finalseg_test.go index 120a984..5fd41f2 100755 --- a/finalseg/finalseg_test.go +++ b/finalseg/finalseg_test.go @@ -7,8 +7,7 @@ import ( func TestViterbi(t *testing.T) { obs := "我们是程序员" - states := []byte{'B', 'M', 'E', 'S'} - prob, path := viterbi([]rune(obs), states) + prob, path := viterbi([]rune(obs), 'B', 'M', 'E', 'S') if math.Abs(prob+39.68824128493802) > 1e-10 { t.Fatal(prob) } diff --git a/finalseg/prob_emit.go b/finalseg/prob_emit.go index 3832a22..67b8b9b 100755 --- a/finalseg/prob_emit.go +++ b/finalseg/prob_emit.go @@ -1,9 +1,7 @@ package finalseg -var probEmit = make(map[byte]map[rune]float64) - -func init() { - probEmit['B'] = map[rune]float64{'\u4e00': -3.6544978750449433, +var probEmit = map[byte]map[rune]float64{ + 'B': {'\u4e00': -3.6544978750449433, '\u4e01': -8.125041941842026, '\u4e03': -7.817392401429855, '\u4e07': -6.3096425804013165, @@ -6859,8 +6857,8 @@ func init() { '\u9f99': -7.892474414343774, '\u9f9a': -9.557108305917183, '\u9f9c': -10.895131537474946, - '\u9f9f': -10.895131537474946} - probEmit['E'] = map[rune]float64{'\u4e00': -6.044987536255073, + '\u9f9f': -10.895131537474946}, + 'E': {'\u4e00': -6.044987536255073, '\u4e01': -9.075800412310807, '\u4e03': -9.198842005220659, '\u4e07': -7.655326112989935, @@ -14298,8 +14296,8 @@ func init() { '\u9f9a': -15.137257331238825, '\u9f9b': -12.729311722586953, '\u9f9c': -10.574067217491615, - '\u9f9f': -10.574067217491615} - probEmit['M'] = map[rune]float64{'\u4e00': -4.428158526435913, + '\u9f9f': -10.574067217491615}, + 'M': {'\u4e00': -4.428158526435913, '\u4e01': -7.932945687598502, '\u4e03': -6.559715525951586, '\u4e07': -6.139922374120667, @@ -20707,8 +20705,8 @@ func init() { '\u9f99': -6.908072798071771, '\u9f9a': -14.14915250738439, '\u9f9c': -11.058110054026073, - '\u9f9f': -11.058110054026073} - probEmit['S'] = map[rune]float64{'\u2236': -15.828865681131282, + '\u9f9f': -11.058110054026073}, + 'S': {'\u2236': -15.828865681131282, '\u4e00': -4.92368982120877, '\u4e01': -9.024528361347633, '\u4e02': -16.522012861691227, @@ -35226,6 +35224,5 @@ func init() { '\u9f9c': -10.409437488834186, '\u9f9f': -10.409437488834186, '\u9fa0': -15.605722129817071, - '\u9fa2': -10.61937952828986} - + '\u9fa2': -10.61937952828986}, } diff --git a/finalseg/prob_trans.go b/finalseg/prob_trans.go index 78dc694..071240b 100755 --- a/finalseg/prob_trans.go +++ b/finalseg/prob_trans.go @@ -1,14 +1,8 @@ package finalseg -var probTrans = make(map[byte]map[byte]float64) - -func init() { - probTrans['B'] = map[byte]float64{'E': -0.510825623765990, - 'M': -0.916290731874155} - probTrans['E'] = map[byte]float64{'B': -0.5897149736854513, - 'S': -0.8085250474669937} - probTrans['M'] = map[byte]float64{'E': -0.33344856811948514, - 'M': -1.2603623820268226} - probTrans['S'] = map[byte]float64{'B': -0.7211965654669841, - 'S': -0.6658631448798212} +var probTrans = map[byte]map[byte]float64{ + 'B': {'E': -0.510825623765990, 'M': -0.916290731874155}, + 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937}, + 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226}, + 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}, } diff --git a/finalseg/viterbi.go b/finalseg/viterbi.go index 2eef566..27d2c51 100755 --- a/finalseg/viterbi.go +++ b/finalseg/viterbi.go @@ -8,21 +8,20 @@ import ( const minFloat = -3.14e100 var ( - prevStatus = make(map[byte][]byte) - probStart = make(map[byte]float64) + prevStatus = map[byte][2]byte{ + 'B': {'E', 'S'}, + 'M': {'M', 'B'}, + 'S': {'S', 'E'}, + 'E': {'B', 'M'}, + } + probStart = map[byte]float64{ + 'B': -0.26268660809250016, + 'E': -3.14e+100, + 'M': -3.14e+100, + 'S': -1.4652633398537678, + } ) -func init() { - prevStatus['B'] = []byte{'E', 'S'} - prevStatus['M'] = []byte{'M', 'B'} - prevStatus['S'] = []byte{'S', 'E'} - prevStatus['E'] = []byte{'B', 'M'} - probStart['B'] = -0.26268660809250016 - probStart['E'] = -3.14e+100 - probStart['M'] = -3.14e+100 - probStart['S'] = -1.4652633398537678 -} - type probState struct { prob float64 state byte @@ -49,10 +48,10 @@ func (ps probStates) Swap(i, j int) { ps[i], ps[j] = ps[j], ps[i] } -func viterbi(obs []rune, states []byte) (float64, []byte) { - path := make(map[byte][]byte) - V := make([]map[byte]float64, len(obs)) - V[0] = make(map[byte]float64) +func viterbi(obs []rune, states ...byte) (float64, []byte) { + path := [256][]byte{} + newPath := [256][]byte{} + V := make([][256]float64, len(obs)) for _, y := range states { if val, ok := probEmit[y][obs[0]]; ok { V[0][y] = val + probStart[y] @@ -61,12 +60,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) { } path[y] = []byte{y} } - for t := 1; t < len(obs); t++ { - newPath := make(map[byte][]byte) - V[t] = make(map[byte]float64) for _, y := range states { - ps0 := make(probStates, 0) + ps0 := make(probStates, 0, 2) var emP float64 if val, ok := probEmit[y][obs[t]]; ok { emP = val @@ -91,9 +87,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) { } path = newPath } - ps := make(probStates, 0) - for _, y := range []byte{'E', 'S'} { - ps = append(ps, &probState{V[len(obs)-1][y], y}) + ps := probStates{ + &probState{V[len(obs)-1]['E'], 'E'}, + &probState{V[len(obs)-1]['S'], 'S'}, } sort.Sort(sort.Reverse(ps)) v := ps[0]