1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

优化 finalseg

This commit is contained in:
源文雨
2022-11-30 14:47:50 +08:00
parent 6b239b5918
commit b2508252d5
5 changed files with 52 additions and 72 deletions

View File

@@ -11,10 +11,9 @@ var (
)
func cutHan(sentence string) []string {
result := make([]string, 0, 10)
runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
result := make([]string, 0, len(runes))
_, posList := viterbi(runes, 'B', 'M', 'E', 'S')
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
@@ -37,29 +36,23 @@ func cutHan(sentence string) []string {
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by jieba for unknonw words.
func Cut(sentence string) []string {
result := make([]string, 0, 10)
s := sentence
var hans string
var hanLoc []int
var nonhanLoc []int
// algorithm. It is used by jieba for unknown words.
func Cut(s string) []string {
result := make([]string, 0, len(s))
lop:
for {
hanLoc = reHan.FindStringIndex(s)
hanLoc := reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
hans := s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for _, han := range cutHan(hans) {
result = append(result, han)
}
result = append(result, cutHan(hans)...)
continue
}
nonhanLoc = reSkip.FindStringIndex(s)
nonhanLoc := reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
@@ -73,18 +66,19 @@ func Cut(sentence string) []string {
}
}
var loc []int
if hanLoc == nil && nonhanLoc == nil {
switch {
case hanLoc == nil && nonhanLoc == nil:
if len(s) > 0 {
result = append(result, s)
break
break lop
}
} else if hanLoc == nil {
case hanLoc == nil:
loc = nonhanLoc
} else if nonhanLoc == nil {
case nonhanLoc == nil:
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
case hanLoc[0] < nonhanLoc[0]:
loc = hanLoc
} else {
default:
loc = nonhanLoc
}
result = append(result, s[:loc[0]])

View File

@@ -7,8 +7,7 @@ import (
func TestViterbi(t *testing.T) {
obs := "我们是程序员"
states := []byte{'B', 'M', 'E', 'S'}
prob, path := viterbi([]rune(obs), states)
prob, path := viterbi([]rune(obs), 'B', 'M', 'E', 'S')
if math.Abs(prob+39.68824128493802) > 1e-10 {
t.Fatal(prob)
}

View File

@@ -1,9 +1,7 @@
package finalseg
var probEmit = make(map[byte]map[rune]float64)
func init() {
probEmit['B'] = map[rune]float64{'\u4e00': -3.6544978750449433,
var probEmit = map[byte]map[rune]float64{
'B': {'\u4e00': -3.6544978750449433,
'\u4e01': -8.125041941842026,
'\u4e03': -7.817392401429855,
'\u4e07': -6.3096425804013165,
@@ -6859,8 +6857,8 @@ func init() {
'\u9f99': -7.892474414343774,
'\u9f9a': -9.557108305917183,
'\u9f9c': -10.895131537474946,
'\u9f9f': -10.895131537474946}
probEmit['E'] = map[rune]float64{'\u4e00': -6.044987536255073,
'\u9f9f': -10.895131537474946},
'E': {'\u4e00': -6.044987536255073,
'\u4e01': -9.075800412310807,
'\u4e03': -9.198842005220659,
'\u4e07': -7.655326112989935,
@@ -14298,8 +14296,8 @@ func init() {
'\u9f9a': -15.137257331238825,
'\u9f9b': -12.729311722586953,
'\u9f9c': -10.574067217491615,
'\u9f9f': -10.574067217491615}
probEmit['M'] = map[rune]float64{'\u4e00': -4.428158526435913,
'\u9f9f': -10.574067217491615},
'M': {'\u4e00': -4.428158526435913,
'\u4e01': -7.932945687598502,
'\u4e03': -6.559715525951586,
'\u4e07': -6.139922374120667,
@@ -20707,8 +20705,8 @@ func init() {
'\u9f99': -6.908072798071771,
'\u9f9a': -14.14915250738439,
'\u9f9c': -11.058110054026073,
'\u9f9f': -11.058110054026073}
probEmit['S'] = map[rune]float64{'\u2236': -15.828865681131282,
'\u9f9f': -11.058110054026073},
'S': {'\u2236': -15.828865681131282,
'\u4e00': -4.92368982120877,
'\u4e01': -9.024528361347633,
'\u4e02': -16.522012861691227,
@@ -35226,6 +35224,5 @@ func init() {
'\u9f9c': -10.409437488834186,
'\u9f9f': -10.409437488834186,
'\u9fa0': -15.605722129817071,
'\u9fa2': -10.61937952828986}
'\u9fa2': -10.61937952828986},
}

View File

@@ -1,14 +1,8 @@
package finalseg
var probTrans = make(map[byte]map[byte]float64)
func init() {
probTrans['B'] = map[byte]float64{'E': -0.510825623765990,
'M': -0.916290731874155}
probTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
'S': -0.8085250474669937}
probTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
'M': -1.2603623820268226}
probTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
'S': -0.6658631448798212}
var probTrans = map[byte]map[byte]float64{
'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212},
}

View File

@@ -8,20 +8,19 @@ import (
const minFloat = -3.14e100
var (
prevStatus = make(map[byte][]byte)
probStart = make(map[byte]float64)
)
func init() {
prevStatus['B'] = []byte{'E', 'S'}
prevStatus['M'] = []byte{'M', 'B'}
prevStatus['S'] = []byte{'S', 'E'}
prevStatus['E'] = []byte{'B', 'M'}
probStart['B'] = -0.26268660809250016
probStart['E'] = -3.14e+100
probStart['M'] = -3.14e+100
probStart['S'] = -1.4652633398537678
prevStatus = map[byte][2]byte{
'B': {'E', 'S'},
'M': {'M', 'B'},
'S': {'S', 'E'},
'E': {'B', 'M'},
}
probStart = map[byte]float64{
'B': -0.26268660809250016,
'E': -3.14e+100,
'M': -3.14e+100,
'S': -1.4652633398537678,
}
)
type probState struct {
prob float64
@@ -49,10 +48,10 @@ func (ps probStates) Swap(i, j int) {
ps[i], ps[j] = ps[j], ps[i]
}
func viterbi(obs []rune, states []byte) (float64, []byte) {
path := make(map[byte][]byte)
V := make([]map[byte]float64, len(obs))
V[0] = make(map[byte]float64)
func viterbi(obs []rune, states ...byte) (float64, []byte) {
path := [256][]byte{}
newPath := [256][]byte{}
V := make([][256]float64, len(obs))
for _, y := range states {
if val, ok := probEmit[y][obs[0]]; ok {
V[0][y] = val + probStart[y]
@@ -61,12 +60,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
}
path[y] = []byte{y}
}
for t := 1; t < len(obs); t++ {
newPath := make(map[byte][]byte)
V[t] = make(map[byte]float64)
for _, y := range states {
ps0 := make(probStates, 0)
ps0 := make(probStates, 0, 2)
var emP float64
if val, ok := probEmit[y][obs[t]]; ok {
emP = val
@@ -91,9 +87,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
}
path = newPath
}
ps := make(probStates, 0)
for _, y := range []byte{'E', 'S'} {
ps = append(ps, &probState{V[len(obs)-1][y], y})
ps := probStates{
&probState{V[len(obs)-1]['E'], 'E'},
&probState{V[len(obs)-1]['S'], 'S'},
}
sort.Sort(sort.Reverse(ps))
v := ps[0]