mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化 finalseg
This commit is contained in:
@@ -11,10 +11,9 @@ var (
|
||||
)
|
||||
|
||||
func cutHan(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
result := make([]string, 0, len(runes))
|
||||
_, posList := viterbi(runes, 'B', 'M', 'E', 'S')
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
@@ -37,29 +36,23 @@ func cutHan(sentence string) []string {
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by jieba for unknonw words.
|
||||
func Cut(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
s := sentence
|
||||
var hans string
|
||||
var hanLoc []int
|
||||
var nonhanLoc []int
|
||||
|
||||
// algorithm. It is used by jieba for unknown words.
|
||||
func Cut(s string) []string {
|
||||
result := make([]string, 0, len(s))
|
||||
lop:
|
||||
for {
|
||||
hanLoc = reHan.FindStringIndex(s)
|
||||
hanLoc := reHan.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
hans := s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for _, han := range cutHan(hans) {
|
||||
result = append(result, han)
|
||||
}
|
||||
result = append(result, cutHan(hans)...)
|
||||
continue
|
||||
}
|
||||
nonhanLoc = reSkip.FindStringIndex(s)
|
||||
nonhanLoc := reSkip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
@@ -73,18 +66,19 @@ func Cut(sentence string) []string {
|
||||
}
|
||||
}
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
switch {
|
||||
case hanLoc == nil && nonhanLoc == nil:
|
||||
if len(s) > 0 {
|
||||
result = append(result, s)
|
||||
break
|
||||
break lop
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
case hanLoc == nil:
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
case nonhanLoc == nil:
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
case hanLoc[0] < nonhanLoc[0]:
|
||||
loc = hanLoc
|
||||
} else {
|
||||
default:
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result = append(result, s[:loc[0]])
|
||||
|
||||
@@ -7,8 +7,7 @@ import (
|
||||
|
||||
func TestViterbi(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
states := []byte{'B', 'M', 'E', 'S'}
|
||||
prob, path := viterbi([]rune(obs), states)
|
||||
prob, path := viterbi([]rune(obs), 'B', 'M', 'E', 'S')
|
||||
if math.Abs(prob+39.68824128493802) > 1e-10 {
|
||||
t.Fatal(prob)
|
||||
}
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
package finalseg
|
||||
|
||||
var probEmit = make(map[byte]map[rune]float64)
|
||||
|
||||
func init() {
|
||||
probEmit['B'] = map[rune]float64{'\u4e00': -3.6544978750449433,
|
||||
var probEmit = map[byte]map[rune]float64{
|
||||
'B': {'\u4e00': -3.6544978750449433,
|
||||
'\u4e01': -8.125041941842026,
|
||||
'\u4e03': -7.817392401429855,
|
||||
'\u4e07': -6.3096425804013165,
|
||||
@@ -6859,8 +6857,8 @@ func init() {
|
||||
'\u9f99': -7.892474414343774,
|
||||
'\u9f9a': -9.557108305917183,
|
||||
'\u9f9c': -10.895131537474946,
|
||||
'\u9f9f': -10.895131537474946}
|
||||
probEmit['E'] = map[rune]float64{'\u4e00': -6.044987536255073,
|
||||
'\u9f9f': -10.895131537474946},
|
||||
'E': {'\u4e00': -6.044987536255073,
|
||||
'\u4e01': -9.075800412310807,
|
||||
'\u4e03': -9.198842005220659,
|
||||
'\u4e07': -7.655326112989935,
|
||||
@@ -14298,8 +14296,8 @@ func init() {
|
||||
'\u9f9a': -15.137257331238825,
|
||||
'\u9f9b': -12.729311722586953,
|
||||
'\u9f9c': -10.574067217491615,
|
||||
'\u9f9f': -10.574067217491615}
|
||||
probEmit['M'] = map[rune]float64{'\u4e00': -4.428158526435913,
|
||||
'\u9f9f': -10.574067217491615},
|
||||
'M': {'\u4e00': -4.428158526435913,
|
||||
'\u4e01': -7.932945687598502,
|
||||
'\u4e03': -6.559715525951586,
|
||||
'\u4e07': -6.139922374120667,
|
||||
@@ -20707,8 +20705,8 @@ func init() {
|
||||
'\u9f99': -6.908072798071771,
|
||||
'\u9f9a': -14.14915250738439,
|
||||
'\u9f9c': -11.058110054026073,
|
||||
'\u9f9f': -11.058110054026073}
|
||||
probEmit['S'] = map[rune]float64{'\u2236': -15.828865681131282,
|
||||
'\u9f9f': -11.058110054026073},
|
||||
'S': {'\u2236': -15.828865681131282,
|
||||
'\u4e00': -4.92368982120877,
|
||||
'\u4e01': -9.024528361347633,
|
||||
'\u4e02': -16.522012861691227,
|
||||
@@ -35226,6 +35224,5 @@ func init() {
|
||||
'\u9f9c': -10.409437488834186,
|
||||
'\u9f9f': -10.409437488834186,
|
||||
'\u9fa0': -15.605722129817071,
|
||||
'\u9fa2': -10.61937952828986}
|
||||
|
||||
'\u9fa2': -10.61937952828986},
|
||||
}
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
package finalseg
|
||||
|
||||
var probTrans = make(map[byte]map[byte]float64)
|
||||
|
||||
func init() {
|
||||
probTrans['B'] = map[byte]float64{'E': -0.510825623765990,
|
||||
'M': -0.916290731874155}
|
||||
probTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
|
||||
'S': -0.8085250474669937}
|
||||
probTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
|
||||
'M': -1.2603623820268226}
|
||||
probTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
|
||||
'S': -0.6658631448798212}
|
||||
var probTrans = map[byte]map[byte]float64{
|
||||
'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
|
||||
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
|
||||
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
|
||||
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212},
|
||||
}
|
||||
|
||||
@@ -8,20 +8,19 @@ import (
|
||||
const minFloat = -3.14e100
|
||||
|
||||
var (
|
||||
prevStatus = make(map[byte][]byte)
|
||||
probStart = make(map[byte]float64)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prevStatus['B'] = []byte{'E', 'S'}
|
||||
prevStatus['M'] = []byte{'M', 'B'}
|
||||
prevStatus['S'] = []byte{'S', 'E'}
|
||||
prevStatus['E'] = []byte{'B', 'M'}
|
||||
probStart['B'] = -0.26268660809250016
|
||||
probStart['E'] = -3.14e+100
|
||||
probStart['M'] = -3.14e+100
|
||||
probStart['S'] = -1.4652633398537678
|
||||
prevStatus = map[byte][2]byte{
|
||||
'B': {'E', 'S'},
|
||||
'M': {'M', 'B'},
|
||||
'S': {'S', 'E'},
|
||||
'E': {'B', 'M'},
|
||||
}
|
||||
probStart = map[byte]float64{
|
||||
'B': -0.26268660809250016,
|
||||
'E': -3.14e+100,
|
||||
'M': -3.14e+100,
|
||||
'S': -1.4652633398537678,
|
||||
}
|
||||
)
|
||||
|
||||
type probState struct {
|
||||
prob float64
|
||||
@@ -49,10 +48,10 @@ func (ps probStates) Swap(i, j int) {
|
||||
ps[i], ps[j] = ps[j], ps[i]
|
||||
}
|
||||
|
||||
func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
path := make(map[byte][]byte)
|
||||
V := make([]map[byte]float64, len(obs))
|
||||
V[0] = make(map[byte]float64)
|
||||
func viterbi(obs []rune, states ...byte) (float64, []byte) {
|
||||
path := [256][]byte{}
|
||||
newPath := [256][]byte{}
|
||||
V := make([][256]float64, len(obs))
|
||||
for _, y := range states {
|
||||
if val, ok := probEmit[y][obs[0]]; ok {
|
||||
V[0][y] = val + probStart[y]
|
||||
@@ -61,12 +60,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
}
|
||||
path[y] = []byte{y}
|
||||
}
|
||||
|
||||
for t := 1; t < len(obs); t++ {
|
||||
newPath := make(map[byte][]byte)
|
||||
V[t] = make(map[byte]float64)
|
||||
for _, y := range states {
|
||||
ps0 := make(probStates, 0)
|
||||
ps0 := make(probStates, 0, 2)
|
||||
var emP float64
|
||||
if val, ok := probEmit[y][obs[t]]; ok {
|
||||
emP = val
|
||||
@@ -91,9 +87,9 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
}
|
||||
path = newPath
|
||||
}
|
||||
ps := make(probStates, 0)
|
||||
for _, y := range []byte{'E', 'S'} {
|
||||
ps = append(ps, &probState{V[len(obs)-1][y], y})
|
||||
ps := probStates{
|
||||
&probState{V[len(obs)-1]['E'], 'E'},
|
||||
&probState{V[len(obs)-1]['S'], 'S'},
|
||||
}
|
||||
sort.Sort(sort.Reverse(ps))
|
||||
v := ps[0]
|
||||
|
||||
Reference in New Issue
Block a user