mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-27 07:30:32 +08:00
initial commit
This commit is contained in:
85
finalseg/finalseg.go
Normal file
85
finalseg/finalseg.go
Normal file
@@ -0,0 +1,85 @@
|
||||
package finalseg
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
)
|
||||
|
||||
func cutHan(sentence string) []string {
|
||||
runes := []rune(sentence)
|
||||
result := make([]string, 0)
|
||||
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := pos_list[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
case 'E':
|
||||
result = append(result, string(runes[begin:i+1]))
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result = append(result, string(char))
|
||||
next = i + 1
|
||||
}
|
||||
}
|
||||
if next < len(runes) {
|
||||
result = append(result, string(runes[next:]))
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func Cut(sentence string) []string {
|
||||
result := make([]string, 0)
|
||||
re_han := regexp.MustCompile(`\p{Han}+`)
|
||||
re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
|
||||
s := sentence
|
||||
var hans string
|
||||
var hanLoc []int
|
||||
var nonhanLoc []int
|
||||
for {
|
||||
hanLoc = re_han.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for _, han := range cutHan(hans) {
|
||||
result = append(result, han)
|
||||
}
|
||||
continue
|
||||
}
|
||||
nonhanLoc = re_skip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if nonhanLoc[0] == 0 {
|
||||
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
|
||||
s = s[nonhanLoc[1]:]
|
||||
if nonhans != "" {
|
||||
result = append(result, nonhans)
|
||||
continue
|
||||
}
|
||||
}
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
if len(s) > 0 {
|
||||
result = append(result, s)
|
||||
break
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
loc = hanLoc
|
||||
} else {
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result = append(result, s[:loc[0]])
|
||||
s = s[loc[0]:]
|
||||
}
|
||||
return result
|
||||
}
|
||||
63
finalseg/finalseg_test.go
Normal file
63
finalseg/finalseg_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package finalseg
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestViterbi(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
states := []byte{'B', 'M', 'E', 'S'}
|
||||
prob, path := viterbi([]rune(obs), states)
|
||||
if math.Abs(prob+39.68824128493802) > 1e-10 {
|
||||
t.Error(prob)
|
||||
}
|
||||
for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} {
|
||||
if path[index] != state {
|
||||
t.Error(path)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCutHan(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
result := cutHan(obs)
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
}
|
||||
if result[0] != "我们" {
|
||||
t.Error(result[0])
|
||||
}
|
||||
if result[1] != "是" {
|
||||
t.Error(result[1])
|
||||
}
|
||||
if result[2] != "程序员" {
|
||||
t.Error(result[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
sentence := "我们是程序员"
|
||||
result := Cut(sentence)
|
||||
if len(result) != 3 {
|
||||
t.Error(len(result))
|
||||
}
|
||||
if result[0] != "我们" {
|
||||
t.Error(result[0])
|
||||
}
|
||||
if result[1] != "是" {
|
||||
t.Error(result[1])
|
||||
}
|
||||
if result[2] != "程序员" {
|
||||
t.Error(result[2])
|
||||
}
|
||||
result2 := Cut("I'm a programmer!")
|
||||
if len(result2) != 8 {
|
||||
t.Error(result2)
|
||||
}
|
||||
result3 := Cut("程序员average年龄28.6岁。")
|
||||
if len(result3) != 6 {
|
||||
t.Error(result3)
|
||||
}
|
||||
|
||||
}
|
||||
35231
finalseg/prob_emit.go
Normal file
35231
finalseg/prob_emit.go
Normal file
File diff suppressed because it is too large
Load Diff
10
finalseg/prob_start.go
Normal file
10
finalseg/prob_start.go
Normal file
@@ -0,0 +1,10 @@
|
||||
package finalseg
|
||||
|
||||
var ProbStart = make(map[byte]float64)
|
||||
|
||||
func init() {
|
||||
ProbStart['B'] = -0.26268660809250016
|
||||
ProbStart['E'] = -3.14e+100
|
||||
ProbStart['M'] = -3.14e+100
|
||||
ProbStart['S'] = -1.4652633398537678
|
||||
}
|
||||
14
finalseg/prob_trans.go
Normal file
14
finalseg/prob_trans.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package finalseg
|
||||
|
||||
var ProbTrans = make(map[byte]map[byte]float64)
|
||||
|
||||
func init() {
|
||||
ProbTrans['B'] = map[byte]float64{'E': -0.510825623765990,
|
||||
'M': -0.916290731874155}
|
||||
ProbTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
|
||||
'S': -0.8085250474669937}
|
||||
ProbTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
|
||||
'M': -1.2603623820268226}
|
||||
ProbTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
|
||||
'S': -0.6658631448798212}
|
||||
}
|
||||
94
finalseg/viterbi.go
Normal file
94
finalseg/viterbi.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package finalseg
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
const MIN_FLOAT = -3.14e100
|
||||
|
||||
var PrevStatus = make(map[byte][]byte)
|
||||
|
||||
func init() {
|
||||
PrevStatus['B'] = []byte{'E', 'S'}
|
||||
PrevStatus['M'] = []byte{'M', 'B'}
|
||||
PrevStatus['S'] = []byte{'S', 'E'}
|
||||
PrevStatus['E'] = []byte{'B', 'M'}
|
||||
}
|
||||
|
||||
type Viterbi struct {
|
||||
prob float64
|
||||
state byte
|
||||
}
|
||||
|
||||
func (v Viterbi) String() string {
|
||||
return fmt.Sprintf("(%f, %s)", v.prob, v.state)
|
||||
}
|
||||
|
||||
type Viterbis []*Viterbi
|
||||
|
||||
func (vs Viterbis) Len() int {
|
||||
return len(vs)
|
||||
}
|
||||
|
||||
func (vs Viterbis) Less(i, j int) bool {
|
||||
if vs[i].prob == vs[j].prob {
|
||||
return vs[i].state < vs[j].state
|
||||
}
|
||||
return vs[i].prob < vs[j].prob
|
||||
}
|
||||
|
||||
func (vs Viterbis) Swap(i, j int) {
|
||||
vs[i], vs[j] = vs[j], vs[i]
|
||||
}
|
||||
|
||||
func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
path := make(map[byte][]byte)
|
||||
V := make([]map[byte]float64, len(obs))
|
||||
V[0] = make(map[byte]float64)
|
||||
for _, y := range states {
|
||||
if val, ok := ProbEmit[y][obs[0]]; ok {
|
||||
V[0][y] = val + ProbStart[y]
|
||||
} else {
|
||||
V[0][y] = MIN_FLOAT + ProbStart[y]
|
||||
}
|
||||
path[y] = []byte{y}
|
||||
}
|
||||
|
||||
for t := 1; t < len(obs); t++ {
|
||||
newPath := make(map[byte][]byte)
|
||||
V[t] = make(map[byte]float64)
|
||||
for _, y := range states {
|
||||
vs0 := make(Viterbis, 0)
|
||||
var em_p float64
|
||||
if val, ok := ProbEmit[y][obs[t]]; ok {
|
||||
em_p = val
|
||||
} else {
|
||||
em_p = MIN_FLOAT
|
||||
}
|
||||
for _, y0 := range PrevStatus[y] {
|
||||
var transP float64
|
||||
if tp, ok := ProbTrans[y0][y]; ok {
|
||||
transP = tp
|
||||
} else {
|
||||
transP = MIN_FLOAT
|
||||
}
|
||||
prob0 := V[t-1][y0] + transP + em_p
|
||||
vs0 = append(vs0, &Viterbi{prob: prob0, state: y0})
|
||||
}
|
||||
sort.Sort(sort.Reverse(vs0))
|
||||
V[t][y] = vs0[0].prob
|
||||
pp := make([]byte, len(path[vs0[0].state]))
|
||||
copy(pp, path[vs0[0].state])
|
||||
newPath[y] = append(pp, y)
|
||||
}
|
||||
path = newPath
|
||||
}
|
||||
vs := make(Viterbis, 0)
|
||||
for _, y := range []byte{'E', 'S'} {
|
||||
vs = append(vs, &Viterbi{V[len(obs)-1][y], y})
|
||||
}
|
||||
sort.Sort(sort.Reverse(vs))
|
||||
v := vs[0]
|
||||
return v.prob, path[v.state]
|
||||
}
|
||||
Reference in New Issue
Block a user