1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-27 07:30:32 +08:00

initial commit

This commit is contained in:
Wang Bin
2013-10-31 18:20:04 +08:00
commit 8c785ad36a
24 changed files with 831685 additions and 0 deletions

85
finalseg/finalseg.go Normal file
View File

@@ -0,0 +1,85 @@
package finalseg
import (
"regexp"
)
func cutHan(sentence string) []string {
runes := []rune(sentence)
result := make([]string, 0)
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := pos_list[i]
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, string(runes[begin:i+1]))
next = i + 1
case 'S':
result = append(result, string(char))
next = i + 1
}
}
if next < len(runes) {
result = append(result, string(runes[next:]))
}
return result
}
func Cut(sentence string) []string {
result := make([]string, 0)
re_han := regexp.MustCompile(`\p{Han}+`)
re_skip := regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
s := sentence
var hans string
var hanLoc []int
var nonhanLoc []int
for {
hanLoc = re_han.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for _, han := range cutHan(hans) {
result = append(result, han)
}
continue
}
nonhanLoc = re_skip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue
}
}
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result = append(result, s)
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result = append(result, s[:loc[0]])
s = s[loc[0]:]
}
return result
}

63
finalseg/finalseg_test.go Normal file
View File

@@ -0,0 +1,63 @@
package finalseg
import (
"math"
"testing"
)
func TestViterbi(t *testing.T) {
obs := "我们是程序员"
states := []byte{'B', 'M', 'E', 'S'}
prob, path := viterbi([]rune(obs), states)
if math.Abs(prob+39.68824128493802) > 1e-10 {
t.Error(prob)
}
for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} {
if path[index] != state {
t.Error(path)
}
}
}
func TestCutHan(t *testing.T) {
obs := "我们是程序员"
result := cutHan(obs)
if len(result) != 3 {
t.Error(result)
}
if result[0] != "我们" {
t.Error(result[0])
}
if result[1] != "是" {
t.Error(result[1])
}
if result[2] != "程序员" {
t.Error(result[2])
}
}
func TestCut(t *testing.T) {
sentence := "我们是程序员"
result := Cut(sentence)
if len(result) != 3 {
t.Error(len(result))
}
if result[0] != "我们" {
t.Error(result[0])
}
if result[1] != "是" {
t.Error(result[1])
}
if result[2] != "程序员" {
t.Error(result[2])
}
result2 := Cut("I'm a programmer!")
if len(result2) != 8 {
t.Error(result2)
}
result3 := Cut("程序员average年龄28.6岁。")
if len(result3) != 6 {
t.Error(result3)
}
}

35231
finalseg/prob_emit.go Normal file

File diff suppressed because it is too large Load Diff

10
finalseg/prob_start.go Normal file
View File

@@ -0,0 +1,10 @@
package finalseg
var ProbStart = make(map[byte]float64)
func init() {
ProbStart['B'] = -0.26268660809250016
ProbStart['E'] = -3.14e+100
ProbStart['M'] = -3.14e+100
ProbStart['S'] = -1.4652633398537678
}

14
finalseg/prob_trans.go Normal file
View File

@@ -0,0 +1,14 @@
package finalseg
var ProbTrans = make(map[byte]map[byte]float64)
func init() {
ProbTrans['B'] = map[byte]float64{'E': -0.510825623765990,
'M': -0.916290731874155}
ProbTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
'S': -0.8085250474669937}
ProbTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
'M': -1.2603623820268226}
ProbTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
'S': -0.6658631448798212}
}

94
finalseg/viterbi.go Normal file
View File

@@ -0,0 +1,94 @@
package finalseg
import (
"fmt"
"sort"
)
const MIN_FLOAT = -3.14e100
var PrevStatus = make(map[byte][]byte)
func init() {
PrevStatus['B'] = []byte{'E', 'S'}
PrevStatus['M'] = []byte{'M', 'B'}
PrevStatus['S'] = []byte{'S', 'E'}
PrevStatus['E'] = []byte{'B', 'M'}
}
type Viterbi struct {
prob float64
state byte
}
func (v Viterbi) String() string {
return fmt.Sprintf("(%f, %s)", v.prob, v.state)
}
type Viterbis []*Viterbi
func (vs Viterbis) Len() int {
return len(vs)
}
func (vs Viterbis) Less(i, j int) bool {
if vs[i].prob == vs[j].prob {
return vs[i].state < vs[j].state
}
return vs[i].prob < vs[j].prob
}
func (vs Viterbis) Swap(i, j int) {
vs[i], vs[j] = vs[j], vs[i]
}
func viterbi(obs []rune, states []byte) (float64, []byte) {
path := make(map[byte][]byte)
V := make([]map[byte]float64, len(obs))
V[0] = make(map[byte]float64)
for _, y := range states {
if val, ok := ProbEmit[y][obs[0]]; ok {
V[0][y] = val + ProbStart[y]
} else {
V[0][y] = MIN_FLOAT + ProbStart[y]
}
path[y] = []byte{y}
}
for t := 1; t < len(obs); t++ {
newPath := make(map[byte][]byte)
V[t] = make(map[byte]float64)
for _, y := range states {
vs0 := make(Viterbis, 0)
var em_p float64
if val, ok := ProbEmit[y][obs[t]]; ok {
em_p = val
} else {
em_p = MIN_FLOAT
}
for _, y0 := range PrevStatus[y] {
var transP float64
if tp, ok := ProbTrans[y0][y]; ok {
transP = tp
} else {
transP = MIN_FLOAT
}
prob0 := V[t-1][y0] + transP + em_p
vs0 = append(vs0, &Viterbi{prob: prob0, state: y0})
}
sort.Sort(sort.Reverse(vs0))
V[t][y] = vs0[0].prob
pp := make([]byte, len(path[vs0[0].state]))
copy(pp, path[vs0[0].state])
newPath[y] = append(pp, y)
}
path = newPath
}
vs := make(Viterbis, 0)
for _, y := range []byte{'E', 'S'} {
vs = append(vs, &Viterbi{V[len(obs)-1][y], y})
}
sort.Sort(sort.Reverse(vs))
v := vs[0]
return v.prob, path[v.state]
}