mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
优化
This commit is contained in:
138
finalseg/finalseg.go
Normal file → Executable file
138
finalseg/finalseg.go
Normal file → Executable file
@@ -10,88 +10,86 @@ var (
|
||||
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
|
||||
)
|
||||
|
||||
func cutHan(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
case 'E':
|
||||
result <- string(runes[begin : i+1])
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result <- string(char)
|
||||
next = i + 1
|
||||
}
|
||||
func cutHan(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
|
||||
runes := []rune(sentence)
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
case 'E':
|
||||
result = append(result, string(runes[begin:i+1]))
|
||||
next = i + 1
|
||||
case 'S':
|
||||
result = append(result, string(char))
|
||||
next = i + 1
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- string(runes[next:])
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
}
|
||||
if next < len(runes) {
|
||||
result = append(result, string(runes[next:]))
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by Jiebago for unknonw words.
|
||||
func Cut(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
func Cut(sentence string) []string {
|
||||
result := make([]string, 0, 10)
|
||||
s := sentence
|
||||
var hans string
|
||||
var hanLoc []int
|
||||
var nonhanLoc []int
|
||||
go func() {
|
||||
for {
|
||||
hanLoc = reHan.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for han := range cutHan(hans) {
|
||||
result <- han
|
||||
}
|
||||
|
||||
for {
|
||||
hanLoc = reHan.FindStringIndex(s)
|
||||
if hanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if hanLoc[0] == 0 {
|
||||
hans = s[hanLoc[0]:hanLoc[1]]
|
||||
s = s[hanLoc[1]:]
|
||||
for _, han := range cutHan(hans) {
|
||||
result = append(result, han)
|
||||
}
|
||||
continue
|
||||
}
|
||||
nonhanLoc = reSkip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if nonhanLoc[0] == 0 {
|
||||
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
|
||||
s = s[nonhanLoc[1]:]
|
||||
if nonhans != "" {
|
||||
result = append(result, nonhans)
|
||||
continue
|
||||
}
|
||||
nonhanLoc = reSkip.FindStringIndex(s)
|
||||
if nonhanLoc == nil {
|
||||
if len(s) == 0 {
|
||||
break
|
||||
}
|
||||
} else if nonhanLoc[0] == 0 {
|
||||
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
|
||||
s = s[nonhanLoc[1]:]
|
||||
if nonhans != "" {
|
||||
result <- nonhans
|
||||
continue
|
||||
}
|
||||
}
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
if len(s) > 0 {
|
||||
result <- s
|
||||
break
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
loc = hanLoc
|
||||
} else {
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result <- s[:loc[0]]
|
||||
s = s[loc[0]:]
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
var loc []int
|
||||
if hanLoc == nil && nonhanLoc == nil {
|
||||
if len(s) > 0 {
|
||||
result = append(result, s)
|
||||
break
|
||||
}
|
||||
} else if hanLoc == nil {
|
||||
loc = nonhanLoc
|
||||
} else if nonhanLoc == nil {
|
||||
loc = hanLoc
|
||||
} else if hanLoc[0] < nonhanLoc[0] {
|
||||
loc = hanLoc
|
||||
} else {
|
||||
loc = nonhanLoc
|
||||
}
|
||||
result = append(result, s[:loc[0]])
|
||||
s = s[loc[0]:]
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
16
finalseg/finalseg_test.go
Normal file → Executable file
16
finalseg/finalseg_test.go
Normal file → Executable file
@@ -5,14 +5,6 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func TestViterbi(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
states := []byte{'B', 'M', 'E', 'S'}
|
||||
@@ -29,7 +21,7 @@ func TestViterbi(t *testing.T) {
|
||||
|
||||
func TestCutHan(t *testing.T) {
|
||||
obs := "我们是程序员"
|
||||
result := chanToArray(cutHan(obs))
|
||||
result := cutHan(obs)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(result)
|
||||
}
|
||||
@@ -46,7 +38,7 @@ func TestCutHan(t *testing.T) {
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
sentence := "我们是程序员"
|
||||
result := chanToArray(Cut(sentence))
|
||||
result := Cut(sentence)
|
||||
if len(result) != 3 {
|
||||
t.Fatal(len(result))
|
||||
}
|
||||
@@ -59,11 +51,11 @@ func TestCut(t *testing.T) {
|
||||
if result[2] != "程序员" {
|
||||
t.Fatal(result[2])
|
||||
}
|
||||
result2 := chanToArray(Cut("I'm a programmer!"))
|
||||
result2 := Cut("I'm a programmer!")
|
||||
if len(result2) != 8 {
|
||||
t.Fatal(result2)
|
||||
}
|
||||
result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
|
||||
result3 := Cut("程序员average年龄28.6岁。")
|
||||
if len(result3) != 6 {
|
||||
t.Fatal(result3)
|
||||
}
|
||||
|
||||
0
finalseg/prob_emit.go
Normal file → Executable file
0
finalseg/prob_emit.go
Normal file → Executable file
0
finalseg/prob_trans.go
Normal file → Executable file
0
finalseg/prob_trans.go
Normal file → Executable file
0
finalseg/viterbi.go
Normal file → Executable file
0
finalseg/viterbi.go
Normal file → Executable file
Reference in New Issue
Block a user