1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
This commit is contained in:
源文雨
2022-11-30 12:18:15 +08:00
parent ab8b95ef87
commit 8bbc755ed4
48 changed files with 984 additions and 859 deletions

138
finalseg/finalseg.go Normal file → Executable file
View File

@@ -10,88 +10,86 @@ var (
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
)
func cutHan(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result <- string(runes[begin : i+1])
next = i + 1
case 'S':
result <- string(char)
next = i + 1
}
func cutHan(sentence string) []string {
result := make([]string, 0, 10)
runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, string(runes[begin:i+1]))
next = i + 1
case 'S':
result = append(result, string(char))
next = i + 1
}
if next < len(runes) {
result <- string(runes[next:])
}
close(result)
}()
}
if next < len(runes) {
result = append(result, string(runes[next:]))
}
return result
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string {
result := make(chan string)
func Cut(sentence string) []string {
result := make([]string, 0, 10)
s := sentence
var hans string
var hanLoc []int
var nonhanLoc []int
go func() {
for {
hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for han := range cutHan(hans) {
result <- han
}
for {
hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for _, han := range cutHan(hans) {
result = append(result, han)
}
continue
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result <- nonhans
continue
}
}
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result <- s
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result <- s[:loc[0]]
s = s[loc[0]:]
}
close(result)
}()
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result = append(result, s)
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result = append(result, s[:loc[0]])
s = s[loc[0]:]
}
return result
}

16
finalseg/finalseg_test.go Normal file → Executable file
View File

@@ -5,14 +5,6 @@ import (
"testing"
)
func chanToArray(ch chan string) []string {
var result []string
for word := range ch {
result = append(result, word)
}
return result
}
func TestViterbi(t *testing.T) {
obs := "我们是程序员"
states := []byte{'B', 'M', 'E', 'S'}
@@ -29,7 +21,7 @@ func TestViterbi(t *testing.T) {
func TestCutHan(t *testing.T) {
obs := "我们是程序员"
result := chanToArray(cutHan(obs))
result := cutHan(obs)
if len(result) != 3 {
t.Fatal(result)
}
@@ -46,7 +38,7 @@ func TestCutHan(t *testing.T) {
func TestCut(t *testing.T) {
sentence := "我们是程序员"
result := chanToArray(Cut(sentence))
result := Cut(sentence)
if len(result) != 3 {
t.Fatal(len(result))
}
@@ -59,11 +51,11 @@ func TestCut(t *testing.T) {
if result[2] != "程序员" {
t.Fatal(result[2])
}
result2 := chanToArray(Cut("I'm a programmer!"))
result2 := Cut("I'm a programmer!")
if len(result2) != 8 {
t.Fatal(result2)
}
result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
result3 := Cut("程序员average年龄28.6岁。")
if len(result3) != 6 {
t.Fatal(result3)
}

0
finalseg/prob_emit.go Normal file → Executable file
View File

0
finalseg/prob_trans.go Normal file → Executable file
View File

0
finalseg/viterbi.go Normal file → Executable file
View File