1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
Files
jieba/finalseg/finalseg.go
2022-11-30 14:47:50 +08:00

90 lines
1.8 KiB
Go
Executable File

// Package finalseg is the Golang implementation of Jieba's finalseg module.
package finalseg
import (
"regexp"
)
var (
reHan = regexp.MustCompile(`\p{Han}+`)
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
)
func cutHan(sentence string) []string {
runes := []rune(sentence)
result := make([]string, 0, len(runes))
_, posList := viterbi(runes, 'B', 'M', 'E', 'S')
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, string(runes[begin:i+1]))
next = i + 1
case 'S':
result = append(result, string(char))
next = i + 1
}
}
if next < len(runes) {
result = append(result, string(runes[next:]))
}
return result
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by jieba for unknown words.
func Cut(s string) []string {
result := make([]string, 0, len(s))
lop:
for {
hanLoc := reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans := s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
result = append(result, cutHan(hans)...)
continue
}
nonhanLoc := reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue
}
}
var loc []int
switch {
case hanLoc == nil && nonhanLoc == nil:
if len(s) > 0 {
result = append(result, s)
break lop
}
case hanLoc == nil:
loc = nonhanLoc
case nonhanLoc == nil:
loc = hanLoc
case hanLoc[0] < nonhanLoc[0]:
loc = hanLoc
default:
loc = nonhanLoc
}
result = append(result, s[:loc[0]])
s = s[loc[0]:]
}
return result
}