mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
315 lines
6.6 KiB
Go
315 lines
6.6 KiB
Go
// Golang implemention of jieba (Python Chinese word segmentation module).
|
|
package jiebago
|
|
|
|
import (
|
|
"errors"
|
|
"github.com/wangbin/jiebago/finalseg"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
)
|
|
|
|
var (
|
|
ErrInitialized = errors.New("already initialized")
|
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
|
reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
|
|
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
|
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
|
)
|
|
|
|
type Segmenter interface {
|
|
Freq(string) (float64, bool)
|
|
Total() float64
|
|
LogTotal() float64
|
|
}
|
|
|
|
type Jieba struct {
|
|
total, logTotal float64
|
|
freqMap map[string]float64
|
|
}
|
|
|
|
func (j Jieba) Freq(key string) (float64, bool) {
|
|
freq, ok := j.freqMap[key]
|
|
return freq, ok
|
|
}
|
|
|
|
func (j Jieba) Total() float64 {
|
|
return j.total
|
|
}
|
|
|
|
func (j Jieba) LogTotal() float64 {
|
|
return j.logTotal
|
|
}
|
|
|
|
func (j *Jieba) AddEntry(entry Entry) {
|
|
j.Add(entry.Word, entry.Freq)
|
|
}
|
|
|
|
func (j *Jieba) Add(word string, freq float64) {
|
|
j.freqMap[word] = freq
|
|
j.total += freq
|
|
runes := []rune(word)
|
|
for i := 0; i < len(runes); i++ {
|
|
frag := string(runes[0 : i+1])
|
|
if _, ok := j.Freq(frag); !ok {
|
|
j.freqMap[frag] = 0.0
|
|
}
|
|
}
|
|
j.logTotal = math.Log(j.total)
|
|
}
|
|
|
|
// Load user specified dictionary file.
|
|
func (j *Jieba) LoadUserDict(dictFileName string) error {
|
|
return LoadDict(j, dictFileName, false)
|
|
}
|
|
|
|
func (j *Jieba) SetDict(dictFileName string) error {
|
|
if len(j.freqMap) > 0 || j.total > 0.0 {
|
|
return ErrInitialized
|
|
}
|
|
return LoadDict(j, dictFileName, false)
|
|
}
|
|
|
|
func New() *Jieba {
|
|
return &Jieba{total: 0.0, freqMap: make(map[string]float64)}
|
|
}
|
|
|
|
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
|
// name in current directory. This function must be called before cut any
|
|
// sentence.
|
|
func Open(dictFileName string) (*Jieba, error) {
|
|
j := New()
|
|
err := LoadDict(j, dictFileName, false)
|
|
return j, err
|
|
}
|
|
|
|
type cutFunc func(sentence string) chan string
|
|
|
|
func (j *Jieba) cutDAG(sentence string) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
runes := []rune(sentence)
|
|
dag := DAG(j, runes)
|
|
routes := Routes(j, runes, dag)
|
|
x := 0
|
|
var y int
|
|
length := len(runes)
|
|
buf := make([]rune, 0)
|
|
for {
|
|
if x >= length {
|
|
break
|
|
}
|
|
y = routes[x].Index + 1
|
|
l_word := runes[x:y]
|
|
if y-x == 1 {
|
|
buf = append(buf, l_word...)
|
|
} else {
|
|
if len(buf) > 0 {
|
|
if len(buf) == 1 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
} else {
|
|
bufString := string(buf)
|
|
if v, ok := j.Freq(bufString); !ok || v == 0.0 {
|
|
for x := range finalseg.Cut(bufString) {
|
|
result <- x
|
|
}
|
|
} else {
|
|
for _, elem := range buf {
|
|
result <- string(elem) // TODO: I don't get this?
|
|
}
|
|
}
|
|
buf = make([]rune, 0)
|
|
}
|
|
}
|
|
result <- string(l_word)
|
|
}
|
|
x = y
|
|
}
|
|
|
|
if len(buf) > 0 {
|
|
if len(buf) == 1 {
|
|
result <- string(buf)
|
|
} else {
|
|
bufString := string(buf)
|
|
if v, ok := j.Freq(bufString); !ok || v == 0.0 {
|
|
for t := range finalseg.Cut(bufString) {
|
|
result <- t
|
|
}
|
|
} else {
|
|
for _, elem := range buf {
|
|
result <- string(elem) // TODO: I don't get this?
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
|
|
result := make(chan string)
|
|
|
|
go func() {
|
|
runes := []rune(sentence)
|
|
dag := DAG(j, runes)
|
|
routes := Routes(j, runes, dag)
|
|
x := 0
|
|
var y int
|
|
length := len(runes)
|
|
buf := make([]rune, 0)
|
|
for {
|
|
if x >= length {
|
|
break
|
|
}
|
|
y = routes[x].Index + 1
|
|
l_word := runes[x:y]
|
|
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
|
|
buf = append(buf, l_word...)
|
|
x = y
|
|
} else {
|
|
if len(buf) > 0 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
}
|
|
result <- string(l_word)
|
|
x = y
|
|
}
|
|
}
|
|
if len(buf) > 0 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
func (j *Jieba) cutAll(sentence string) chan string {
|
|
result := make(chan string)
|
|
|
|
go func() {
|
|
runes := []rune(sentence)
|
|
dag := DAG(j, runes)
|
|
old_j := -1
|
|
ks := make([]int, 0)
|
|
for k := range dag {
|
|
ks = append(ks, k)
|
|
}
|
|
sort.Ints(ks)
|
|
for k := range ks {
|
|
l := dag[k]
|
|
if len(l) == 1 && k > old_j {
|
|
result <- string(runes[k : l[0]+1])
|
|
old_j = l[0]
|
|
} else {
|
|
for _, j := range l {
|
|
if j > k {
|
|
result <- string(runes[k : j+1])
|
|
old_j = j
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
/*
|
|
Cut sentence.
|
|
|
|
isCutAll controls use full cut mode or accurate mode.
|
|
|
|
Full Mode gets all the possible words from the sentence. Fast but not accurate.
|
|
|
|
Accurate Mode attempts to cut the sentence into the most accurate segmentations,
|
|
which is suitable for text analysis.
|
|
|
|
HMM contols whether to use the Hidden Markov Mode.
|
|
*/
|
|
func (j *Jieba) Cut(sentence string, hmm bool) chan string {
|
|
result := make(chan string)
|
|
var cut cutFunc
|
|
if hmm {
|
|
cut = j.cutDAG
|
|
} else {
|
|
cut = j.cutDAGNoHMM
|
|
}
|
|
|
|
go func() {
|
|
for _, block := range RegexpSplit(reHanDefault, sentence, -1) {
|
|
if len(block) == 0 {
|
|
continue
|
|
}
|
|
if reHanDefault.MatchString(block) {
|
|
for x := range cut(block) {
|
|
result <- x
|
|
}
|
|
} else {
|
|
for _, subBlock := range RegexpSplit(reSkipDefault, block, -1) {
|
|
if reSkipDefault.MatchString(subBlock) {
|
|
result <- subBlock
|
|
} else {
|
|
for _, r := range subBlock {
|
|
result <- string(r)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
func (j *Jieba) CutAll(sentence string) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
for _, block := range RegexpSplit(reHanCutAll, sentence, -1) {
|
|
if len(block) == 0 {
|
|
continue
|
|
}
|
|
if reHanCutAll.MatchString(block) {
|
|
for x := range j.cutAll(block) {
|
|
result <- x
|
|
}
|
|
} else {
|
|
for _, subBlock := range reSkipCutAll.Split(block, -1) {
|
|
result <- subBlock
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
|
// to cut long words into several short words, which can raise the recall rate.
|
|
// Suitable for search engines.
|
|
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
for word := range j.Cut(sentence, hmm) {
|
|
runes := []rune(word)
|
|
for _, increment := range []int{2, 3} {
|
|
if len(runes) > increment {
|
|
var gram2 string
|
|
for i := 0; i < len(runes)-increment+1; i++ {
|
|
gram2 = string(runes[i : i+increment])
|
|
if v, ok := j.Freq(gram2); ok && v > 0.0 {
|
|
result <- gram2
|
|
}
|
|
}
|
|
}
|
|
}
|
|
result <- word
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|