mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
386 lines
8.0 KiB
Go
386 lines
8.0 KiB
Go
// Golang implemention of jieba (Python Chinese word segmentation module).
|
|
package jiebago
|
|
|
|
import (
|
|
"fmt"
|
|
"github.com/wangbin/jiebago/finalseg"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
)
|
|
|
|
const cacheNameFormat = "jieba.%x.cache"
|
|
|
|
var (
|
|
// Word/Tag Map load from user dictionary
|
|
UserWordTagTab = make(map[string]string)
|
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
|
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
|
|
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
|
|
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
|
)
|
|
|
|
type route struct {
|
|
Freq float64
|
|
Index int
|
|
}
|
|
|
|
func (r route) String() string {
|
|
return fmt.Sprintf("(%f, %d)", r.Freq, r.Index)
|
|
}
|
|
|
|
type routes []*route
|
|
|
|
func (rs routes) Len() int {
|
|
return len(rs)
|
|
}
|
|
|
|
func (rs routes) Less(i, j int) bool {
|
|
if rs[i].Freq < rs[j].Freq {
|
|
return true
|
|
}
|
|
if rs[i].Freq == rs[j].Freq {
|
|
return rs[i].Index < rs[j].Index
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (rs routes) Swap(i, j int) {
|
|
rs[i], rs[j] = rs[j], rs[i]
|
|
}
|
|
|
|
type Jieba struct {
|
|
Total float64
|
|
Freq map[string]float64
|
|
}
|
|
|
|
func (j *Jieba) AddEntry(entry *Entry) {
|
|
j.Add(entry.Word, entry.Freq)
|
|
}
|
|
|
|
func (j *Jieba) CacheNameFormat() string {
|
|
return cacheNameFormat
|
|
}
|
|
|
|
func (j *Jieba) Add(word string, freq float64) {
|
|
j.Freq[word] = freq
|
|
j.Total += freq
|
|
runes := []rune(word)
|
|
for i := 0; i < len(runes); i++ {
|
|
frag := string(runes[0 : i+1])
|
|
if _, ok := j.Freq[frag]; !ok {
|
|
j.Freq[frag] = 0.0
|
|
}
|
|
}
|
|
}
|
|
|
|
// Load user specified dictionary file.
|
|
func (j *Jieba) LoadUserDict(dictFilePath string) error {
|
|
return LoadDict(j, dictFilePath, false)
|
|
}
|
|
|
|
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
|
// name in current directory. This function must be called before cut any
|
|
// sentence.
|
|
func NewJieba(dictFileName string) (*Jieba, error) {
|
|
j := &Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
|
err := SetDict(j, dictFileName, false)
|
|
return j, err
|
|
}
|
|
|
|
// Build a directed acyclic graph (DAG) for sentence.
|
|
func (j *Jieba) DAG(sentence string) map[int][]int {
|
|
dag := make(map[int][]int)
|
|
runes := []rune(sentence)
|
|
n := len(runes)
|
|
i := 0
|
|
var frag string
|
|
for k := 0; k < n; k++ {
|
|
tmpList := make([]int, 0)
|
|
i = k
|
|
frag = string(runes[k])
|
|
for {
|
|
if freq, ok := j.Freq[frag]; !ok {
|
|
break
|
|
} else {
|
|
if freq > 0.0 {
|
|
tmpList = append(tmpList, i)
|
|
}
|
|
}
|
|
i += 1
|
|
if i >= n {
|
|
break
|
|
}
|
|
frag = string(runes[k : i+1])
|
|
}
|
|
if len(tmpList) == 0 {
|
|
tmpList = append(tmpList, k)
|
|
}
|
|
dag[k] = tmpList
|
|
}
|
|
return dag
|
|
}
|
|
|
|
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
|
|
runes := []rune(sentence)
|
|
number := len(runes)
|
|
rs := make(map[int]*route)
|
|
rs[number] = &route{Freq: 0.0, Index: 0}
|
|
logTotal := math.Log(j.Total)
|
|
for idx := number - 1; idx >= 0; idx-- {
|
|
candidates := make(routes, 0)
|
|
for _, i := range dag[idx] {
|
|
word := string(runes[idx : i+1])
|
|
var r *route
|
|
if _, ok := j.Freq[word]; ok {
|
|
r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
|
} else {
|
|
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
|
}
|
|
candidates = append(candidates, r)
|
|
}
|
|
sort.Sort(sort.Reverse(candidates))
|
|
rs[idx] = candidates[0]
|
|
}
|
|
return rs
|
|
}
|
|
|
|
type cutFunc func(sentence string) chan string
|
|
|
|
func (j *Jieba) cutDAG(sentence string) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
dag := j.DAG(sentence)
|
|
routes := j.Calc(sentence, dag)
|
|
x := 0
|
|
var y int
|
|
runes := []rune(sentence)
|
|
length := len(runes)
|
|
buf := make([]rune, 0)
|
|
for {
|
|
if x >= length {
|
|
break
|
|
}
|
|
y = routes[x].Index + 1
|
|
l_word := runes[x:y]
|
|
if y-x == 1 {
|
|
buf = append(buf, l_word...)
|
|
} else {
|
|
if len(buf) > 0 {
|
|
if len(buf) == 1 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
} else {
|
|
bufString := string(buf)
|
|
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
|
for x := range finalseg.Cut(bufString) {
|
|
result <- x
|
|
}
|
|
} else {
|
|
for _, elem := range buf {
|
|
result <- string(elem) // TODO: I don't get this?
|
|
}
|
|
}
|
|
buf = make([]rune, 0)
|
|
}
|
|
}
|
|
result <- string(l_word)
|
|
}
|
|
x = y
|
|
}
|
|
|
|
if len(buf) > 0 {
|
|
if len(buf) == 1 {
|
|
result <- string(buf)
|
|
} else {
|
|
bufString := string(buf)
|
|
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
|
for t := range finalseg.Cut(bufString) {
|
|
result <- t
|
|
}
|
|
} else {
|
|
for _, elem := range buf {
|
|
result <- string(elem) // TODO: I don't get this?
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
|
|
result := make(chan string)
|
|
|
|
go func() {
|
|
dag := j.DAG(sentence)
|
|
routes := j.Calc(sentence, dag)
|
|
x := 0
|
|
var y int
|
|
runes := []rune(sentence)
|
|
length := len(runes)
|
|
buf := make([]rune, 0)
|
|
for {
|
|
if x >= length {
|
|
break
|
|
}
|
|
y = routes[x].Index + 1
|
|
l_word := runes[x:y]
|
|
if reEng.MatchString(string(l_word)) && len(l_word) == 1 {
|
|
buf = append(buf, l_word...)
|
|
x = y
|
|
} else {
|
|
if len(buf) > 0 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
}
|
|
result <- string(l_word)
|
|
x = y
|
|
}
|
|
}
|
|
if len(buf) > 0 {
|
|
result <- string(buf)
|
|
buf = make([]rune, 0)
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
func (j *Jieba) cutAll(sentence string) chan string {
|
|
result := make(chan string)
|
|
|
|
go func() {
|
|
runes := []rune(sentence)
|
|
dag := j.DAG(sentence)
|
|
old_j := -1
|
|
ks := make([]int, 0)
|
|
for k := range dag {
|
|
ks = append(ks, k)
|
|
}
|
|
sort.Ints(ks)
|
|
for k := range ks {
|
|
l := dag[k]
|
|
if len(l) == 1 && k > old_j {
|
|
result <- string(runes[k : l[0]+1])
|
|
old_j = l[0]
|
|
} else {
|
|
for _, j := range l {
|
|
if j > k {
|
|
result <- string(runes[k : j+1])
|
|
old_j = j
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
/*
|
|
Cut sentence.
|
|
|
|
isCutAll controls use full cut mode or accurate mode.
|
|
|
|
Full Mode gets all the possible words from the sentence. Fast but not accurate.
|
|
|
|
Accurate Mode attempts to cut the sentence into the most accurate segmentations,
|
|
which is suitable for text analysis.
|
|
|
|
HMM contols whether to use the Hidden Markov Mode.
|
|
*/
|
|
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
var reHan, reSkip *regexp.Regexp
|
|
if isCutAll {
|
|
reHan = reHanCutAll
|
|
reSkip = reSkipCutAll
|
|
} else {
|
|
reHan = reHanDefault
|
|
reSkip = reSkipDefault
|
|
}
|
|
var cut cutFunc
|
|
if HMM {
|
|
cut = j.cutDAG
|
|
} else {
|
|
cut = j.cutDAGNoHMM
|
|
}
|
|
if isCutAll {
|
|
cut = j.cutAll
|
|
}
|
|
for blk := range RegexpSplit(reHan, sentence) {
|
|
if len(blk) == 0 {
|
|
continue
|
|
}
|
|
if reHan.MatchString(blk) {
|
|
for x := range cut(blk) {
|
|
result <- x
|
|
}
|
|
} else {
|
|
type skipSplitFunc func(sentence string) chan string
|
|
var ssf skipSplitFunc
|
|
if isCutAll {
|
|
ssf = func(sentence string) chan string {
|
|
ch := make(chan string)
|
|
go func() {
|
|
for _, s := range reSkip.Split(sentence, -1) {
|
|
ch <- s
|
|
}
|
|
close(ch)
|
|
}()
|
|
return ch
|
|
}
|
|
} else {
|
|
ssf = func(sentence string) chan string {
|
|
return RegexpSplit(reSkip, sentence)
|
|
}
|
|
}
|
|
|
|
for x := range ssf(blk) {
|
|
if reSkip.MatchString(x) {
|
|
result <- x
|
|
} else if !isCutAll {
|
|
for _, xx := range x {
|
|
result <- string(xx)
|
|
}
|
|
} else {
|
|
result <- x
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|
|
|
|
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
|
// to cut long words into several short words, which can raise the recall rate.
|
|
// Suitable for search engines.
|
|
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
|
result := make(chan string)
|
|
go func() {
|
|
for word := range j.Cut(sentence, false, hmm) {
|
|
runes := []rune(word)
|
|
for _, increment := range []int{2, 3} {
|
|
if len(runes) > increment {
|
|
var gram2 string
|
|
for i := 0; i < len(runes)-increment+1; i++ {
|
|
gram2 = string(runes[i : i+increment])
|
|
if v, ok := j.Freq[gram2]; ok && v > 0.0 {
|
|
result <- gram2
|
|
}
|
|
}
|
|
}
|
|
}
|
|
result <- word
|
|
}
|
|
close(result)
|
|
}()
|
|
return result
|
|
}
|