mirror of
https://github.com/fumiama/jieba.git
synced 2026-07-01 17:40:29 +08:00
优化 Segmenter
This commit is contained in:
380
posseg/posseg.go
380
posseg/posseg.go
@@ -35,107 +35,102 @@ func (s Segment) Pos() string {
|
||||
}
|
||||
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
type Segmenter Dictionary
|
||||
|
||||
// LoadDictionary loads dictionary from given file name.
|
||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionary(file)
|
||||
func LoadDictionary(file fs.File) (*Segmenter, error) {
|
||||
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
err := dict.loadDictionary(file)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return (*Segmenter)(dict), nil
|
||||
}
|
||||
|
||||
// LoadDictionaryAt loads dictionary from given file name.
|
||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionaryAt(fileName)
|
||||
func LoadDictionaryAt(file string) (*Segmenter, error) {
|
||||
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
err := dict.loadDictionaryAt(file)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return (*Segmenter)(dict), nil
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||
return seg.dict.loadDictionary(file)
|
||||
return (*Dictionary)(seg).loadDictionary(file)
|
||||
}
|
||||
|
||||
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
|
||||
return seg.dict.loadDictionaryAt(fileName)
|
||||
return (*Dictionary)(seg).loadDictionaryAt(fileName)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
posList := viterbi(runes)
|
||||
begin := 0
|
||||
next := 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos.position() {
|
||||
case "B":
|
||||
begin = i
|
||||
case "E":
|
||||
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
||||
next = i + 1
|
||||
case "S":
|
||||
result <- Segment{string(char), pos.pos()}
|
||||
next = i + 1
|
||||
}
|
||||
func (seg *Segmenter) cutDetailInternal(sentence string) (results []Segment) {
|
||||
runes := []rune(sentence)
|
||||
posList := viterbi(runes)
|
||||
begin := 0
|
||||
next := 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos.position() {
|
||||
case "B":
|
||||
begin = i
|
||||
case "E":
|
||||
results = append(results, Segment{string(runes[begin : i+1]), pos.pos()})
|
||||
next = i + 1
|
||||
case "S":
|
||||
results = append(results, Segment{string(char), pos.pos()})
|
||||
next = i + 1
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- Segment{string(runes[next:]), posList[next].pos()}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
if next < len(runes) {
|
||||
results = append(results, Segment{string(runes[next:]), posList[next].pos()})
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
go func() {
|
||||
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
for segment := range seg.cutDetailInternal(blk) {
|
||||
result <- segment
|
||||
}
|
||||
func (seg *Segmenter) cutDetail(sentence string) (results []Segment) {
|
||||
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
results = append(results, seg.cutDetailInternal(blk)...)
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
|
||||
if len(x) == 0 {
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
|
||||
if len(x) == 0 {
|
||||
continue
|
||||
}
|
||||
switch {
|
||||
case reNum.MatchString(x):
|
||||
result <- Segment{x, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- Segment{x, "eng"}
|
||||
default:
|
||||
result <- Segment{x, "x"}
|
||||
}
|
||||
switch {
|
||||
case reNum.MatchString(x):
|
||||
results = append(results, Segment{x, "m"})
|
||||
case reEng.MatchString(x):
|
||||
results = append(results, Segment{x, "eng"})
|
||||
default:
|
||||
results = append(results, Segment{x, "x"})
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
func (seg *Segmenter) dag(runes []rune) [][]int {
|
||||
n := len(runes)
|
||||
dag := make([][]int, n)
|
||||
var frag []rune
|
||||
var i int
|
||||
for k := 0; k < n; k++ {
|
||||
dag[k] = make([]int, 0)
|
||||
dag[k] = make([]int, 0, 64)
|
||||
i = k
|
||||
frag = runes[k : k+1]
|
||||
for {
|
||||
freq, ok := seg.dict.Frequency(string(frag))
|
||||
freq, ok := (*Dictionary)(seg).Frequency(string(frag))
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
@@ -160,20 +155,20 @@ type route struct {
|
||||
index int
|
||||
}
|
||||
|
||||
func (seg *Segmenter) calc(runes []rune) map[int]route {
|
||||
func (seg *Segmenter) calc(runes []rune) []*route {
|
||||
dag := seg.dag(runes)
|
||||
n := len(runes)
|
||||
rs := make(map[int]route)
|
||||
rs[n] = route{frequency: 0.0, index: 0}
|
||||
var r route
|
||||
rs := make([]*route, n+1)
|
||||
rs[n] = &route{frequency: 0.0, index: 0}
|
||||
var r *route
|
||||
for idx := n - 1; idx >= 0; idx-- {
|
||||
for _, i := range dag[idx] {
|
||||
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
|
||||
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
|
||||
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||
} else {
|
||||
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
||||
r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||
}
|
||||
if v, ok := rs[idx]; !ok {
|
||||
if v := rs[idx]; v == nil {
|
||||
rs[idx] = r
|
||||
} else {
|
||||
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
||||
@@ -185,168 +180,139 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
|
||||
return rs
|
||||
}
|
||||
|
||||
type cutFunc func(sentence string) <-chan Segment
|
||||
|
||||
func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
||||
result <- Segment{bufString, tag}
|
||||
} else {
|
||||
result <- Segment{bufString, "x"}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
continue
|
||||
}
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range seg.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := seg.dict.Pos(selem); ok {
|
||||
result <- Segment{selem, tag}
|
||||
} else {
|
||||
result <- Segment{selem, "x"}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
word := string(frag)
|
||||
if tag, ok := seg.dict.Pos(word); ok {
|
||||
result <- Segment{word, tag}
|
||||
} else {
|
||||
result <- Segment{word, "x"}
|
||||
}
|
||||
func (seg *Segmenter) cutDAG(sentence string) (results []Segment) {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
buf := make([]rune, 0, 256)
|
||||
for x := 0; x < len(runes); {
|
||||
y := routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if y-x == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
||||
result <- Segment{bufString, tag}
|
||||
if tag, ok := (*Dictionary)(seg).Pos(bufString); ok {
|
||||
results = append(results, Segment{bufString, tag})
|
||||
} else {
|
||||
result <- Segment{bufString, "x"}
|
||||
results = append(results, Segment{bufString, "x"})
|
||||
}
|
||||
} else {
|
||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
||||
for t := range seg.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := seg.dict.Pos(selem); ok {
|
||||
result <- Segment{selem, tag}
|
||||
} else {
|
||||
result <- Segment{selem, "x"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
buf = buf[:0]
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- Segment{string(buf), "eng"}
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
word := string(frag)
|
||||
if tag, ok := seg.dict.Pos(word); ok {
|
||||
result <- Segment{word, tag}
|
||||
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||
results = append(results, seg.cutDetail(bufString)...)
|
||||
} else {
|
||||
result <- Segment{word, "x"}
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := (*Dictionary)(seg).Pos(selem); ok {
|
||||
results = append(results, Segment{selem, tag})
|
||||
} else {
|
||||
results = append(results, Segment{selem, "x"})
|
||||
}
|
||||
}
|
||||
}
|
||||
x = y
|
||||
buf = buf[:0]
|
||||
}
|
||||
word := string(frag)
|
||||
if tag, ok := (*Dictionary)(seg).Pos(word); ok {
|
||||
results = append(results, Segment{word, tag})
|
||||
} else {
|
||||
results = append(results, Segment{word, "x"})
|
||||
}
|
||||
x = y
|
||||
}
|
||||
|
||||
if len(buf) > 0 {
|
||||
bufString := string(buf)
|
||||
if len(buf) == 1 {
|
||||
if tag, ok := (*Dictionary)(seg).Pos(bufString); ok {
|
||||
results = append(results, Segment{bufString, tag})
|
||||
} else {
|
||||
results = append(results, Segment{bufString, "x"})
|
||||
}
|
||||
return
|
||||
}
|
||||
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||
results = append(results, seg.cutDetail(bufString)...)
|
||||
return
|
||||
}
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := (*Dictionary)(seg).Pos(selem); ok {
|
||||
results = append(results, Segment{selem, tag})
|
||||
} else {
|
||||
results = append(results, Segment{selem, "x"})
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) (results []Segment) {
|
||||
runes := []rune(sentence)
|
||||
routes := seg.calc(runes)
|
||||
buf := make([]rune, 0, 256)
|
||||
for x := 0; x < len(runes); {
|
||||
y := routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
|
||||
buf = append(buf, frag...)
|
||||
x = y
|
||||
continue
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
result <- Segment{string(buf), "eng"}
|
||||
buf = make([]rune, 0)
|
||||
results = append(results, Segment{string(buf), "eng"})
|
||||
buf = buf[:0]
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
word := string(frag)
|
||||
if tag, ok := (*Dictionary)(seg).Pos(word); ok {
|
||||
results = append(results, Segment{word, tag})
|
||||
} else {
|
||||
results = append(results, Segment{word, "x"})
|
||||
}
|
||||
x = y
|
||||
}
|
||||
if len(buf) > 0 {
|
||||
results = append(results, Segment{string(buf), "eng"})
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Cut cuts a sentence into words.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
var cut cutFunc
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) (results []Segment) {
|
||||
var cut func(sentence string) []Segment
|
||||
if hmm {
|
||||
cut = seg.cutDAG
|
||||
} else {
|
||||
cut = seg.cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
for wordTag := range cut(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
results = append(results, cut(blk)...)
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
results = append(results, Segment{x, "x"})
|
||||
continue
|
||||
}
|
||||
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result <- Segment{x, "x"}
|
||||
continue
|
||||
}
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
result <- Segment{s, "m"}
|
||||
case reEng.MatchString(x):
|
||||
result <- Segment{x, "eng"}
|
||||
default:
|
||||
result <- Segment{s, "x"}
|
||||
}
|
||||
for _, xx := range x {
|
||||
s := string(xx)
|
||||
switch {
|
||||
case reNum.MatchString(s):
|
||||
results = append(results, Segment{s, "m"})
|
||||
case reEng.MatchString(x):
|
||||
results = append(results, Segment{x, "eng"})
|
||||
default:
|
||||
results = append(results, Segment{s, "x"})
|
||||
}
|
||||
}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user