1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-27 23:50:33 +08:00

move jieba to a seperate module, tweak posseg module

This commit is contained in:
Wang Bin
2015-04-30 17:01:02 +08:00
parent d9f77563bf
commit edef39719d
5 changed files with 1530 additions and 386 deletions

View File

@@ -1,10 +1,11 @@
package posseg
import (
"fmt"
"github.com/wangbin/jiebago"
"math"
"regexp"
"strings"
"github.com/wangbin/jiebago/dictionary"
"github.com/wangbin/jiebago/util"
)
var (
@@ -17,57 +18,28 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
)
type Pair struct {
Word, Flag string
type Segment struct {
text, pos string
}
func (p Pair) String() string {
return fmt.Sprintf("%s / %s", p.Word, p.Flag)
func (s Segment) Text() string {
return s.text
}
type Posseg struct {
*jiebago.Jieba
flagMap map[string]string
func (s Segment) Pos() string {
return s.pos
}
func (p *Posseg) AddEntry(entry jiebago.Entry) {
if len(entry.Flag) > 0 {
p.flagMap[entry.Word] = strings.TrimSpace(entry.Flag)
}
p.Add(entry.Word, entry.Freq)
type Segmenter struct {
*dictionary.Dictionary
}
func (p Posseg) Flag(word string) (string, bool) {
flag, ok := p.flagMap[word]
return flag, ok
func New() *Segmenter {
return &Segmenter{dictionary.New()}
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func Open(dictFileName string) (*Posseg, error) {
p := New()
err := jiebago.LoadDict(p, dictFileName, true)
return p, err
}
// Load user specified dictionary file.
func (p *Posseg) LoadUserDict(dictFileName string) error {
return jiebago.LoadDict(p, dictFileName, true)
}
func (p *Posseg) SetDict(dictFileName string) error {
if len(p.flagMap) > 0 || p.Total() > 0.0 {
return jiebago.ErrInitialized
}
return jiebago.LoadDict(p, dictFileName, false)
}
func New() *Posseg {
return &Posseg{jiebago.New(), make(map[string]string)}
}
func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
result := make(chan Pair)
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
runes := []rune(sentence)
@@ -80,42 +52,42 @@ func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
case "B":
begin = i
case "E":
result <- Pair{string(runes[begin : i+1]), pos.POS()}
result <- Segment{string(runes[begin : i+1]), pos.POS()}
next = i + 1
case "S":
result <- Pair{string(char), pos.POS()}
result <- Segment{string(char), pos.POS()}
next = i + 1
}
}
if next < len(runes) {
result <- Pair{string(runes[next:]), posList[next].POS()}
result <- Segment{string(runes[next:]), posList[next].POS()}
}
close(result)
}()
return result
}
func (p *Posseg) cutDetail(sentence string) chan Pair {
result := make(chan Pair)
func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
for _, blk := range jiebago.RegexpSplit(reHanDetail, sentence, -1) {
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
if reHanDetail.MatchString(blk) {
for wordTag := range p.cutDetailInternal(blk) {
result <- wordTag
for segment := range seg.cutDetailInternal(blk) {
result <- segment
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk, -1) {
if len(x) == 0 {
continue
}
switch {
case reNum.MatchString(x):
result <- Pair{x, "m"}
case reEng.MatchString(x):
result <- Pair{x, "eng"}
default:
result <- Pair{x, "x"}
}
continue
}
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
if len(x) == 0 {
continue
}
switch {
case reNum.MatchString(x):
result <- Segment{x, "m"}
case reEng.MatchString(x):
result <- Segment{x, "eng"}
default:
result <- Segment{x, "x"}
}
}
}
@@ -124,46 +96,105 @@ func (p *Posseg) cutDetail(sentence string) chan Pair {
return result
}
type cutFunc func(sentence string) chan Pair
func (seg *Segmenter) dag(runes []rune) map[int][]int {
dag := make(map[int][]int)
n := len(runes)
var frag []rune
var i int
for k := 0; k < n; k++ {
dag[k] = make([]int, 0)
i = k
frag = runes[k : k+1]
for {
freq, ok := seg.Frequency(string(frag))
if !ok {
break
}
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i += 1
if i >= n {
break
}
frag = runes[k : i+1]
}
if len(dag[k]) == 0 {
dag[k] = append(dag[k], k)
}
}
return dag
}
func (p *Posseg) cutDAG(sentence string) chan Pair {
result := make(chan Pair)
type route struct {
frequency float64
index int
}
func (seg *Segmenter) calc(runes []rune) map[int]route {
dag := seg.dag(runes)
n := len(runes)
rs := make(map[int]route)
rs[n] = route{frequency: 0.0, index: 0}
logTotal := seg.LogTotal()
var r route
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
if freq, ok := seg.Frequency(string(runes[idx : i+1])); ok {
r = route{frequency: math.Log(freq) - logTotal + rs[i+1].frequency, index: i}
} else {
r = route{frequency: math.Log(1.0) - logTotal + rs[i+1].frequency, index: i}
}
if v, ok := rs[idx]; !ok {
rs[idx] = r
} else {
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
rs[idx] = r
}
}
}
}
return rs
}
type cutFunc func(sentence string) <-chan Segment
func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
runes := []rune(sentence)
dag := jiebago.DAG(p, runes)
routes := jiebago.Routes(p, runes, dag)
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
for x := 0; x < length; {
y = routes[x].Index + 1
l_word := runes[x:y]
y = routes[x].index + 1
frag := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
buf = append(buf, frag...)
} else {
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag}
if tag, ok := seg.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Pair{sbuf, "x"}
result <- Segment{bufString, "x"}
}
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := p.Freq(bufString); !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
if v, ok := seg.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := p.Flag(selem); ok {
result <- Pair{string(elem), tag}
if tag, ok := seg.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- Pair{string(elem), "x"}
result <- Segment{selem, "x"}
}
}
@@ -171,37 +202,36 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
buf = make([]rune, 0)
}
}
sl_word := string(l_word)
if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag}
word := string(frag)
if tag, ok := seg.Pos(word); ok {
result <- Segment{word, tag}
} else {
result <- Pair{sl_word, "x"}
result <- Segment{word, "x"}
}
}
x = y
}
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag}
if tag, ok := seg.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Pair{sbuf, "x"}
result <- Segment{bufString, "x"}
}
} else {
bufString := string(buf)
if v, ok := p.Freq(bufString); !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
if v, ok := seg.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := p.Flag(selem); ok {
result <- Pair{selem, tag}
if tag, ok := seg.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- Pair{selem, "x"}
result <- Segment{selem, "x"}
}
}
}
@@ -212,42 +242,37 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
return result
}
func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
result := make(chan Pair)
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
result := make(chan Segment)
go func() {
runes := []rune(sentence)
dag := jiebago.DAG(p, runes)
routes := jiebago.Routes(p, runes, dag)
x := 0
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if reEng1.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
buf = append(buf, frag...)
x = y
} else {
if len(buf) > 0 {
result <- Pair{string(buf), "eng"}
result <- Segment{string(buf), "eng"}
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag}
word := string(frag)
if tag, ok := seg.Pos(word); ok {
result <- Segment{word, tag}
} else {
result <- Pair{sl_word, "x"}
result <- Segment{word, "x"}
}
x = y
}
}
if len(buf) > 0 {
result <- Pair{string(buf), "eng"}
result <- Segment{string(buf), "eng"}
buf = make([]rune, 0)
}
close(result)
@@ -255,37 +280,34 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
return result
}
// Tags the POS of each word after segmentation, using labels compatible with
// ictclas.
func (p *Posseg) Cut(sentence string, HMM bool) chan Pair {
result := make(chan Pair)
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment)
var cut cutFunc
if HMM {
cut = p.cutDAG
if hmm {
cut = seg.cutDAG
} else {
cut = p.cutDAGNoHMM
cut = seg.cutDAGNoHMM
}
go func() {
for _, blk := range jiebago.RegexpSplit(reHanInternal, sentence, -1) {
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
if reHanInternal.MatchString(blk) {
for wordTag := range cut(blk) {
result <- wordTag
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk, -1) {
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
if reSkipInternal.MatchString(x) {
result <- Pair{x, "x"}
result <- Segment{x, "x"}
} else {
for _, xx := range x {
s := string(xx)
switch {
case reNum.MatchString(s):
result <- Pair{s, "m"}
result <- Segment{s, "m"}
case reEng.MatchString(x):
result <- Pair{x, "eng"}
break
result <- Segment{x, "eng"}
default:
result <- Pair{s, "x"}
result <- Segment{s, "x"}
}
}
}