1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-27 15:40:32 +08:00

refactor posseg, added Posseg struct

This commit is contained in:
Wang Bin
2015-03-24 16:54:02 +08:00
parent 0027927b6d
commit 73d87e4ed6
6 changed files with 146 additions and 109 deletions

View File

@@ -3,10 +3,10 @@ package posseg
import (
"github.com/wangbin/jiebago"
"regexp"
"strings"
)
var (
wordTagMap = make(map[string]string)
reHanDetail = regexp.MustCompile(`\p{Han}+`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
@@ -20,26 +20,48 @@ type WordTag struct {
Word, Tag string
}
type Posseg struct {
*jiebago.Jieba
Flag map[string]string
}
func (p *Posseg) Add(wtf *jiebago.WordTagFreq) {
if len(wtf.Tag) > 0 {
p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag)
}
p.AddWord(wtf)
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func SetDictionary(dictFileName string) error {
err := jiebago.SetDictionary(dictFileName)
if err != nil {
return err
}
func NewPosseg(dictFileName string) (*Posseg, error) {
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
p := &Posseg{j, make(map[string]string)}
dictFilePath, err := jiebago.DictPath(dictFileName)
if err != nil {
return err
return nil, err
}
wtfs, err := jiebago.ParseDictFile(dictFilePath)
for _, wtf := range wtfs {
wordTagMap[wtf.Word] = wtf.Tag
p.Add(wtf)
}
return p, nil
}
// Load user specified dictionary file.
func (p *Posseg) LoadUserDict(dictFilePath string) error {
wtfs, err := jiebago.ParseDictFile(dictFilePath)
if err != nil {
return err
}
for _, wtf := range wtfs {
p.Add(wtf)
}
return nil
}
func cutDetailInternal(sentence string) chan WordTag {
func (p *Posseg) cutDetailInternal(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
@@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag {
return result
}
func cutDetail(sentence string) chan WordTag {
func (p *Posseg) cutDetail(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
if reHanDetail.MatchString(blk) {
for wordTag := range cutDetailInternal(blk) {
for wordTag := range p.cutDetailInternal(blk) {
result <- wordTag
}
} else {
@@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag {
type cutFunc func(sentence string) chan WordTag
func cutDAG(sentence string) chan WordTag {
func (p *Posseg) cutDAG(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
if tag, ok := p.Flag[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
@@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag {
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
if tag, ok := p.Flag[selem]; ok {
result <- WordTag{string(elem), tag}
} else {
result <- WordTag{string(elem), "x"}
@@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag {
}
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
if tag, ok := p.Flag[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
@@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
if tag, ok := p.Flag[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
}
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
if tag, ok := p.Flag[selem]; ok {
result <- WordTag{selem, tag}
} else {
result <- WordTag{selem, "x"}
@@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag {
return result
}
func cutDAGNoHMM(sentence string) chan WordTag {
func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag {
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
if tag, ok := p.Flag[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
@@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag {
// Tags the POS of each word after segmentation, using labels compatible with
// ictclas.
func Cut(sentence string, HMM bool) chan WordTag {
for key := range jiebago.UserWordTagTab {
wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag {
result := make(chan WordTag)
var cut cutFunc
if HMM {
cut = cutDAG
cut = p.cutDAG
} else {
cut = cutDAGNoHMM
cut = p.cutDAGNoHMM
}
go func() {
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {

View File

@@ -1,7 +1,6 @@
package posseg
import (
"github.com/wangbin/jiebago"
"testing"
)
@@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag {
}
func TestCut(t *testing.T) {
SetDictionary("../dict.txt")
p, err := NewPosseg("../dict.txt")
if err != nil {
t.Fatal(err)
}
for index, content := range test_contents {
result := chanToArray(Cut(content, true))
result := chanToArray(p.Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Error(content)
}
for i, _ := range result {
if result[i] != defaultCutResult[index][i] {
t.Error(content)
t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i])
}
}
result = chanToArray(Cut(content, false))
result = chanToArray(p.Cut(content, false))
if len(noHMMCutResult[index]) != len(result) {
t.Error(content)
}
@@ -305,7 +307,7 @@ func TestBug132(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/132
*/
SetDictionary("../dict.txt")
p, _ := NewPosseg("../dict.txt")
sentence := "又跛又啞"
cutResult := []WordTag{
WordTag{"又", "d"},
@@ -313,7 +315,7 @@ func TestBug132(t *testing.T) {
WordTag{"又", "d"},
WordTag{"啞", "v"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -328,7 +330,7 @@ func TestBug137(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/137
*/
SetDictionary("../dict.txt")
p, _ := NewPosseg("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []WordTag{
WordTag{"前", "f"},
@@ -345,7 +347,7 @@ func TestBug137(t *testing.T) {
WordTag{"研究", "vn"},
WordTag{"組", "x"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -357,8 +359,8 @@ func TestBug137(t *testing.T) {
}
func TestUserDict(t *testing.T) {
SetDictionary("../dict.txt")
jiebago.LoadUserDict("../userdict.txt")
p, _ := NewPosseg("../dict.txt")
p.LoadUserDict("../userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
cutResult := []WordTag{
@@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) {
WordTag{"N", "eng"},
WordTag{"类型", "n"}}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}