mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-27 15:40:32 +08:00
refactor posseg, added Posseg struct
This commit is contained in:
@@ -3,10 +3,10 @@ package posseg
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
wordTagMap = make(map[string]string)
|
||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
@@ -20,26 +20,48 @@ type WordTag struct {
|
||||
Word, Tag string
|
||||
}
|
||||
|
||||
type Posseg struct {
|
||||
*jiebago.Jieba
|
||||
Flag map[string]string
|
||||
}
|
||||
|
||||
func (p *Posseg) Add(wtf *jiebago.WordTagFreq) {
|
||||
if len(wtf.Tag) > 0 {
|
||||
p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
||||
}
|
||||
p.AddWord(wtf)
|
||||
}
|
||||
|
||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||
// name in current diectory.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
func NewPosseg(dictFileName string) (*Posseg, error) {
|
||||
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
||||
p := &Posseg{j, make(map[string]string)}
|
||||
dictFilePath, err := jiebago.DictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
wordTagMap[wtf.Word] = wtf.Tag
|
||||
p.Add(wtf)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// Load user specified dictionary file.
|
||||
func (p *Posseg) LoadUserDict(dictFilePath string) error {
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, wtf := range wtfs {
|
||||
p.Add(wtf)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func cutDetailInternal(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDetailInternal(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
@@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDetail(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDetail(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
for wordTag := range cutDetailInternal(blk) {
|
||||
for wordTag := range p.cutDetailInternal(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
@@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag {
|
||||
|
||||
type cutFunc func(sentence string) chan WordTag
|
||||
|
||||
func cutDAG(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDAG(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
if tag, ok := p.Flag[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
@@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag {
|
||||
buf = make([]rune, 0)
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range p.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
if tag, ok := p.Flag[selem]; ok {
|
||||
result <- WordTag{string(elem), tag}
|
||||
} else {
|
||||
result <- WordTag{string(elem), "x"}
|
||||
@@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag {
|
||||
}
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
if tag, ok := p.Flag[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
@@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
if tag, ok := p.Flag[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
}
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range p.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
if tag, ok := p.Flag[selem]; ok {
|
||||
result <- WordTag{selem, tag}
|
||||
} else {
|
||||
result <- WordTag{selem, "x"}
|
||||
@@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
if tag, ok := p.Flag[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
@@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
|
||||
// Tags the POS of each word after segmentation, using labels compatible with
|
||||
// ictclas.
|
||||
func Cut(sentence string, HMM bool) chan WordTag {
|
||||
for key := range jiebago.UserWordTagTab {
|
||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
cut = p.cutDAG
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
cut = p.cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag {
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
p, err := NewPosseg("../dict.txt")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for index, content := range test_contents {
|
||||
result := chanToArray(Cut(content, true))
|
||||
result := chanToArray(p.Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
for i, _ := range result {
|
||||
if result[i] != defaultCutResult[index][i] {
|
||||
t.Error(content)
|
||||
t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||
}
|
||||
}
|
||||
result = chanToArray(Cut(content, false))
|
||||
result = chanToArray(p.Cut(content, false))
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
@@ -305,7 +307,7 @@ func TestBug132(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/132
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
sentence := "又跛又啞"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"又", "d"},
|
||||
@@ -313,7 +315,7 @@ func TestBug132(t *testing.T) {
|
||||
WordTag{"又", "d"},
|
||||
WordTag{"啞", "v"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -328,7 +330,7 @@ func TestBug137(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/137
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"前", "f"},
|
||||
@@ -345,7 +347,7 @@ func TestBug137(t *testing.T) {
|
||||
WordTag{"研究", "vn"},
|
||||
WordTag{"組", "x"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -357,8 +359,8 @@ func TestBug137(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUserDict(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
jiebago.LoadUserDict("../userdict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
p.LoadUserDict("../userdict.txt")
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
|
||||
cutResult := []WordTag{
|
||||
@@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) {
|
||||
WordTag{"N", "eng"},
|
||||
WordTag{"类型", "n"}}
|
||||
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user