1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

uniform the api

This commit is contained in:
Wang Bin
2015-03-30 17:52:09 +08:00
parent 7a7f8af517
commit c397cafe8a
4 changed files with 50 additions and 28 deletions

View File

@@ -161,7 +161,7 @@ func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
// name in current directory. This function must be called before cut any
// sentence.
func NewTextRanker(dictFileName string) (*TextRanker, error) {
p, err := posseg.NewPosseg(dictFileName)
p, err := posseg.Open(dictFileName)
if err != nil {
return nil, err
}

View File

@@ -2,17 +2,19 @@
package jiebago
import (
"errors"
"github.com/wangbin/jiebago/finalseg"
"regexp"
"sort"
)
var (
reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
ErrInitialized = errors.New("already initialized")
reEng = regexp.MustCompile(`[[:alnum:]]`)
reHanCutAll = regexp.MustCompile(`\p{Han}+`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
)
type Segmenter interface {
@@ -51,8 +53,15 @@ func (j *Jieba) Add(word string, freq float64) {
}
// Load user specified dictionary file.
func (j *Jieba) LoadUserDict(dictFilePath string) error {
return LoadDict(j, dictFilePath, false)
func (j *Jieba) LoadUserDict(dictFileName string) error {
return LoadDict(j, dictFileName, false)
}
func (j *Jieba) SetDict(dictFileName string) error {
if len(j.freqMap) > 0 || j.total > 0.0 {
return ErrInitialized
}
return LoadDict(j, dictFileName, false)
}
func New() *Jieba {

View File

@@ -22,30 +22,43 @@ type Pair struct {
type Posseg struct {
*jiebago.Jieba
Flag map[string]string
flagMap map[string]string
}
func (p *Posseg) AddEntry(entry jiebago.Entry) {
if len(entry.Flag) > 0 {
p.Flag[entry.Word] = strings.TrimSpace(entry.Flag)
p.flagMap[entry.Word] = strings.TrimSpace(entry.Flag)
}
p.Add(entry.Word, entry.Freq)
}
func (p Posseg) Flag(word string) (string, bool) {
flag, ok := p.flagMap[word]
return flag, ok
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func NewPosseg(dictFileName string) (*Posseg, error) {
p := &Posseg{jiebago.New(), make(map[string]string)}
func Open(dictFileName string) (*Posseg, error) {
p := New()
err := jiebago.LoadDict(p, dictFileName, true)
if err != nil {
return nil, err
}
return p, nil
return p, err
}
// Load user specified dictionary file.
func (p *Posseg) LoadUserDict(dictFilePath string) error {
return jiebago.LoadDict(p, dictFilePath, true)
func (p *Posseg) LoadUserDict(dictFileName string) error {
return jiebago.LoadDict(p, dictFileName, true)
}
func (p *Posseg) SetDict(dictFileName string) error {
if len(p.flagMap) > 0 || p.Total() > 0.0 {
return jiebago.ErrInitialized
}
return jiebago.LoadDict(p, dictFileName, false)
}
func New() *Posseg {
return &Posseg{jiebago.New(), make(map[string]string)}
}
func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
@@ -128,7 +141,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := p.Flag[sbuf]; ok {
if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag}
} else {
result <- Pair{sbuf, "x"}
@@ -143,7 +156,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := p.Flag[selem]; ok {
if tag, ok := p.Flag(selem); ok {
result <- Pair{string(elem), tag}
} else {
result <- Pair{string(elem), "x"}
@@ -155,7 +168,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
}
}
sl_word := string(l_word)
if tag, ok := p.Flag[sl_word]; ok {
if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag}
} else {
result <- Pair{sl_word, "x"}
@@ -167,7 +180,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := p.Flag[sbuf]; ok {
if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag}
} else {
result <- Pair{sbuf, "x"}
@@ -181,7 +194,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := p.Flag[selem]; ok {
if tag, ok := p.Flag(selem); ok {
result <- Pair{selem, tag}
} else {
result <- Pair{selem, "x"}
@@ -221,7 +234,7 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := p.Flag[sl_word]; ok {
if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag}
} else {
result <- Pair{sl_word, "x"}

View File

@@ -276,7 +276,7 @@ func chanToArray(ch chan Pair) []Pair {
}
func TestCut(t *testing.T) {
p, err := NewPosseg("../dict.txt")
p, err := Open("../dict.txt")
if err != nil {
t.Fatal(err)
}
@@ -307,7 +307,7 @@ func TestBug132(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/132
*/
p, _ := NewPosseg("../dict.txt")
p, _ := Open("../dict.txt")
sentence := "又跛又啞"
cutResult := []Pair{
Pair{"又", "d"},
@@ -330,7 +330,7 @@ func TestBug137(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/137
*/
p, _ := NewPosseg("../dict.txt")
p, _ := Open("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []Pair{
Pair{"前", "f"},
@@ -359,7 +359,7 @@ func TestBug137(t *testing.T) {
}
func TestUserDict(t *testing.T) {
p, _ := NewPosseg("../dict.txt")
p, _ := Open("../dict.txt")
p.LoadUserDict("../userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"