1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-30 00:50:30 +08:00

uniform the api

This commit is contained in:
Wang Bin
2015-03-30 17:52:09 +08:00
parent 7a7f8af517
commit c397cafe8a
4 changed files with 50 additions and 28 deletions

View File

@@ -161,7 +161,7 @@ func (t *TextRanker) TextRank(sentence string, topK int) wordWeights {
// name in current directory. This function must be called before cut any // name in current directory. This function must be called before cut any
// sentence. // sentence.
func NewTextRanker(dictFileName string) (*TextRanker, error) { func NewTextRanker(dictFileName string) (*TextRanker, error) {
p, err := posseg.NewPosseg(dictFileName) p, err := posseg.Open(dictFileName)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@@ -2,17 +2,19 @@
package jiebago package jiebago
import ( import (
"errors"
"github.com/wangbin/jiebago/finalseg" "github.com/wangbin/jiebago/finalseg"
"regexp" "regexp"
"sort" "sort"
) )
var ( var (
reEng = regexp.MustCompile(`[[:alnum:]]`) ErrInitialized = errors.New("already initialized")
reHanCutAll = regexp.MustCompile(`\p{Han}+`) reEng = regexp.MustCompile(`[[:alnum:]]`)
reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) reHanCutAll = regexp.MustCompile(`\p{Han}+`)
reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
) )
type Segmenter interface { type Segmenter interface {
@@ -51,8 +53,15 @@ func (j *Jieba) Add(word string, freq float64) {
} }
// Load user specified dictionary file. // Load user specified dictionary file.
func (j *Jieba) LoadUserDict(dictFilePath string) error { func (j *Jieba) LoadUserDict(dictFileName string) error {
return LoadDict(j, dictFilePath, false) return LoadDict(j, dictFileName, false)
}
func (j *Jieba) SetDict(dictFileName string) error {
if len(j.freqMap) > 0 || j.total > 0.0 {
return ErrInitialized
}
return LoadDict(j, dictFileName, false)
} }
func New() *Jieba { func New() *Jieba {

View File

@@ -22,30 +22,43 @@ type Pair struct {
type Posseg struct { type Posseg struct {
*jiebago.Jieba *jiebago.Jieba
Flag map[string]string flagMap map[string]string
} }
func (p *Posseg) AddEntry(entry jiebago.Entry) { func (p *Posseg) AddEntry(entry jiebago.Entry) {
if len(entry.Flag) > 0 { if len(entry.Flag) > 0 {
p.Flag[entry.Word] = strings.TrimSpace(entry.Flag) p.flagMap[entry.Word] = strings.TrimSpace(entry.Flag)
} }
p.Add(entry.Word, entry.Freq) p.Add(entry.Word, entry.Freq)
} }
func (p Posseg) Flag(word string) (string, bool) {
flag, ok := p.flagMap[word]
return flag, ok
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary // Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory. // name in current diectory.
func NewPosseg(dictFileName string) (*Posseg, error) { func Open(dictFileName string) (*Posseg, error) {
p := &Posseg{jiebago.New(), make(map[string]string)} p := New()
err := jiebago.LoadDict(p, dictFileName, true) err := jiebago.LoadDict(p, dictFileName, true)
if err != nil { return p, err
return nil, err
}
return p, nil
} }
// Load user specified dictionary file. // Load user specified dictionary file.
func (p *Posseg) LoadUserDict(dictFilePath string) error { func (p *Posseg) LoadUserDict(dictFileName string) error {
return jiebago.LoadDict(p, dictFilePath, true) return jiebago.LoadDict(p, dictFileName, true)
}
func (p *Posseg) SetDict(dictFileName string) error {
if len(p.flagMap) > 0 || p.Total() > 0.0 {
return jiebago.ErrInitialized
}
return jiebago.LoadDict(p, dictFileName, false)
}
func New() *Posseg {
return &Posseg{jiebago.New(), make(map[string]string)}
} }
func (p *Posseg) cutDetailInternal(sentence string) chan Pair { func (p *Posseg) cutDetailInternal(sentence string) chan Pair {
@@ -128,7 +141,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
if len(buf) > 0 { if len(buf) > 0 {
if len(buf) == 1 { if len(buf) == 1 {
sbuf := string(buf) sbuf := string(buf)
if tag, ok := p.Flag[sbuf]; ok { if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag} result <- Pair{sbuf, tag}
} else { } else {
result <- Pair{sbuf, "x"} result <- Pair{sbuf, "x"}
@@ -143,7 +156,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
} else { } else {
for _, elem := range buf { for _, elem := range buf {
selem := string(elem) selem := string(elem)
if tag, ok := p.Flag[selem]; ok { if tag, ok := p.Flag(selem); ok {
result <- Pair{string(elem), tag} result <- Pair{string(elem), tag}
} else { } else {
result <- Pair{string(elem), "x"} result <- Pair{string(elem), "x"}
@@ -155,7 +168,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
} }
} }
sl_word := string(l_word) sl_word := string(l_word)
if tag, ok := p.Flag[sl_word]; ok { if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag} result <- Pair{sl_word, tag}
} else { } else {
result <- Pair{sl_word, "x"} result <- Pair{sl_word, "x"}
@@ -167,7 +180,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
if len(buf) > 0 { if len(buf) > 0 {
if len(buf) == 1 { if len(buf) == 1 {
sbuf := string(buf) sbuf := string(buf)
if tag, ok := p.Flag[sbuf]; ok { if tag, ok := p.Flag(sbuf); ok {
result <- Pair{sbuf, tag} result <- Pair{sbuf, tag}
} else { } else {
result <- Pair{sbuf, "x"} result <- Pair{sbuf, "x"}
@@ -181,7 +194,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
} else { } else {
for _, elem := range buf { for _, elem := range buf {
selem := string(elem) selem := string(elem)
if tag, ok := p.Flag[selem]; ok { if tag, ok := p.Flag(selem); ok {
result <- Pair{selem, tag} result <- Pair{selem, tag}
} else { } else {
result <- Pair{selem, "x"} result <- Pair{selem, "x"}
@@ -221,7 +234,7 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair {
buf = make([]rune, 0) buf = make([]rune, 0)
} }
sl_word := string(l_word) sl_word := string(l_word)
if tag, ok := p.Flag[sl_word]; ok { if tag, ok := p.Flag(sl_word); ok {
result <- Pair{sl_word, tag} result <- Pair{sl_word, tag}
} else { } else {
result <- Pair{sl_word, "x"} result <- Pair{sl_word, "x"}

View File

@@ -276,7 +276,7 @@ func chanToArray(ch chan Pair) []Pair {
} }
func TestCut(t *testing.T) { func TestCut(t *testing.T) {
p, err := NewPosseg("../dict.txt") p, err := Open("../dict.txt")
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -307,7 +307,7 @@ func TestBug132(t *testing.T) {
/* /*
https://github.com/fxsjy/jieba/issues/132 https://github.com/fxsjy/jieba/issues/132
*/ */
p, _ := NewPosseg("../dict.txt") p, _ := Open("../dict.txt")
sentence := "又跛又啞" sentence := "又跛又啞"
cutResult := []Pair{ cutResult := []Pair{
Pair{"又", "d"}, Pair{"又", "d"},
@@ -330,7 +330,7 @@ func TestBug137(t *testing.T) {
/* /*
https://github.com/fxsjy/jieba/issues/137 https://github.com/fxsjy/jieba/issues/137
*/ */
p, _ := NewPosseg("../dict.txt") p, _ := Open("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []Pair{ cutResult := []Pair{
Pair{"前", "f"}, Pair{"前", "f"},
@@ -359,7 +359,7 @@ func TestBug137(t *testing.T) {
} }
func TestUserDict(t *testing.T) { func TestUserDict(t *testing.T) {
p, _ := NewPosseg("../dict.txt") p, _ := Open("../dict.txt")
p.LoadUserDict("../userdict.txt") p.LoadUserDict("../userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型" sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"