diff --git a/analyse/textrank.go b/analyse/textrank.go index df04e85..e6fb6c4 100644 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -161,7 +161,7 @@ func (t *TextRanker) TextRank(sentence string, topK int) wordWeights { // name in current directory. This function must be called before cut any // sentence. func NewTextRanker(dictFileName string) (*TextRanker, error) { - p, err := posseg.NewPosseg(dictFileName) + p, err := posseg.Open(dictFileName) if err != nil { return nil, err } diff --git a/jieba.go b/jieba.go index c62e22e..5a1822c 100644 --- a/jieba.go +++ b/jieba.go @@ -2,17 +2,19 @@ package jiebago import ( + "errors" "github.com/wangbin/jiebago/finalseg" "regexp" "sort" ) var ( - reEng = regexp.MustCompile(`[[:alnum:]]`) - reHanCutAll = regexp.MustCompile(`\p{Han}+`) - reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) - reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) - reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) + ErrInitialized = errors.New("already initialized") + reEng = regexp.MustCompile(`[[:alnum:]]`) + reHanCutAll = regexp.MustCompile(`\p{Han}+`) + reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) + reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) + reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) type Segmenter interface { @@ -51,8 +53,15 @@ func (j *Jieba) Add(word string, freq float64) { } // Load user specified dictionary file. -func (j *Jieba) LoadUserDict(dictFilePath string) error { - return LoadDict(j, dictFilePath, false) +func (j *Jieba) LoadUserDict(dictFileName string) error { + return LoadDict(j, dictFileName, false) +} + +func (j *Jieba) SetDict(dictFileName string) error { + if len(j.freqMap) > 0 || j.total > 0.0 { + return ErrInitialized + } + return LoadDict(j, dictFileName, false) } func New() *Jieba { diff --git a/posseg/posseg.go b/posseg/posseg.go index 55c7033..a9b9978 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -22,30 +22,43 @@ type Pair struct { type Posseg struct { *jiebago.Jieba - Flag map[string]string + flagMap map[string]string } func (p *Posseg) AddEntry(entry jiebago.Entry) { if len(entry.Flag) > 0 { - p.Flag[entry.Word] = strings.TrimSpace(entry.Flag) + p.flagMap[entry.Word] = strings.TrimSpace(entry.Flag) } p.Add(entry.Word, entry.Freq) } +func (p Posseg) Flag(word string) (string, bool) { + flag, ok := p.flagMap[word] + return flag, ok +} + // Set dictionary, it could be absolute path of dictionary file, or dictionary // name in current diectory. -func NewPosseg(dictFileName string) (*Posseg, error) { - p := &Posseg{jiebago.New(), make(map[string]string)} +func Open(dictFileName string) (*Posseg, error) { + p := New() err := jiebago.LoadDict(p, dictFileName, true) - if err != nil { - return nil, err - } - return p, nil + return p, err } // Load user specified dictionary file. -func (p *Posseg) LoadUserDict(dictFilePath string) error { - return jiebago.LoadDict(p, dictFilePath, true) +func (p *Posseg) LoadUserDict(dictFileName string) error { + return jiebago.LoadDict(p, dictFileName, true) +} + +func (p *Posseg) SetDict(dictFileName string) error { + if len(p.flagMap) > 0 || p.Total() > 0.0 { + return jiebago.ErrInitialized + } + return jiebago.LoadDict(p, dictFileName, false) +} + +func New() *Posseg { + return &Posseg{jiebago.New(), make(map[string]string)} } func (p *Posseg) cutDetailInternal(sentence string) chan Pair { @@ -128,7 +141,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := p.Flag[sbuf]; ok { + if tag, ok := p.Flag(sbuf); ok { result <- Pair{sbuf, tag} } else { result <- Pair{sbuf, "x"} @@ -143,7 +156,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { } else { for _, elem := range buf { selem := string(elem) - if tag, ok := p.Flag[selem]; ok { + if tag, ok := p.Flag(selem); ok { result <- Pair{string(elem), tag} } else { result <- Pair{string(elem), "x"} @@ -155,7 +168,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { } } sl_word := string(l_word) - if tag, ok := p.Flag[sl_word]; ok { + if tag, ok := p.Flag(sl_word); ok { result <- Pair{sl_word, tag} } else { result <- Pair{sl_word, "x"} @@ -167,7 +180,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := p.Flag[sbuf]; ok { + if tag, ok := p.Flag(sbuf); ok { result <- Pair{sbuf, tag} } else { result <- Pair{sbuf, "x"} @@ -181,7 +194,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair { } else { for _, elem := range buf { selem := string(elem) - if tag, ok := p.Flag[selem]; ok { + if tag, ok := p.Flag(selem); ok { result <- Pair{selem, tag} } else { result <- Pair{selem, "x"} @@ -221,7 +234,7 @@ func (p *Posseg) cutDAGNoHMM(sentence string) chan Pair { buf = make([]rune, 0) } sl_word := string(l_word) - if tag, ok := p.Flag[sl_word]; ok { + if tag, ok := p.Flag(sl_word); ok { result <- Pair{sl_word, tag} } else { result <- Pair{sl_word, "x"} diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 0c22c50..726d5f8 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -276,7 +276,7 @@ func chanToArray(ch chan Pair) []Pair { } func TestCut(t *testing.T) { - p, err := NewPosseg("../dict.txt") + p, err := Open("../dict.txt") if err != nil { t.Fatal(err) } @@ -307,7 +307,7 @@ func TestBug132(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/132 */ - p, _ := NewPosseg("../dict.txt") + p, _ := Open("../dict.txt") sentence := "又跛又啞" cutResult := []Pair{ Pair{"又", "d"}, @@ -330,7 +330,7 @@ func TestBug137(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/137 */ - p, _ := NewPosseg("../dict.txt") + p, _ := Open("../dict.txt") sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" cutResult := []Pair{ Pair{"前", "f"}, @@ -359,7 +359,7 @@ func TestBug137(t *testing.T) { } func TestUserDict(t *testing.T) { - p, _ := NewPosseg("../dict.txt") + p, _ := Open("../dict.txt") p.LoadUserDict("../userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"