1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

refactor posseg, added Posseg struct

This commit is contained in:
Wang Bin
2015-03-24 16:54:02 +08:00
parent 0027927b6d
commit 73d87e4ed6
6 changed files with 146 additions and 109 deletions

15
dictionary.go Normal file
View File

@@ -0,0 +1,15 @@
package jiebago
type Pair struct {
Word string
Flag string
}
type Token struct {
*Pair
Freq float64
}
type DictLoader interface {
Add(*Token)
}

View File

@@ -49,7 +49,7 @@ func (rs routes) Swap(i, j int) {
}
// Build a directed acyclic graph (DAG) for sentence.
func DAG(sentence string) map[int][]int {
func (j *Jieba) DAG(sentence string) map[int][]int {
dag := make(map[int][]int)
runes := []rune(sentence)
n := len(runes)
@@ -60,7 +60,7 @@ func DAG(sentence string) map[int][]int {
i = k
frag = string(runes[k])
for {
if freq, ok := Trie.Freq[frag]; !ok {
if freq, ok := j.Freq[frag]; !ok {
break
} else {
if freq > 0.0 {
@@ -81,19 +81,19 @@ func DAG(sentence string) map[int][]int {
return dag
}
func Calc(sentence string, dag map[int][]int) map[int]*route {
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
runes := []rune(sentence)
number := len(runes)
rs := make(map[int]*route)
rs[number] = &route{Freq: 0.0, Index: 0}
logTotal := math.Log(Trie.Total)
logTotal := math.Log(j.Total)
for idx := number - 1; idx >= 0; idx-- {
candidates := make(routes, 0)
for _, i := range dag[idx] {
word := string(runes[idx : i+1])
var r *route
if _, ok := Trie.Freq[word]; ok {
r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
if _, ok := j.Freq[word]; ok {
r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
} else {
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
}
@@ -107,11 +107,11 @@ func Calc(sentence string, dag map[int][]int) map[int]*route {
type cutFunc func(sentence string) chan string
func cutDAG(sentence string) chan string {
func (j *Jieba) cutDAG(sentence string) chan string {
result := make(chan string)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
dag := j.DAG(sentence)
routes := j.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -132,7 +132,7 @@ func cutDAG(sentence string) chan string {
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
for x := range finalseg.Cut(bufString) {
result <- x
}
@@ -154,7 +154,7 @@ func cutDAG(sentence string) chan string {
result <- string(buf)
} else {
bufString := string(buf)
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
for t := range finalseg.Cut(bufString) {
result <- t
}
@@ -170,12 +170,12 @@ func cutDAG(sentence string) chan string {
return result
}
func cutDAGNoHMM(sentence string) chan string {
func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
result := make(chan string)
go func() {
dag := DAG(sentence)
routes := Calc(sentence, dag)
dag := j.DAG(sentence)
routes := j.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -208,12 +208,12 @@ func cutDAGNoHMM(sentence string) chan string {
return result
}
func cutAll(sentence string) chan string {
func (j *Jieba) cutAll(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
dag := DAG(sentence)
dag := j.DAG(sentence)
old_j := -1
ks := make([]int, 0)
for k := range dag {
@@ -251,7 +251,7 @@ which is suitable for text analysis.
HMM contols whether to use the Hidden Markov Mode.
*/
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
result := make(chan string)
go func() {
var reHan, reSkip *regexp.Regexp
@@ -264,12 +264,12 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
}
var cut cutFunc
if HMM {
cut = cutDAG
cut = j.cutDAG
} else {
cut = cutDAGNoHMM
cut = j.cutDAGNoHMM
}
if isCutAll {
cut = cutAll
cut = j.cutAll
}
for blk := range RegexpSplit(reHan, sentence) {
if len(blk) == 0 {
@@ -320,17 +320,17 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
// to cut long words into several short words, which can raise the recall rate.
// Suitable for search engines.
func CutForSearch(sentence string, hmm bool) chan string {
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
result := make(chan string)
go func() {
for word := range Cut(sentence, false, hmm) {
for word := range j.Cut(sentence, false, hmm) {
runes := []rune(word)
for _, increment := range []int{2, 3} {
if len(runes) > increment {
var gram2 string
for i := 0; i < len(runes)-increment+1; i++ {
gram2 = string(runes[i : i+increment])
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
if v, ok := j.Freq[gram2]; ok && v > 0.0 {
result <- gram2
}
}

View File

@@ -617,10 +617,6 @@ var (
}
)
func init() {
SetDictionary("dict.txt")
}
func chanToArray(ch chan string) []string {
result := make([]string, 0)
for word := range ch {
@@ -630,14 +626,18 @@ func chanToArray(ch chan string) []string {
}
func TestCutDAG(t *testing.T) {
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
j, _ := NewJieba("dict.txt")
result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
}
}
func TestCutDAGNoHmm(t *testing.T) {
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
j, _ := NewJieba("dict.txt")
result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 11 {
t.Error(result)
}
@@ -657,9 +657,11 @@ func TestRegexpSplit(t *testing.T) {
}
func TestDefaultCut(t *testing.T) {
j, _ := NewJieba("dict.txt")
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, false, true))
result = chanToArray(j.Cut(content, false, true))
if len(result) != len(defaultCutResult[index]) {
t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result))
@@ -673,9 +675,11 @@ func TestDefaultCut(t *testing.T) {
}
func TestCutAll(t *testing.T) {
j, _ := NewJieba("dict.txt")
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, true, true))
result = chanToArray(j.Cut(content, true, true))
if len(result) != len(cutAllResult[index]) {
t.Errorf("cut all for %s length should be %d not %d\n",
content, len(cutAllResult[index]), len(result))
@@ -689,9 +693,11 @@ func TestCutAll(t *testing.T) {
}
func TestDefaultCutNoHMM(t *testing.T) {
j, _ := NewJieba("dict.txt")
var result []string
for index, content := range test_contents {
result = chanToArray(Cut(content, false, false))
result = chanToArray(j.Cut(content, false, false))
if len(result) != len(defaultCutNoHMMResult[index]) {
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
content, len(defaultCutNoHMMResult[index]), len(result))
@@ -705,9 +711,11 @@ func TestDefaultCutNoHMM(t *testing.T) {
}
func TestCutForSearch(t *testing.T) {
j, _ := NewJieba("dict.txt")
var result []string
for index, content := range test_contents {
result = chanToArray(CutForSearch(content, true))
result = chanToArray(j.CutForSearch(content, true))
if len(result) != len(cutForSearchResult[index]) {
t.Errorf("cut for search for %s length should be %d not %d\n",
content, len(cutForSearchResult[index]), len(result))
@@ -719,7 +727,7 @@ func TestCutForSearch(t *testing.T) {
}
}
for index, content := range test_contents {
result = chanToArray(CutForSearch(content, false))
result = chanToArray(j.CutForSearch(content, false))
if len(result) != len(cutForSearchNoHMMResult[index]) {
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
content, len(cutForSearchNoHMMResult[index]), len(result))
@@ -734,9 +742,9 @@ func TestCutForSearch(t *testing.T) {
func TestSetdictionary(t *testing.T) {
var result []string
SetDictionary("foobar.txt")
j, _ := NewJieba("foobar.txt")
for index, content := range test_contents {
result = chanToArray(Cut(content, false, true))
result = chanToArray(j.Cut(content, false, true))
if len(result) != len(userDictCutResult[index]) {
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
content, len(userDictCutResult[index]), len(result))
@@ -750,13 +758,13 @@ func TestSetdictionary(t *testing.T) {
}
func TestLoadUserDict(t *testing.T) {
SetDictionary("dict.txt")
LoadUserDict("userdict.txt")
j, _ := NewJieba("dict.txt")
j.LoadUserDict("userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", "", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
words := chanToArray(Cut(sentence, false, true))
words := chanToArray(j.Cut(sentence, false, true))
if len(words) != len(result) {
t.Error(len(words))
}
@@ -768,7 +776,7 @@ func TestLoadUserDict(t *testing.T) {
sentence = "easy_install is great"
result = []string{"easy_install", " ", "is", " ", "great"}
words = chanToArray(Cut(sentence, false, true))
words = chanToArray(j.Cut(sentence, false, true))
if len(words) != len(result) {
t.Error(len(words))
}
@@ -780,7 +788,7 @@ func TestLoadUserDict(t *testing.T) {
sentence = "python 的正则表达式是好用的"
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
words = chanToArray(Cut(sentence, false, true))
words = chanToArray(j.Cut(sentence, false, true))
if len(words) != len(result) {
t.Error(words)
t.Error(result)

View File

@@ -3,10 +3,10 @@ package posseg
import (
"github.com/wangbin/jiebago"
"regexp"
"strings"
)
var (
wordTagMap = make(map[string]string)
reHanDetail = regexp.MustCompile(`\p{Han}+`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
@@ -20,26 +20,48 @@ type WordTag struct {
Word, Tag string
}
type Posseg struct {
*jiebago.Jieba
Flag map[string]string
}
func (p *Posseg) Add(wtf *jiebago.WordTagFreq) {
if len(wtf.Tag) > 0 {
p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag)
}
p.AddWord(wtf)
}
// Set dictionary, it could be absolute path of dictionary file, or dictionary
// name in current diectory.
func SetDictionary(dictFileName string) error {
err := jiebago.SetDictionary(dictFileName)
if err != nil {
return err
}
func NewPosseg(dictFileName string) (*Posseg, error) {
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
p := &Posseg{j, make(map[string]string)}
dictFilePath, err := jiebago.DictPath(dictFileName)
if err != nil {
return err
return nil, err
}
wtfs, err := jiebago.ParseDictFile(dictFilePath)
for _, wtf := range wtfs {
wordTagMap[wtf.Word] = wtf.Tag
p.Add(wtf)
}
return p, nil
}
// Load user specified dictionary file.
func (p *Posseg) LoadUserDict(dictFilePath string) error {
wtfs, err := jiebago.ParseDictFile(dictFilePath)
if err != nil {
return err
}
for _, wtf := range wtfs {
p.Add(wtf)
}
return nil
}
func cutDetailInternal(sentence string) chan WordTag {
func (p *Posseg) cutDetailInternal(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
@@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag {
return result
}
func cutDetail(sentence string) chan WordTag {
func (p *Posseg) cutDetail(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
if reHanDetail.MatchString(blk) {
for wordTag := range cutDetailInternal(blk) {
for wordTag := range p.cutDetailInternal(blk) {
result <- wordTag
}
} else {
@@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag {
type cutFunc func(sentence string) chan WordTag
func cutDAG(sentence string) chan WordTag {
func (p *Posseg) cutDAG(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
if tag, ok := p.Flag[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
@@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag {
buf = make([]rune, 0)
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
if tag, ok := p.Flag[selem]; ok {
result <- WordTag{string(elem), tag}
} else {
result <- WordTag{string(elem), "x"}
@@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag {
}
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
if tag, ok := p.Flag[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
@@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := wordTagMap[sbuf]; ok {
if tag, ok := p.Flag[sbuf]; ok {
result <- WordTag{sbuf, tag}
} else {
result <- WordTag{sbuf, "x"}
}
} else {
bufString := string(buf)
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
for t := range cutDetail(bufString) {
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
for t := range p.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := wordTagMap[selem]; ok {
if tag, ok := p.Flag[selem]; ok {
result <- WordTag{selem, tag}
} else {
result <- WordTag{selem, "x"}
@@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag {
return result
}
func cutDAGNoHMM(sentence string) chan WordTag {
func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
dag := jiebago.DAG(sentence)
routes := jiebago.Calc(sentence, dag)
dag := p.DAG(sentence)
routes := p.Calc(sentence, dag)
x := 0
var y int
runes := []rune(sentence)
@@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag {
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := wordTagMap[sl_word]; ok {
if tag, ok := p.Flag[sl_word]; ok {
result <- WordTag{sl_word, tag}
} else {
result <- WordTag{sl_word, "x"}
@@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag {
// Tags the POS of each word after segmentation, using labels compatible with
// ictclas.
func Cut(sentence string, HMM bool) chan WordTag {
for key := range jiebago.UserWordTagTab {
wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag {
result := make(chan WordTag)
var cut cutFunc
if HMM {
cut = cutDAG
cut = p.cutDAG
} else {
cut = cutDAGNoHMM
cut = p.cutDAGNoHMM
}
go func() {
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {

View File

@@ -1,7 +1,6 @@
package posseg
import (
"github.com/wangbin/jiebago"
"testing"
)
@@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag {
}
func TestCut(t *testing.T) {
SetDictionary("../dict.txt")
p, err := NewPosseg("../dict.txt")
if err != nil {
t.Fatal(err)
}
for index, content := range test_contents {
result := chanToArray(Cut(content, true))
result := chanToArray(p.Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Error(content)
}
for i, _ := range result {
if result[i] != defaultCutResult[index][i] {
t.Error(content)
t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i])
}
}
result = chanToArray(Cut(content, false))
result = chanToArray(p.Cut(content, false))
if len(noHMMCutResult[index]) != len(result) {
t.Error(content)
}
@@ -305,7 +307,7 @@ func TestBug132(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/132
*/
SetDictionary("../dict.txt")
p, _ := NewPosseg("../dict.txt")
sentence := "又跛又啞"
cutResult := []WordTag{
WordTag{"又", "d"},
@@ -313,7 +315,7 @@ func TestBug132(t *testing.T) {
WordTag{"又", "d"},
WordTag{"啞", "v"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -328,7 +330,7 @@ func TestBug137(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/137
*/
SetDictionary("../dict.txt")
p, _ := NewPosseg("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []WordTag{
WordTag{"前", "f"},
@@ -345,7 +347,7 @@ func TestBug137(t *testing.T) {
WordTag{"研究", "vn"},
WordTag{"組", "x"},
}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}
@@ -357,8 +359,8 @@ func TestBug137(t *testing.T) {
}
func TestUserDict(t *testing.T) {
SetDictionary("../dict.txt")
jiebago.LoadUserDict("../userdict.txt")
p, _ := NewPosseg("../dict.txt")
p.LoadUserDict("../userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
cutResult := []WordTag{
@@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) {
WordTag{"N", "eng"},
WordTag{"类型", "n"}}
result := chanToArray(Cut(sentence, true))
result := chanToArray(p.Cut(sentence, true))
if len(cutResult) != len(result) {
t.Error(result)
}

38
trie.go
View File

@@ -7,18 +7,14 @@ import (
"log"
"os"
"path/filepath"
"strings"
)
// Trie store the total frequency and map of all words and their frequenciesb
var Trie *trie
type trie struct {
type Jieba struct {
Total float64
Freq map[string]float64
}
func (t *trie) load(dictFileName string) error {
func (j *Jieba) load(dictFileName string) error {
dictFilePath, err := DictPath(dictFileName)
if err != nil {
return err
@@ -55,7 +51,7 @@ func (t *trie) load(dictFileName string) error {
if isDictCached {
dec := gob.NewDecoder(cacheFile)
err = dec.Decode(&t)
err = dec.Decode(&j)
if err != nil {
isDictCached = false
} else {
@@ -70,7 +66,7 @@ func (t *trie) load(dictFileName string) error {
}
for _, wtf := range wtfs {
t.addWord(wtf)
j.AddWord(wtf)
}
// dump trie
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
@@ -79,7 +75,7 @@ func (t *trie) load(dictFileName string) error {
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
err = enc.Encode(t)
err = enc.Encode(j)
if err != nil {
return err
} else {
@@ -89,30 +85,27 @@ func (t *trie) load(dictFileName string) error {
return nil
}
func (t *trie) addWord(wtf *WordTagFreq) {
t.Freq[wtf.Word] = wtf.Freq
t.Total += wtf.Freq
func (j *Jieba) AddWord(wtf *WordTagFreq) {
j.Freq[wtf.Word] = wtf.Freq
j.Total += wtf.Freq
runes := []rune(wtf.Word)
count := len(runes)
for i := 0; i < count; i++ {
wfrag := string(runes[0 : i+1])
if _, ok := t.Freq[wfrag]; !ok {
t.Freq[wfrag] = 0.0
if _, ok := j.Freq[wfrag]; !ok {
j.Freq[wfrag] = 0.0
}
}
}
// Load user specified dictionary file.
func LoadUserDict(dictFilePath string) error {
func (j *Jieba) LoadUserDict(dictFilePath string) error {
wtfs, err := ParseDictFile(dictFilePath)
if err != nil {
return err
}
for _, wtf := range wtfs {
if len(wtf.Tag) > 0 {
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
}
Trie.addWord(wtf)
j.AddWord(wtf)
}
return nil
}
@@ -120,7 +113,8 @@ func LoadUserDict(dictFilePath string) error {
// Set the dictionary, could be absolute path of dictionary file, or dictionary
// name in current directory. This function must be called before cut any
// sentence.
func SetDictionary(dictFileName string) error {
Trie = &trie{Total: 0.0, Freq: make(map[string]float64)}
return Trie.load(dictFileName)
func NewJieba(dictFileName string) (*Jieba, error) {
j := &Jieba{Total: 0.0, Freq: make(map[string]float64)}
err := j.load(dictFileName)
return j, err
}