mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
refactor posseg, added Posseg struct
This commit is contained in:
15
dictionary.go
Normal file
15
dictionary.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package jiebago
|
||||
|
||||
type Pair struct {
|
||||
Word string
|
||||
Flag string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
*Pair
|
||||
Freq float64
|
||||
}
|
||||
|
||||
type DictLoader interface {
|
||||
Add(*Token)
|
||||
}
|
||||
46
jieba.go
46
jieba.go
@@ -49,7 +49,7 @@ func (rs routes) Swap(i, j int) {
|
||||
}
|
||||
|
||||
// Build a directed acyclic graph (DAG) for sentence.
|
||||
func DAG(sentence string) map[int][]int {
|
||||
func (j *Jieba) DAG(sentence string) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
runes := []rune(sentence)
|
||||
n := len(runes)
|
||||
@@ -60,7 +60,7 @@ func DAG(sentence string) map[int][]int {
|
||||
i = k
|
||||
frag = string(runes[k])
|
||||
for {
|
||||
if freq, ok := Trie.Freq[frag]; !ok {
|
||||
if freq, ok := j.Freq[frag]; !ok {
|
||||
break
|
||||
} else {
|
||||
if freq > 0.0 {
|
||||
@@ -81,19 +81,19 @@ func DAG(sentence string) map[int][]int {
|
||||
return dag
|
||||
}
|
||||
|
||||
func Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||
runes := []rune(sentence)
|
||||
number := len(runes)
|
||||
rs := make(map[int]*route)
|
||||
rs[number] = &route{Freq: 0.0, Index: 0}
|
||||
logTotal := math.Log(Trie.Total)
|
||||
logTotal := math.Log(j.Total)
|
||||
for idx := number - 1; idx >= 0; idx-- {
|
||||
candidates := make(routes, 0)
|
||||
for _, i := range dag[idx] {
|
||||
word := string(runes[idx : i+1])
|
||||
var r *route
|
||||
if _, ok := Trie.Freq[word]; ok {
|
||||
r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
||||
if _, ok := j.Freq[word]; ok {
|
||||
r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
||||
} else {
|
||||
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||
}
|
||||
@@ -107,11 +107,11 @@ func Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||
|
||||
type cutFunc func(sentence string) chan string
|
||||
|
||||
func cutDAG(sentence string) chan string {
|
||||
func (j *Jieba) cutDAG(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
dag := DAG(sentence)
|
||||
routes := Calc(sentence, dag)
|
||||
dag := j.DAG(sentence)
|
||||
routes := j.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -132,7 +132,7 @@ func cutDAG(sentence string) chan string {
|
||||
buf = make([]rune, 0)
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
||||
for x := range finalseg.Cut(bufString) {
|
||||
result <- x
|
||||
}
|
||||
@@ -154,7 +154,7 @@ func cutDAG(sentence string) chan string {
|
||||
result <- string(buf)
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range finalseg.Cut(bufString) {
|
||||
result <- t
|
||||
}
|
||||
@@ -170,12 +170,12 @@ func cutDAG(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDAGNoHMM(sentence string) chan string {
|
||||
func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
|
||||
go func() {
|
||||
dag := DAG(sentence)
|
||||
routes := Calc(sentence, dag)
|
||||
dag := j.DAG(sentence)
|
||||
routes := j.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -208,12 +208,12 @@ func cutDAGNoHMM(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutAll(sentence string) chan string {
|
||||
func (j *Jieba) cutAll(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
dag := DAG(sentence)
|
||||
dag := j.DAG(sentence)
|
||||
old_j := -1
|
||||
ks := make([]int, 0)
|
||||
for k := range dag {
|
||||
@@ -251,7 +251,7 @@ which is suitable for text analysis.
|
||||
|
||||
HMM contols whether to use the Hidden Markov Mode.
|
||||
*/
|
||||
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
var reHan, reSkip *regexp.Regexp
|
||||
@@ -264,12 +264,12 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
}
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
cut = j.cutDAG
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
cut = j.cutDAGNoHMM
|
||||
}
|
||||
if isCutAll {
|
||||
cut = cutAll
|
||||
cut = j.cutAll
|
||||
}
|
||||
for blk := range RegexpSplit(reHan, sentence) {
|
||||
if len(blk) == 0 {
|
||||
@@ -320,17 +320,17 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
||||
// to cut long words into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func CutForSearch(sentence string, hmm bool) chan string {
|
||||
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
for word := range Cut(sentence, false, hmm) {
|
||||
for word := range j.Cut(sentence, false, hmm) {
|
||||
runes := []rune(word)
|
||||
for _, increment := range []int{2, 3} {
|
||||
if len(runes) > increment {
|
||||
var gram2 string
|
||||
for i := 0; i < len(runes)-increment+1; i++ {
|
||||
gram2 = string(runes[i : i+increment])
|
||||
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
|
||||
if v, ok := j.Freq[gram2]; ok && v > 0.0 {
|
||||
result <- gram2
|
||||
}
|
||||
}
|
||||
|
||||
@@ -617,10 +617,6 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
func init() {
|
||||
SetDictionary("dict.txt")
|
||||
}
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
result := make([]string, 0)
|
||||
for word := range ch {
|
||||
@@ -630,14 +626,18 @@ func chanToArray(ch chan string) []string {
|
||||
}
|
||||
|
||||
func TestCutDAG(t *testing.T) {
|
||||
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
if len(result) != 11 {
|
||||
t.Error(result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCutDAGNoHmm(t *testing.T) {
|
||||
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
if len(result) != 11 {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -657,9 +657,11 @@ func TestRegexpSplit(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDefaultCut(t *testing.T) {
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, true))
|
||||
result = chanToArray(j.Cut(content, false, true))
|
||||
if len(result) != len(defaultCutResult[index]) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
content, len(defaultCutResult[index]), len(result))
|
||||
@@ -673,9 +675,11 @@ func TestDefaultCut(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestCutAll(t *testing.T) {
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, true, true))
|
||||
result = chanToArray(j.Cut(content, true, true))
|
||||
if len(result) != len(cutAllResult[index]) {
|
||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||
content, len(cutAllResult[index]), len(result))
|
||||
@@ -689,9 +693,11 @@ func TestCutAll(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDefaultCutNoHMM(t *testing.T) {
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, false))
|
||||
result = chanToArray(j.Cut(content, false, false))
|
||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
||||
content, len(defaultCutNoHMMResult[index]), len(result))
|
||||
@@ -705,9 +711,11 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestCutForSearch(t *testing.T) {
|
||||
j, _ := NewJieba("dict.txt")
|
||||
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(CutForSearch(content, true))
|
||||
result = chanToArray(j.CutForSearch(content, true))
|
||||
if len(result) != len(cutForSearchResult[index]) {
|
||||
t.Errorf("cut for search for %s length should be %d not %d\n",
|
||||
content, len(cutForSearchResult[index]), len(result))
|
||||
@@ -719,7 +727,7 @@ func TestCutForSearch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(CutForSearch(content, false))
|
||||
result = chanToArray(j.CutForSearch(content, false))
|
||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
||||
content, len(cutForSearchNoHMMResult[index]), len(result))
|
||||
@@ -734,9 +742,9 @@ func TestCutForSearch(t *testing.T) {
|
||||
|
||||
func TestSetdictionary(t *testing.T) {
|
||||
var result []string
|
||||
SetDictionary("foobar.txt")
|
||||
j, _ := NewJieba("foobar.txt")
|
||||
for index, content := range test_contents {
|
||||
result = chanToArray(Cut(content, false, true))
|
||||
result = chanToArray(j.Cut(content, false, true))
|
||||
if len(result) != len(userDictCutResult[index]) {
|
||||
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||
content, len(userDictCutResult[index]), len(result))
|
||||
@@ -750,13 +758,13 @@ func TestSetdictionary(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestLoadUserDict(t *testing.T) {
|
||||
SetDictionary("dict.txt")
|
||||
LoadUserDict("userdict.txt")
|
||||
j, _ := NewJieba("dict.txt")
|
||||
j.LoadUserDict("userdict.txt")
|
||||
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||
|
||||
words := chanToArray(Cut(sentence, false, true))
|
||||
words := chanToArray(j.Cut(sentence, false, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
}
|
||||
@@ -768,7 +776,7 @@ func TestLoadUserDict(t *testing.T) {
|
||||
|
||||
sentence = "easy_install is great"
|
||||
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||
words = chanToArray(Cut(sentence, false, true))
|
||||
words = chanToArray(j.Cut(sentence, false, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(len(words))
|
||||
}
|
||||
@@ -780,7 +788,7 @@ func TestLoadUserDict(t *testing.T) {
|
||||
|
||||
sentence = "python 的正则表达式是好用的"
|
||||
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
||||
words = chanToArray(Cut(sentence, false, true))
|
||||
words = chanToArray(j.Cut(sentence, false, true))
|
||||
if len(words) != len(result) {
|
||||
t.Error(words)
|
||||
t.Error(result)
|
||||
|
||||
@@ -3,10 +3,10 @@ package posseg
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
wordTagMap = make(map[string]string)
|
||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
@@ -20,26 +20,48 @@ type WordTag struct {
|
||||
Word, Tag string
|
||||
}
|
||||
|
||||
type Posseg struct {
|
||||
*jiebago.Jieba
|
||||
Flag map[string]string
|
||||
}
|
||||
|
||||
func (p *Posseg) Add(wtf *jiebago.WordTagFreq) {
|
||||
if len(wtf.Tag) > 0 {
|
||||
p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
||||
}
|
||||
p.AddWord(wtf)
|
||||
}
|
||||
|
||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||
// name in current diectory.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
func NewPosseg(dictFileName string) (*Posseg, error) {
|
||||
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
||||
p := &Posseg{j, make(map[string]string)}
|
||||
dictFilePath, err := jiebago.DictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
wordTagMap[wtf.Word] = wtf.Tag
|
||||
p.Add(wtf)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// Load user specified dictionary file.
|
||||
func (p *Posseg) LoadUserDict(dictFilePath string) error {
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, wtf := range wtfs {
|
||||
p.Add(wtf)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func cutDetailInternal(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDetailInternal(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
@@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDetail(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDetail(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
for wordTag := range cutDetailInternal(blk) {
|
||||
for wordTag := range p.cutDetailInternal(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
@@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag {
|
||||
|
||||
type cutFunc func(sentence string) chan WordTag
|
||||
|
||||
func cutDAG(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDAG(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
if tag, ok := p.Flag[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
@@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag {
|
||||
buf = make([]rune, 0)
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range p.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
if tag, ok := p.Flag[selem]; ok {
|
||||
result <- WordTag{string(elem), tag}
|
||||
} else {
|
||||
result <- WordTag{string(elem), "x"}
|
||||
@@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag {
|
||||
}
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
if tag, ok := p.Flag[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
@@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
if tag, ok := p.Flag[sbuf]; ok {
|
||||
result <- WordTag{sbuf, tag}
|
||||
} else {
|
||||
result <- WordTag{sbuf, "x"}
|
||||
}
|
||||
} else {
|
||||
bufString := string(buf)
|
||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range cutDetail(bufString) {
|
||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||
for t := range p.cutDetail(bufString) {
|
||||
result <- t
|
||||
}
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
if tag, ok := p.Flag[selem]; ok {
|
||||
result <- WordTag{selem, tag}
|
||||
} else {
|
||||
result <- WordTag{selem, "x"}
|
||||
@@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag {
|
||||
return result
|
||||
}
|
||||
|
||||
func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
dag := jiebago.DAG(sentence)
|
||||
routes := jiebago.Calc(sentence, dag)
|
||||
dag := p.DAG(sentence)
|
||||
routes := p.Calc(sentence, dag)
|
||||
x := 0
|
||||
var y int
|
||||
runes := []rune(sentence)
|
||||
@@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
if tag, ok := p.Flag[sl_word]; ok {
|
||||
result <- WordTag{sl_word, tag}
|
||||
} else {
|
||||
result <- WordTag{sl_word, "x"}
|
||||
@@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
||||
|
||||
// Tags the POS of each word after segmentation, using labels compatible with
|
||||
// ictclas.
|
||||
func Cut(sentence string, HMM bool) chan WordTag {
|
||||
for key := range jiebago.UserWordTagTab {
|
||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
cut = p.cutDAG
|
||||
} else {
|
||||
cut = cutDAGNoHMM
|
||||
cut = p.cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"github.com/wangbin/jiebago"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag {
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
p, err := NewPosseg("../dict.txt")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for index, content := range test_contents {
|
||||
result := chanToArray(Cut(content, true))
|
||||
result := chanToArray(p.Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
for i, _ := range result {
|
||||
if result[i] != defaultCutResult[index][i] {
|
||||
t.Error(content)
|
||||
t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||
}
|
||||
}
|
||||
result = chanToArray(Cut(content, false))
|
||||
result = chanToArray(p.Cut(content, false))
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Error(content)
|
||||
}
|
||||
@@ -305,7 +307,7 @@ func TestBug132(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/132
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
sentence := "又跛又啞"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"又", "d"},
|
||||
@@ -313,7 +315,7 @@ func TestBug132(t *testing.T) {
|
||||
WordTag{"又", "d"},
|
||||
WordTag{"啞", "v"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -328,7 +330,7 @@ func TestBug137(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/137
|
||||
*/
|
||||
SetDictionary("../dict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"前", "f"},
|
||||
@@ -345,7 +347,7 @@ func TestBug137(t *testing.T) {
|
||||
WordTag{"研究", "vn"},
|
||||
WordTag{"組", "x"},
|
||||
}
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
@@ -357,8 +359,8 @@ func TestBug137(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUserDict(t *testing.T) {
|
||||
SetDictionary("../dict.txt")
|
||||
jiebago.LoadUserDict("../userdict.txt")
|
||||
p, _ := NewPosseg("../dict.txt")
|
||||
p.LoadUserDict("../userdict.txt")
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
|
||||
cutResult := []WordTag{
|
||||
@@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) {
|
||||
WordTag{"N", "eng"},
|
||||
WordTag{"类型", "n"}}
|
||||
|
||||
result := chanToArray(Cut(sentence, true))
|
||||
result := chanToArray(p.Cut(sentence, true))
|
||||
if len(cutResult) != len(result) {
|
||||
t.Error(result)
|
||||
}
|
||||
|
||||
38
trie.go
38
trie.go
@@ -7,18 +7,14 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Trie store the total frequency and map of all words and their frequenciesb
|
||||
var Trie *trie
|
||||
|
||||
type trie struct {
|
||||
type Jieba struct {
|
||||
Total float64
|
||||
Freq map[string]float64
|
||||
}
|
||||
|
||||
func (t *trie) load(dictFileName string) error {
|
||||
func (j *Jieba) load(dictFileName string) error {
|
||||
dictFilePath, err := DictPath(dictFileName)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -55,7 +51,7 @@ func (t *trie) load(dictFileName string) error {
|
||||
|
||||
if isDictCached {
|
||||
dec := gob.NewDecoder(cacheFile)
|
||||
err = dec.Decode(&t)
|
||||
err = dec.Decode(&j)
|
||||
if err != nil {
|
||||
isDictCached = false
|
||||
} else {
|
||||
@@ -70,7 +66,7 @@ func (t *trie) load(dictFileName string) error {
|
||||
}
|
||||
|
||||
for _, wtf := range wtfs {
|
||||
t.addWord(wtf)
|
||||
j.AddWord(wtf)
|
||||
}
|
||||
// dump trie
|
||||
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
@@ -79,7 +75,7 @@ func (t *trie) load(dictFileName string) error {
|
||||
}
|
||||
defer cacheFile.Close()
|
||||
enc := gob.NewEncoder(cacheFile)
|
||||
err = enc.Encode(t)
|
||||
err = enc.Encode(j)
|
||||
if err != nil {
|
||||
return err
|
||||
} else {
|
||||
@@ -89,30 +85,27 @@ func (t *trie) load(dictFileName string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *trie) addWord(wtf *WordTagFreq) {
|
||||
t.Freq[wtf.Word] = wtf.Freq
|
||||
t.Total += wtf.Freq
|
||||
func (j *Jieba) AddWord(wtf *WordTagFreq) {
|
||||
j.Freq[wtf.Word] = wtf.Freq
|
||||
j.Total += wtf.Freq
|
||||
runes := []rune(wtf.Word)
|
||||
count := len(runes)
|
||||
for i := 0; i < count; i++ {
|
||||
wfrag := string(runes[0 : i+1])
|
||||
if _, ok := t.Freq[wfrag]; !ok {
|
||||
t.Freq[wfrag] = 0.0
|
||||
if _, ok := j.Freq[wfrag]; !ok {
|
||||
j.Freq[wfrag] = 0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load user specified dictionary file.
|
||||
func LoadUserDict(dictFilePath string) error {
|
||||
func (j *Jieba) LoadUserDict(dictFilePath string) error {
|
||||
wtfs, err := ParseDictFile(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, wtf := range wtfs {
|
||||
if len(wtf.Tag) > 0 {
|
||||
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
||||
}
|
||||
Trie.addWord(wtf)
|
||||
j.AddWord(wtf)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -120,7 +113,8 @@ func LoadUserDict(dictFilePath string) error {
|
||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||
// name in current directory. This function must be called before cut any
|
||||
// sentence.
|
||||
func SetDictionary(dictFileName string) error {
|
||||
Trie = &trie{Total: 0.0, Freq: make(map[string]float64)}
|
||||
return Trie.load(dictFileName)
|
||||
func NewJieba(dictFileName string) (*Jieba, error) {
|
||||
j := &Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
||||
err := j.load(dictFileName)
|
||||
return j, err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user