mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-27 07:30:32 +08:00
refactor posseg, added Posseg struct
This commit is contained in:
15
dictionary.go
Normal file
15
dictionary.go
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package jiebago
|
||||||
|
|
||||||
|
type Pair struct {
|
||||||
|
Word string
|
||||||
|
Flag string
|
||||||
|
}
|
||||||
|
|
||||||
|
type Token struct {
|
||||||
|
*Pair
|
||||||
|
Freq float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type DictLoader interface {
|
||||||
|
Add(*Token)
|
||||||
|
}
|
||||||
46
jieba.go
46
jieba.go
@@ -49,7 +49,7 @@ func (rs routes) Swap(i, j int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Build a directed acyclic graph (DAG) for sentence.
|
// Build a directed acyclic graph (DAG) for sentence.
|
||||||
func DAG(sentence string) map[int][]int {
|
func (j *Jieba) DAG(sentence string) map[int][]int {
|
||||||
dag := make(map[int][]int)
|
dag := make(map[int][]int)
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
@@ -60,7 +60,7 @@ func DAG(sentence string) map[int][]int {
|
|||||||
i = k
|
i = k
|
||||||
frag = string(runes[k])
|
frag = string(runes[k])
|
||||||
for {
|
for {
|
||||||
if freq, ok := Trie.Freq[frag]; !ok {
|
if freq, ok := j.Freq[frag]; !ok {
|
||||||
break
|
break
|
||||||
} else {
|
} else {
|
||||||
if freq > 0.0 {
|
if freq > 0.0 {
|
||||||
@@ -81,19 +81,19 @@ func DAG(sentence string) map[int][]int {
|
|||||||
return dag
|
return dag
|
||||||
}
|
}
|
||||||
|
|
||||||
func Calc(sentence string, dag map[int][]int) map[int]*route {
|
func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
number := len(runes)
|
number := len(runes)
|
||||||
rs := make(map[int]*route)
|
rs := make(map[int]*route)
|
||||||
rs[number] = &route{Freq: 0.0, Index: 0}
|
rs[number] = &route{Freq: 0.0, Index: 0}
|
||||||
logTotal := math.Log(Trie.Total)
|
logTotal := math.Log(j.Total)
|
||||||
for idx := number - 1; idx >= 0; idx-- {
|
for idx := number - 1; idx >= 0; idx-- {
|
||||||
candidates := make(routes, 0)
|
candidates := make(routes, 0)
|
||||||
for _, i := range dag[idx] {
|
for _, i := range dag[idx] {
|
||||||
word := string(runes[idx : i+1])
|
word := string(runes[idx : i+1])
|
||||||
var r *route
|
var r *route
|
||||||
if _, ok := Trie.Freq[word]; ok {
|
if _, ok := j.Freq[word]; ok {
|
||||||
r = &route{Freq: math.Log(Trie.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
} else {
|
} else {
|
||||||
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
}
|
}
|
||||||
@@ -107,11 +107,11 @@ func Calc(sentence string, dag map[int][]int) map[int]*route {
|
|||||||
|
|
||||||
type cutFunc func(sentence string) chan string
|
type cutFunc func(sentence string) chan string
|
||||||
|
|
||||||
func cutDAG(sentence string) chan string {
|
func (j *Jieba) cutDAG(sentence string) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
dag := DAG(sentence)
|
dag := j.DAG(sentence)
|
||||||
routes := Calc(sentence, dag)
|
routes := j.Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
var y int
|
var y int
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
@@ -132,7 +132,7 @@ func cutDAG(sentence string) chan string {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
||||||
for x := range finalseg.Cut(bufString) {
|
for x := range finalseg.Cut(bufString) {
|
||||||
result <- x
|
result <- x
|
||||||
}
|
}
|
||||||
@@ -154,7 +154,7 @@ func cutDAG(sentence string) chan string {
|
|||||||
result <- string(buf)
|
result <- string(buf)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := Trie.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
||||||
for t := range finalseg.Cut(bufString) {
|
for t := range finalseg.Cut(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
@@ -170,12 +170,12 @@ func cutDAG(sentence string) chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func cutDAGNoHMM(sentence string) chan string {
|
func (j *Jieba) cutDAGNoHMM(sentence string) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
dag := DAG(sentence)
|
dag := j.DAG(sentence)
|
||||||
routes := Calc(sentence, dag)
|
routes := j.Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
var y int
|
var y int
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
@@ -208,12 +208,12 @@ func cutDAGNoHMM(sentence string) chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func cutAll(sentence string) chan string {
|
func (j *Jieba) cutAll(sentence string) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
dag := DAG(sentence)
|
dag := j.DAG(sentence)
|
||||||
old_j := -1
|
old_j := -1
|
||||||
ks := make([]int, 0)
|
ks := make([]int, 0)
|
||||||
for k := range dag {
|
for k := range dag {
|
||||||
@@ -251,7 +251,7 @@ which is suitable for text analysis.
|
|||||||
|
|
||||||
HMM contols whether to use the Hidden Markov Mode.
|
HMM contols whether to use the Hidden Markov Mode.
|
||||||
*/
|
*/
|
||||||
func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
func (j *Jieba) Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
var reHan, reSkip *regexp.Regexp
|
var reHan, reSkip *regexp.Regexp
|
||||||
@@ -264,12 +264,12 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
|||||||
}
|
}
|
||||||
var cut cutFunc
|
var cut cutFunc
|
||||||
if HMM {
|
if HMM {
|
||||||
cut = cutDAG
|
cut = j.cutDAG
|
||||||
} else {
|
} else {
|
||||||
cut = cutDAGNoHMM
|
cut = j.cutDAGNoHMM
|
||||||
}
|
}
|
||||||
if isCutAll {
|
if isCutAll {
|
||||||
cut = cutAll
|
cut = j.cutAll
|
||||||
}
|
}
|
||||||
for blk := range RegexpSplit(reHan, sentence) {
|
for blk := range RegexpSplit(reHan, sentence) {
|
||||||
if len(blk) == 0 {
|
if len(blk) == 0 {
|
||||||
@@ -320,17 +320,17 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
|||||||
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
// Cut sentence using Search Engine Mode, based on the Accurate Mode, attempts
|
||||||
// to cut long words into several short words, which can raise the recall rate.
|
// to cut long words into several short words, which can raise the recall rate.
|
||||||
// Suitable for search engines.
|
// Suitable for search engines.
|
||||||
func CutForSearch(sentence string, hmm bool) chan string {
|
func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
for word := range Cut(sentence, false, hmm) {
|
for word := range j.Cut(sentence, false, hmm) {
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
for _, increment := range []int{2, 3} {
|
for _, increment := range []int{2, 3} {
|
||||||
if len(runes) > increment {
|
if len(runes) > increment {
|
||||||
var gram2 string
|
var gram2 string
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
gram2 = string(runes[i : i+increment])
|
gram2 = string(runes[i : i+increment])
|
||||||
if v, ok := Trie.Freq[gram2]; ok && v > 0.0 {
|
if v, ok := j.Freq[gram2]; ok && v > 0.0 {
|
||||||
result <- gram2
|
result <- gram2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -617,10 +617,6 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
|
||||||
SetDictionary("dict.txt")
|
|
||||||
}
|
|
||||||
|
|
||||||
func chanToArray(ch chan string) []string {
|
func chanToArray(ch chan string) []string {
|
||||||
result := make([]string, 0)
|
result := make([]string, 0)
|
||||||
for word := range ch {
|
for word := range ch {
|
||||||
@@ -630,14 +626,18 @@ func chanToArray(ch chan string) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAG(t *testing.T) {
|
func TestCutDAG(t *testing.T) {
|
||||||
result := chanToArray(cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
|
result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||||
if len(result) != 11 {
|
if len(result) != 11 {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAGNoHmm(t *testing.T) {
|
func TestCutDAGNoHmm(t *testing.T) {
|
||||||
result := chanToArray(cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
|
result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||||
if len(result) != 11 {
|
if len(result) != 11 {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
@@ -657,9 +657,11 @@ func TestRegexpSplit(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestDefaultCut(t *testing.T) {
|
func TestDefaultCut(t *testing.T) {
|
||||||
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(Cut(content, false, true))
|
result = chanToArray(j.Cut(content, false, true))
|
||||||
if len(result) != len(defaultCutResult[index]) {
|
if len(result) != len(defaultCutResult[index]) {
|
||||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutResult[index]), len(result))
|
content, len(defaultCutResult[index]), len(result))
|
||||||
@@ -673,9 +675,11 @@ func TestDefaultCut(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutAll(t *testing.T) {
|
func TestCutAll(t *testing.T) {
|
||||||
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(Cut(content, true, true))
|
result = chanToArray(j.Cut(content, true, true))
|
||||||
if len(result) != len(cutAllResult[index]) {
|
if len(result) != len(cutAllResult[index]) {
|
||||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||||
content, len(cutAllResult[index]), len(result))
|
content, len(cutAllResult[index]), len(result))
|
||||||
@@ -689,9 +693,11 @@ func TestCutAll(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestDefaultCutNoHMM(t *testing.T) {
|
func TestDefaultCutNoHMM(t *testing.T) {
|
||||||
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(Cut(content, false, false))
|
result = chanToArray(j.Cut(content, false, false))
|
||||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||||
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
t.Errorf("default cut no hmm for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutNoHMMResult[index]), len(result))
|
content, len(defaultCutNoHMMResult[index]), len(result))
|
||||||
@@ -705,9 +711,11 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutForSearch(t *testing.T) {
|
func TestCutForSearch(t *testing.T) {
|
||||||
|
j, _ := NewJieba("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(CutForSearch(content, true))
|
result = chanToArray(j.CutForSearch(content, true))
|
||||||
if len(result) != len(cutForSearchResult[index]) {
|
if len(result) != len(cutForSearchResult[index]) {
|
||||||
t.Errorf("cut for search for %s length should be %d not %d\n",
|
t.Errorf("cut for search for %s length should be %d not %d\n",
|
||||||
content, len(cutForSearchResult[index]), len(result))
|
content, len(cutForSearchResult[index]), len(result))
|
||||||
@@ -719,7 +727,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(CutForSearch(content, false))
|
result = chanToArray(j.CutForSearch(content, false))
|
||||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||||
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
t.Errorf("cut for search no hmm for %s length should be %d not %d\n",
|
||||||
content, len(cutForSearchNoHMMResult[index]), len(result))
|
content, len(cutForSearchNoHMMResult[index]), len(result))
|
||||||
@@ -734,9 +742,9 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
|
|
||||||
func TestSetdictionary(t *testing.T) {
|
func TestSetdictionary(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
SetDictionary("foobar.txt")
|
j, _ := NewJieba("foobar.txt")
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(Cut(content, false, true))
|
result = chanToArray(j.Cut(content, false, true))
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
t.Errorf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||||
content, len(userDictCutResult[index]), len(result))
|
content, len(userDictCutResult[index]), len(result))
|
||||||
@@ -750,13 +758,13 @@ func TestSetdictionary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadUserDict(t *testing.T) {
|
func TestLoadUserDict(t *testing.T) {
|
||||||
SetDictionary("dict.txt")
|
j, _ := NewJieba("dict.txt")
|
||||||
LoadUserDict("userdict.txt")
|
j.LoadUserDict("userdict.txt")
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
|
||||||
|
|
||||||
words := chanToArray(Cut(sentence, false, true))
|
words := chanToArray(j.Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(len(words))
|
t.Error(len(words))
|
||||||
}
|
}
|
||||||
@@ -768,7 +776,7 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
|
|
||||||
sentence = "easy_install is great"
|
sentence = "easy_install is great"
|
||||||
result = []string{"easy_install", " ", "is", " ", "great"}
|
result = []string{"easy_install", " ", "is", " ", "great"}
|
||||||
words = chanToArray(Cut(sentence, false, true))
|
words = chanToArray(j.Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(len(words))
|
t.Error(len(words))
|
||||||
}
|
}
|
||||||
@@ -780,7 +788,7 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
|
|
||||||
sentence = "python 的正则表达式是好用的"
|
sentence = "python 的正则表达式是好用的"
|
||||||
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
|
||||||
words = chanToArray(Cut(sentence, false, true))
|
words = chanToArray(j.Cut(sentence, false, true))
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(words)
|
t.Error(words)
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
|
|||||||
@@ -3,10 +3,10 @@ package posseg
|
|||||||
import (
|
import (
|
||||||
"github.com/wangbin/jiebago"
|
"github.com/wangbin/jiebago"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
wordTagMap = make(map[string]string)
|
|
||||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||||
@@ -20,26 +20,48 @@ type WordTag struct {
|
|||||||
Word, Tag string
|
Word, Tag string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Posseg struct {
|
||||||
|
*jiebago.Jieba
|
||||||
|
Flag map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Posseg) Add(wtf *jiebago.WordTagFreq) {
|
||||||
|
if len(wtf.Tag) > 0 {
|
||||||
|
p.Flag[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
||||||
|
}
|
||||||
|
p.AddWord(wtf)
|
||||||
|
}
|
||||||
|
|
||||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||||
// name in current diectory.
|
// name in current diectory.
|
||||||
func SetDictionary(dictFileName string) error {
|
func NewPosseg(dictFileName string) (*Posseg, error) {
|
||||||
err := jiebago.SetDictionary(dictFileName)
|
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
||||||
if err != nil {
|
p := &Posseg{j, make(map[string]string)}
|
||||||
return err
|
|
||||||
}
|
|
||||||
dictFilePath, err := jiebago.DictPath(dictFileName)
|
dictFilePath, err := jiebago.DictPath(dictFileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return nil, err
|
||||||
}
|
}
|
||||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||||
|
|
||||||
for _, wtf := range wtfs {
|
for _, wtf := range wtfs {
|
||||||
wordTagMap[wtf.Word] = wtf.Tag
|
p.Add(wtf)
|
||||||
|
}
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load user specified dictionary file.
|
||||||
|
func (p *Posseg) LoadUserDict(dictFilePath string) error {
|
||||||
|
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, wtf := range wtfs {
|
||||||
|
p.Add(wtf)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func cutDetailInternal(sentence string) chan WordTag {
|
func (p *Posseg) cutDetailInternal(sentence string) chan WordTag {
|
||||||
result := make(chan WordTag)
|
result := make(chan WordTag)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
@@ -68,13 +90,13 @@ func cutDetailInternal(sentence string) chan WordTag {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func cutDetail(sentence string) chan WordTag {
|
func (p *Posseg) cutDetail(sentence string) chan WordTag {
|
||||||
result := make(chan WordTag)
|
result := make(chan WordTag)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
||||||
if reHanDetail.MatchString(blk) {
|
if reHanDetail.MatchString(blk) {
|
||||||
for wordTag := range cutDetailInternal(blk) {
|
for wordTag := range p.cutDetailInternal(blk) {
|
||||||
result <- wordTag
|
result <- wordTag
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -100,12 +122,12 @@ func cutDetail(sentence string) chan WordTag {
|
|||||||
|
|
||||||
type cutFunc func(sentence string) chan WordTag
|
type cutFunc func(sentence string) chan WordTag
|
||||||
|
|
||||||
func cutDAG(sentence string) chan WordTag {
|
func (p *Posseg) cutDAG(sentence string) chan WordTag {
|
||||||
result := make(chan WordTag)
|
result := make(chan WordTag)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
dag := jiebago.DAG(sentence)
|
dag := p.DAG(sentence)
|
||||||
routes := jiebago.Calc(sentence, dag)
|
routes := p.Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
var y int
|
var y int
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
@@ -123,7 +145,7 @@ func cutDAG(sentence string) chan WordTag {
|
|||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
if len(buf) == 1 {
|
if len(buf) == 1 {
|
||||||
sbuf := string(buf)
|
sbuf := string(buf)
|
||||||
if tag, ok := wordTagMap[sbuf]; ok {
|
if tag, ok := p.Flag[sbuf]; ok {
|
||||||
result <- WordTag{sbuf, tag}
|
result <- WordTag{sbuf, tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{sbuf, "x"}
|
result <- WordTag{sbuf, "x"}
|
||||||
@@ -131,14 +153,14 @@ func cutDAG(sentence string) chan WordTag {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||||
for t := range cutDetail(bufString) {
|
for t := range p.cutDetail(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for _, elem := range buf {
|
for _, elem := range buf {
|
||||||
selem := string(elem)
|
selem := string(elem)
|
||||||
if tag, ok := wordTagMap[selem]; ok {
|
if tag, ok := p.Flag[selem]; ok {
|
||||||
result <- WordTag{string(elem), tag}
|
result <- WordTag{string(elem), tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{string(elem), "x"}
|
result <- WordTag{string(elem), "x"}
|
||||||
@@ -150,7 +172,7 @@ func cutDAG(sentence string) chan WordTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
sl_word := string(l_word)
|
sl_word := string(l_word)
|
||||||
if tag, ok := wordTagMap[sl_word]; ok {
|
if tag, ok := p.Flag[sl_word]; ok {
|
||||||
result <- WordTag{sl_word, tag}
|
result <- WordTag{sl_word, tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{sl_word, "x"}
|
result <- WordTag{sl_word, "x"}
|
||||||
@@ -162,21 +184,21 @@ func cutDAG(sentence string) chan WordTag {
|
|||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
if len(buf) == 1 {
|
if len(buf) == 1 {
|
||||||
sbuf := string(buf)
|
sbuf := string(buf)
|
||||||
if tag, ok := wordTagMap[sbuf]; ok {
|
if tag, ok := p.Flag[sbuf]; ok {
|
||||||
result <- WordTag{sbuf, tag}
|
result <- WordTag{sbuf, tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{sbuf, "x"}
|
result <- WordTag{sbuf, "x"}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := jiebago.Trie.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
||||||
for t := range cutDetail(bufString) {
|
for t := range p.cutDetail(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for _, elem := range buf {
|
for _, elem := range buf {
|
||||||
selem := string(elem)
|
selem := string(elem)
|
||||||
if tag, ok := wordTagMap[selem]; ok {
|
if tag, ok := p.Flag[selem]; ok {
|
||||||
result <- WordTag{selem, tag}
|
result <- WordTag{selem, tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{selem, "x"}
|
result <- WordTag{selem, "x"}
|
||||||
@@ -190,12 +212,12 @@ func cutDAG(sentence string) chan WordTag {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func cutDAGNoHMM(sentence string) chan WordTag {
|
func (p *Posseg) cutDAGNoHMM(sentence string) chan WordTag {
|
||||||
result := make(chan WordTag)
|
result := make(chan WordTag)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
dag := jiebago.DAG(sentence)
|
dag := p.DAG(sentence)
|
||||||
routes := jiebago.Calc(sentence, dag)
|
routes := p.Calc(sentence, dag)
|
||||||
x := 0
|
x := 0
|
||||||
var y int
|
var y int
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
@@ -216,7 +238,7 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
}
|
}
|
||||||
sl_word := string(l_word)
|
sl_word := string(l_word)
|
||||||
if tag, ok := wordTagMap[sl_word]; ok {
|
if tag, ok := p.Flag[sl_word]; ok {
|
||||||
result <- WordTag{sl_word, tag}
|
result <- WordTag{sl_word, tag}
|
||||||
} else {
|
} else {
|
||||||
result <- WordTag{sl_word, "x"}
|
result <- WordTag{sl_word, "x"}
|
||||||
@@ -235,17 +257,13 @@ func cutDAGNoHMM(sentence string) chan WordTag {
|
|||||||
|
|
||||||
// Tags the POS of each word after segmentation, using labels compatible with
|
// Tags the POS of each word after segmentation, using labels compatible with
|
||||||
// ictclas.
|
// ictclas.
|
||||||
func Cut(sentence string, HMM bool) chan WordTag {
|
func (p *Posseg) Cut(sentence string, HMM bool) chan WordTag {
|
||||||
for key := range jiebago.UserWordTagTab {
|
|
||||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
|
||||||
delete(jiebago.UserWordTagTab, key)
|
|
||||||
}
|
|
||||||
result := make(chan WordTag)
|
result := make(chan WordTag)
|
||||||
var cut cutFunc
|
var cut cutFunc
|
||||||
if HMM {
|
if HMM {
|
||||||
cut = cutDAG
|
cut = p.cutDAG
|
||||||
} else {
|
} else {
|
||||||
cut = cutDAGNoHMM
|
cut = p.cutDAGNoHMM
|
||||||
}
|
}
|
||||||
go func() {
|
go func() {
|
||||||
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/wangbin/jiebago"
|
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -277,18 +276,21 @@ func chanToArray(ch chan WordTag) []WordTag {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCut(t *testing.T) {
|
func TestCut(t *testing.T) {
|
||||||
SetDictionary("../dict.txt")
|
p, err := NewPosseg("../dict.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result := chanToArray(Cut(content, true))
|
result := chanToArray(p.Cut(content, true))
|
||||||
if len(defaultCutResult[index]) != len(result) {
|
if len(defaultCutResult[index]) != len(result) {
|
||||||
t.Error(content)
|
t.Error(content)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i, _ := range result {
|
||||||
if result[i] != defaultCutResult[index][i] {
|
if result[i] != defaultCutResult[index][i] {
|
||||||
t.Error(content)
|
t.Errorf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = chanToArray(Cut(content, false))
|
result = chanToArray(p.Cut(content, false))
|
||||||
if len(noHMMCutResult[index]) != len(result) {
|
if len(noHMMCutResult[index]) != len(result) {
|
||||||
t.Error(content)
|
t.Error(content)
|
||||||
}
|
}
|
||||||
@@ -305,7 +307,7 @@ func TestBug132(t *testing.T) {
|
|||||||
/*
|
/*
|
||||||
https://github.com/fxsjy/jieba/issues/132
|
https://github.com/fxsjy/jieba/issues/132
|
||||||
*/
|
*/
|
||||||
SetDictionary("../dict.txt")
|
p, _ := NewPosseg("../dict.txt")
|
||||||
sentence := "又跛又啞"
|
sentence := "又跛又啞"
|
||||||
cutResult := []WordTag{
|
cutResult := []WordTag{
|
||||||
WordTag{"又", "d"},
|
WordTag{"又", "d"},
|
||||||
@@ -313,7 +315,7 @@ func TestBug132(t *testing.T) {
|
|||||||
WordTag{"又", "d"},
|
WordTag{"又", "d"},
|
||||||
WordTag{"啞", "v"},
|
WordTag{"啞", "v"},
|
||||||
}
|
}
|
||||||
result := chanToArray(Cut(sentence, true))
|
result := chanToArray(p.Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
@@ -328,7 +330,7 @@ func TestBug137(t *testing.T) {
|
|||||||
/*
|
/*
|
||||||
https://github.com/fxsjy/jieba/issues/137
|
https://github.com/fxsjy/jieba/issues/137
|
||||||
*/
|
*/
|
||||||
SetDictionary("../dict.txt")
|
p, _ := NewPosseg("../dict.txt")
|
||||||
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
||||||
cutResult := []WordTag{
|
cutResult := []WordTag{
|
||||||
WordTag{"前", "f"},
|
WordTag{"前", "f"},
|
||||||
@@ -345,7 +347,7 @@ func TestBug137(t *testing.T) {
|
|||||||
WordTag{"研究", "vn"},
|
WordTag{"研究", "vn"},
|
||||||
WordTag{"組", "x"},
|
WordTag{"組", "x"},
|
||||||
}
|
}
|
||||||
result := chanToArray(Cut(sentence, true))
|
result := chanToArray(p.Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
@@ -357,8 +359,8 @@ func TestBug137(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestUserDict(t *testing.T) {
|
func TestUserDict(t *testing.T) {
|
||||||
SetDictionary("../dict.txt")
|
p, _ := NewPosseg("../dict.txt")
|
||||||
jiebago.LoadUserDict("../userdict.txt")
|
p.LoadUserDict("../userdict.txt")
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
|
||||||
cutResult := []WordTag{
|
cutResult := []WordTag{
|
||||||
@@ -400,7 +402,7 @@ func TestUserDict(t *testing.T) {
|
|||||||
WordTag{"N", "eng"},
|
WordTag{"N", "eng"},
|
||||||
WordTag{"类型", "n"}}
|
WordTag{"类型", "n"}}
|
||||||
|
|
||||||
result := chanToArray(Cut(sentence, true))
|
result := chanToArray(p.Cut(sentence, true))
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Error(result)
|
t.Error(result)
|
||||||
}
|
}
|
||||||
|
|||||||
38
trie.go
38
trie.go
@@ -7,18 +7,14 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Trie store the total frequency and map of all words and their frequenciesb
|
type Jieba struct {
|
||||||
var Trie *trie
|
|
||||||
|
|
||||||
type trie struct {
|
|
||||||
Total float64
|
Total float64
|
||||||
Freq map[string]float64
|
Freq map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *trie) load(dictFileName string) error {
|
func (j *Jieba) load(dictFileName string) error {
|
||||||
dictFilePath, err := DictPath(dictFileName)
|
dictFilePath, err := DictPath(dictFileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -55,7 +51,7 @@ func (t *trie) load(dictFileName string) error {
|
|||||||
|
|
||||||
if isDictCached {
|
if isDictCached {
|
||||||
dec := gob.NewDecoder(cacheFile)
|
dec := gob.NewDecoder(cacheFile)
|
||||||
err = dec.Decode(&t)
|
err = dec.Decode(&j)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
isDictCached = false
|
isDictCached = false
|
||||||
} else {
|
} else {
|
||||||
@@ -70,7 +66,7 @@ func (t *trie) load(dictFileName string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, wtf := range wtfs {
|
for _, wtf := range wtfs {
|
||||||
t.addWord(wtf)
|
j.AddWord(wtf)
|
||||||
}
|
}
|
||||||
// dump trie
|
// dump trie
|
||||||
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||||
@@ -79,7 +75,7 @@ func (t *trie) load(dictFileName string) error {
|
|||||||
}
|
}
|
||||||
defer cacheFile.Close()
|
defer cacheFile.Close()
|
||||||
enc := gob.NewEncoder(cacheFile)
|
enc := gob.NewEncoder(cacheFile)
|
||||||
err = enc.Encode(t)
|
err = enc.Encode(j)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
} else {
|
} else {
|
||||||
@@ -89,30 +85,27 @@ func (t *trie) load(dictFileName string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *trie) addWord(wtf *WordTagFreq) {
|
func (j *Jieba) AddWord(wtf *WordTagFreq) {
|
||||||
t.Freq[wtf.Word] = wtf.Freq
|
j.Freq[wtf.Word] = wtf.Freq
|
||||||
t.Total += wtf.Freq
|
j.Total += wtf.Freq
|
||||||
runes := []rune(wtf.Word)
|
runes := []rune(wtf.Word)
|
||||||
count := len(runes)
|
count := len(runes)
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
wfrag := string(runes[0 : i+1])
|
wfrag := string(runes[0 : i+1])
|
||||||
if _, ok := t.Freq[wfrag]; !ok {
|
if _, ok := j.Freq[wfrag]; !ok {
|
||||||
t.Freq[wfrag] = 0.0
|
j.Freq[wfrag] = 0.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load user specified dictionary file.
|
// Load user specified dictionary file.
|
||||||
func LoadUserDict(dictFilePath string) error {
|
func (j *Jieba) LoadUserDict(dictFilePath string) error {
|
||||||
wtfs, err := ParseDictFile(dictFilePath)
|
wtfs, err := ParseDictFile(dictFilePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, wtf := range wtfs {
|
for _, wtf := range wtfs {
|
||||||
if len(wtf.Tag) > 0 {
|
j.AddWord(wtf)
|
||||||
UserWordTagTab[wtf.Word] = strings.TrimSpace(wtf.Tag)
|
|
||||||
}
|
|
||||||
Trie.addWord(wtf)
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -120,7 +113,8 @@ func LoadUserDict(dictFilePath string) error {
|
|||||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||||
// name in current directory. This function must be called before cut any
|
// name in current directory. This function must be called before cut any
|
||||||
// sentence.
|
// sentence.
|
||||||
func SetDictionary(dictFileName string) error {
|
func NewJieba(dictFileName string) (*Jieba, error) {
|
||||||
Trie = &trie{Total: 0.0, Freq: make(map[string]float64)}
|
j := &Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
||||||
return Trie.load(dictFileName)
|
err := j.load(dictFileName)
|
||||||
|
return j, err
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user