mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-30 09:00:30 +08:00
make struct Jieba's fields private
This commit is contained in:
@@ -42,7 +42,7 @@ type TagExtracter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
||||||
j, err := jiebago.NewJieba(dictFileName)
|
j, err := jiebago.Open(dictFileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -57,7 +57,7 @@ func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) {
|
|||||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags wordWeights) {
|
||||||
freq := make(map[string]float64)
|
freq := make(map[string]float64)
|
||||||
|
|
||||||
for w := range t.Cut(sentence, false, true) {
|
for w := range t.Cut(sentence, true) {
|
||||||
w = strings.TrimSpace(w)
|
w = strings.TrimSpace(w)
|
||||||
if utf8.RuneCountInString(w) < 2 {
|
if utf8.RuneCountInString(w) < 2 {
|
||||||
continue
|
continue
|
||||||
|
|||||||
48
jieba.go
48
jieba.go
@@ -2,7 +2,6 @@
|
|||||||
package jiebago
|
package jiebago
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"github.com/wangbin/jiebago/finalseg"
|
"github.com/wangbin/jiebago/finalseg"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -22,10 +21,6 @@ type route struct {
|
|||||||
Index int
|
Index int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r route) String() string {
|
|
||||||
return fmt.Sprintf("(%f, %d)", r.Freq, r.Index)
|
|
||||||
}
|
|
||||||
|
|
||||||
type routes []*route
|
type routes []*route
|
||||||
|
|
||||||
func (rs routes) Len() int {
|
func (rs routes) Len() int {
|
||||||
@@ -47,8 +42,17 @@ func (rs routes) Swap(i, j int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Jieba struct {
|
type Jieba struct {
|
||||||
Total float64
|
total float64
|
||||||
Freq map[string]float64
|
freqMap map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j Jieba) Freq(key string) (float64, bool) {
|
||||||
|
freq, ok := j.freqMap[key]
|
||||||
|
return freq, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j Jieba) Total() float64 {
|
||||||
|
return j.total
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *Jieba) AddEntry(entry Entry) {
|
func (j *Jieba) AddEntry(entry Entry) {
|
||||||
@@ -56,13 +60,13 @@ func (j *Jieba) AddEntry(entry Entry) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j *Jieba) Add(word string, freq float64) {
|
func (j *Jieba) Add(word string, freq float64) {
|
||||||
j.Freq[word] = freq
|
j.freqMap[word] = freq
|
||||||
j.Total += freq
|
j.total += freq
|
||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
for i := 0; i < len(runes); i++ {
|
for i := 0; i < len(runes); i++ {
|
||||||
frag := string(runes[0 : i+1])
|
frag := string(runes[0 : i+1])
|
||||||
if _, ok := j.Freq[frag]; !ok {
|
if _, ok := j.Freq(frag); !ok {
|
||||||
j.Freq[frag] = 0.0
|
j.freqMap[frag] = 0.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,11 +76,15 @@ func (j *Jieba) LoadUserDict(dictFilePath string) error {
|
|||||||
return LoadDict(j, dictFilePath, false)
|
return LoadDict(j, dictFilePath, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func New() *Jieba {
|
||||||
|
return &Jieba{total: 0.0, freqMap: make(map[string]float64)}
|
||||||
|
}
|
||||||
|
|
||||||
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
// Set the dictionary, could be absolute path of dictionary file, or dictionary
|
||||||
// name in current directory. This function must be called before cut any
|
// name in current directory. This function must be called before cut any
|
||||||
// sentence.
|
// sentence.
|
||||||
func NewJieba(dictFileName string) (*Jieba, error) {
|
func Open(dictFileName string) (*Jieba, error) {
|
||||||
j := &Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
j := &Jieba{total: 0.0, freqMap: make(map[string]float64)}
|
||||||
err := LoadDict(j, dictFileName, false)
|
err := LoadDict(j, dictFileName, false)
|
||||||
return j, err
|
return j, err
|
||||||
}
|
}
|
||||||
@@ -92,7 +100,7 @@ func (j *Jieba) DAG(sentence string) map[int][]int {
|
|||||||
i := k
|
i := k
|
||||||
frag = string(runes[k])
|
frag = string(runes[k])
|
||||||
for {
|
for {
|
||||||
if freq, ok := j.Freq[frag]; !ok {
|
if freq, ok := j.Freq(frag); !ok {
|
||||||
break
|
break
|
||||||
} else {
|
} else {
|
||||||
if freq > 0.0 {
|
if freq > 0.0 {
|
||||||
@@ -118,14 +126,14 @@ func (j *Jieba) Calc(sentence string, dag map[int][]int) map[int]*route {
|
|||||||
number := len(runes)
|
number := len(runes)
|
||||||
rs := make(map[int]*route)
|
rs := make(map[int]*route)
|
||||||
rs[number] = &route{Freq: 0.0, Index: 0}
|
rs[number] = &route{Freq: 0.0, Index: 0}
|
||||||
logTotal := math.Log(j.Total)
|
logTotal := math.Log(j.Total())
|
||||||
for idx := number - 1; idx >= 0; idx-- {
|
for idx := number - 1; idx >= 0; idx-- {
|
||||||
candidates := make(routes, 0)
|
candidates := make(routes, 0)
|
||||||
for _, i := range dag[idx] {
|
for _, i := range dag[idx] {
|
||||||
word := string(runes[idx : i+1])
|
word := string(runes[idx : i+1])
|
||||||
var r *route
|
var r *route
|
||||||
if _, ok := j.Freq[word]; ok {
|
if freq, ok := j.Freq(word); ok {
|
||||||
r = &route{Freq: math.Log(j.Freq[word]) - logTotal + rs[i+1].Freq, Index: i}
|
r = &route{Freq: math.Log(freq) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
} else {
|
} else {
|
||||||
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
r = &route{Freq: math.Log(1.0) - logTotal + rs[i+1].Freq, Index: i}
|
||||||
}
|
}
|
||||||
@@ -164,7 +172,7 @@ func (j *Jieba) cutDAG(sentence string) chan string {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := j.Freq(bufString); !ok || v == 0.0 {
|
||||||
for x := range finalseg.Cut(bufString) {
|
for x := range finalseg.Cut(bufString) {
|
||||||
result <- x
|
result <- x
|
||||||
}
|
}
|
||||||
@@ -186,7 +194,7 @@ func (j *Jieba) cutDAG(sentence string) chan string {
|
|||||||
result <- string(buf)
|
result <- string(buf)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := j.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := j.Freq(bufString); !ok || v == 0.0 {
|
||||||
for t := range finalseg.Cut(bufString) {
|
for t := range finalseg.Cut(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
@@ -352,7 +360,7 @@ func (j *Jieba) CutForSearch(sentence string, hmm bool) chan string {
|
|||||||
var gram2 string
|
var gram2 string
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
gram2 = string(runes[i : i+increment])
|
gram2 = string(runes[i : i+increment])
|
||||||
if v, ok := j.Freq[gram2]; ok && v > 0.0 {
|
if v, ok := j.Freq(gram2); ok && v > 0.0 {
|
||||||
result <- gram2
|
result <- gram2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -626,7 +626,7 @@ func chanToArray(ch chan string) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAG(t *testing.T) {
|
func TestCutDAG(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
result := chanToArray(j.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||||
if len(result) != 11 {
|
if len(result) != 11 {
|
||||||
@@ -635,7 +635,7 @@ func TestCutDAG(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAGNoHmm(t *testing.T) {
|
func TestCutDAGNoHmm(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
result := chanToArray(j.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
|
||||||
if len(result) != 11 {
|
if len(result) != 11 {
|
||||||
@@ -657,7 +657,7 @@ func TestRegexpSplit(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestDefaultCut(t *testing.T) {
|
func TestDefaultCut(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
@@ -675,7 +675,7 @@ func TestDefaultCut(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutAll(t *testing.T) {
|
func TestCutAll(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
@@ -693,7 +693,7 @@ func TestCutAll(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestDefaultCutNoHMM(t *testing.T) {
|
func TestDefaultCutNoHMM(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
@@ -711,7 +711,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCutForSearch(t *testing.T) {
|
func TestCutForSearch(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
|
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
@@ -742,7 +742,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
|
|
||||||
func TestSetdictionary(t *testing.T) {
|
func TestSetdictionary(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
j, _ := NewJieba("foobar.txt")
|
j, _ := Open("foobar.txt")
|
||||||
for index, content := range test_contents {
|
for index, content := range test_contents {
|
||||||
result = chanToArray(j.Cut(content, true))
|
result = chanToArray(j.Cut(content, true))
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
@@ -758,7 +758,7 @@ func TestSetdictionary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadUserDict(t *testing.T) {
|
func TestLoadUserDict(t *testing.T) {
|
||||||
j, _ := NewJieba("dict.txt")
|
j, _ := Open("dict.txt")
|
||||||
j.LoadUserDict("userdict.txt")
|
j.LoadUserDict("userdict.txt")
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
|||||||
66
loader.go
Normal file
66
loader.go
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
package jiebago
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Entry struct {
|
||||||
|
Word string
|
||||||
|
Flag string
|
||||||
|
Freq float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type Loader interface {
|
||||||
|
AddEntry(Entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
func dictPath(dictFileName string) (string, error) {
|
||||||
|
if filepath.IsAbs(dictFileName) {
|
||||||
|
return dictFileName, nil
|
||||||
|
}
|
||||||
|
var dictFilePath string
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
return dictFilePath, err
|
||||||
|
}
|
||||||
|
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||||
|
return dictFilePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadDict(l Loader, dictFileName string, usingFlag bool) error {
|
||||||
|
dictFilePath, err := dictPath(dictFileName)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
dictFile, err := os.Open(dictFilePath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer dictFile.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(dictFile)
|
||||||
|
var entry Entry
|
||||||
|
var line string
|
||||||
|
var fields []string
|
||||||
|
for scanner.Scan() {
|
||||||
|
line = scanner.Text()
|
||||||
|
fields = strings.Split(line, " ")
|
||||||
|
entry.Word = strings.Replace(fields[0], "\ufeff", "", 1)
|
||||||
|
if length := len(fields); length > 1 {
|
||||||
|
entry.Freq, err = strconv.ParseFloat(fields[1], 64)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if usingFlag && length > 2 {
|
||||||
|
entry.Flag = fields[2]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l.AddEntry(entry)
|
||||||
|
}
|
||||||
|
return scanner.Err()
|
||||||
|
}
|
||||||
@@ -35,7 +35,7 @@ func (p *Posseg) AddEntry(entry jiebago.Entry) {
|
|||||||
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
// Set dictionary, it could be absolute path of dictionary file, or dictionary
|
||||||
// name in current diectory.
|
// name in current diectory.
|
||||||
func NewPosseg(dictFileName string) (*Posseg, error) {
|
func NewPosseg(dictFileName string) (*Posseg, error) {
|
||||||
j := &jiebago.Jieba{Total: 0.0, Freq: make(map[string]float64)}
|
j := jiebago.New()
|
||||||
p := &Posseg{j, make(map[string]string)}
|
p := &Posseg{j, make(map[string]string)}
|
||||||
err := jiebago.LoadDict(p, dictFileName, true)
|
err := jiebago.LoadDict(p, dictFileName, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -137,7 +137,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := p.Freq(bufString); !ok || v == 0.0 {
|
||||||
for t := range p.cutDetail(bufString) {
|
for t := range p.cutDetail(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
@@ -175,7 +175,7 @@ func (p *Posseg) cutDAG(sentence string) chan Pair {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if v, ok := p.Freq[bufString]; !ok || v == 0.0 {
|
if v, ok := p.Freq(bufString); !ok || v == 0.0 {
|
||||||
for t := range p.cutDetail(bufString) {
|
for t := range p.cutDetail(bufString) {
|
||||||
result <- t
|
result <- t
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ type JiebaTokenizer struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
j, err := jiebago.NewJieba(dictFileName)
|
j, err := jiebago.Open(dictFileName)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
j: j,
|
j: j,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
@@ -44,7 +44,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
for i := 0; i < width-step+1; i++ {
|
for i := 0; i < width-step+1; i++ {
|
||||||
gram = string(runes[i : i+step])
|
gram = string(runes[i : i+step])
|
||||||
gramLen := len(gram)
|
gramLen := len(gram)
|
||||||
if value, ok := jt.j.Freq[gram]; ok && value > 0 {
|
if value, ok := jt.j.Freq(gram); ok && value > 0 {
|
||||||
gramStart := start + len(string(runes[:i]))
|
gramStart := start + len(string(runes[:i]))
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Term: []byte(gram),
|
Term: []byte(gram),
|
||||||
|
|||||||
Reference in New Issue
Block a user