mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-22 20:20:44 +08:00
优化 jieba
This commit is contained in:
@@ -53,17 +53,17 @@ type TagExtracter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary reads the given filename and create a new dictionary.
|
// LoadDictionary reads the given filename and create a new dictionary.
|
||||||
func (t *TagExtracter) LoadDictionary(file fs.File) error {
|
func (t *TagExtracter) LoadDictionary(file fs.File) (err error) {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
t.seg = new(jieba.Segmenter)
|
t.seg, err = jieba.LoadDictionary(file)
|
||||||
return t.seg.LoadDictionary(file)
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionaryAt reads the given filename and create a new dictionary.
|
// LoadDictionaryAt reads the given filename and create a new dictionary.
|
||||||
func (t *TagExtracter) LoadDictionaryAt(fileName string) error {
|
func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
t.seg = new(jieba.Segmenter)
|
t.seg, err = jieba.LoadDictionaryAt(file)
|
||||||
return t.seg.LoadDictionaryAt(fileName)
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||||
|
|||||||
@@ -10,9 +10,9 @@ import (
|
|||||||
|
|
||||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||||
type Dictionary struct {
|
type Dictionary struct {
|
||||||
|
sync.RWMutex
|
||||||
total, logTotal float64
|
total, logTotal float64
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
sync.RWMutex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load loads all tokens
|
// Load loads all tokens
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ type line struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
segmenter = Segmenter{}
|
segmenter *Segmenter
|
||||||
numThreads = runtime.NumCPU()
|
numThreads = runtime.NumCPU()
|
||||||
task = make(chan line, numThreads)
|
task = make(chan line, numThreads)
|
||||||
result = make(chan line, numThreads)
|
result = make(chan line, numThreads)
|
||||||
@@ -35,9 +35,6 @@ func Example_parallelCut() {
|
|||||||
// Set the number of goroutines
|
// Set the number of goroutines
|
||||||
runtime.GOMAXPROCS(numThreads)
|
runtime.GOMAXPROCS(numThreads)
|
||||||
|
|
||||||
// Load dictionary
|
|
||||||
segmenter.LoadDictionaryAt("dict.txt")
|
|
||||||
|
|
||||||
// open file for segmentation
|
// open file for segmentation
|
||||||
file, err := os.Open("README.md")
|
file, err := os.Open("README.md")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -45,6 +42,12 @@ func Example_parallelCut() {
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
|
// Load dictionary
|
||||||
|
segmenter, err = LoadDictionaryAt("dict.txt")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
// start worker routines
|
// start worker routines
|
||||||
for i := 0; i < numThreads; i++ {
|
for i := 0; i < numThreads; i++ {
|
||||||
go worker()
|
go worker()
|
||||||
|
|||||||
@@ -5,8 +5,10 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func Example() {
|
func Example() {
|
||||||
var seg Segmenter
|
seg, err := LoadDictionaryAt("dict.txt")
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Print("【全模式】:")
|
fmt.Print("【全模式】:")
|
||||||
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
fmt.Println(seg.CutAll("我来到北京清华大学"))
|
||||||
@@ -27,8 +29,10 @@ func Example() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Example_suggestFrequency() {
|
func Example_suggestFrequency() {
|
||||||
var seg Segmenter
|
seg, err := LoadDictionaryAt("dict.txt")
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
sentence := "超敏C反应蛋白是什么?"
|
sentence := "超敏C反应蛋白是什么?"
|
||||||
fmt.Print("Before:")
|
fmt.Print("Before:")
|
||||||
@@ -75,8 +79,10 @@ func Example_suggestFrequency() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Example_loadUserDictionary() {
|
func Example_loadUserDictionary() {
|
||||||
var seg Segmenter
|
seg, err := LoadDictionaryAt("dict.txt")
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家"
|
sentence := "李小福是创新办主任也是云计算方面的专家"
|
||||||
fmt.Print("Before:")
|
fmt.Print("Before:")
|
||||||
|
|||||||
139
jieba.go
139
jieba.go
@@ -21,23 +21,21 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Segmenter is a Chinese words segmentation struct.
|
// Segmenter is a Chinese words segmentation struct.
|
||||||
type Segmenter struct {
|
type Segmenter Dictionary
|
||||||
dict *Dictionary
|
|
||||||
}
|
|
||||||
|
|
||||||
// Frequency returns a word's frequency and existence
|
// Frequency returns a word's frequency and existence
|
||||||
func (seg *Segmenter) Frequency(word string) (float64, bool) {
|
func (seg *Segmenter) Frequency(word string) (float64, bool) {
|
||||||
return seg.dict.Frequency(word)
|
return (*Dictionary)(seg).Frequency(word)
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddWord adds a new word with frequency to dictionary
|
// AddWord adds a new word with frequency to dictionary
|
||||||
func (seg *Segmenter) AddWord(word string, frequency float64) {
|
func (seg *Segmenter) AddWord(word string, frequency float64) {
|
||||||
seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
|
(*Dictionary)(seg).AddToken(dictionary.NewToken(word, frequency, ""))
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeleteWord removes a word from dictionary
|
// DeleteWord removes a word from dictionary
|
||||||
func (seg *Segmenter) DeleteWord(word string) {
|
func (seg *Segmenter) DeleteWord(word string) {
|
||||||
seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
|
(*Dictionary)(seg).AddToken(dictionary.NewToken(word, 0.0, ""))
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -58,79 +56,79 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
|||||||
frequency := 1.0
|
frequency := 1.0
|
||||||
if len(words) > 1 {
|
if len(words) > 1 {
|
||||||
for _, word := range words {
|
for _, word := range words {
|
||||||
if freq, ok := seg.dict.Frequency(word); ok {
|
if freq, ok := (*Dictionary)(seg).Frequency(word); ok {
|
||||||
frequency *= freq
|
frequency *= freq
|
||||||
}
|
}
|
||||||
frequency /= seg.dict.total
|
frequency /= (*Dictionary)(seg).total
|
||||||
}
|
}
|
||||||
frequency, _ = math.Modf(frequency * seg.dict.total)
|
frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total)
|
||||||
wordFreq := 0.0
|
wordFreq := 0.0
|
||||||
if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
|
if freq, ok := (*Dictionary)(seg).Frequency(strings.Join(words, "")); ok {
|
||||||
wordFreq = freq
|
wordFreq = freq
|
||||||
}
|
}
|
||||||
if wordFreq < frequency {
|
if wordFreq < frequency {
|
||||||
frequency = wordFreq
|
frequency = wordFreq
|
||||||
}
|
}
|
||||||
} else {
|
return frequency
|
||||||
word := words[0]
|
}
|
||||||
for _, segment := range seg.Cut(word, false) {
|
word := words[0]
|
||||||
if freq, ok := seg.dict.Frequency(segment); ok {
|
for _, segment := range seg.Cut(word, false) {
|
||||||
frequency *= freq
|
if freq, ok := (*Dictionary)(seg).Frequency(segment); ok {
|
||||||
}
|
frequency *= freq
|
||||||
frequency /= seg.dict.total
|
|
||||||
}
|
|
||||||
frequency, _ = math.Modf(frequency * seg.dict.total)
|
|
||||||
frequency += 1.0
|
|
||||||
wordFreq := 1.0
|
|
||||||
if freq, ok := seg.dict.Frequency(word); ok {
|
|
||||||
wordFreq = freq
|
|
||||||
}
|
|
||||||
if wordFreq > frequency {
|
|
||||||
frequency = wordFreq
|
|
||||||
}
|
}
|
||||||
|
frequency /= (*Dictionary)(seg).total
|
||||||
|
}
|
||||||
|
frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total)
|
||||||
|
frequency += 1.0
|
||||||
|
wordFreq := 1.0
|
||||||
|
if freq, ok := (*Dictionary)(seg).Frequency(word); ok {
|
||||||
|
wordFreq = freq
|
||||||
|
}
|
||||||
|
if wordFreq > frequency {
|
||||||
|
frequency = wordFreq
|
||||||
}
|
}
|
||||||
return frequency
|
return frequency
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name. Everytime
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
func LoadDictionary(file fs.File) (*Segmenter, error) {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
d := &Dictionary{freqMap: make(map[string]float64)}
|
||||||
return seg.dict.loadDictionary(file)
|
err := d.loadDictionary(file)
|
||||||
|
return (*Segmenter)(d), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionaryAt loads dictionary from given file name. Everytime
|
// LoadDictionaryAt loads dictionary from given file name. Everytime
|
||||||
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
// LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionaryAt(file string) error {
|
func LoadDictionaryAt(file string) (*Segmenter, error) {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
d := &Dictionary{freqMap: make(map[string]float64)}
|
||||||
return seg.dict.loadDictionaryAt(file)
|
err := d.loadDictionaryAt(file)
|
||||||
|
return (*Segmenter)(d), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||||
return seg.dict.loadDictionary(file)
|
return (*Dictionary)(seg).loadDictionary(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
|
func (seg *Segmenter) LoadUserDictionaryAt(file string) error {
|
||||||
return seg.dict.loadDictionaryAt(file)
|
return (*Dictionary)(seg).loadDictionaryAt(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
func (seg *Segmenter) dag(runes []rune) [][]int {
|
||||||
dag := make(map[int][]int)
|
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
var frag []rune
|
dag := make([][]int, n)
|
||||||
var i int
|
|
||||||
for k := 0; k < n; k++ {
|
for k := 0; k < n; k++ {
|
||||||
dag[k] = make([]int, 0)
|
dag[k] = make([]int, 0, 64)
|
||||||
i = k
|
i := k
|
||||||
frag = runes[k : k+1]
|
frag := runes[k : k+1]
|
||||||
for {
|
for {
|
||||||
freq, ok := seg.dict.Frequency(string(frag))
|
freq, ok := (*Dictionary)(seg).Frequency(string(frag))
|
||||||
if !ok {
|
if !ok {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -155,20 +153,20 @@ type route struct {
|
|||||||
index int
|
index int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) calc(runes []rune) map[int]route {
|
func (seg *Segmenter) calc(runes []rune) []*route {
|
||||||
dag := seg.dag(runes)
|
dag := seg.dag(runes)
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
rs := make(map[int]route)
|
rs := make([]*route, n+1)
|
||||||
rs[n] = route{frequency: 0.0, index: 0}
|
rs[n] = &route{frequency: 0.0, index: 0}
|
||||||
var r route
|
|
||||||
for idx := n - 1; idx >= 0; idx-- {
|
for idx := n - 1; idx >= 0; idx-- {
|
||||||
for _, i := range dag[idx] {
|
for _, i := range dag[idx] {
|
||||||
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
|
var r *route
|
||||||
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
|
||||||
|
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||||
} else {
|
} else {
|
||||||
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||||
}
|
}
|
||||||
if v, ok := rs[idx]; !ok {
|
if v := rs[idx]; v == nil {
|
||||||
rs[idx] = r
|
rs[idx] = r
|
||||||
} else {
|
} else {
|
||||||
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
||||||
@@ -190,14 +188,11 @@ type cutFunc func(sentence string) []string
|
|||||||
|
|
||||||
func (seg *Segmenter) cutDAG(sentence string) []string {
|
func (seg *Segmenter) cutDAG(sentence string) []string {
|
||||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||||
|
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
buf := make([]rune, 0, 256)
|
||||||
length := len(runes)
|
for x := 0; x < len(runes); {
|
||||||
var buf []rune
|
y := routes[x].index + 1
|
||||||
for x := 0; x < length; {
|
|
||||||
y = routes[x].index + 1
|
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
if y-x == 1 {
|
if y-x == 1 {
|
||||||
buf = append(buf, frag...)
|
buf = append(buf, frag...)
|
||||||
@@ -207,7 +202,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
|
|||||||
if len(buf) == 1 {
|
if len(buf) == 1 {
|
||||||
result = append(result, bufString)
|
result = append(result, bufString)
|
||||||
} else {
|
} else {
|
||||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||||
result = append(result, finalseg.Cut(bufString)...)
|
result = append(result, finalseg.Cut(bufString)...)
|
||||||
} else {
|
} else {
|
||||||
for _, elem := range buf {
|
for _, elem := range buf {
|
||||||
@@ -215,7 +210,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
buf = make([]rune, 0)
|
buf = buf[:0]
|
||||||
}
|
}
|
||||||
result = append(result, string(frag))
|
result = append(result, string(frag))
|
||||||
}
|
}
|
||||||
@@ -227,7 +222,7 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
|
|||||||
if len(buf) == 1 {
|
if len(buf) == 1 {
|
||||||
result = append(result, bufString)
|
result = append(result, bufString)
|
||||||
} else {
|
} else {
|
||||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||||
result = append(result, finalseg.Cut(bufString)...)
|
result = append(result, finalseg.Cut(bufString)...)
|
||||||
} else {
|
} else {
|
||||||
for _, elem := range buf {
|
for _, elem := range buf {
|
||||||
@@ -242,14 +237,11 @@ func (seg *Segmenter) cutDAG(sentence string) []string {
|
|||||||
|
|
||||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
|
func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
|
||||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||||
|
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
buf := make([]rune, 0, 256)
|
||||||
length := len(runes)
|
for x := 0; x < len(runes); {
|
||||||
var buf []rune
|
y := routes[x].index + 1
|
||||||
for x := 0; x < length; {
|
|
||||||
y = routes[x].index + 1
|
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
if reEng.MatchString(string(frag)) && len(frag) == 1 {
|
||||||
buf = append(buf, frag...)
|
buf = append(buf, frag...)
|
||||||
@@ -258,7 +250,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
|
|||||||
}
|
}
|
||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
result = append(result, string(buf))
|
result = append(result, string(buf))
|
||||||
buf = make([]rune, 0)
|
buf = buf[:0]
|
||||||
}
|
}
|
||||||
result = append(result, string(frag))
|
result = append(result, string(frag))
|
||||||
x = y
|
x = y
|
||||||
@@ -307,17 +299,11 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) []string {
|
|||||||
|
|
||||||
func (seg *Segmenter) cutAll(sentence string) []string {
|
func (seg *Segmenter) cutAll(sentence string) []string {
|
||||||
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
|
||||||
|
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
dag := seg.dag(runes)
|
dag := seg.dag(runes)
|
||||||
start := -1
|
start := -1
|
||||||
ks := make([]int, len(dag))
|
for k := 0; k < len(dag); k++ {
|
||||||
for k := range dag {
|
l := dag[k]
|
||||||
ks[k] = k
|
|
||||||
}
|
|
||||||
var l []int
|
|
||||||
for k := range ks {
|
|
||||||
l = dag[k]
|
|
||||||
if len(l) == 1 && k > start {
|
if len(l) == 1 && k > start {
|
||||||
result = append(result, string(runes[k:l[0]+1]))
|
result = append(result, string(runes[k:l[0]+1]))
|
||||||
start = l[0]
|
start = l[0]
|
||||||
@@ -367,10 +353,9 @@ func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string {
|
|||||||
if len(runes) <= increment {
|
if len(runes) <= increment {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
var gram string
|
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
gram = string(runes[i : i+increment])
|
gram := string(runes[i : i+increment])
|
||||||
if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
|
if v, ok := (*Dictionary)(seg).Frequency(gram); ok && v > 0.0 {
|
||||||
result = append(result, gram)
|
result = append(result, gram)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ package jieba
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
seg Segmenter
|
seg *Segmenter
|
||||||
testContents = []string{
|
testContents = []string{
|
||||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
"我不喜欢日本和服。",
|
"我不喜欢日本和服。",
|
||||||
@@ -616,7 +616,11 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
var err error
|
||||||
|
seg, err = LoadDictionaryAt("dict.txt")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCutDAG(t *testing.T) {
|
func TestCutDAG(t *testing.T) {
|
||||||
@@ -715,7 +719,11 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
|
|
||||||
func TestLoadDictionary(t *testing.T) {
|
func TestLoadDictionary(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
seg.LoadDictionaryAt("foobar.txt")
|
var err error
|
||||||
|
seg, err = LoadDictionaryAt("foobar.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
for index, content := range testContents {
|
for index, content := range testContents {
|
||||||
result = seg.Cut(content, true)
|
result = seg.Cut(content, true)
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
@@ -728,7 +736,10 @@ func TestLoadDictionary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
seg, err = LoadDictionaryAt("dict.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadUserDictionary(t *testing.T) {
|
func TestLoadUserDictionary(t *testing.T) {
|
||||||
@@ -771,7 +782,11 @@ func TestLoadUserDictionary(t *testing.T) {
|
|||||||
t.Fatal(word)
|
t.Fatal(word)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seg.LoadDictionaryAt("dict.txt")
|
var err error
|
||||||
|
seg, err = LoadDictionaryAt("dict.txt")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkCutNoHMM(b *testing.B) {
|
func BenchmarkCutNoHMM(b *testing.B) {
|
||||||
|
|||||||
@@ -123,12 +123,10 @@ func (seg *Segmenter) cutDetail(sentence string) (results []Segment) {
|
|||||||
func (seg *Segmenter) dag(runes []rune) [][]int {
|
func (seg *Segmenter) dag(runes []rune) [][]int {
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
dag := make([][]int, n)
|
dag := make([][]int, n)
|
||||||
var frag []rune
|
|
||||||
var i int
|
|
||||||
for k := 0; k < n; k++ {
|
for k := 0; k < n; k++ {
|
||||||
dag[k] = make([]int, 0, 64)
|
dag[k] = make([]int, 0, 64)
|
||||||
i = k
|
i := k
|
||||||
frag = runes[k : k+1]
|
frag := runes[k : k+1]
|
||||||
for {
|
for {
|
||||||
freq, ok := (*Dictionary)(seg).Frequency(string(frag))
|
freq, ok := (*Dictionary)(seg).Frequency(string(frag))
|
||||||
if !ok {
|
if !ok {
|
||||||
@@ -160,9 +158,9 @@ func (seg *Segmenter) calc(runes []rune) []*route {
|
|||||||
n := len(runes)
|
n := len(runes)
|
||||||
rs := make([]*route, n+1)
|
rs := make([]*route, n+1)
|
||||||
rs[n] = &route{frequency: 0.0, index: 0}
|
rs[n] = &route{frequency: 0.0, index: 0}
|
||||||
var r *route
|
|
||||||
for idx := n - 1; idx >= 0; idx-- {
|
for idx := n - 1; idx >= 0; idx-- {
|
||||||
for _, i := range dag[idx] {
|
for _, i := range dag[idx] {
|
||||||
|
var r *route
|
||||||
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
|
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
|
||||||
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
|||||||
|
|
||||||
// JiebaTokenizer is the beleve tokenizer for jieba.
|
// JiebaTokenizer is the beleve tokenizer for jieba.
|
||||||
type JiebaTokenizer struct {
|
type JiebaTokenizer struct {
|
||||||
seg jieba.Segmenter
|
seg *jieba.Segmenter
|
||||||
hmm, searchMode bool
|
hmm, searchMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,8 +43,7 @@ Parameters:
|
|||||||
this word into "交换", "换机", which are valid Chinese words.
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
*/
|
*/
|
||||||
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
var seg jieba.Segmenter
|
seg, err := jieba.LoadDictionary(dictFile)
|
||||||
err := seg.LoadDictionary(dictFile)
|
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
@@ -73,8 +72,7 @@ Parameters:
|
|||||||
this word into "交换", "换机", which are valid Chinese words.
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
*/
|
*/
|
||||||
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizerAt(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
var seg jieba.Segmenter
|
seg, err := jieba.LoadDictionaryAt(dictFilePath)
|
||||||
err := seg.LoadDictionaryAt(dictFilePath)
|
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
|
|||||||
Reference in New Issue
Block a user