mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-23 04:30:44 +08:00
use only one dict to store words and prefixes, this corresponding to jieba commit #f808ea0ebba7056fa1b55081b474329e556933a8
This commit is contained in:
15
jieba.go
15
jieba.go
@@ -82,11 +82,12 @@ func GetDAG(sentence string) map[int][]int {
|
|||||||
i = k
|
i = k
|
||||||
frag = string(runes[k])
|
frag = string(runes[k])
|
||||||
for {
|
for {
|
||||||
if !T.Nodes.Contains(frag) {
|
if freq, ok := T.Freq[frag]; !ok {
|
||||||
break
|
break
|
||||||
}
|
} else {
|
||||||
if _, ok := T.Freq[frag]; ok {
|
if freq > 0.0 {
|
||||||
tmpList = append(tmpList, i)
|
tmpList = append(tmpList, i)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
i += 1
|
i += 1
|
||||||
if i >= n {
|
if i >= n {
|
||||||
@@ -152,7 +153,7 @@ func cut_DAG(sentence string) []string {
|
|||||||
buf = make([]rune, 0)
|
buf = make([]rune, 0)
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if _, ok := T.Freq[bufString]; !ok {
|
if v, ok := T.Freq[bufString]; !ok || v == 0.0 {
|
||||||
recognized := finalseg.Cut(bufString)
|
recognized := finalseg.Cut(bufString)
|
||||||
for _, t := range recognized {
|
for _, t := range recognized {
|
||||||
result = append(result, t)
|
result = append(result, t)
|
||||||
@@ -175,7 +176,7 @@ func cut_DAG(sentence string) []string {
|
|||||||
result = append(result, string(buf))
|
result = append(result, string(buf))
|
||||||
} else {
|
} else {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if _, ok := T.Freq[bufString]; !ok {
|
if v, ok := T.Freq[bufString]; !ok || v == 0.0 {
|
||||||
recognized := finalseg.Cut(bufString)
|
recognized := finalseg.Cut(bufString)
|
||||||
for _, t := range recognized {
|
for _, t := range recognized {
|
||||||
result = append(result, t)
|
result = append(result, t)
|
||||||
@@ -319,7 +320,7 @@ func CutForSearch(sentence string, hmm bool) []string {
|
|||||||
var gram2 string
|
var gram2 string
|
||||||
for i := 0; i < len(runes)-increment+1; i++ {
|
for i := 0; i < len(runes)-increment+1; i++ {
|
||||||
gram2 = string(runes[i : i+increment])
|
gram2 = string(runes[i : i+increment])
|
||||||
if _, ok := T.Freq[gram2]; ok {
|
if v, ok := T.Freq[gram2]; ok && v > 0.0 {
|
||||||
result = append(result, gram2)
|
result = append(result, gram2)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
30
trie_node.go
30
trie_node.go
@@ -6,7 +6,6 @@ import (
|
|||||||
"crypto/md5"
|
"crypto/md5"
|
||||||
"encoding/gob"
|
"encoding/gob"
|
||||||
"fmt"
|
"fmt"
|
||||||
mapset "github.com/deckarep/golang-set"
|
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -17,7 +16,6 @@ import (
|
|||||||
var T *Trie
|
var T *Trie
|
||||||
|
|
||||||
type Trie struct {
|
type Trie struct {
|
||||||
Nodes mapset.Set
|
|
||||||
Total float64
|
Total float64
|
||||||
Freq map[string]float64
|
Freq map[string]float64
|
||||||
}
|
}
|
||||||
@@ -25,11 +23,7 @@ type Trie struct {
|
|||||||
func (t Trie) MarshalBinary() ([]byte, error) {
|
func (t Trie) MarshalBinary() ([]byte, error) {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
enc := gob.NewEncoder(&b)
|
enc := gob.NewEncoder(&b)
|
||||||
err := enc.Encode(t.Nodes.ToSlice())
|
err := enc.Encode(t.Total)
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
err = enc.Encode(t.Total)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -43,13 +37,7 @@ func (t Trie) MarshalBinary() ([]byte, error) {
|
|||||||
func (t *Trie) UnmarshalBinary(data []byte) error {
|
func (t *Trie) UnmarshalBinary(data []byte) error {
|
||||||
b := bytes.NewBuffer(data)
|
b := bytes.NewBuffer(data)
|
||||||
dec := gob.NewDecoder(b)
|
dec := gob.NewDecoder(b)
|
||||||
var nodes []interface{}
|
err := dec.Decode(&t.Total)
|
||||||
err := dec.Decode(&nodes)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
t.Nodes = mapset.NewSetFromSlice(nodes)
|
|
||||||
err = dec.Decode(&t.Total)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -111,8 +99,7 @@ func newTrie(fileName string) (*Trie, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !isDictCached {
|
if !isDictCached {
|
||||||
trie = &Trie{Nodes: mapset.NewSet(), Total: 0.0,
|
trie = &Trie{Total: 0.0, Freq: make(map[string]float64)}
|
||||||
Freq: make(map[string]float64)}
|
|
||||||
|
|
||||||
file, openError := os.Open(filePath)
|
file, openError := os.Open(filePath)
|
||||||
if openError != nil {
|
if openError != nil {
|
||||||
@@ -125,7 +112,10 @@ func newTrie(fileName string) (*Trie, error) {
|
|||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
words := strings.Split(line, " ")
|
words := strings.Split(line, " ")
|
||||||
word, freqStr := words[0], words[1]
|
word, freqStr := words[0], words[1]
|
||||||
freq, _ := strconv.ParseFloat(freqStr, 64)
|
freq, err := strconv.ParseFloat(freqStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
trie.addWord(word, freq)
|
trie.addWord(word, freq)
|
||||||
}
|
}
|
||||||
if scanErr := scanner.Err(); scanErr != nil {
|
if scanErr := scanner.Err(); scanErr != nil {
|
||||||
@@ -155,10 +145,12 @@ func (t *Trie) addWord(word string, freq float64) {
|
|||||||
runes := []rune(word)
|
runes := []rune(word)
|
||||||
count := len(runes)
|
count := len(runes)
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
t.Nodes.Add(string(runes[:i+1]))
|
wfrag := string(runes[0 : i+1])
|
||||||
|
if _, ok := t.Freq[wfrag]; !ok {
|
||||||
|
t.Freq[wfrag] = 0.0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func addWord(word string, freq float64, tag string) {
|
func addWord(word string, freq float64, tag string) {
|
||||||
if len(tag) > 0 {
|
if len(tag) > 0 {
|
||||||
UserWordTagTab[word] = strings.TrimSpace(tag)
|
UserWordTagTab[word] = strings.TrimSpace(tag)
|
||||||
|
|||||||
Reference in New Issue
Block a user