1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-25 14:10:31 +08:00

tweak style

This commit is contained in:
Wang Bin
2015-05-04 15:11:55 +08:00
parent edef39719d
commit 500e6bd10e
13 changed files with 469 additions and 1602 deletions

69
posseg/dictionary.go Normal file
View File

@@ -0,0 +1,69 @@
package posseg
import (
"math"
"sync"
"github.com/wangbin/jiebago/dictionary"
)
type Dictionary struct {
total, logTotal float64
freqMap map[string]float64
posMap map[string]string
sync.RWMutex
}
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
d.addToken(token)
}
d.Unlock()
d.updateLogTotal()
}
func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock()
d.addToken(token)
d.Unlock()
d.updateLogTotal()
}
func (d *Dictionary) addToken(token dictionary.Token) {
d.freqMap[token.Text()] = token.Frequency()
d.total += token.Frequency()
runes := []rune(token.Text())
n := len(runes)
for i := 0; i < n; i++ {
frag := string(runes[:i+1])
if _, ok := d.freqMap[frag]; !ok {
d.freqMap[frag] = 0.0
}
}
if len(token.Pos()) > 0 {
d.posMap[token.Text()] = token.Pos()
}
}
func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
func (d Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]
d.RUnlock()
return freq, ok
}
func (d Dictionary) Pos(key string) (string, bool) {
d.RLock()
pos, ok := d.posMap[key]
d.RUnlock()
return pos, ok
}
func (d *Dictionary) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(d, fileName)
}

View File

@@ -4,7 +4,6 @@ import (
"math"
"regexp"
"github.com/wangbin/jiebago/dictionary"
"github.com/wangbin/jiebago/util"
)
@@ -31,11 +30,16 @@ func (s Segment) Pos() string {
}
type Segmenter struct {
*dictionary.Dictionary
dict *Dictionary
}
func New() *Segmenter {
return &Segmenter{dictionary.New()}
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
}
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
@@ -106,7 +110,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
i = k
frag = runes[k : k+1]
for {
freq, ok := seg.Frequency(string(frag))
freq, ok := seg.dict.Frequency(string(frag))
if !ok {
break
}
@@ -136,14 +140,13 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
n := len(runes)
rs := make(map[int]route)
rs[n] = route{frequency: 0.0, index: 0}
logTotal := seg.LogTotal()
var r route
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
if freq, ok := seg.Frequency(string(runes[idx : i+1])); ok {
r = route{frequency: math.Log(freq) - logTotal + rs[i+1].frequency, index: i}
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
} else {
r = route{frequency: math.Log(1.0) - logTotal + rs[i+1].frequency, index: i}
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
}
if v, ok := rs[idx]; !ok {
rs[idx] = r
@@ -177,21 +180,21 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
if tag, ok := seg.Pos(bufString); ok {
if tag, ok := seg.dict.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Segment{bufString, "x"}
}
buf = make([]rune, 0)
} else {
if v, ok := seg.Frequency(bufString); !ok || v == 0.0 {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := seg.Pos(selem); ok {
if tag, ok := seg.dict.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- Segment{selem, "x"}
@@ -203,7 +206,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
}
}
word := string(frag)
if tag, ok := seg.Pos(word); ok {
if tag, ok := seg.dict.Pos(word); ok {
result <- Segment{word, tag}
} else {
result <- Segment{word, "x"}
@@ -215,20 +218,20 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
if len(buf) > 0 {
bufString := string(buf)
if len(buf) == 1 {
if tag, ok := seg.Pos(bufString); ok {
if tag, ok := seg.dict.Pos(bufString); ok {
result <- Segment{bufString, tag}
} else {
result <- Segment{bufString, "x"}
}
} else {
if v, ok := seg.Frequency(bufString); !ok || v == 0.0 {
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
for t := range seg.cutDetail(bufString) {
result <- t
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := seg.Pos(selem); ok {
if tag, ok := seg.dict.Pos(selem); ok {
result <- Segment{selem, tag}
} else {
result <- Segment{selem, "x"}
@@ -263,7 +266,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
buf = make([]rune, 0)
}
word := string(frag)
if tag, ok := seg.Pos(word); ok {
if tag, ok := seg.dict.Pos(word); ok {
result <- Segment{word, tag}
} else {
result <- Segment{word, "x"}

View File

@@ -5,7 +5,7 @@ import (
)
var (
seg *Segmenter
seg Segmenter
test_contents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
@@ -269,7 +269,6 @@ var (
)
func init() {
seg = New()
seg.LoadDictionary("../dict.txt")
}