1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-26 06:50:23 +08:00

code refactor, added more documents

This commit is contained in:
Wang Bin
2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions

View File

@@ -2,9 +2,9 @@ package posseg
import "fmt"
type Tag uint16
type tag uint16
func (t Tag) Tag() string {
func (t tag) position() string {
switch t / 100 {
case 4:
return "S"
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
}
}
func (t Tag) POS() string {
func (t tag) pos() string {
return poss[t%100]
}
func (t Tag) String() string {
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS())
}
func NewTag(tag, pos string) (Tag, error) {
tagIndex := -1
func newTag(position, pos string) (tag, error) {
positionIndex := -1
posIndex := -1
for i, t := range tags {
if tag == t {
tagIndex = (i + 1) * 100
for i, p := range positions {
if position == p {
positionIndex = (i + 1) * 100
break
}
}
for i, p := range poss {
if pos == p {
posIndex = i
break
}
}
if tagIndex < 0 || posIndex < 0 {
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos)
if positionIndex < 0 || posIndex < 0 {
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
}
return Tag(tagIndex + posIndex), nil
return tag(positionIndex + posIndex), nil
}
type charStateTabMap map[rune][]uint16
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
func (m charStateTabMap) get(key rune) []uint16 {
if value, ok := m[key]; ok {
return value
} else {
return probTransKeys
}
return probTransKeys
}
var (
@@ -6708,6 +6705,6 @@ var (
'\u9fa0': []uint16{413},
}
tags = []string{"B", "E", "M", "S"}
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
positions = []string{"B", "E", "M", "S"}
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
)

View File

@@ -7,6 +7,7 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct {
total, logTotal float64
freqMap map[string]float64
@@ -14,6 +15,7 @@ type Dictionary struct {
sync.RWMutex
}
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.updateLogTotal()
}
// AddToken adds one token
func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock()
d.addToken(token)
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok
}
// Pos returns the POS and existence of give word
func (d *Dictionary) Pos(key string) (string, bool) {
d.RLock()
pos, ok := d.posMap[key]

View File

@@ -17,27 +17,36 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
)
// Segment represents a word with it's POS
type Segment struct {
text, pos string
}
// Text returns the Segment's text.
func (s Segment) Text() string {
return s.text
}
// Pos returns the Segment's POS.
func (s Segment) Pos() string {
return s.pos
}
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
next := 0
for i, char := range runes {
pos := posList[i]
switch pos.Tag() {
switch pos.position() {
case "B":
begin = i
case "E":
result <- Segment{string(runes[begin : i+1]), pos.POS()}
result <- Segment{string(runes[begin : i+1]), pos.pos()}
next = i + 1
case "S":
result <- Segment{string(char), pos.POS()}
result <- Segment{string(char), pos.pos()}
next = i + 1
}
}
if next < len(runes) {
result <- Segment{string(runes[next:]), posList[next].POS()}
result <- Segment{string(runes[next:]), posList[next].pos()}
}
close(result)
}()
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i += 1
i++
if i >= n {
break
}
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
return result
}
// Cut cuts a sentence into words.
// Parameter hmm controls whether to use the Hidden Markov Model.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment)
var cut cutFunc

View File

@@ -5,8 +5,8 @@ import (
)
var (
seg Segmenter
test_contents = []string{
seg Segmenter
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -273,7 +273,7 @@ func init() {
}
func chanToArray(ch <-chan Segment) []Segment {
result := make([]Segment, 0)
var result []Segment
for word := range ch {
result = append(result, word)
}
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
}
func TestCut(t *testing.T) {
for index, content := range test_contents {
for index, content := range testContents {
result := chanToArray(seg.Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Errorf("default cut for %s length should be %d not %d\n",
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
}
for i, _ := range result {
for i := range result {
if result[i] != defaultCutResult[index][i] {
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
}
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
if len(noHMMCutResult[index]) != len(result) {
t.Fatal(content)
}
for i, _ := range result {
for i := range result {
if result[i] != noHMMCutResult[index][i] {
t.Fatal(content)
}
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}

View File

@@ -1,15 +1,14 @@
package posseg
const MinFloat = -3.14e100
const minFloat = -3.14e100
type runeFloatMap map[rune]float64
func (m runeFloatMap) get(key rune) float64 {
if value, ok := m[key]; ok {
return value
} else {
return MinFloat
}
return minFloat
}
var probEmit = map[uint16]runeFloatMap{

View File

@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
func (m probTransMap) Get(key uint16) float64 {
if value, ok := m[key]; ok {
return value
} else {
return inf
}
return inf
}
var (

View File

@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
pss[i], pss[j] = pss[j], pss[i]
}
func viterbi(obs []rune) []Tag {
func viterbi(obs []rune) []tag {
obsLength := len(obs)
V := make([]map[uint16]float64, obsLength)
V[0] = make(map[uint16]float64)
mem_path := make([]map[uint16]uint16, obsLength)
mem_path[0] = make(map[uint16]uint16)
memPath := make([]map[uint16]uint16, obsLength)
memPath[0] = make(map[uint16]uint16)
ys := charStateTab.get(obs[0]) // default is all_states
for _, y := range ys {
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
mem_path[0][y] = 0
memPath[0][y] = 0
}
for t := 1; t < obsLength; t++ {
prev_states := make([]uint16, 0)
for x := range mem_path[t-1] {
var prevStates []uint16
for x := range memPath[t-1] {
if len(probTrans[x]) > 0 {
prev_states = append(prev_states, x)
prevStates = append(prevStates, x)
}
}
//use Go's map to implement Python's Set()
prev_states_expect_next := make(map[uint16]int)
for _, x := range prev_states {
prevStatesExpectNext := make(map[uint16]int)
for _, x := range prevStates {
for y := range probTrans[x] {
prev_states_expect_next[y] = 1
prevStatesExpectNext[y] = 1
}
}
tmp_obs_states := charStateTab.get(obs[t])
tmpObsStates := charStateTab.get(obs[t])
obs_states := make([]uint16, 0)
for index := range tmp_obs_states {
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
obs_states = append(obs_states, tmp_obs_states[index])
var obsStates []uint16
for index := range tmpObsStates {
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
obsStates = append(obsStates, tmpObsStates[index])
}
}
if len(obs_states) == 0 {
for key := range prev_states_expect_next {
obs_states = append(obs_states, key)
if len(obsStates) == 0 {
for key := range prevStatesExpectNext {
obsStates = append(obsStates, key)
}
}
if len(obs_states) == 0 {
obs_states = probTransKeys
if len(obsStates) == 0 {
obsStates = probTransKeys
}
mem_path[t] = make(map[uint16]uint16)
memPath[t] = make(map[uint16]uint16)
V[t] = make(map[uint16]float64)
for _, y := range obs_states {
for _, y := range obsStates {
var max, ps probState
for i, y0 := range prev_states {
for i, y0 := range prevStates {
ps = probState{
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
state: y0}
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
}
}
V[t][y] = max.prob
mem_path[t][y] = max.state
memPath[t][y] = max.state
}
}
last := make(probStates, 0)
length := len(mem_path)
length := len(memPath)
vlength := len(V)
for y := range mem_path[length-1] {
for y := range memPath[length-1] {
ps := probState{prob: V[vlength-1][y], state: y}
last = append(last, ps)
}
sort.Sort(sort.Reverse(last))
state := last[0].state
route := make([]Tag, len(obs))
route := make([]tag, len(obs))
for i := obsLength - 1; i >= 0; i-- {
route[i] = Tag(state)
state = mem_path[i][state]
route[i] = tag(state)
state = memPath[i][state]
}
return route
}

View File

@@ -4,49 +4,49 @@ import (
"testing"
)
var defaultRoute []Tag
var defaultRoute []tag
func init() {
var t Tag
t, _ = NewTag("B", "nr")
var t tag
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr")
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v")
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v")
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v")
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "n")
t, _ = newTag("B", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "n")
t, _ = newTag("M", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "n")
t, _ = newTag("E", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "d")
t, _ = newTag("S", "d")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v")
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "n")
t, _ = newTag("S", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v")
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v")
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "nr")
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr")
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "zg")
t, _ = newTag("S", "zg")
defaultRoute = append(defaultRoute, t)
}