mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-26 06:50:23 +08:00
code refactor, added more documents
This commit is contained in:
@@ -2,9 +2,9 @@ package posseg
|
||||
|
||||
import "fmt"
|
||||
|
||||
type Tag uint16
|
||||
type tag uint16
|
||||
|
||||
func (t Tag) Tag() string {
|
||||
func (t tag) position() string {
|
||||
switch t / 100 {
|
||||
case 4:
|
||||
return "S"
|
||||
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
|
||||
}
|
||||
}
|
||||
|
||||
func (t Tag) POS() string {
|
||||
func (t tag) pos() string {
|
||||
return poss[t%100]
|
||||
}
|
||||
|
||||
func (t Tag) String() string {
|
||||
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS())
|
||||
}
|
||||
|
||||
func NewTag(tag, pos string) (Tag, error) {
|
||||
tagIndex := -1
|
||||
func newTag(position, pos string) (tag, error) {
|
||||
positionIndex := -1
|
||||
posIndex := -1
|
||||
for i, t := range tags {
|
||||
if tag == t {
|
||||
tagIndex = (i + 1) * 100
|
||||
for i, p := range positions {
|
||||
if position == p {
|
||||
positionIndex = (i + 1) * 100
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, p := range poss {
|
||||
if pos == p {
|
||||
posIndex = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if tagIndex < 0 || posIndex < 0 {
|
||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos)
|
||||
if positionIndex < 0 || posIndex < 0 {
|
||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
|
||||
}
|
||||
return Tag(tagIndex + posIndex), nil
|
||||
return tag(positionIndex + posIndex), nil
|
||||
}
|
||||
|
||||
type charStateTabMap map[rune][]uint16
|
||||
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
|
||||
func (m charStateTabMap) get(key rune) []uint16 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return probTransKeys
|
||||
}
|
||||
return probTransKeys
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -6708,6 +6705,6 @@ var (
|
||||
'\u9fa0': []uint16{413},
|
||||
}
|
||||
|
||||
tags = []string{"B", "E", "M", "S"}
|
||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||
positions = []string{"B", "E", "M", "S"}
|
||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
type Dictionary struct {
|
||||
total, logTotal float64
|
||||
freqMap map[string]float64
|
||||
@@ -14,6 +15,7 @@ type Dictionary struct {
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel
|
||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
// AddToken adds one token
|
||||
func (d *Dictionary) AddToken(token dictionary.Token) {
|
||||
d.Lock()
|
||||
d.addToken(token)
|
||||
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
|
||||
d.logTotal = math.Log(d.total)
|
||||
}
|
||||
|
||||
// Frequency returns the frequency and existence of give word
|
||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
d.RLock()
|
||||
freq, ok := d.freqMap[key]
|
||||
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// Pos returns the POS and existence of give word
|
||||
func (d *Dictionary) Pos(key string) (string, bool) {
|
||||
d.RLock()
|
||||
pos, ok := d.posMap[key]
|
||||
|
||||
@@ -17,27 +17,36 @@ var (
|
||||
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
// Segment represents a word with it's POS
|
||||
type Segment struct {
|
||||
text, pos string
|
||||
}
|
||||
|
||||
// Text returns the Segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
// Pos returns the Segment's POS.
|
||||
func (s Segment) Pos() string {
|
||||
return s.pos
|
||||
}
|
||||
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name.
|
||||
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||
next := 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos.Tag() {
|
||||
switch pos.position() {
|
||||
case "B":
|
||||
begin = i
|
||||
case "E":
|
||||
result <- Segment{string(runes[begin : i+1]), pos.POS()}
|
||||
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
||||
next = i + 1
|
||||
case "S":
|
||||
result <- Segment{string(char), pos.POS()}
|
||||
result <- Segment{string(char), pos.pos()}
|
||||
next = i + 1
|
||||
}
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- Segment{string(runes[next:]), posList[next].POS()}
|
||||
result <- Segment{string(runes[next:]), posList[next].pos()}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
if freq > 0.0 {
|
||||
dag[k] = append(dag[k], i)
|
||||
}
|
||||
i += 1
|
||||
i++
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts a sentence into words.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
var cut cutFunc
|
||||
|
||||
@@ -5,8 +5,8 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
seg Segmenter
|
||||
test_contents = []string{
|
||||
seg Segmenter
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -273,7 +273,7 @@ func init() {
|
||||
}
|
||||
|
||||
func chanToArray(ch <-chan Segment) []Segment {
|
||||
result := make([]Segment, 0)
|
||||
var result []Segment
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result := chanToArray(seg.Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
|
||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||
t.Fatalf("got: %v\n", result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != defaultCutResult[index][i] {
|
||||
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||
}
|
||||
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Fatal(content)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != noHMMCutResult[index][i] {
|
||||
t.Fatal(content)
|
||||
}
|
||||
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
package posseg
|
||||
|
||||
const MinFloat = -3.14e100
|
||||
const minFloat = -3.14e100
|
||||
|
||||
type runeFloatMap map[rune]float64
|
||||
|
||||
func (m runeFloatMap) get(key rune) float64 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return MinFloat
|
||||
}
|
||||
return minFloat
|
||||
}
|
||||
|
||||
var probEmit = map[uint16]runeFloatMap{
|
||||
|
||||
@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
|
||||
func (m probTransMap) Get(key uint16) float64 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return inf
|
||||
}
|
||||
return inf
|
||||
}
|
||||
|
||||
var (
|
||||
|
||||
@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
|
||||
pss[i], pss[j] = pss[j], pss[i]
|
||||
}
|
||||
|
||||
func viterbi(obs []rune) []Tag {
|
||||
func viterbi(obs []rune) []tag {
|
||||
obsLength := len(obs)
|
||||
V := make([]map[uint16]float64, obsLength)
|
||||
V[0] = make(map[uint16]float64)
|
||||
mem_path := make([]map[uint16]uint16, obsLength)
|
||||
mem_path[0] = make(map[uint16]uint16)
|
||||
memPath := make([]map[uint16]uint16, obsLength)
|
||||
memPath[0] = make(map[uint16]uint16)
|
||||
ys := charStateTab.get(obs[0]) // default is all_states
|
||||
for _, y := range ys {
|
||||
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
|
||||
mem_path[0][y] = 0
|
||||
memPath[0][y] = 0
|
||||
}
|
||||
for t := 1; t < obsLength; t++ {
|
||||
prev_states := make([]uint16, 0)
|
||||
for x := range mem_path[t-1] {
|
||||
var prevStates []uint16
|
||||
for x := range memPath[t-1] {
|
||||
if len(probTrans[x]) > 0 {
|
||||
prev_states = append(prev_states, x)
|
||||
prevStates = append(prevStates, x)
|
||||
}
|
||||
}
|
||||
//use Go's map to implement Python's Set()
|
||||
prev_states_expect_next := make(map[uint16]int)
|
||||
for _, x := range prev_states {
|
||||
prevStatesExpectNext := make(map[uint16]int)
|
||||
for _, x := range prevStates {
|
||||
for y := range probTrans[x] {
|
||||
prev_states_expect_next[y] = 1
|
||||
prevStatesExpectNext[y] = 1
|
||||
}
|
||||
}
|
||||
tmp_obs_states := charStateTab.get(obs[t])
|
||||
tmpObsStates := charStateTab.get(obs[t])
|
||||
|
||||
obs_states := make([]uint16, 0)
|
||||
for index := range tmp_obs_states {
|
||||
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
|
||||
obs_states = append(obs_states, tmp_obs_states[index])
|
||||
var obsStates []uint16
|
||||
for index := range tmpObsStates {
|
||||
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
|
||||
obsStates = append(obsStates, tmpObsStates[index])
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
for key := range prev_states_expect_next {
|
||||
obs_states = append(obs_states, key)
|
||||
if len(obsStates) == 0 {
|
||||
for key := range prevStatesExpectNext {
|
||||
obsStates = append(obsStates, key)
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
obs_states = probTransKeys
|
||||
if len(obsStates) == 0 {
|
||||
obsStates = probTransKeys
|
||||
}
|
||||
mem_path[t] = make(map[uint16]uint16)
|
||||
memPath[t] = make(map[uint16]uint16)
|
||||
V[t] = make(map[uint16]float64)
|
||||
for _, y := range obs_states {
|
||||
for _, y := range obsStates {
|
||||
var max, ps probState
|
||||
for i, y0 := range prev_states {
|
||||
for i, y0 := range prevStates {
|
||||
ps = probState{
|
||||
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
||||
state: y0}
|
||||
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
|
||||
}
|
||||
}
|
||||
V[t][y] = max.prob
|
||||
mem_path[t][y] = max.state
|
||||
memPath[t][y] = max.state
|
||||
}
|
||||
}
|
||||
last := make(probStates, 0)
|
||||
length := len(mem_path)
|
||||
length := len(memPath)
|
||||
vlength := len(V)
|
||||
for y := range mem_path[length-1] {
|
||||
for y := range memPath[length-1] {
|
||||
ps := probState{prob: V[vlength-1][y], state: y}
|
||||
last = append(last, ps)
|
||||
}
|
||||
sort.Sort(sort.Reverse(last))
|
||||
state := last[0].state
|
||||
route := make([]Tag, len(obs))
|
||||
route := make([]tag, len(obs))
|
||||
|
||||
for i := obsLength - 1; i >= 0; i-- {
|
||||
route[i] = Tag(state)
|
||||
state = mem_path[i][state]
|
||||
route[i] = tag(state)
|
||||
state = memPath[i][state]
|
||||
}
|
||||
return route
|
||||
}
|
||||
|
||||
@@ -4,49 +4,49 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
var defaultRoute []Tag
|
||||
var defaultRoute []tag
|
||||
|
||||
func init() {
|
||||
var t Tag
|
||||
t, _ = NewTag("B", "nr")
|
||||
var t tag
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "nr")
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "v")
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "v")
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "v")
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "n")
|
||||
t, _ = newTag("B", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "n")
|
||||
t, _ = newTag("M", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "n")
|
||||
t, _ = newTag("E", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "d")
|
||||
t, _ = newTag("S", "d")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "v")
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "n")
|
||||
t, _ = newTag("S", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "v")
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "v")
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "nr")
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "nr")
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "zg")
|
||||
t, _ = newTag("S", "zg")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user