1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-07-01 01:20:29 +08:00

code refactor, added more documents

This commit is contained in:
Wang Bin
2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions

View File

@@ -17,27 +17,36 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
)
// Segment represents a word with it's POS
type Segment struct {
text, pos string
}
// Text returns the Segment's text.
func (s Segment) Text() string {
return s.text
}
// Pos returns the Segment's POS.
func (s Segment) Pos() string {
return s.pos
}
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
next := 0
for i, char := range runes {
pos := posList[i]
switch pos.Tag() {
switch pos.position() {
case "B":
begin = i
case "E":
result <- Segment{string(runes[begin : i+1]), pos.POS()}
result <- Segment{string(runes[begin : i+1]), pos.pos()}
next = i + 1
case "S":
result <- Segment{string(char), pos.POS()}
result <- Segment{string(char), pos.pos()}
next = i + 1
}
}
if next < len(runes) {
result <- Segment{string(runes[next:]), posList[next].POS()}
result <- Segment{string(runes[next:]), posList[next].pos()}
}
close(result)
}()
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i += 1
i++
if i >= n {
break
}
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
return result
}
// Cut cuts a sentence into words.
// Parameter hmm controls whether to use the Hidden Markov Model.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment)
var cut cutFunc