1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

code refactor, added more documents

This commit is contained in:
Wang Bin
2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions

View File

@@ -6,7 +6,7 @@ import (
)
var (
test_contents = []string{
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt")
for index, sentence := range test_contents {
for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) {
t.Fatalf("%s = %v", sentence, result)

View File

@@ -7,6 +7,8 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// Idf represents a thread-safe dictionary for all words with their
// IDFs(Inverse Document Frequency).
type Idf struct {
freqMap map[string]float64
median float64
@@ -14,6 +16,7 @@ type Idf struct {
sync.RWMutex
}
// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(token dictionary.Token) {
i.Lock()
i.freqMap[token.Text()] = token.Frequency()
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
i.Unlock()
}
// Load loads all tokens from channel into it's dictionary.
func (i *Idf) Load(ch <-chan dictionary.Token) {
i.Lock()
for token := range ch {
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName)
}
// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) {
i.RLock()
freq, ok := i.freqMap[key]
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
return freq, ok
}
// NewIdf creates a new Idf instance.
func NewIdf() *Idf {
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
}

View File

@@ -6,6 +6,7 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// DefaultStopWordMap contains some stop words.
var DefaultStopWordMap = map[string]int{
"the": 1,
"of": 1,
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
"or": 1,
}
// StopWord is a thread-safe dictionary for all stop words.
type StopWord struct {
stopWordMap map[string]int
sync.RWMutex
}
// AddToken adds a token into StopWord dictionary.
func (s *StopWord) AddToken(token dictionary.Token) {
s.Lock()
s.stopWordMap[token.Text()] = 1
s.Unlock()
}
// NewStopWord create a new StopWord with default stop words.
func NewStopWord() *StopWord {
s := new(StopWord)
s.stopWordMap = DefaultStopWordMap
return s
}
// IsStopWord checks if a given word is stop word.
func (s *StopWord) IsStopWord(word string) bool {
s.RLock()
_, ok := s.stopWordMap[word]
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
return ok
}
// Load loads all tokens from given channel into StopWord dictionary.
func (s *StopWord) Load(ch <-chan dictionary.Token) {
s.Lock()
for token := range ch {

View File

@@ -1,7 +1,6 @@
package analyse
import (
"fmt"
"sort"
"strings"
"unicode/utf8"
@@ -9,23 +8,23 @@ import (
"github.com/wangbin/jiebago"
)
// Segment represents a word with weight.
type Segment struct {
text string
weight float64
}
// Text returns the segment's text.
func (s Segment) Text() string {
return s.text
}
// Weight returns the segment's weight.
func (s Segment) Weight() float64 {
return s.weight
}
func (s Segment) String() string {
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
}
// Segments represents a slice of Segment.
type Segments []Segment
func (ss Segments) Len() int {
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct {
seg *jiebago.Segmenter
idf *Idf
stopWord *StopWord
}
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error {
t.stopWord = NewStopWord()
t.seg = new(jiebago.Segmenter)
return t.seg.LoadDictionary(fileName)
}
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error {
t.idf = NewIdf()
return t.idf.loadDictionary(fileName)
}
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error {
t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName)
}
// Keyword extraction.
// ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)

View File

@@ -1,7 +1,6 @@
package analyse
import (
"fmt"
"math"
"sort"
@@ -20,10 +19,6 @@ type edge struct {
weight float64
}
func (e edge) String() string {
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
}
type edges []edge
func (es edges) Len() int {
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
return result
}
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
// could be manually speificed.
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
// Parameter allowPOS allows a customized pos list.
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
posFilt := make(map[string]int)
for _, pos := range allowPOS {
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64)
span := 5
pairs := make([]posseg.Segment, 0)
var pairs []posseg.Segment
for pair := range t.seg.Cut(sentence, true) {
pairs = append(pairs, pair)
}
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
return tags
}
// Extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most.
// TextRank extract keywords from sentence using TextRank algorithm.
// Parameter topK specify how many top keywords to be returned at most.
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
}
// TextRanker is used to extract tags from sentence.
type TextRanker struct {
seg *posseg.Segmenter
}
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
func (t *TextRanker) LoadDictionary(fileName string) error {
t.seg = new(posseg.Segmenter)
return t.seg.LoadDictionary(fileName)

View File

@@ -14,7 +14,7 @@ type Dictionary struct {
sync.RWMutex
}
// Load loads all tokens from channel
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
@@ -49,7 +49,7 @@ func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
// Frequency returns the frequency of give word, if not found, the second result is false
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]

View File

@@ -8,6 +8,8 @@ import (
"strings"
)
// DictLoader represents a interface that could add one token or load bunch of
// tokens from channel.
type DictLoader interface {
Load(<-chan Token)
AddToken(Token)
@@ -49,6 +51,7 @@ func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
}
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, fileName string) error {
filePath, err := dictPath(fileName)
if err != nil {

View File

@@ -1,23 +1,28 @@
package dictionary
// Token represents a Chinese word with (optional) frequency and POS.
type Token struct {
text string
frequency float64
pos string
}
//Text returns token's text.
func (t Token) Text() string {
return t.text
}
// Frequency returns token's frequency.
func (t Token) Frequency() float64 {
return t.frequency
}
// Pos returns token's POS.
func (t Token) Pos() string {
return t.pos
}
// NewToken creates a new token.
func NewToken(text string, frequency float64, pos string) Token {
return Token{text: text, frequency: frequency, pos: pos}
}

View File

@@ -13,10 +13,10 @@ func cutHan(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := pos_list[i]
pos := posList[i]
switch pos {
case 'B':
begin = i
@@ -36,6 +36,8 @@ func cutHan(sentence string) chan string {
return result
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string {
result := make(chan string)
s := sentence

View File

@@ -6,7 +6,7 @@ import (
)
func chanToArray(ch chan string) []string {
result := make([]string, 0)
var result []string
for word := range ch {
result = append(result, word)
}

View File

@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
V[t] = make(map[byte]float64)
for _, y := range states {
ps0 := make(probStates, 0)
var em_p float64
var emP float64
if val, ok := probEmit[y][obs[t]]; ok {
em_p = val
emP = val
} else {
em_p = minFloat
emP = minFloat
}
for _, y0 := range prevStatus[y] {
var transP float64
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
} else {
transP = minFloat
}
prob0 := V[t-1][y0] + transP + em_p
prob0 := V[t-1][y0] + transP + emP
ps0 = append(ps0, &probState{prob: prob0, state: y0})
}
sort.Sort(sort.Reverse(ps0))

View File

@@ -16,15 +16,21 @@ var (
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
)
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
// LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
@@ -46,7 +52,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i += 1
i++
if i >= n {
break
}
@@ -98,7 +104,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan string {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -156,7 +162,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -181,6 +187,10 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
return result
}
// Cut cuts a sentence into words using accurate mode.
// Parameter hmm controls whether to use the Hidden Markov Model.
// Accurate mode attempts to cut the sentence into the most accurate
// segmentations, which is suitable for text analysis.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
result := make(chan string)
var cut cutFunc
@@ -246,6 +256,9 @@ func (seg *Segmenter) cutAll(sentence string) <-chan string {
return result
}
// CutAll cuts a sentence into words using full mode.
// Full mode gets all the possible words from the sentence.
// Fast but not accurate.
func (seg *Segmenter) CutAll(sentence string) <-chan string {
result := make(chan string)
go func() {
@@ -268,6 +281,10 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
return result
}
// CutForSearch cuts sentence into words using search engine mode.
// Search engine mode, based on the accurate mode, attempts to cut long words
// into several short words, which can raise the recall rate.
// Suitable for search engines.
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
result := make(chan string)
go func() {

View File

@@ -3,8 +3,8 @@ package jiebago
import "testing"
var (
seg Segmenter
test_contents = []string{
seg Segmenter
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -620,7 +620,7 @@ func init() {
}
func chanToArray(ch <-chan string) []string {
result := make([]string, 0)
var result []string
for word := range ch {
result = append(result, word)
}
@@ -643,7 +643,7 @@ func TestCutDAGNoHmm(t *testing.T) {
func TestDefaultCut(t *testing.T) {
var result []string
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.Cut(content, true))
if len(result) != len(defaultCutResult[index]) {
t.Errorf("default cut for %s length should be %d not %d\n",
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {
func TestCutAll(t *testing.T) {
var result []string
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.CutAll(content))
if len(result) != len(cutAllResult[index]) {
t.Errorf("cut all for %s length should be %d not %d\n",
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {
func TestDefaultCutNoHMM(t *testing.T) {
var result []string
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.Cut(content, false))
if len(result) != len(defaultCutNoHMMResult[index]) {
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
@@ -695,7 +695,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
func TestCutForSearch(t *testing.T) {
var result []string
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, true))
if len(result) != len(cutForSearchResult[index]) {
t.Fatalf("cut for search for %s length should be %d not %d\n",
@@ -707,7 +707,7 @@ func TestCutForSearch(t *testing.T) {
}
}
}
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, false))
if len(result) != len(cutForSearchNoHMMResult[index]) {
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
@@ -724,7 +724,7 @@ func TestCutForSearch(t *testing.T) {
func TestLoadDictionary(t *testing.T) {
var result []string
seg.LoadDictionary("foobar.txt")
for index, content := range test_contents {
for index, content := range testContents {
result = chanToArray(seg.Cut(content, true))
if len(result) != len(userDictCutResult[index]) {
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",

View File

@@ -2,9 +2,9 @@ package posseg
import "fmt"
type Tag uint16
type tag uint16
func (t Tag) Tag() string {
func (t tag) position() string {
switch t / 100 {
case 4:
return "S"
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
}
}
func (t Tag) POS() string {
func (t tag) pos() string {
return poss[t%100]
}
func (t Tag) String() string {
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS())
}
func NewTag(tag, pos string) (Tag, error) {
tagIndex := -1
func newTag(position, pos string) (tag, error) {
positionIndex := -1
posIndex := -1
for i, t := range tags {
if tag == t {
tagIndex = (i + 1) * 100
for i, p := range positions {
if position == p {
positionIndex = (i + 1) * 100
break
}
}
for i, p := range poss {
if pos == p {
posIndex = i
break
}
}
if tagIndex < 0 || posIndex < 0 {
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos)
if positionIndex < 0 || posIndex < 0 {
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
}
return Tag(tagIndex + posIndex), nil
return tag(positionIndex + posIndex), nil
}
type charStateTabMap map[rune][]uint16
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
func (m charStateTabMap) get(key rune) []uint16 {
if value, ok := m[key]; ok {
return value
} else {
return probTransKeys
}
return probTransKeys
}
var (
@@ -6708,6 +6705,6 @@ var (
'\u9fa0': []uint16{413},
}
tags = []string{"B", "E", "M", "S"}
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
positions = []string{"B", "E", "M", "S"}
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
)

View File

@@ -7,6 +7,7 @@ import (
"github.com/wangbin/jiebago/dictionary"
)
// A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct {
total, logTotal float64
freqMap map[string]float64
@@ -14,6 +15,7 @@ type Dictionary struct {
sync.RWMutex
}
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock()
for token := range ch {
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.updateLogTotal()
}
// AddToken adds one token
func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock()
d.addToken(token)
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total)
}
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock()
freq, ok := d.freqMap[key]
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok
}
// Pos returns the POS and existence of give word
func (d *Dictionary) Pos(key string) (string, bool) {
d.RLock()
pos, ok := d.posMap[key]

View File

@@ -17,27 +17,36 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
)
// Segment represents a word with it's POS
type Segment struct {
text, pos string
}
// Text returns the Segment's text.
func (s Segment) Text() string {
return s.text
}
// Pos returns the Segment's POS.
func (s Segment) Pos() string {
return s.pos
}
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct {
dict *Dictionary
}
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName)
}
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName)
}
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
next := 0
for i, char := range runes {
pos := posList[i]
switch pos.Tag() {
switch pos.position() {
case "B":
begin = i
case "E":
result <- Segment{string(runes[begin : i+1]), pos.POS()}
result <- Segment{string(runes[begin : i+1]), pos.pos()}
next = i + 1
case "S":
result <- Segment{string(char), pos.POS()}
result <- Segment{string(char), pos.pos()}
next = i + 1
}
}
if next < len(runes) {
result <- Segment{string(runes[next:]), posList[next].POS()}
result <- Segment{string(runes[next:]), posList[next].pos()}
}
close(result)
}()
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 {
dag[k] = append(dag[k], i)
}
i += 1
i++
if i >= n {
break
}
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
routes := seg.calc(runes)
var y int
length := len(runes)
buf := make([]rune, 0)
var buf []rune
for x := 0; x < length; {
y = routes[x].index + 1
frag := runes[x:y]
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
return result
}
// Cut cuts a sentence into words.
// Parameter hmm controls whether to use the Hidden Markov Model.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment)
var cut cutFunc

View File

@@ -5,8 +5,8 @@ import (
)
var (
seg Segmenter
test_contents = []string{
seg Segmenter
testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
@@ -273,7 +273,7 @@ func init() {
}
func chanToArray(ch <-chan Segment) []Segment {
result := make([]Segment, 0)
var result []Segment
for word := range ch {
result = append(result, word)
}
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
}
func TestCut(t *testing.T) {
for index, content := range test_contents {
for index, content := range testContents {
result := chanToArray(seg.Cut(content, true))
if len(defaultCutResult[index]) != len(result) {
t.Errorf("default cut for %s length should be %d not %d\n",
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result)
}
for i, _ := range result {
for i := range result {
if result[i] != defaultCutResult[index][i] {
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
}
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
if len(noHMMCutResult[index]) != len(result) {
t.Fatal(content)
}
for i, _ := range result {
for i := range result {
if result[i] != noHMMCutResult[index][i] {
t.Fatal(content)
}
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
if len(cutResult) != len(result) {
t.Fatal(result)
}
for i, _ := range result {
for i := range result {
if result[i] != cutResult[i] {
t.Fatal(result[i])
}

View File

@@ -1,15 +1,14 @@
package posseg
const MinFloat = -3.14e100
const minFloat = -3.14e100
type runeFloatMap map[rune]float64
func (m runeFloatMap) get(key rune) float64 {
if value, ok := m[key]; ok {
return value
} else {
return MinFloat
}
return minFloat
}
var probEmit = map[uint16]runeFloatMap{

View File

@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
func (m probTransMap) Get(key uint16) float64 {
if value, ok := m[key]; ok {
return value
} else {
return inf
}
return inf
}
var (

View File

@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
pss[i], pss[j] = pss[j], pss[i]
}
func viterbi(obs []rune) []Tag {
func viterbi(obs []rune) []tag {
obsLength := len(obs)
V := make([]map[uint16]float64, obsLength)
V[0] = make(map[uint16]float64)
mem_path := make([]map[uint16]uint16, obsLength)
mem_path[0] = make(map[uint16]uint16)
memPath := make([]map[uint16]uint16, obsLength)
memPath[0] = make(map[uint16]uint16)
ys := charStateTab.get(obs[0]) // default is all_states
for _, y := range ys {
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
mem_path[0][y] = 0
memPath[0][y] = 0
}
for t := 1; t < obsLength; t++ {
prev_states := make([]uint16, 0)
for x := range mem_path[t-1] {
var prevStates []uint16
for x := range memPath[t-1] {
if len(probTrans[x]) > 0 {
prev_states = append(prev_states, x)
prevStates = append(prevStates, x)
}
}
//use Go's map to implement Python's Set()
prev_states_expect_next := make(map[uint16]int)
for _, x := range prev_states {
prevStatesExpectNext := make(map[uint16]int)
for _, x := range prevStates {
for y := range probTrans[x] {
prev_states_expect_next[y] = 1
prevStatesExpectNext[y] = 1
}
}
tmp_obs_states := charStateTab.get(obs[t])
tmpObsStates := charStateTab.get(obs[t])
obs_states := make([]uint16, 0)
for index := range tmp_obs_states {
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
obs_states = append(obs_states, tmp_obs_states[index])
var obsStates []uint16
for index := range tmpObsStates {
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
obsStates = append(obsStates, tmpObsStates[index])
}
}
if len(obs_states) == 0 {
for key := range prev_states_expect_next {
obs_states = append(obs_states, key)
if len(obsStates) == 0 {
for key := range prevStatesExpectNext {
obsStates = append(obsStates, key)
}
}
if len(obs_states) == 0 {
obs_states = probTransKeys
if len(obsStates) == 0 {
obsStates = probTransKeys
}
mem_path[t] = make(map[uint16]uint16)
memPath[t] = make(map[uint16]uint16)
V[t] = make(map[uint16]float64)
for _, y := range obs_states {
for _, y := range obsStates {
var max, ps probState
for i, y0 := range prev_states {
for i, y0 := range prevStates {
ps = probState{
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
state: y0}
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
}
}
V[t][y] = max.prob
mem_path[t][y] = max.state
memPath[t][y] = max.state
}
}
last := make(probStates, 0)
length := len(mem_path)
length := len(memPath)
vlength := len(V)
for y := range mem_path[length-1] {
for y := range memPath[length-1] {
ps := probState{prob: V[vlength-1][y], state: y}
last = append(last, ps)
}
sort.Sort(sort.Reverse(last))
state := last[0].state
route := make([]Tag, len(obs))
route := make([]tag, len(obs))
for i := obsLength - 1; i >= 0; i-- {
route[i] = Tag(state)
state = mem_path[i][state]
route[i] = tag(state)
state = memPath[i][state]
}
return route
}

View File

@@ -4,49 +4,49 @@ import (
"testing"
)
var defaultRoute []Tag
var defaultRoute []tag
func init() {
var t Tag
t, _ = NewTag("B", "nr")
var t tag
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr")
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v")
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v")
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v")
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "n")
t, _ = newTag("B", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "n")
t, _ = newTag("M", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "n")
t, _ = newTag("E", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "d")
t, _ = newTag("S", "d")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v")
t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "n")
t, _ = newTag("S", "n")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v")
t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v")
t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "nr")
t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr")
t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr")
t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "zg")
t, _ = newTag("S", "zg")
defaultRoute = append(defaultRoute, t)
}

View File

@@ -9,18 +9,40 @@ import (
"github.com/blevesearch/bleve/registry"
)
// Name is the jieba tokenizer name.
const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct {
seg Segmenter
hmm, searchMode bool
}
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
/*
NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg Segmenter
err := seg.LoadDictionary(dictFileName)
err := seg.LoadDictionary(dictFilePath)
return &JiebaTokenizer{
seg: seg,
hmm: hmm,
@@ -28,6 +50,7 @@ func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Toke
}, err
}
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
runeStart := 0
@@ -77,9 +100,20 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
return rv
}
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) {
dictFileName, ok := config["file"].(string)
dictFilePath, ok := config["file"].(string)
if !ok {
return nil, fmt.Errorf("must specify dictionary file path")
}
@@ -92,11 +126,11 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
searchMode = true
}
return NewJiebaTokenizer(dictFileName, hmm, searchMode)
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
}
func detectTokenType(term string) analysis.TokenType {
if IdeographRegexp.MatchString(term) {
if ideographRegexp.MatchString(term) {
return analysis.Ideographic
}
_, err := strconv.ParseFloat(term, 64)

View File

@@ -2,12 +2,14 @@ package util
import "regexp"
// RegexpSplit split slices s into substrings separated by the expression and
// returns a slice of the substrings between those expression matches.
// If capturing parentheses are used in expression, then the text of all groups
// in the expression are also returned as part of the resulting slice.
//
// This function acts consistent with Python's re.split function.
/*
RegexpSplit split slices s into substrings separated by the expression and
returns a slice of the substrings between those expression matches.
If capturing parentheses are used in expression, then the text of all groups
in the expression are also returned as part of the resulting slice.
This function acts consistent with Python's re.split function.
*/
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 {
return nil