diff --git a/analyse/analyse_test.go b/analyse/analyse_test.go index ba14cfe..7a7d979 100644 --- a/analyse/analyse_test.go +++ b/analyse/analyse_test.go @@ -6,7 +6,7 @@ import ( ) var ( - test_contents = []string{ + testContents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。", @@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) { te.LoadDictionary("../dict.txt") te.LoadIdf("idf.txt") - for index, sentence := range test_contents { + for index, sentence := range testContents { result := te.ExtractTags(sentence, 20) if len(result) != len(Tags[index]) { t.Fatalf("%s = %v", sentence, result) diff --git a/analyse/idf.go b/analyse/idf.go index 37e50eb..5ab8f33 100644 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -7,6 +7,8 @@ import ( "github.com/wangbin/jiebago/dictionary" ) +// Idf represents a thread-safe dictionary for all words with their +// IDFs(Inverse Document Frequency). type Idf struct { freqMap map[string]float64 median float64 @@ -14,6 +16,7 @@ type Idf struct { sync.RWMutex } +// AddToken adds a new word with IDF into it's dictionary. func (i *Idf) AddToken(token dictionary.Token) { i.Lock() i.freqMap[token.Text()] = token.Frequency() @@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) { i.Unlock() } +// Load loads all tokens from channel into it's dictionary. func (i *Idf) Load(ch <-chan dictionary.Token) { i.Lock() for token := range ch { @@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error { return dictionary.LoadDictionary(i, fileName) } +// Frequency returns the IDF of given word. func (i *Idf) Frequency(key string) (float64, bool) { i.RLock() freq, ok := i.freqMap[key] @@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) { return freq, ok } +// NewIdf creates a new Idf instance. func NewIdf() *Idf { return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)} } diff --git a/analyse/stopwords.go b/analyse/stopwords.go index 81012f7..b76d773 100644 --- a/analyse/stopwords.go +++ b/analyse/stopwords.go @@ -6,6 +6,7 @@ import ( "github.com/wangbin/jiebago/dictionary" ) +// DefaultStopWordMap contains some stop words. var DefaultStopWordMap = map[string]int{ "the": 1, "of": 1, @@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{ "or": 1, } +// StopWord is a thread-safe dictionary for all stop words. type StopWord struct { stopWordMap map[string]int sync.RWMutex } +// AddToken adds a token into StopWord dictionary. func (s *StopWord) AddToken(token dictionary.Token) { s.Lock() s.stopWordMap[token.Text()] = 1 s.Unlock() } +// NewStopWord create a new StopWord with default stop words. func NewStopWord() *StopWord { s := new(StopWord) s.stopWordMap = DefaultStopWordMap return s } +// IsStopWord checks if a given word is stop word. func (s *StopWord) IsStopWord(word string) bool { s.RLock() _, ok := s.stopWordMap[word] @@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool { return ok } +// Load loads all tokens from given channel into StopWord dictionary. func (s *StopWord) Load(ch <-chan dictionary.Token) { s.Lock() for token := range ch { diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index d9b6f96..b7836f5 100644 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -1,7 +1,6 @@ package analyse import ( - "fmt" "sort" "strings" "unicode/utf8" @@ -9,23 +8,23 @@ import ( "github.com/wangbin/jiebago" ) +// Segment represents a word with weight. type Segment struct { text string weight float64 } +// Text returns the segment's text. func (s Segment) Text() string { return s.text } +// Weight returns the segment's weight. func (s Segment) Weight() float64 { return s.weight } -func (s Segment) String() string { - return fmt.Sprintf("{%s: %f}", s.text, s.weight) -} - +// Segments represents a slice of Segment. type Segments []Segment func (ss Segments) Len() int { @@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) { ss[i], ss[j] = ss[j], ss[i] } +// TagExtracter is used to extract tags from sentence. type TagExtracter struct { seg *jiebago.Segmenter idf *Idf stopWord *StopWord } +// LoadDictionary reads the given filename and create a new dictionary. func (t *TagExtracter) LoadDictionary(fileName string) error { t.stopWord = NewStopWord() t.seg = new(jiebago.Segmenter) return t.seg.LoadDictionary(fileName) } +// LoadIdf reads the given file and create a new Idf dictionary. func (t *TagExtracter) LoadIdf(fileName string) error { t.idf = NewIdf() return t.idf.loadDictionary(fileName) } +// LoadStopWords reads the given file and create a new StopWord dictionary. func (t *TagExtracter) LoadStopWords(fileName string) error { t.stopWord = NewStopWord() return t.stopWord.loadDictionary(fileName) } -// Keyword extraction. +// ExtractTags extracts the topK key words from sentence. func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { freqMap := make(map[string]float64) diff --git a/analyse/textrank.go b/analyse/textrank.go index fc0a54d..41dd489 100644 --- a/analyse/textrank.go +++ b/analyse/textrank.go @@ -1,7 +1,6 @@ package analyse import ( - "fmt" "math" "sort" @@ -20,10 +19,6 @@ type edge struct { weight float64 } -func (e edge) String() string { - return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight) -} - type edges []edge func (es edges) Len() int { @@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments { return result } -// Extract keywords from sentence using TextRank algorithm. the allowed POS list -// could be manually speificed. +// TextRankWithPOS extracts keywords from sentence using TextRank algorithm. +// Parameter allowPOS allows a customized pos list. func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments { posFilt := make(map[string]int) for _, pos := range allowPOS { @@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin g := newUndirectWeightedGraph() cm := make(map[[2]string]float64) span := 5 - pairs := make([]posseg.Segment, 0) + var pairs []posseg.Segment for pair := range t.seg.Cut(sentence, true) { pairs = append(pairs, pair) } @@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin return tags } -// Extract keywords from sentence using TextRank algorithm. -// topK specify how many top keywords to be returned at most. +// TextRank extract keywords from sentence using TextRank algorithm. +// Parameter topK specify how many top keywords to be returned at most. func (t *TextRanker) TextRank(sentence string, topK int) Segments { return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) } +// TextRanker is used to extract tags from sentence. type TextRanker struct { seg *posseg.Segmenter } +// LoadDictionary reads a given file and create a new dictionary file for Textranker. func (t *TextRanker) LoadDictionary(fileName string) error { t.seg = new(posseg.Segmenter) return t.seg.LoadDictionary(fileName) diff --git a/dictionary.go b/dictionary.go index d8805b8..ea50ebc 100644 --- a/dictionary.go +++ b/dictionary.go @@ -14,7 +14,7 @@ type Dictionary struct { sync.RWMutex } -// Load loads all tokens from channel +// Load loads all tokens from given channel func (d *Dictionary) Load(ch <-chan dictionary.Token) { d.Lock() for token := range ch { @@ -49,7 +49,7 @@ func (d *Dictionary) updateLogTotal() { d.logTotal = math.Log(d.total) } -// Frequency returns the frequency of give word, if not found, the second result is false +// Frequency returns the frequency and existence of give word func (d *Dictionary) Frequency(key string) (float64, bool) { d.RLock() freq, ok := d.freqMap[key] diff --git a/dictionary/dictionary.go b/dictionary/dictionary.go index 83cde0d..c62e2d3 100644 --- a/dictionary/dictionary.go +++ b/dictionary/dictionary.go @@ -8,6 +8,8 @@ import ( "strings" ) +// DictLoader represents a interface that could add one token or load bunch of +// tokens from channel. type DictLoader interface { Load(<-chan Token) AddToken(Token) @@ -49,6 +51,7 @@ func loadDictionary(file *os.File) (<-chan Token, <-chan error) { } +// LoadDictionary reads the given file and passes all tokens to a DictLoader. func LoadDictionary(dl DictLoader, fileName string) error { filePath, err := dictPath(fileName) if err != nil { diff --git a/dictionary/token.go b/dictionary/token.go index fbae97f..6bebae1 100644 --- a/dictionary/token.go +++ b/dictionary/token.go @@ -1,23 +1,28 @@ package dictionary +// Token represents a Chinese word with (optional) frequency and POS. type Token struct { text string frequency float64 pos string } +//Text returns token's text. func (t Token) Text() string { return t.text } +// Frequency returns token's frequency. func (t Token) Frequency() float64 { return t.frequency } +// Pos returns token's POS. func (t Token) Pos() string { return t.pos } +// NewToken creates a new token. func NewToken(text string, frequency float64, pos string) Token { return Token{text: text, frequency: frequency, pos: pos} } diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index 0b19ecd..aace620 100644 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -13,10 +13,10 @@ func cutHan(sentence string) chan string { result := make(chan string) go func() { runes := []rune(sentence) - _, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) + _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) begin, next := 0, 0 for i, char := range runes { - pos := pos_list[i] + pos := posList[i] switch pos { case 'B': begin = i @@ -36,6 +36,8 @@ func cutHan(sentence string) chan string { return result } +// Cut cuts sentence into words using Hidden Markov Model with Viterbi +// algorithm. It is used by Jiebago for unknonw words. func Cut(sentence string) chan string { result := make(chan string) s := sentence diff --git a/finalseg/finalseg_test.go b/finalseg/finalseg_test.go index 69343ce..bafe4be 100644 --- a/finalseg/finalseg_test.go +++ b/finalseg/finalseg_test.go @@ -6,7 +6,7 @@ import ( ) func chanToArray(ch chan string) []string { - result := make([]string, 0) + var result []string for word := range ch { result = append(result, word) } diff --git a/finalseg/viterbi.go b/finalseg/viterbi.go index 636eb24..2eef566 100644 --- a/finalseg/viterbi.go +++ b/finalseg/viterbi.go @@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) { V[t] = make(map[byte]float64) for _, y := range states { ps0 := make(probStates, 0) - var em_p float64 + var emP float64 if val, ok := probEmit[y][obs[t]]; ok { - em_p = val + emP = val } else { - em_p = minFloat + emP = minFloat } for _, y0 := range prevStatus[y] { var transP float64 @@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) { } else { transP = minFloat } - prob0 := V[t-1][y0] + transP + em_p + prob0 := V[t-1][y0] + transP + emP ps0 = append(ps0, &probState{prob: prob0, state: y0}) } sort.Sort(sort.Reverse(ps0)) diff --git a/jieba.go b/jieba.go index 386a3a3..a8eb0e9 100644 --- a/jieba.go +++ b/jieba.go @@ -16,15 +16,21 @@ var ( reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) +// Segmenter is a Chinese words segmentation struct. type Segmenter struct { dict *Dictionary } +// LoadDictionary loads dictionary from given file name. Everytime +// LoadDictionary is called, previously loaded dictionary will be cleard. func (seg *Segmenter) LoadDictionary(fileName string) error { seg.dict = &Dictionary{freqMap: make(map[string]float64)} return seg.dict.loadDictionary(fileName) } +// LoadUserDictionary loads a user specified dictionary, it must be called +// after LoadDictionary, and it will not clear any previous loaded dictionary, +// instead it will override exist entries. func (seg *Segmenter) LoadUserDictionary(fileName string) error { return seg.dict.loadDictionary(fileName) } @@ -46,7 +52,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int { if freq > 0.0 { dag[k] = append(dag[k], i) } - i += 1 + i++ if i >= n { break } @@ -98,7 +104,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan string { routes := seg.calc(runes) var y int length := len(runes) - buf := make([]rune, 0) + var buf []rune for x := 0; x < length; { y = routes[x].index + 1 frag := runes[x:y] @@ -156,7 +162,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { routes := seg.calc(runes) var y int length := len(runes) - buf := make([]rune, 0) + var buf []rune for x := 0; x < length; { y = routes[x].index + 1 frag := runes[x:y] @@ -181,6 +187,10 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { return result } +// Cut cuts a sentence into words using accurate mode. +// Parameter hmm controls whether to use the Hidden Markov Model. +// Accurate mode attempts to cut the sentence into the most accurate +// segmentations, which is suitable for text analysis. func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { result := make(chan string) var cut cutFunc @@ -246,6 +256,9 @@ func (seg *Segmenter) cutAll(sentence string) <-chan string { return result } +// CutAll cuts a sentence into words using full mode. +// Full mode gets all the possible words from the sentence. +// Fast but not accurate. func (seg *Segmenter) CutAll(sentence string) <-chan string { result := make(chan string) go func() { @@ -268,6 +281,10 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string { return result } +// CutForSearch cuts sentence into words using search engine mode. +// Search engine mode, based on the accurate mode, attempts to cut long words +// into several short words, which can raise the recall rate. +// Suitable for search engines. func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { result := make(chan string) go func() { diff --git a/jieba_test.go b/jieba_test.go index 716ad20..6203848 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -3,8 +3,8 @@ package jiebago import "testing" var ( - seg Segmenter - test_contents = []string{ + seg Segmenter + testContents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。", @@ -620,7 +620,7 @@ func init() { } func chanToArray(ch <-chan string) []string { - result := make([]string, 0) + var result []string for word := range ch { result = append(result, word) } @@ -643,7 +643,7 @@ func TestCutDAGNoHmm(t *testing.T) { func TestDefaultCut(t *testing.T) { var result []string - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.Cut(content, true)) if len(result) != len(defaultCutResult[index]) { t.Errorf("default cut for %s length should be %d not %d\n", @@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) { func TestCutAll(t *testing.T) { var result []string - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.CutAll(content)) if len(result) != len(cutAllResult[index]) { t.Errorf("cut all for %s length should be %d not %d\n", @@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) { func TestDefaultCutNoHMM(t *testing.T) { var result []string - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.Cut(content, false)) if len(result) != len(defaultCutNoHMMResult[index]) { t.Fatalf("default cut no hmm for %s length should be %d not %d\n", @@ -695,7 +695,7 @@ func TestDefaultCutNoHMM(t *testing.T) { func TestCutForSearch(t *testing.T) { var result []string - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.CutForSearch(content, true)) if len(result) != len(cutForSearchResult[index]) { t.Fatalf("cut for search for %s length should be %d not %d\n", @@ -707,7 +707,7 @@ func TestCutForSearch(t *testing.T) { } } } - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.CutForSearch(content, false)) if len(result) != len(cutForSearchNoHMMResult[index]) { t.Fatalf("cut for search no hmm for %s length should be %d not %d\n", @@ -724,7 +724,7 @@ func TestCutForSearch(t *testing.T) { func TestLoadDictionary(t *testing.T) { var result []string seg.LoadDictionary("foobar.txt") - for index, content := range test_contents { + for index, content := range testContents { result = chanToArray(seg.Cut(content, true)) if len(result) != len(userDictCutResult[index]) { t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n", diff --git a/posseg/char_state_tab.go b/posseg/char_state_tab.go index 6d3b727..e8ae043 100644 --- a/posseg/char_state_tab.go +++ b/posseg/char_state_tab.go @@ -2,9 +2,9 @@ package posseg import "fmt" -type Tag uint16 +type tag uint16 -func (t Tag) Tag() string { +func (t tag) position() string { switch t / 100 { case 4: return "S" @@ -19,31 +19,29 @@ func (t Tag) Tag() string { } } -func (t Tag) POS() string { +func (t tag) pos() string { return poss[t%100] } -func (t Tag) String() string { - return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS()) -} - -func NewTag(tag, pos string) (Tag, error) { - tagIndex := -1 +func newTag(position, pos string) (tag, error) { + positionIndex := -1 posIndex := -1 - for i, t := range tags { - if tag == t { - tagIndex = (i + 1) * 100 + for i, p := range positions { + if position == p { + positionIndex = (i + 1) * 100 + break } } for i, p := range poss { if pos == p { posIndex = i + break } } - if tagIndex < 0 || posIndex < 0 { - return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos) + if positionIndex < 0 || posIndex < 0 { + return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos) } - return Tag(tagIndex + posIndex), nil + return tag(positionIndex + posIndex), nil } type charStateTabMap map[rune][]uint16 @@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16 func (m charStateTabMap) get(key rune) []uint16 { if value, ok := m[key]; ok { return value - } else { - return probTransKeys } + return probTransKeys } var ( @@ -6708,6 +6705,6 @@ var ( '\u9fa0': []uint16{413}, } - tags = []string{"B", "E", "M", "S"} - poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"} + positions = []string{"B", "E", "M", "S"} + poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"} ) diff --git a/posseg/dictionary.go b/posseg/dictionary.go index e4fee3b..9015bc5 100644 --- a/posseg/dictionary.go +++ b/posseg/dictionary.go @@ -7,6 +7,7 @@ import ( "github.com/wangbin/jiebago/dictionary" ) +// A Dictionary represents a thread-safe dictionary used for word segmentation. type Dictionary struct { total, logTotal float64 freqMap map[string]float64 @@ -14,6 +15,7 @@ type Dictionary struct { sync.RWMutex } +// Load loads all tokens from given channel func (d *Dictionary) Load(ch <-chan dictionary.Token) { d.Lock() for token := range ch { @@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) { d.updateLogTotal() } +// AddToken adds one token func (d *Dictionary) AddToken(token dictionary.Token) { d.Lock() d.addToken(token) @@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() { d.logTotal = math.Log(d.total) } +// Frequency returns the frequency and existence of give word func (d *Dictionary) Frequency(key string) (float64, bool) { d.RLock() freq, ok := d.freqMap[key] @@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) { return freq, ok } +// Pos returns the POS and existence of give word func (d *Dictionary) Pos(key string) (string, bool) { d.RLock() pos, ok := d.posMap[key] diff --git a/posseg/posseg.go b/posseg/posseg.go index ae095ec..c3b401f 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -17,27 +17,36 @@ var ( reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) ) +// Segment represents a word with it's POS type Segment struct { text, pos string } +// Text returns the Segment's text. func (s Segment) Text() string { return s.text } +// Pos returns the Segment's POS. func (s Segment) Pos() string { return s.pos } +// Segmenter is a Chinese words segmentation struct. type Segmenter struct { dict *Dictionary } +// LoadDictionary loads dictionary from given file name. +// Everytime LoadDictionary is called, previously loaded dictionary will be cleard. func (seg *Segmenter) LoadDictionary(fileName string) error { seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} return seg.dict.loadDictionary(fileName) } +// LoadUserDictionary loads a user specified dictionary, it must be called +// after LoadDictionary, and it will not clear any previous loaded dictionary, +// instead it will override exist entries. func (seg *Segmenter) LoadUserDictionary(fileName string) error { return seg.dict.loadDictionary(fileName) } @@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { next := 0 for i, char := range runes { pos := posList[i] - switch pos.Tag() { + switch pos.position() { case "B": begin = i case "E": - result <- Segment{string(runes[begin : i+1]), pos.POS()} + result <- Segment{string(runes[begin : i+1]), pos.pos()} next = i + 1 case "S": - result <- Segment{string(char), pos.POS()} + result <- Segment{string(char), pos.pos()} next = i + 1 } } if next < len(runes) { - result <- Segment{string(runes[next:]), posList[next].POS()} + result <- Segment{string(runes[next:]), posList[next].pos()} } close(result) }() @@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int { if freq > 0.0 { dag[k] = append(dag[k], i) } - i += 1 + i++ if i >= n { break } @@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment { routes := seg.calc(runes) var y int length := len(runes) - buf := make([]rune, 0) + var buf []rune for x := 0; x < length; { y = routes[x].index + 1 frag := runes[x:y] @@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment { routes := seg.calc(runes) var y int length := len(runes) - buf := make([]rune, 0) + var buf []rune for x := 0; x < length; { y = routes[x].index + 1 frag := runes[x:y] @@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment { return result } +// Cut cuts a sentence into words. +// Parameter hmm controls whether to use the Hidden Markov Model. func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment { result := make(chan Segment) var cut cutFunc diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index d861361..5158611 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -5,8 +5,8 @@ import ( ) var ( - seg Segmenter - test_contents = []string{ + seg Segmenter + testContents = []string{ "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。", @@ -273,7 +273,7 @@ func init() { } func chanToArray(ch <-chan Segment) []Segment { - result := make([]Segment, 0) + var result []Segment for word := range ch { result = append(result, word) } @@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment { } func TestCut(t *testing.T) { - for index, content := range test_contents { + for index, content := range testContents { result := chanToArray(seg.Cut(content, true)) if len(defaultCutResult[index]) != len(result) { t.Errorf("default cut for %s length should be %d not %d\n", @@ -289,7 +289,7 @@ func TestCut(t *testing.T) { t.Errorf("expect: %v\n", defaultCutResult[index]) t.Fatalf("got: %v\n", result) } - for i, _ := range result { + for i := range result { if result[i] != defaultCutResult[index][i] { t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i]) } @@ -298,7 +298,7 @@ func TestCut(t *testing.T) { if len(noHMMCutResult[index]) != len(result) { t.Fatal(content) } - for i, _ := range result { + for i := range result { if result[i] != noHMMCutResult[index][i] { t.Fatal(content) } @@ -320,7 +320,7 @@ func TestBug132(t *testing.T) { if len(cutResult) != len(result) { t.Fatal(result) } - for i, _ := range result { + for i := range result { if result[i] != cutResult[i] { t.Fatal(result[i]) } @@ -349,7 +349,7 @@ func TestBug137(t *testing.T) { if len(cutResult) != len(result) { t.Fatal(result) } - for i, _ := range result { + for i := range result { if result[i] != cutResult[i] { t.Fatal(result[i]) } @@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) { if len(cutResult) != len(result) { t.Fatal(result) } - for i, _ := range result { + for i := range result { if result[i] != cutResult[i] { t.Fatal(result[i]) } diff --git a/posseg/prob_emit.go b/posseg/prob_emit.go index 93fe1a9..b8e37f9 100644 --- a/posseg/prob_emit.go +++ b/posseg/prob_emit.go @@ -1,15 +1,14 @@ package posseg -const MinFloat = -3.14e100 +const minFloat = -3.14e100 type runeFloatMap map[rune]float64 func (m runeFloatMap) get(key rune) float64 { if value, ok := m[key]; ok { return value - } else { - return MinFloat } + return minFloat } var probEmit = map[uint16]runeFloatMap{ diff --git a/posseg/prob_trans.go b/posseg/prob_trans.go index 409e719..6a7df99 100644 --- a/posseg/prob_trans.go +++ b/posseg/prob_trans.go @@ -11,9 +11,8 @@ type probTransMap map[uint16]float64 func (m probTransMap) Get(key uint16) float64 { if value, ok := m[key]; ok { return value - } else { - return inf } + return inf } var ( diff --git a/posseg/viterbi.go b/posseg/viterbi.go index ad9de8e..7c03e41 100644 --- a/posseg/viterbi.go +++ b/posseg/viterbi.go @@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) { pss[i], pss[j] = pss[j], pss[i] } -func viterbi(obs []rune) []Tag { +func viterbi(obs []rune) []tag { obsLength := len(obs) V := make([]map[uint16]float64, obsLength) V[0] = make(map[uint16]float64) - mem_path := make([]map[uint16]uint16, obsLength) - mem_path[0] = make(map[uint16]uint16) + memPath := make([]map[uint16]uint16, obsLength) + memPath[0] = make(map[uint16]uint16) ys := charStateTab.get(obs[0]) // default is all_states for _, y := range ys { V[0][y] = probEmit[y].get(obs[0]) + probStart[y] - mem_path[0][y] = 0 + memPath[0][y] = 0 } for t := 1; t < obsLength; t++ { - prev_states := make([]uint16, 0) - for x := range mem_path[t-1] { + var prevStates []uint16 + for x := range memPath[t-1] { if len(probTrans[x]) > 0 { - prev_states = append(prev_states, x) + prevStates = append(prevStates, x) } } //use Go's map to implement Python's Set() - prev_states_expect_next := make(map[uint16]int) - for _, x := range prev_states { + prevStatesExpectNext := make(map[uint16]int) + for _, x := range prevStates { for y := range probTrans[x] { - prev_states_expect_next[y] = 1 + prevStatesExpectNext[y] = 1 } } - tmp_obs_states := charStateTab.get(obs[t]) + tmpObsStates := charStateTab.get(obs[t]) - obs_states := make([]uint16, 0) - for index := range tmp_obs_states { - if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok { - obs_states = append(obs_states, tmp_obs_states[index]) + var obsStates []uint16 + for index := range tmpObsStates { + if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok { + obsStates = append(obsStates, tmpObsStates[index]) } } - if len(obs_states) == 0 { - for key := range prev_states_expect_next { - obs_states = append(obs_states, key) + if len(obsStates) == 0 { + for key := range prevStatesExpectNext { + obsStates = append(obsStates, key) } } - if len(obs_states) == 0 { - obs_states = probTransKeys + if len(obsStates) == 0 { + obsStates = probTransKeys } - mem_path[t] = make(map[uint16]uint16) + memPath[t] = make(map[uint16]uint16) V[t] = make(map[uint16]float64) - for _, y := range obs_states { + for _, y := range obsStates { var max, ps probState - for i, y0 := range prev_states { + for i, y0 := range prevStates { ps = probState{ prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]), state: y0} @@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag { } } V[t][y] = max.prob - mem_path[t][y] = max.state + memPath[t][y] = max.state } } last := make(probStates, 0) - length := len(mem_path) + length := len(memPath) vlength := len(V) - for y := range mem_path[length-1] { + for y := range memPath[length-1] { ps := probState{prob: V[vlength-1][y], state: y} last = append(last, ps) } sort.Sort(sort.Reverse(last)) state := last[0].state - route := make([]Tag, len(obs)) + route := make([]tag, len(obs)) for i := obsLength - 1; i >= 0; i-- { - route[i] = Tag(state) - state = mem_path[i][state] + route[i] = tag(state) + state = memPath[i][state] } return route } diff --git a/posseg/viterbi_test.go b/posseg/viterbi_test.go index faa54b4..f3aa55f 100644 --- a/posseg/viterbi_test.go +++ b/posseg/viterbi_test.go @@ -4,49 +4,49 @@ import ( "testing" ) -var defaultRoute []Tag +var defaultRoute []tag func init() { - var t Tag - t, _ = NewTag("B", "nr") + var t tag + t, _ = newTag("B", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("M", "nr") + t, _ = newTag("M", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("E", "nr") + t, _ = newTag("E", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("S", "v") + t, _ = newTag("S", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("B", "v") + t, _ = newTag("B", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("E", "v") + t, _ = newTag("E", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("B", "n") + t, _ = newTag("B", "n") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("M", "n") + t, _ = newTag("M", "n") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("E", "n") + t, _ = newTag("E", "n") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("S", "d") + t, _ = newTag("S", "d") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("S", "v") + t, _ = newTag("S", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("S", "n") + t, _ = newTag("S", "n") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("B", "v") + t, _ = newTag("B", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("E", "v") + t, _ = newTag("E", "v") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("B", "nr") + t, _ = newTag("B", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("M", "nr") + t, _ = newTag("M", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("M", "nr") + t, _ = newTag("M", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("M", "nr") + t, _ = newTag("M", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("E", "nr") + t, _ = newTag("E", "nr") defaultRoute = append(defaultRoute, t) - t, _ = NewTag("S", "zg") + t, _ = newTag("S", "zg") defaultRoute = append(defaultRoute, t) } diff --git a/tokenizer.go b/tokenizer.go index 8684c96..91eb0c9 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -9,18 +9,40 @@ import ( "github.com/blevesearch/bleve/registry" ) +// Name is the jieba tokenizer name. const Name = "jieba" -var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) +var ideographRegexp = regexp.MustCompile(`\p{Han}+`) +// JiebaTokenizer is the beleve tokenizer for jiebago. type JiebaTokenizer struct { seg Segmenter hmm, searchMode bool } -func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { +/* +NewJiebaTokenizer creates a new JiebaTokenizer. + +Parameters: + + dictFilePath: path of the dictioanry file. + + hmm: whether to use Hidden Markov Model to cut unknown words, + i.e. not found in dictionary. For example word "安卓" (means "Android" in + English) not in the dictionary file. If hmm is set to false, it will be + cutted into two single words "安" and "卓", if hmm is set to true, it will + be traded as one single word because Jieba using Hidden Markov Model with + Viterbi algorithm to guess the best possibility. + + searchMode: whether to further cut long words into serveral short words. + In Chinese, some long words may contains other words, for example "交换机" + is a Chinese word for "Switcher", if sechMode is false, it will trade + "交换机" as a single word. If searchMode is true, it will further split + this word into "交换", "换机", which are valid Chinese words. +*/ +func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { var seg Segmenter - err := seg.LoadDictionary(dictFileName) + err := seg.LoadDictionary(dictFilePath) return &JiebaTokenizer{ seg: seg, hmm: hmm, @@ -28,6 +50,7 @@ func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Toke }, err } +// Tokenize cuts input into bleve token stream. func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { rv := make(analysis.TokenStream, 0) runeStart := 0 @@ -77,9 +100,20 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { return rv } +/* +JiebaTokenizerConstructor creates a JiebaTokenizer. + +Parameter config should contains at least one parameter: + + file: the path of the dictionary file. + + hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. + + search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details. +*/ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( analysis.Tokenizer, error) { - dictFileName, ok := config["file"].(string) + dictFilePath, ok := config["file"].(string) if !ok { return nil, fmt.Errorf("must specify dictionary file path") } @@ -92,11 +126,11 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca searchMode = true } - return NewJiebaTokenizer(dictFileName, hmm, searchMode) + return NewJiebaTokenizer(dictFilePath, hmm, searchMode) } func detectTokenType(term string) analysis.TokenType { - if IdeographRegexp.MatchString(term) { + if ideographRegexp.MatchString(term) { return analysis.Ideographic } _, err := strconv.ParseFloat(term, 64) diff --git a/util/util.go b/util/util.go index 9c2fac7..ae4f0ba 100644 --- a/util/util.go +++ b/util/util.go @@ -2,12 +2,14 @@ package util import "regexp" -// RegexpSplit split slices s into substrings separated by the expression and -// returns a slice of the substrings between those expression matches. -// If capturing parentheses are used in expression, then the text of all groups -// in the expression are also returned as part of the resulting slice. -// -// This function acts consistent with Python's re.split function. +/* +RegexpSplit split slices s into substrings separated by the expression and +returns a slice of the substrings between those expression matches. +If capturing parentheses are used in expression, then the text of all groups +in the expression are also returned as part of the resulting slice. + +This function acts consistent with Python's re.split function. +*/ func RegexpSplit(re *regexp.Regexp, s string, n int) []string { if n == 0 { return nil