mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-26 06:50:23 +08:00
code refactor, added more documents
This commit is contained in:
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
test_contents = []string{
|
testContents = []string{
|
||||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
"我不喜欢日本和服。",
|
"我不喜欢日本和服。",
|
||||||
"雷猴回归人间。",
|
"雷猴回归人间。",
|
||||||
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
|
|||||||
te.LoadDictionary("../dict.txt")
|
te.LoadDictionary("../dict.txt")
|
||||||
te.LoadIdf("idf.txt")
|
te.LoadIdf("idf.txt")
|
||||||
|
|
||||||
for index, sentence := range test_contents {
|
for index, sentence := range testContents {
|
||||||
result := te.ExtractTags(sentence, 20)
|
result := te.ExtractTags(sentence, 20)
|
||||||
if len(result) != len(Tags[index]) {
|
if len(result) != len(Tags[index]) {
|
||||||
t.Fatalf("%s = %v", sentence, result)
|
t.Fatalf("%s = %v", sentence, result)
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ import (
|
|||||||
"github.com/wangbin/jiebago/dictionary"
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Idf represents a thread-safe dictionary for all words with their
|
||||||
|
// IDFs(Inverse Document Frequency).
|
||||||
type Idf struct {
|
type Idf struct {
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
median float64
|
median float64
|
||||||
@@ -14,6 +16,7 @@ type Idf struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddToken adds a new word with IDF into it's dictionary.
|
||||||
func (i *Idf) AddToken(token dictionary.Token) {
|
func (i *Idf) AddToken(token dictionary.Token) {
|
||||||
i.Lock()
|
i.Lock()
|
||||||
i.freqMap[token.Text()] = token.Frequency()
|
i.freqMap[token.Text()] = token.Frequency()
|
||||||
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
|
|||||||
i.Unlock()
|
i.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load loads all tokens from channel into it's dictionary.
|
||||||
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
||||||
i.Lock()
|
i.Lock()
|
||||||
for token := range ch {
|
for token := range ch {
|
||||||
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
|
|||||||
return dictionary.LoadDictionary(i, fileName)
|
return dictionary.LoadDictionary(i, fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Frequency returns the IDF of given word.
|
||||||
func (i *Idf) Frequency(key string) (float64, bool) {
|
func (i *Idf) Frequency(key string) (float64, bool) {
|
||||||
i.RLock()
|
i.RLock()
|
||||||
freq, ok := i.freqMap[key]
|
freq, ok := i.freqMap[key]
|
||||||
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
|
|||||||
return freq, ok
|
return freq, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewIdf creates a new Idf instance.
|
||||||
func NewIdf() *Idf {
|
func NewIdf() *Idf {
|
||||||
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"github.com/wangbin/jiebago/dictionary"
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// DefaultStopWordMap contains some stop words.
|
||||||
var DefaultStopWordMap = map[string]int{
|
var DefaultStopWordMap = map[string]int{
|
||||||
"the": 1,
|
"the": 1,
|
||||||
"of": 1,
|
"of": 1,
|
||||||
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
|
|||||||
"or": 1,
|
"or": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StopWord is a thread-safe dictionary for all stop words.
|
||||||
type StopWord struct {
|
type StopWord struct {
|
||||||
stopWordMap map[string]int
|
stopWordMap map[string]int
|
||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddToken adds a token into StopWord dictionary.
|
||||||
func (s *StopWord) AddToken(token dictionary.Token) {
|
func (s *StopWord) AddToken(token dictionary.Token) {
|
||||||
s.Lock()
|
s.Lock()
|
||||||
s.stopWordMap[token.Text()] = 1
|
s.stopWordMap[token.Text()] = 1
|
||||||
s.Unlock()
|
s.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewStopWord create a new StopWord with default stop words.
|
||||||
func NewStopWord() *StopWord {
|
func NewStopWord() *StopWord {
|
||||||
s := new(StopWord)
|
s := new(StopWord)
|
||||||
s.stopWordMap = DefaultStopWordMap
|
s.stopWordMap = DefaultStopWordMap
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsStopWord checks if a given word is stop word.
|
||||||
func (s *StopWord) IsStopWord(word string) bool {
|
func (s *StopWord) IsStopWord(word string) bool {
|
||||||
s.RLock()
|
s.RLock()
|
||||||
_, ok := s.stopWordMap[word]
|
_, ok := s.stopWordMap[word]
|
||||||
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
|
|||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load loads all tokens from given channel into StopWord dictionary.
|
||||||
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
||||||
s.Lock()
|
s.Lock()
|
||||||
for token := range ch {
|
for token := range ch {
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
@@ -9,23 +8,23 @@ import (
|
|||||||
"github.com/wangbin/jiebago"
|
"github.com/wangbin/jiebago"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Segment represents a word with weight.
|
||||||
type Segment struct {
|
type Segment struct {
|
||||||
text string
|
text string
|
||||||
weight float64
|
weight float64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Text returns the segment's text.
|
||||||
func (s Segment) Text() string {
|
func (s Segment) Text() string {
|
||||||
return s.text
|
return s.text
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Weight returns the segment's weight.
|
||||||
func (s Segment) Weight() float64 {
|
func (s Segment) Weight() float64 {
|
||||||
return s.weight
|
return s.weight
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s Segment) String() string {
|
// Segments represents a slice of Segment.
|
||||||
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
|
||||||
}
|
|
||||||
|
|
||||||
type Segments []Segment
|
type Segments []Segment
|
||||||
|
|
||||||
func (ss Segments) Len() int {
|
func (ss Segments) Len() int {
|
||||||
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
|
|||||||
ss[i], ss[j] = ss[j], ss[i]
|
ss[i], ss[j] = ss[j], ss[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TagExtracter is used to extract tags from sentence.
|
||||||
type TagExtracter struct {
|
type TagExtracter struct {
|
||||||
seg *jiebago.Segmenter
|
seg *jiebago.Segmenter
|
||||||
idf *Idf
|
idf *Idf
|
||||||
stopWord *StopWord
|
stopWord *StopWord
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadDictionary reads the given filename and create a new dictionary.
|
||||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
t.seg = new(jiebago.Segmenter)
|
t.seg = new(jiebago.Segmenter)
|
||||||
return t.seg.LoadDictionary(fileName)
|
return t.seg.LoadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||||
t.idf = NewIdf()
|
t.idf = NewIdf()
|
||||||
return t.idf.loadDictionary(fileName)
|
return t.idf.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
return t.stopWord.loadDictionary(fileName)
|
return t.stopWord.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keyword extraction.
|
// ExtractTags extracts the topK key words from sentence.
|
||||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||||
freqMap := make(map[string]float64)
|
freqMap := make(map[string]float64)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
@@ -20,10 +19,6 @@ type edge struct {
|
|||||||
weight float64
|
weight float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e edge) String() string {
|
|
||||||
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
|
|
||||||
}
|
|
||||||
|
|
||||||
type edges []edge
|
type edges []edge
|
||||||
|
|
||||||
func (es edges) Len() int {
|
func (es edges) Len() int {
|
||||||
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
|
||||||
// could be manually speificed.
|
// Parameter allowPOS allows a customized pos list.
|
||||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
||||||
posFilt := make(map[string]int)
|
posFilt := make(map[string]int)
|
||||||
for _, pos := range allowPOS {
|
for _, pos := range allowPOS {
|
||||||
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
|||||||
g := newUndirectWeightedGraph()
|
g := newUndirectWeightedGraph()
|
||||||
cm := make(map[[2]string]float64)
|
cm := make(map[[2]string]float64)
|
||||||
span := 5
|
span := 5
|
||||||
pairs := make([]posseg.Segment, 0)
|
var pairs []posseg.Segment
|
||||||
for pair := range t.seg.Cut(sentence, true) {
|
for pair := range t.seg.Cut(sentence, true) {
|
||||||
pairs = append(pairs, pair)
|
pairs = append(pairs, pair)
|
||||||
}
|
}
|
||||||
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
|||||||
return tags
|
return tags
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract keywords from sentence using TextRank algorithm.
|
// TextRank extract keywords from sentence using TextRank algorithm.
|
||||||
// topK specify how many top keywords to be returned at most.
|
// Parameter topK specify how many top keywords to be returned at most.
|
||||||
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TextRanker is used to extract tags from sentence.
|
||||||
type TextRanker struct {
|
type TextRanker struct {
|
||||||
seg *posseg.Segmenter
|
seg *posseg.Segmenter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
|
||||||
func (t *TextRanker) LoadDictionary(fileName string) error {
|
func (t *TextRanker) LoadDictionary(fileName string) error {
|
||||||
t.seg = new(posseg.Segmenter)
|
t.seg = new(posseg.Segmenter)
|
||||||
return t.seg.LoadDictionary(fileName)
|
return t.seg.LoadDictionary(fileName)
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ type Dictionary struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load loads all tokens from channel
|
// Load loads all tokens from given channel
|
||||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||||
d.Lock()
|
d.Lock()
|
||||||
for token := range ch {
|
for token := range ch {
|
||||||
@@ -49,7 +49,7 @@ func (d *Dictionary) updateLogTotal() {
|
|||||||
d.logTotal = math.Log(d.total)
|
d.logTotal = math.Log(d.total)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Frequency returns the frequency of give word, if not found, the second result is false
|
// Frequency returns the frequency and existence of give word
|
||||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||||
d.RLock()
|
d.RLock()
|
||||||
freq, ok := d.freqMap[key]
|
freq, ok := d.freqMap[key]
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// DictLoader represents a interface that could add one token or load bunch of
|
||||||
|
// tokens from channel.
|
||||||
type DictLoader interface {
|
type DictLoader interface {
|
||||||
Load(<-chan Token)
|
Load(<-chan Token)
|
||||||
AddToken(Token)
|
AddToken(Token)
|
||||||
@@ -49,6 +51,7 @@ func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||||
func LoadDictionary(dl DictLoader, fileName string) error {
|
func LoadDictionary(dl DictLoader, fileName string) error {
|
||||||
filePath, err := dictPath(fileName)
|
filePath, err := dictPath(fileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -1,23 +1,28 @@
|
|||||||
package dictionary
|
package dictionary
|
||||||
|
|
||||||
|
// Token represents a Chinese word with (optional) frequency and POS.
|
||||||
type Token struct {
|
type Token struct {
|
||||||
text string
|
text string
|
||||||
frequency float64
|
frequency float64
|
||||||
pos string
|
pos string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Text returns token's text.
|
||||||
func (t Token) Text() string {
|
func (t Token) Text() string {
|
||||||
return t.text
|
return t.text
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Frequency returns token's frequency.
|
||||||
func (t Token) Frequency() float64 {
|
func (t Token) Frequency() float64 {
|
||||||
return t.frequency
|
return t.frequency
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pos returns token's POS.
|
||||||
func (t Token) Pos() string {
|
func (t Token) Pos() string {
|
||||||
return t.pos
|
return t.pos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewToken creates a new token.
|
||||||
func NewToken(text string, frequency float64, pos string) Token {
|
func NewToken(text string, frequency float64, pos string) Token {
|
||||||
return Token{text: text, frequency: frequency, pos: pos}
|
return Token{text: text, frequency: frequency, pos: pos}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ func cutHan(sentence string) chan string {
|
|||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
runes := []rune(sentence)
|
runes := []rune(sentence)
|
||||||
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||||
begin, next := 0, 0
|
begin, next := 0, 0
|
||||||
for i, char := range runes {
|
for i, char := range runes {
|
||||||
pos := pos_list[i]
|
pos := posList[i]
|
||||||
switch pos {
|
switch pos {
|
||||||
case 'B':
|
case 'B':
|
||||||
begin = i
|
begin = i
|
||||||
@@ -36,6 +36,8 @@ func cutHan(sentence string) chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||||
|
// algorithm. It is used by Jiebago for unknonw words.
|
||||||
func Cut(sentence string) chan string {
|
func Cut(sentence string) chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
s := sentence
|
s := sentence
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func chanToArray(ch chan string) []string {
|
func chanToArray(ch chan string) []string {
|
||||||
result := make([]string, 0)
|
var result []string
|
||||||
for word := range ch {
|
for word := range ch {
|
||||||
result = append(result, word)
|
result = append(result, word)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
|||||||
V[t] = make(map[byte]float64)
|
V[t] = make(map[byte]float64)
|
||||||
for _, y := range states {
|
for _, y := range states {
|
||||||
ps0 := make(probStates, 0)
|
ps0 := make(probStates, 0)
|
||||||
var em_p float64
|
var emP float64
|
||||||
if val, ok := probEmit[y][obs[t]]; ok {
|
if val, ok := probEmit[y][obs[t]]; ok {
|
||||||
em_p = val
|
emP = val
|
||||||
} else {
|
} else {
|
||||||
em_p = minFloat
|
emP = minFloat
|
||||||
}
|
}
|
||||||
for _, y0 := range prevStatus[y] {
|
for _, y0 := range prevStatus[y] {
|
||||||
var transP float64
|
var transP float64
|
||||||
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
|||||||
} else {
|
} else {
|
||||||
transP = minFloat
|
transP = minFloat
|
||||||
}
|
}
|
||||||
prob0 := V[t-1][y0] + transP + em_p
|
prob0 := V[t-1][y0] + transP + emP
|
||||||
ps0 = append(ps0, &probState{prob: prob0, state: y0})
|
ps0 = append(ps0, &probState{prob: prob0, state: y0})
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(ps0))
|
sort.Sort(sort.Reverse(ps0))
|
||||||
|
|||||||
23
jieba.go
23
jieba.go
@@ -16,15 +16,21 @@ var (
|
|||||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Segmenter is a Chinese words segmentation struct.
|
||||||
type Segmenter struct {
|
type Segmenter struct {
|
||||||
dict *Dictionary
|
dict *Dictionary
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
|
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
@@ -46,7 +52,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
|||||||
if freq > 0.0 {
|
if freq > 0.0 {
|
||||||
dag[k] = append(dag[k], i)
|
dag[k] = append(dag[k], i)
|
||||||
}
|
}
|
||||||
i += 1
|
i++
|
||||||
if i >= n {
|
if i >= n {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -98,7 +104,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan string {
|
|||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
var y int
|
||||||
length := len(runes)
|
length := len(runes)
|
||||||
buf := make([]rune, 0)
|
var buf []rune
|
||||||
for x := 0; x < length; {
|
for x := 0; x < length; {
|
||||||
y = routes[x].index + 1
|
y = routes[x].index + 1
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
@@ -156,7 +162,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
|||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
var y int
|
||||||
length := len(runes)
|
length := len(runes)
|
||||||
buf := make([]rune, 0)
|
var buf []rune
|
||||||
for x := 0; x < length; {
|
for x := 0; x < length; {
|
||||||
y = routes[x].index + 1
|
y = routes[x].index + 1
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
@@ -181,6 +187,10 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cut cuts a sentence into words using accurate mode.
|
||||||
|
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||||
|
// Accurate mode attempts to cut the sentence into the most accurate
|
||||||
|
// segmentations, which is suitable for text analysis.
|
||||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
var cut cutFunc
|
var cut cutFunc
|
||||||
@@ -246,6 +256,9 @@ func (seg *Segmenter) cutAll(sentence string) <-chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CutAll cuts a sentence into words using full mode.
|
||||||
|
// Full mode gets all the possible words from the sentence.
|
||||||
|
// Fast but not accurate.
|
||||||
func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
@@ -268,6 +281,10 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CutForSearch cuts sentence into words using search engine mode.
|
||||||
|
// Search engine mode, based on the accurate mode, attempts to cut long words
|
||||||
|
// into several short words, which can raise the recall rate.
|
||||||
|
// Suitable for search engines.
|
||||||
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
|
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
|
||||||
result := make(chan string)
|
result := make(chan string)
|
||||||
go func() {
|
go func() {
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ package jiebago
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
seg Segmenter
|
seg Segmenter
|
||||||
test_contents = []string{
|
testContents = []string{
|
||||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
"我不喜欢日本和服。",
|
"我不喜欢日本和服。",
|
||||||
"雷猴回归人间。",
|
"雷猴回归人间。",
|
||||||
@@ -620,7 +620,7 @@ func init() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func chanToArray(ch <-chan string) []string {
|
func chanToArray(ch <-chan string) []string {
|
||||||
result := make([]string, 0)
|
var result []string
|
||||||
for word := range ch {
|
for word := range ch {
|
||||||
result = append(result, word)
|
result = append(result, word)
|
||||||
}
|
}
|
||||||
@@ -643,7 +643,7 @@ func TestCutDAGNoHmm(t *testing.T) {
|
|||||||
|
|
||||||
func TestDefaultCut(t *testing.T) {
|
func TestDefaultCut(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.Cut(content, true))
|
result = chanToArray(seg.Cut(content, true))
|
||||||
if len(result) != len(defaultCutResult[index]) {
|
if len(result) != len(defaultCutResult[index]) {
|
||||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {
|
|||||||
|
|
||||||
func TestCutAll(t *testing.T) {
|
func TestCutAll(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.CutAll(content))
|
result = chanToArray(seg.CutAll(content))
|
||||||
if len(result) != len(cutAllResult[index]) {
|
if len(result) != len(cutAllResult[index]) {
|
||||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||||
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {
|
|||||||
|
|
||||||
func TestDefaultCutNoHMM(t *testing.T) {
|
func TestDefaultCutNoHMM(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.Cut(content, false))
|
result = chanToArray(seg.Cut(content, false))
|
||||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||||
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
|
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
|
||||||
@@ -695,7 +695,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
|||||||
|
|
||||||
func TestCutForSearch(t *testing.T) {
|
func TestCutForSearch(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.CutForSearch(content, true))
|
result = chanToArray(seg.CutForSearch(content, true))
|
||||||
if len(result) != len(cutForSearchResult[index]) {
|
if len(result) != len(cutForSearchResult[index]) {
|
||||||
t.Fatalf("cut for search for %s length should be %d not %d\n",
|
t.Fatalf("cut for search for %s length should be %d not %d\n",
|
||||||
@@ -707,7 +707,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.CutForSearch(content, false))
|
result = chanToArray(seg.CutForSearch(content, false))
|
||||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||||
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
|
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
|
||||||
@@ -724,7 +724,7 @@ func TestCutForSearch(t *testing.T) {
|
|||||||
func TestLoadDictionary(t *testing.T) {
|
func TestLoadDictionary(t *testing.T) {
|
||||||
var result []string
|
var result []string
|
||||||
seg.LoadDictionary("foobar.txt")
|
seg.LoadDictionary("foobar.txt")
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result = chanToArray(seg.Cut(content, true))
|
result = chanToArray(seg.Cut(content, true))
|
||||||
if len(result) != len(userDictCutResult[index]) {
|
if len(result) != len(userDictCutResult[index]) {
|
||||||
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
|
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||||
|
|||||||
@@ -2,9 +2,9 @@ package posseg
|
|||||||
|
|
||||||
import "fmt"
|
import "fmt"
|
||||||
|
|
||||||
type Tag uint16
|
type tag uint16
|
||||||
|
|
||||||
func (t Tag) Tag() string {
|
func (t tag) position() string {
|
||||||
switch t / 100 {
|
switch t / 100 {
|
||||||
case 4:
|
case 4:
|
||||||
return "S"
|
return "S"
|
||||||
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tag) POS() string {
|
func (t tag) pos() string {
|
||||||
return poss[t%100]
|
return poss[t%100]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tag) String() string {
|
func newTag(position, pos string) (tag, error) {
|
||||||
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS())
|
positionIndex := -1
|
||||||
}
|
|
||||||
|
|
||||||
func NewTag(tag, pos string) (Tag, error) {
|
|
||||||
tagIndex := -1
|
|
||||||
posIndex := -1
|
posIndex := -1
|
||||||
for i, t := range tags {
|
for i, p := range positions {
|
||||||
if tag == t {
|
if position == p {
|
||||||
tagIndex = (i + 1) * 100
|
positionIndex = (i + 1) * 100
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i, p := range poss {
|
for i, p := range poss {
|
||||||
if pos == p {
|
if pos == p {
|
||||||
posIndex = i
|
posIndex = i
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if tagIndex < 0 || posIndex < 0 {
|
if positionIndex < 0 || posIndex < 0 {
|
||||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos)
|
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
|
||||||
}
|
}
|
||||||
return Tag(tagIndex + posIndex), nil
|
return tag(positionIndex + posIndex), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type charStateTabMap map[rune][]uint16
|
type charStateTabMap map[rune][]uint16
|
||||||
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
|
|||||||
func (m charStateTabMap) get(key rune) []uint16 {
|
func (m charStateTabMap) get(key rune) []uint16 {
|
||||||
if value, ok := m[key]; ok {
|
if value, ok := m[key]; ok {
|
||||||
return value
|
return value
|
||||||
} else {
|
|
||||||
return probTransKeys
|
|
||||||
}
|
}
|
||||||
|
return probTransKeys
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -6708,6 +6705,6 @@ var (
|
|||||||
'\u9fa0': []uint16{413},
|
'\u9fa0': []uint16{413},
|
||||||
}
|
}
|
||||||
|
|
||||||
tags = []string{"B", "E", "M", "S"}
|
positions = []string{"B", "E", "M", "S"}
|
||||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
"github.com/wangbin/jiebago/dictionary"
|
"github.com/wangbin/jiebago/dictionary"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||||
type Dictionary struct {
|
type Dictionary struct {
|
||||||
total, logTotal float64
|
total, logTotal float64
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
@@ -14,6 +15,7 @@ type Dictionary struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load loads all tokens from given channel
|
||||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||||
d.Lock()
|
d.Lock()
|
||||||
for token := range ch {
|
for token := range ch {
|
||||||
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
|||||||
d.updateLogTotal()
|
d.updateLogTotal()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddToken adds one token
|
||||||
func (d *Dictionary) AddToken(token dictionary.Token) {
|
func (d *Dictionary) AddToken(token dictionary.Token) {
|
||||||
d.Lock()
|
d.Lock()
|
||||||
d.addToken(token)
|
d.addToken(token)
|
||||||
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
|
|||||||
d.logTotal = math.Log(d.total)
|
d.logTotal = math.Log(d.total)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Frequency returns the frequency and existence of give word
|
||||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||||
d.RLock()
|
d.RLock()
|
||||||
freq, ok := d.freqMap[key]
|
freq, ok := d.freqMap[key]
|
||||||
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
|||||||
return freq, ok
|
return freq, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pos returns the POS and existence of give word
|
||||||
func (d *Dictionary) Pos(key string) (string, bool) {
|
func (d *Dictionary) Pos(key string) (string, bool) {
|
||||||
d.RLock()
|
d.RLock()
|
||||||
pos, ok := d.posMap[key]
|
pos, ok := d.posMap[key]
|
||||||
|
|||||||
@@ -17,27 +17,36 @@ var (
|
|||||||
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Segment represents a word with it's POS
|
||||||
type Segment struct {
|
type Segment struct {
|
||||||
text, pos string
|
text, pos string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Text returns the Segment's text.
|
||||||
func (s Segment) Text() string {
|
func (s Segment) Text() string {
|
||||||
return s.text
|
return s.text
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pos returns the Segment's POS.
|
||||||
func (s Segment) Pos() string {
|
func (s Segment) Pos() string {
|
||||||
return s.pos
|
return s.pos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Segmenter is a Chinese words segmentation struct.
|
||||||
type Segmenter struct {
|
type Segmenter struct {
|
||||||
dict *Dictionary
|
dict *Dictionary
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadDictionary loads dictionary from given file name.
|
||||||
|
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||||
return seg.dict.loadDictionary(fileName)
|
return seg.dict.loadDictionary(fileName)
|
||||||
}
|
}
|
||||||
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
|||||||
next := 0
|
next := 0
|
||||||
for i, char := range runes {
|
for i, char := range runes {
|
||||||
pos := posList[i]
|
pos := posList[i]
|
||||||
switch pos.Tag() {
|
switch pos.position() {
|
||||||
case "B":
|
case "B":
|
||||||
begin = i
|
begin = i
|
||||||
case "E":
|
case "E":
|
||||||
result <- Segment{string(runes[begin : i+1]), pos.POS()}
|
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
||||||
next = i + 1
|
next = i + 1
|
||||||
case "S":
|
case "S":
|
||||||
result <- Segment{string(char), pos.POS()}
|
result <- Segment{string(char), pos.pos()}
|
||||||
next = i + 1
|
next = i + 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if next < len(runes) {
|
if next < len(runes) {
|
||||||
result <- Segment{string(runes[next:]), posList[next].POS()}
|
result <- Segment{string(runes[next:]), posList[next].pos()}
|
||||||
}
|
}
|
||||||
close(result)
|
close(result)
|
||||||
}()
|
}()
|
||||||
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
|||||||
if freq > 0.0 {
|
if freq > 0.0 {
|
||||||
dag[k] = append(dag[k], i)
|
dag[k] = append(dag[k], i)
|
||||||
}
|
}
|
||||||
i += 1
|
i++
|
||||||
if i >= n {
|
if i >= n {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
|||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
var y int
|
||||||
length := len(runes)
|
length := len(runes)
|
||||||
buf := make([]rune, 0)
|
var buf []rune
|
||||||
for x := 0; x < length; {
|
for x := 0; x < length; {
|
||||||
y = routes[x].index + 1
|
y = routes[x].index + 1
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
|||||||
routes := seg.calc(runes)
|
routes := seg.calc(runes)
|
||||||
var y int
|
var y int
|
||||||
length := len(runes)
|
length := len(runes)
|
||||||
buf := make([]rune, 0)
|
var buf []rune
|
||||||
for x := 0; x < length; {
|
for x := 0; x < length; {
|
||||||
y = routes[x].index + 1
|
y = routes[x].index + 1
|
||||||
frag := runes[x:y]
|
frag := runes[x:y]
|
||||||
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cut cuts a sentence into words.
|
||||||
|
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
||||||
result := make(chan Segment)
|
result := make(chan Segment)
|
||||||
var cut cutFunc
|
var cut cutFunc
|
||||||
|
|||||||
@@ -5,8 +5,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
seg Segmenter
|
seg Segmenter
|
||||||
test_contents = []string{
|
testContents = []string{
|
||||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
"我不喜欢日本和服。",
|
"我不喜欢日本和服。",
|
||||||
"雷猴回归人间。",
|
"雷猴回归人间。",
|
||||||
@@ -273,7 +273,7 @@ func init() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func chanToArray(ch <-chan Segment) []Segment {
|
func chanToArray(ch <-chan Segment) []Segment {
|
||||||
result := make([]Segment, 0)
|
var result []Segment
|
||||||
for word := range ch {
|
for word := range ch {
|
||||||
result = append(result, word)
|
result = append(result, word)
|
||||||
}
|
}
|
||||||
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestCut(t *testing.T) {
|
func TestCut(t *testing.T) {
|
||||||
for index, content := range test_contents {
|
for index, content := range testContents {
|
||||||
result := chanToArray(seg.Cut(content, true))
|
result := chanToArray(seg.Cut(content, true))
|
||||||
if len(defaultCutResult[index]) != len(result) {
|
if len(defaultCutResult[index]) != len(result) {
|
||||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
|
|||||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||||
t.Fatalf("got: %v\n", result)
|
t.Fatalf("got: %v\n", result)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i := range result {
|
||||||
if result[i] != defaultCutResult[index][i] {
|
if result[i] != defaultCutResult[index][i] {
|
||||||
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||||
}
|
}
|
||||||
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
|
|||||||
if len(noHMMCutResult[index]) != len(result) {
|
if len(noHMMCutResult[index]) != len(result) {
|
||||||
t.Fatal(content)
|
t.Fatal(content)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i := range result {
|
||||||
if result[i] != noHMMCutResult[index][i] {
|
if result[i] != noHMMCutResult[index][i] {
|
||||||
t.Fatal(content)
|
t.Fatal(content)
|
||||||
}
|
}
|
||||||
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
|
|||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i := range result {
|
||||||
if result[i] != cutResult[i] {
|
if result[i] != cutResult[i] {
|
||||||
t.Fatal(result[i])
|
t.Fatal(result[i])
|
||||||
}
|
}
|
||||||
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
|
|||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i := range result {
|
||||||
if result[i] != cutResult[i] {
|
if result[i] != cutResult[i] {
|
||||||
t.Fatal(result[i])
|
t.Fatal(result[i])
|
||||||
}
|
}
|
||||||
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
|
|||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
for i, _ := range result {
|
for i := range result {
|
||||||
if result[i] != cutResult[i] {
|
if result[i] != cutResult[i] {
|
||||||
t.Fatal(result[i])
|
t.Fatal(result[i])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,14 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
const MinFloat = -3.14e100
|
const minFloat = -3.14e100
|
||||||
|
|
||||||
type runeFloatMap map[rune]float64
|
type runeFloatMap map[rune]float64
|
||||||
|
|
||||||
func (m runeFloatMap) get(key rune) float64 {
|
func (m runeFloatMap) get(key rune) float64 {
|
||||||
if value, ok := m[key]; ok {
|
if value, ok := m[key]; ok {
|
||||||
return value
|
return value
|
||||||
} else {
|
|
||||||
return MinFloat
|
|
||||||
}
|
}
|
||||||
|
return minFloat
|
||||||
}
|
}
|
||||||
|
|
||||||
var probEmit = map[uint16]runeFloatMap{
|
var probEmit = map[uint16]runeFloatMap{
|
||||||
|
|||||||
@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
|
|||||||
func (m probTransMap) Get(key uint16) float64 {
|
func (m probTransMap) Get(key uint16) float64 {
|
||||||
if value, ok := m[key]; ok {
|
if value, ok := m[key]; ok {
|
||||||
return value
|
return value
|
||||||
} else {
|
|
||||||
return inf
|
|
||||||
}
|
}
|
||||||
|
return inf
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
|||||||
@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
|
|||||||
pss[i], pss[j] = pss[j], pss[i]
|
pss[i], pss[j] = pss[j], pss[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
func viterbi(obs []rune) []Tag {
|
func viterbi(obs []rune) []tag {
|
||||||
obsLength := len(obs)
|
obsLength := len(obs)
|
||||||
V := make([]map[uint16]float64, obsLength)
|
V := make([]map[uint16]float64, obsLength)
|
||||||
V[0] = make(map[uint16]float64)
|
V[0] = make(map[uint16]float64)
|
||||||
mem_path := make([]map[uint16]uint16, obsLength)
|
memPath := make([]map[uint16]uint16, obsLength)
|
||||||
mem_path[0] = make(map[uint16]uint16)
|
memPath[0] = make(map[uint16]uint16)
|
||||||
ys := charStateTab.get(obs[0]) // default is all_states
|
ys := charStateTab.get(obs[0]) // default is all_states
|
||||||
for _, y := range ys {
|
for _, y := range ys {
|
||||||
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
|
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
|
||||||
mem_path[0][y] = 0
|
memPath[0][y] = 0
|
||||||
}
|
}
|
||||||
for t := 1; t < obsLength; t++ {
|
for t := 1; t < obsLength; t++ {
|
||||||
prev_states := make([]uint16, 0)
|
var prevStates []uint16
|
||||||
for x := range mem_path[t-1] {
|
for x := range memPath[t-1] {
|
||||||
if len(probTrans[x]) > 0 {
|
if len(probTrans[x]) > 0 {
|
||||||
prev_states = append(prev_states, x)
|
prevStates = append(prevStates, x)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//use Go's map to implement Python's Set()
|
//use Go's map to implement Python's Set()
|
||||||
prev_states_expect_next := make(map[uint16]int)
|
prevStatesExpectNext := make(map[uint16]int)
|
||||||
for _, x := range prev_states {
|
for _, x := range prevStates {
|
||||||
for y := range probTrans[x] {
|
for y := range probTrans[x] {
|
||||||
prev_states_expect_next[y] = 1
|
prevStatesExpectNext[y] = 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tmp_obs_states := charStateTab.get(obs[t])
|
tmpObsStates := charStateTab.get(obs[t])
|
||||||
|
|
||||||
obs_states := make([]uint16, 0)
|
var obsStates []uint16
|
||||||
for index := range tmp_obs_states {
|
for index := range tmpObsStates {
|
||||||
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
|
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
|
||||||
obs_states = append(obs_states, tmp_obs_states[index])
|
obsStates = append(obsStates, tmpObsStates[index])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(obs_states) == 0 {
|
if len(obsStates) == 0 {
|
||||||
for key := range prev_states_expect_next {
|
for key := range prevStatesExpectNext {
|
||||||
obs_states = append(obs_states, key)
|
obsStates = append(obsStates, key)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(obs_states) == 0 {
|
if len(obsStates) == 0 {
|
||||||
obs_states = probTransKeys
|
obsStates = probTransKeys
|
||||||
}
|
}
|
||||||
mem_path[t] = make(map[uint16]uint16)
|
memPath[t] = make(map[uint16]uint16)
|
||||||
V[t] = make(map[uint16]float64)
|
V[t] = make(map[uint16]float64)
|
||||||
for _, y := range obs_states {
|
for _, y := range obsStates {
|
||||||
var max, ps probState
|
var max, ps probState
|
||||||
for i, y0 := range prev_states {
|
for i, y0 := range prevStates {
|
||||||
ps = probState{
|
ps = probState{
|
||||||
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
||||||
state: y0}
|
state: y0}
|
||||||
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
V[t][y] = max.prob
|
V[t][y] = max.prob
|
||||||
mem_path[t][y] = max.state
|
memPath[t][y] = max.state
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
last := make(probStates, 0)
|
last := make(probStates, 0)
|
||||||
length := len(mem_path)
|
length := len(memPath)
|
||||||
vlength := len(V)
|
vlength := len(V)
|
||||||
for y := range mem_path[length-1] {
|
for y := range memPath[length-1] {
|
||||||
ps := probState{prob: V[vlength-1][y], state: y}
|
ps := probState{prob: V[vlength-1][y], state: y}
|
||||||
last = append(last, ps)
|
last = append(last, ps)
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(last))
|
sort.Sort(sort.Reverse(last))
|
||||||
state := last[0].state
|
state := last[0].state
|
||||||
route := make([]Tag, len(obs))
|
route := make([]tag, len(obs))
|
||||||
|
|
||||||
for i := obsLength - 1; i >= 0; i-- {
|
for i := obsLength - 1; i >= 0; i-- {
|
||||||
route[i] = Tag(state)
|
route[i] = tag(state)
|
||||||
state = mem_path[i][state]
|
state = memPath[i][state]
|
||||||
}
|
}
|
||||||
return route
|
return route
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,49 +4,49 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
var defaultRoute []Tag
|
var defaultRoute []tag
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
var t Tag
|
var t tag
|
||||||
t, _ = NewTag("B", "nr")
|
t, _ = newTag("B", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("M", "nr")
|
t, _ = newTag("M", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("E", "nr")
|
t, _ = newTag("E", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("S", "v")
|
t, _ = newTag("S", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("B", "v")
|
t, _ = newTag("B", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("E", "v")
|
t, _ = newTag("E", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("B", "n")
|
t, _ = newTag("B", "n")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("M", "n")
|
t, _ = newTag("M", "n")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("E", "n")
|
t, _ = newTag("E", "n")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("S", "d")
|
t, _ = newTag("S", "d")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("S", "v")
|
t, _ = newTag("S", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("S", "n")
|
t, _ = newTag("S", "n")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("B", "v")
|
t, _ = newTag("B", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("E", "v")
|
t, _ = newTag("E", "v")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("B", "nr")
|
t, _ = newTag("B", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("M", "nr")
|
t, _ = newTag("M", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("M", "nr")
|
t, _ = newTag("M", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("M", "nr")
|
t, _ = newTag("M", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("E", "nr")
|
t, _ = newTag("E", "nr")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
t, _ = NewTag("S", "zg")
|
t, _ = newTag("S", "zg")
|
||||||
defaultRoute = append(defaultRoute, t)
|
defaultRoute = append(defaultRoute, t)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
46
tokenizer.go
46
tokenizer.go
@@ -9,18 +9,40 @@ import (
|
|||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Name is the jieba tokenizer name.
|
||||||
const Name = "jieba"
|
const Name = "jieba"
|
||||||
|
|
||||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||||
|
|
||||||
|
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||||
type JiebaTokenizer struct {
|
type JiebaTokenizer struct {
|
||||||
seg Segmenter
|
seg Segmenter
|
||||||
hmm, searchMode bool
|
hmm, searchMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
/*
|
||||||
|
NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
dictFilePath: path of the dictioanry file.
|
||||||
|
|
||||||
|
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||||
|
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||||
|
English) not in the dictionary file. If hmm is set to false, it will be
|
||||||
|
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||||
|
be traded as one single word because Jieba using Hidden Markov Model with
|
||||||
|
Viterbi algorithm to guess the best possibility.
|
||||||
|
|
||||||
|
searchMode: whether to further cut long words into serveral short words.
|
||||||
|
In Chinese, some long words may contains other words, for example "交换机"
|
||||||
|
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||||
|
"交换机" as a single word. If searchMode is true, it will further split
|
||||||
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
|
*/
|
||||||
|
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
var seg Segmenter
|
var seg Segmenter
|
||||||
err := seg.LoadDictionary(dictFileName)
|
err := seg.LoadDictionary(dictFilePath)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
hmm: hmm,
|
hmm: hmm,
|
||||||
@@ -28,6 +50,7 @@ func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Toke
|
|||||||
}, err
|
}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tokenize cuts input into bleve token stream.
|
||||||
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||||
rv := make(analysis.TokenStream, 0)
|
rv := make(analysis.TokenStream, 0)
|
||||||
runeStart := 0
|
runeStart := 0
|
||||||
@@ -77,9 +100,20 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||||
|
|
||||||
|
Parameter config should contains at least one parameter:
|
||||||
|
|
||||||
|
file: the path of the dictionary file.
|
||||||
|
|
||||||
|
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||||
|
|
||||||
|
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||||
|
*/
|
||||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||||
analysis.Tokenizer, error) {
|
analysis.Tokenizer, error) {
|
||||||
dictFileName, ok := config["file"].(string)
|
dictFilePath, ok := config["file"].(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, fmt.Errorf("must specify dictionary file path")
|
return nil, fmt.Errorf("must specify dictionary file path")
|
||||||
}
|
}
|
||||||
@@ -92,11 +126,11 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
|
|||||||
searchMode = true
|
searchMode = true
|
||||||
}
|
}
|
||||||
|
|
||||||
return NewJiebaTokenizer(dictFileName, hmm, searchMode)
|
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
|
||||||
}
|
}
|
||||||
|
|
||||||
func detectTokenType(term string) analysis.TokenType {
|
func detectTokenType(term string) analysis.TokenType {
|
||||||
if IdeographRegexp.MatchString(term) {
|
if ideographRegexp.MatchString(term) {
|
||||||
return analysis.Ideographic
|
return analysis.Ideographic
|
||||||
}
|
}
|
||||||
_, err := strconv.ParseFloat(term, 64)
|
_, err := strconv.ParseFloat(term, 64)
|
||||||
|
|||||||
14
util/util.go
14
util/util.go
@@ -2,12 +2,14 @@ package util
|
|||||||
|
|
||||||
import "regexp"
|
import "regexp"
|
||||||
|
|
||||||
// RegexpSplit split slices s into substrings separated by the expression and
|
/*
|
||||||
// returns a slice of the substrings between those expression matches.
|
RegexpSplit split slices s into substrings separated by the expression and
|
||||||
// If capturing parentheses are used in expression, then the text of all groups
|
returns a slice of the substrings between those expression matches.
|
||||||
// in the expression are also returned as part of the resulting slice.
|
If capturing parentheses are used in expression, then the text of all groups
|
||||||
//
|
in the expression are also returned as part of the resulting slice.
|
||||||
// This function acts consistent with Python's re.split function.
|
|
||||||
|
This function acts consistent with Python's re.split function.
|
||||||
|
*/
|
||||||
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
Reference in New Issue
Block a user