1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-26 06:50:23 +08:00

code refactor, added more documents

This commit is contained in:
Wang Bin
2015-05-06 12:55:04 +08:00
parent 87caff09cb
commit 122bad0a8d
23 changed files with 228 additions and 142 deletions

View File

@@ -6,7 +6,7 @@ import (
) )
var ( var (
test_contents = []string{ testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。", "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。", "我不喜欢日本和服。",
"雷猴回归人间。", "雷猴回归人间。",
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
te.LoadDictionary("../dict.txt") te.LoadDictionary("../dict.txt")
te.LoadIdf("idf.txt") te.LoadIdf("idf.txt")
for index, sentence := range test_contents { for index, sentence := range testContents {
result := te.ExtractTags(sentence, 20) result := te.ExtractTags(sentence, 20)
if len(result) != len(Tags[index]) { if len(result) != len(Tags[index]) {
t.Fatalf("%s = %v", sentence, result) t.Fatalf("%s = %v", sentence, result)

View File

@@ -7,6 +7,8 @@ import (
"github.com/wangbin/jiebago/dictionary" "github.com/wangbin/jiebago/dictionary"
) )
// Idf represents a thread-safe dictionary for all words with their
// IDFs(Inverse Document Frequency).
type Idf struct { type Idf struct {
freqMap map[string]float64 freqMap map[string]float64
median float64 median float64
@@ -14,6 +16,7 @@ type Idf struct {
sync.RWMutex sync.RWMutex
} }
// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(token dictionary.Token) { func (i *Idf) AddToken(token dictionary.Token) {
i.Lock() i.Lock()
i.freqMap[token.Text()] = token.Frequency() i.freqMap[token.Text()] = token.Frequency()
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
i.Unlock() i.Unlock()
} }
// Load loads all tokens from channel into it's dictionary.
func (i *Idf) Load(ch <-chan dictionary.Token) { func (i *Idf) Load(ch <-chan dictionary.Token) {
i.Lock() i.Lock()
for token := range ch { for token := range ch {
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
return dictionary.LoadDictionary(i, fileName) return dictionary.LoadDictionary(i, fileName)
} }
// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) { func (i *Idf) Frequency(key string) (float64, bool) {
i.RLock() i.RLock()
freq, ok := i.freqMap[key] freq, ok := i.freqMap[key]
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
return freq, ok return freq, ok
} }
// NewIdf creates a new Idf instance.
func NewIdf() *Idf { func NewIdf() *Idf {
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)} return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
} }

View File

@@ -6,6 +6,7 @@ import (
"github.com/wangbin/jiebago/dictionary" "github.com/wangbin/jiebago/dictionary"
) )
// DefaultStopWordMap contains some stop words.
var DefaultStopWordMap = map[string]int{ var DefaultStopWordMap = map[string]int{
"the": 1, "the": 1,
"of": 1, "of": 1,
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
"or": 1, "or": 1,
} }
// StopWord is a thread-safe dictionary for all stop words.
type StopWord struct { type StopWord struct {
stopWordMap map[string]int stopWordMap map[string]int
sync.RWMutex sync.RWMutex
} }
// AddToken adds a token into StopWord dictionary.
func (s *StopWord) AddToken(token dictionary.Token) { func (s *StopWord) AddToken(token dictionary.Token) {
s.Lock() s.Lock()
s.stopWordMap[token.Text()] = 1 s.stopWordMap[token.Text()] = 1
s.Unlock() s.Unlock()
} }
// NewStopWord create a new StopWord with default stop words.
func NewStopWord() *StopWord { func NewStopWord() *StopWord {
s := new(StopWord) s := new(StopWord)
s.stopWordMap = DefaultStopWordMap s.stopWordMap = DefaultStopWordMap
return s return s
} }
// IsStopWord checks if a given word is stop word.
func (s *StopWord) IsStopWord(word string) bool { func (s *StopWord) IsStopWord(word string) bool {
s.RLock() s.RLock()
_, ok := s.stopWordMap[word] _, ok := s.stopWordMap[word]
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
return ok return ok
} }
// Load loads all tokens from given channel into StopWord dictionary.
func (s *StopWord) Load(ch <-chan dictionary.Token) { func (s *StopWord) Load(ch <-chan dictionary.Token) {
s.Lock() s.Lock()
for token := range ch { for token := range ch {

View File

@@ -1,7 +1,6 @@
package analyse package analyse
import ( import (
"fmt"
"sort" "sort"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
@@ -9,23 +8,23 @@ import (
"github.com/wangbin/jiebago" "github.com/wangbin/jiebago"
) )
// Segment represents a word with weight.
type Segment struct { type Segment struct {
text string text string
weight float64 weight float64
} }
// Text returns the segment's text.
func (s Segment) Text() string { func (s Segment) Text() string {
return s.text return s.text
} }
// Weight returns the segment's weight.
func (s Segment) Weight() float64 { func (s Segment) Weight() float64 {
return s.weight return s.weight
} }
func (s Segment) String() string { // Segments represents a slice of Segment.
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
}
type Segments []Segment type Segments []Segment
func (ss Segments) Len() int { func (ss Segments) Len() int {
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i] ss[i], ss[j] = ss[j], ss[i]
} }
// TagExtracter is used to extract tags from sentence.
type TagExtracter struct { type TagExtracter struct {
seg *jiebago.Segmenter seg *jiebago.Segmenter
idf *Idf idf *Idf
stopWord *StopWord stopWord *StopWord
} }
// LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(fileName string) error { func (t *TagExtracter) LoadDictionary(fileName string) error {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
t.seg = new(jiebago.Segmenter) t.seg = new(jiebago.Segmenter)
return t.seg.LoadDictionary(fileName) return t.seg.LoadDictionary(fileName)
} }
// LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(fileName string) error { func (t *TagExtracter) LoadIdf(fileName string) error {
t.idf = NewIdf() t.idf = NewIdf()
return t.idf.loadDictionary(fileName) return t.idf.loadDictionary(fileName)
} }
// LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(fileName string) error { func (t *TagExtracter) LoadStopWords(fileName string) error {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(fileName) return t.stopWord.loadDictionary(fileName)
} }
// Keyword extraction. // ExtractTags extracts the topK key words from sentence.
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64) freqMap := make(map[string]float64)

View File

@@ -1,7 +1,6 @@
package analyse package analyse
import ( import (
"fmt"
"math" "math"
"sort" "sort"
@@ -20,10 +19,6 @@ type edge struct {
weight float64 weight float64
} }
func (e edge) String() string {
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
}
type edges []edge type edges []edge
func (es edges) Len() int { func (es edges) Len() int {
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
return result return result
} }
// Extract keywords from sentence using TextRank algorithm. the allowed POS list // TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
// could be manually speificed. // Parameter allowPOS allows a customized pos list.
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments { func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
posFilt := make(map[string]int) posFilt := make(map[string]int)
for _, pos := range allowPOS { for _, pos := range allowPOS {
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
g := newUndirectWeightedGraph() g := newUndirectWeightedGraph()
cm := make(map[[2]string]float64) cm := make(map[[2]string]float64)
span := 5 span := 5
pairs := make([]posseg.Segment, 0) var pairs []posseg.Segment
for pair := range t.seg.Cut(sentence, true) { for pair := range t.seg.Cut(sentence, true) {
pairs = append(pairs, pair) pairs = append(pairs, pair)
} }
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
return tags return tags
} }
// Extract keywords from sentence using TextRank algorithm. // TextRank extract keywords from sentence using TextRank algorithm.
// topK specify how many top keywords to be returned at most. // Parameter topK specify how many top keywords to be returned at most.
func (t *TextRanker) TextRank(sentence string, topK int) Segments { func (t *TextRanker) TextRank(sentence string, topK int) Segments {
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
} }
// TextRanker is used to extract tags from sentence.
type TextRanker struct { type TextRanker struct {
seg *posseg.Segmenter seg *posseg.Segmenter
} }
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
func (t *TextRanker) LoadDictionary(fileName string) error { func (t *TextRanker) LoadDictionary(fileName string) error {
t.seg = new(posseg.Segmenter) t.seg = new(posseg.Segmenter)
return t.seg.LoadDictionary(fileName) return t.seg.LoadDictionary(fileName)

View File

@@ -14,7 +14,7 @@ type Dictionary struct {
sync.RWMutex sync.RWMutex
} }
// Load loads all tokens from channel // Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) { func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock() d.Lock()
for token := range ch { for token := range ch {
@@ -49,7 +49,7 @@ func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total) d.logTotal = math.Log(d.total)
} }
// Frequency returns the frequency of give word, if not found, the second result is false // Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) { func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock() d.RLock()
freq, ok := d.freqMap[key] freq, ok := d.freqMap[key]

View File

@@ -8,6 +8,8 @@ import (
"strings" "strings"
) )
// DictLoader represents a interface that could add one token or load bunch of
// tokens from channel.
type DictLoader interface { type DictLoader interface {
Load(<-chan Token) Load(<-chan Token)
AddToken(Token) AddToken(Token)
@@ -49,6 +51,7 @@ func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
} }
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, fileName string) error { func LoadDictionary(dl DictLoader, fileName string) error {
filePath, err := dictPath(fileName) filePath, err := dictPath(fileName)
if err != nil { if err != nil {

View File

@@ -1,23 +1,28 @@
package dictionary package dictionary
// Token represents a Chinese word with (optional) frequency and POS.
type Token struct { type Token struct {
text string text string
frequency float64 frequency float64
pos string pos string
} }
//Text returns token's text.
func (t Token) Text() string { func (t Token) Text() string {
return t.text return t.text
} }
// Frequency returns token's frequency.
func (t Token) Frequency() float64 { func (t Token) Frequency() float64 {
return t.frequency return t.frequency
} }
// Pos returns token's POS.
func (t Token) Pos() string { func (t Token) Pos() string {
return t.pos return t.pos
} }
// NewToken creates a new token.
func NewToken(text string, frequency float64, pos string) Token { func NewToken(text string, frequency float64, pos string) Token {
return Token{text: text, frequency: frequency, pos: pos} return Token{text: text, frequency: frequency, pos: pos}
} }

View File

@@ -13,10 +13,10 @@ func cutHan(sentence string) chan string {
result := make(chan string) result := make(chan string)
go func() { go func() {
runes := []rune(sentence) runes := []rune(sentence)
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0 begin, next := 0, 0
for i, char := range runes { for i, char := range runes {
pos := pos_list[i] pos := posList[i]
switch pos { switch pos {
case 'B': case 'B':
begin = i begin = i
@@ -36,6 +36,8 @@ func cutHan(sentence string) chan string {
return result return result
} }
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string { func Cut(sentence string) chan string {
result := make(chan string) result := make(chan string)
s := sentence s := sentence

View File

@@ -6,7 +6,7 @@ import (
) )
func chanToArray(ch chan string) []string { func chanToArray(ch chan string) []string {
result := make([]string, 0) var result []string
for word := range ch { for word := range ch {
result = append(result, word) result = append(result, word)
} }

View File

@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
V[t] = make(map[byte]float64) V[t] = make(map[byte]float64)
for _, y := range states { for _, y := range states {
ps0 := make(probStates, 0) ps0 := make(probStates, 0)
var em_p float64 var emP float64
if val, ok := probEmit[y][obs[t]]; ok { if val, ok := probEmit[y][obs[t]]; ok {
em_p = val emP = val
} else { } else {
em_p = minFloat emP = minFloat
} }
for _, y0 := range prevStatus[y] { for _, y0 := range prevStatus[y] {
var transP float64 var transP float64
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
} else { } else {
transP = minFloat transP = minFloat
} }
prob0 := V[t-1][y0] + transP + em_p prob0 := V[t-1][y0] + transP + emP
ps0 = append(ps0, &probState{prob: prob0, state: y0}) ps0 = append(ps0, &probState{prob: prob0, state: y0})
} }
sort.Sort(sort.Reverse(ps0)) sort.Sort(sort.Reverse(ps0))

View File

@@ -16,15 +16,21 @@ var (
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
) )
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct { type Segmenter struct {
dict *Dictionary dict *Dictionary
} }
// LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error { func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64)} seg.dict = &Dictionary{freqMap: make(map[string]float64)}
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(fileName)
} }
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error { func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(fileName)
} }
@@ -46,7 +52,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 { if freq > 0.0 {
dag[k] = append(dag[k], i) dag[k] = append(dag[k], i)
} }
i += 1 i++
if i >= n { if i >= n {
break break
} }
@@ -98,7 +104,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan string {
routes := seg.calc(runes) routes := seg.calc(runes)
var y int var y int
length := len(runes) length := len(runes)
buf := make([]rune, 0) var buf []rune
for x := 0; x < length; { for x := 0; x < length; {
y = routes[x].index + 1 y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
@@ -156,7 +162,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
routes := seg.calc(runes) routes := seg.calc(runes)
var y int var y int
length := len(runes) length := len(runes)
buf := make([]rune, 0) var buf []rune
for x := 0; x < length; { for x := 0; x < length; {
y = routes[x].index + 1 y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
@@ -181,6 +187,10 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
return result return result
} }
// Cut cuts a sentence into words using accurate mode.
// Parameter hmm controls whether to use the Hidden Markov Model.
// Accurate mode attempts to cut the sentence into the most accurate
// segmentations, which is suitable for text analysis.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
result := make(chan string) result := make(chan string)
var cut cutFunc var cut cutFunc
@@ -246,6 +256,9 @@ func (seg *Segmenter) cutAll(sentence string) <-chan string {
return result return result
} }
// CutAll cuts a sentence into words using full mode.
// Full mode gets all the possible words from the sentence.
// Fast but not accurate.
func (seg *Segmenter) CutAll(sentence string) <-chan string { func (seg *Segmenter) CutAll(sentence string) <-chan string {
result := make(chan string) result := make(chan string)
go func() { go func() {
@@ -268,6 +281,10 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
return result return result
} }
// CutForSearch cuts sentence into words using search engine mode.
// Search engine mode, based on the accurate mode, attempts to cut long words
// into several short words, which can raise the recall rate.
// Suitable for search engines.
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
result := make(chan string) result := make(chan string)
go func() { go func() {

View File

@@ -3,8 +3,8 @@ package jiebago
import "testing" import "testing"
var ( var (
seg Segmenter seg Segmenter
test_contents = []string{ testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。", "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。", "我不喜欢日本和服。",
"雷猴回归人间。", "雷猴回归人间。",
@@ -620,7 +620,7 @@ func init() {
} }
func chanToArray(ch <-chan string) []string { func chanToArray(ch <-chan string) []string {
result := make([]string, 0) var result []string
for word := range ch { for word := range ch {
result = append(result, word) result = append(result, word)
} }
@@ -643,7 +643,7 @@ func TestCutDAGNoHmm(t *testing.T) {
func TestDefaultCut(t *testing.T) { func TestDefaultCut(t *testing.T) {
var result []string var result []string
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.Cut(content, true)) result = chanToArray(seg.Cut(content, true))
if len(result) != len(defaultCutResult[index]) { if len(result) != len(defaultCutResult[index]) {
t.Errorf("default cut for %s length should be %d not %d\n", t.Errorf("default cut for %s length should be %d not %d\n",
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {
func TestCutAll(t *testing.T) { func TestCutAll(t *testing.T) {
var result []string var result []string
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.CutAll(content)) result = chanToArray(seg.CutAll(content))
if len(result) != len(cutAllResult[index]) { if len(result) != len(cutAllResult[index]) {
t.Errorf("cut all for %s length should be %d not %d\n", t.Errorf("cut all for %s length should be %d not %d\n",
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {
func TestDefaultCutNoHMM(t *testing.T) { func TestDefaultCutNoHMM(t *testing.T) {
var result []string var result []string
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.Cut(content, false)) result = chanToArray(seg.Cut(content, false))
if len(result) != len(defaultCutNoHMMResult[index]) { if len(result) != len(defaultCutNoHMMResult[index]) {
t.Fatalf("default cut no hmm for %s length should be %d not %d\n", t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
@@ -695,7 +695,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
func TestCutForSearch(t *testing.T) { func TestCutForSearch(t *testing.T) {
var result []string var result []string
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, true)) result = chanToArray(seg.CutForSearch(content, true))
if len(result) != len(cutForSearchResult[index]) { if len(result) != len(cutForSearchResult[index]) {
t.Fatalf("cut for search for %s length should be %d not %d\n", t.Fatalf("cut for search for %s length should be %d not %d\n",
@@ -707,7 +707,7 @@ func TestCutForSearch(t *testing.T) {
} }
} }
} }
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.CutForSearch(content, false)) result = chanToArray(seg.CutForSearch(content, false))
if len(result) != len(cutForSearchNoHMMResult[index]) { if len(result) != len(cutForSearchNoHMMResult[index]) {
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n", t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
@@ -724,7 +724,7 @@ func TestCutForSearch(t *testing.T) {
func TestLoadDictionary(t *testing.T) { func TestLoadDictionary(t *testing.T) {
var result []string var result []string
seg.LoadDictionary("foobar.txt") seg.LoadDictionary("foobar.txt")
for index, content := range test_contents { for index, content := range testContents {
result = chanToArray(seg.Cut(content, true)) result = chanToArray(seg.Cut(content, true))
if len(result) != len(userDictCutResult[index]) { if len(result) != len(userDictCutResult[index]) {
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n", t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",

View File

@@ -2,9 +2,9 @@ package posseg
import "fmt" import "fmt"
type Tag uint16 type tag uint16
func (t Tag) Tag() string { func (t tag) position() string {
switch t / 100 { switch t / 100 {
case 4: case 4:
return "S" return "S"
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
} }
} }
func (t Tag) POS() string { func (t tag) pos() string {
return poss[t%100] return poss[t%100]
} }
func (t Tag) String() string { func newTag(position, pos string) (tag, error) {
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS()) positionIndex := -1
}
func NewTag(tag, pos string) (Tag, error) {
tagIndex := -1
posIndex := -1 posIndex := -1
for i, t := range tags { for i, p := range positions {
if tag == t { if position == p {
tagIndex = (i + 1) * 100 positionIndex = (i + 1) * 100
break
} }
} }
for i, p := range poss { for i, p := range poss {
if pos == p { if pos == p {
posIndex = i posIndex = i
break
} }
} }
if tagIndex < 0 || posIndex < 0 { if positionIndex < 0 || posIndex < 0 {
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos) return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
} }
return Tag(tagIndex + posIndex), nil return tag(positionIndex + posIndex), nil
} }
type charStateTabMap map[rune][]uint16 type charStateTabMap map[rune][]uint16
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
func (m charStateTabMap) get(key rune) []uint16 { func (m charStateTabMap) get(key rune) []uint16 {
if value, ok := m[key]; ok { if value, ok := m[key]; ok {
return value return value
} else {
return probTransKeys
} }
return probTransKeys
} }
var ( var (
@@ -6708,6 +6705,6 @@ var (
'\u9fa0': []uint16{413}, '\u9fa0': []uint16{413},
} }
tags = []string{"B", "E", "M", "S"} positions = []string{"B", "E", "M", "S"}
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"} poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
) )

View File

@@ -7,6 +7,7 @@ import (
"github.com/wangbin/jiebago/dictionary" "github.com/wangbin/jiebago/dictionary"
) )
// A Dictionary represents a thread-safe dictionary used for word segmentation.
type Dictionary struct { type Dictionary struct {
total, logTotal float64 total, logTotal float64
freqMap map[string]float64 freqMap map[string]float64
@@ -14,6 +15,7 @@ type Dictionary struct {
sync.RWMutex sync.RWMutex
} }
// Load loads all tokens from given channel
func (d *Dictionary) Load(ch <-chan dictionary.Token) { func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.Lock() d.Lock()
for token := range ch { for token := range ch {
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
d.updateLogTotal() d.updateLogTotal()
} }
// AddToken adds one token
func (d *Dictionary) AddToken(token dictionary.Token) { func (d *Dictionary) AddToken(token dictionary.Token) {
d.Lock() d.Lock()
d.addToken(token) d.addToken(token)
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
d.logTotal = math.Log(d.total) d.logTotal = math.Log(d.total)
} }
// Frequency returns the frequency and existence of give word
func (d *Dictionary) Frequency(key string) (float64, bool) { func (d *Dictionary) Frequency(key string) (float64, bool) {
d.RLock() d.RLock()
freq, ok := d.freqMap[key] freq, ok := d.freqMap[key]
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok return freq, ok
} }
// Pos returns the POS and existence of give word
func (d *Dictionary) Pos(key string) (string, bool) { func (d *Dictionary) Pos(key string) (string, bool) {
d.RLock() d.RLock()
pos, ok := d.posMap[key] pos, ok := d.posMap[key]

View File

@@ -17,27 +17,36 @@ var (
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
) )
// Segment represents a word with it's POS
type Segment struct { type Segment struct {
text, pos string text, pos string
} }
// Text returns the Segment's text.
func (s Segment) Text() string { func (s Segment) Text() string {
return s.text return s.text
} }
// Pos returns the Segment's POS.
func (s Segment) Pos() string { func (s Segment) Pos() string {
return s.pos return s.pos
} }
// Segmenter is a Chinese words segmentation struct.
type Segmenter struct { type Segmenter struct {
dict *Dictionary dict *Dictionary
} }
// LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
func (seg *Segmenter) LoadDictionary(fileName string) error { func (seg *Segmenter) LoadDictionary(fileName string) error {
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(fileName)
} }
// LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(fileName string) error { func (seg *Segmenter) LoadUserDictionary(fileName string) error {
return seg.dict.loadDictionary(fileName) return seg.dict.loadDictionary(fileName)
} }
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
next := 0 next := 0
for i, char := range runes { for i, char := range runes {
pos := posList[i] pos := posList[i]
switch pos.Tag() { switch pos.position() {
case "B": case "B":
begin = i begin = i
case "E": case "E":
result <- Segment{string(runes[begin : i+1]), pos.POS()} result <- Segment{string(runes[begin : i+1]), pos.pos()}
next = i + 1 next = i + 1
case "S": case "S":
result <- Segment{string(char), pos.POS()} result <- Segment{string(char), pos.pos()}
next = i + 1 next = i + 1
} }
} }
if next < len(runes) { if next < len(runes) {
result <- Segment{string(runes[next:]), posList[next].POS()} result <- Segment{string(runes[next:]), posList[next].pos()}
} }
close(result) close(result)
}() }()
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
if freq > 0.0 { if freq > 0.0 {
dag[k] = append(dag[k], i) dag[k] = append(dag[k], i)
} }
i += 1 i++
if i >= n { if i >= n {
break break
} }
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
routes := seg.calc(runes) routes := seg.calc(runes)
var y int var y int
length := len(runes) length := len(runes)
buf := make([]rune, 0) var buf []rune
for x := 0; x < length; { for x := 0; x < length; {
y = routes[x].index + 1 y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
routes := seg.calc(runes) routes := seg.calc(runes)
var y int var y int
length := len(runes) length := len(runes)
buf := make([]rune, 0) var buf []rune
for x := 0; x < length; { for x := 0; x < length; {
y = routes[x].index + 1 y = routes[x].index + 1
frag := runes[x:y] frag := runes[x:y]
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
return result return result
} }
// Cut cuts a sentence into words.
// Parameter hmm controls whether to use the Hidden Markov Model.
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment { func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
result := make(chan Segment) result := make(chan Segment)
var cut cutFunc var cut cutFunc

View File

@@ -5,8 +5,8 @@ import (
) )
var ( var (
seg Segmenter seg Segmenter
test_contents = []string{ testContents = []string{
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。", "这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。", "我不喜欢日本和服。",
"雷猴回归人间。", "雷猴回归人间。",
@@ -273,7 +273,7 @@ func init() {
} }
func chanToArray(ch <-chan Segment) []Segment { func chanToArray(ch <-chan Segment) []Segment {
result := make([]Segment, 0) var result []Segment
for word := range ch { for word := range ch {
result = append(result, word) result = append(result, word)
} }
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
} }
func TestCut(t *testing.T) { func TestCut(t *testing.T) {
for index, content := range test_contents { for index, content := range testContents {
result := chanToArray(seg.Cut(content, true)) result := chanToArray(seg.Cut(content, true))
if len(defaultCutResult[index]) != len(result) { if len(defaultCutResult[index]) != len(result) {
t.Errorf("default cut for %s length should be %d not %d\n", t.Errorf("default cut for %s length should be %d not %d\n",
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
t.Errorf("expect: %v\n", defaultCutResult[index]) t.Errorf("expect: %v\n", defaultCutResult[index])
t.Fatalf("got: %v\n", result) t.Fatalf("got: %v\n", result)
} }
for i, _ := range result { for i := range result {
if result[i] != defaultCutResult[index][i] { if result[i] != defaultCutResult[index][i] {
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i]) t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
} }
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
if len(noHMMCutResult[index]) != len(result) { if len(noHMMCutResult[index]) != len(result) {
t.Fatal(content) t.Fatal(content)
} }
for i, _ := range result { for i := range result {
if result[i] != noHMMCutResult[index][i] { if result[i] != noHMMCutResult[index][i] {
t.Fatal(content) t.Fatal(content)
} }
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Fatal(result) t.Fatal(result)
} }
for i, _ := range result { for i := range result {
if result[i] != cutResult[i] { if result[i] != cutResult[i] {
t.Fatal(result[i]) t.Fatal(result[i])
} }
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Fatal(result) t.Fatal(result)
} }
for i, _ := range result { for i := range result {
if result[i] != cutResult[i] { if result[i] != cutResult[i] {
t.Fatal(result[i]) t.Fatal(result[i])
} }
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
if len(cutResult) != len(result) { if len(cutResult) != len(result) {
t.Fatal(result) t.Fatal(result)
} }
for i, _ := range result { for i := range result {
if result[i] != cutResult[i] { if result[i] != cutResult[i] {
t.Fatal(result[i]) t.Fatal(result[i])
} }

View File

@@ -1,15 +1,14 @@
package posseg package posseg
const MinFloat = -3.14e100 const minFloat = -3.14e100
type runeFloatMap map[rune]float64 type runeFloatMap map[rune]float64
func (m runeFloatMap) get(key rune) float64 { func (m runeFloatMap) get(key rune) float64 {
if value, ok := m[key]; ok { if value, ok := m[key]; ok {
return value return value
} else {
return MinFloat
} }
return minFloat
} }
var probEmit = map[uint16]runeFloatMap{ var probEmit = map[uint16]runeFloatMap{

View File

@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
func (m probTransMap) Get(key uint16) float64 { func (m probTransMap) Get(key uint16) float64 {
if value, ok := m[key]; ok { if value, ok := m[key]; ok {
return value return value
} else {
return inf
} }
return inf
} }
var ( var (

View File

@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
pss[i], pss[j] = pss[j], pss[i] pss[i], pss[j] = pss[j], pss[i]
} }
func viterbi(obs []rune) []Tag { func viterbi(obs []rune) []tag {
obsLength := len(obs) obsLength := len(obs)
V := make([]map[uint16]float64, obsLength) V := make([]map[uint16]float64, obsLength)
V[0] = make(map[uint16]float64) V[0] = make(map[uint16]float64)
mem_path := make([]map[uint16]uint16, obsLength) memPath := make([]map[uint16]uint16, obsLength)
mem_path[0] = make(map[uint16]uint16) memPath[0] = make(map[uint16]uint16)
ys := charStateTab.get(obs[0]) // default is all_states ys := charStateTab.get(obs[0]) // default is all_states
for _, y := range ys { for _, y := range ys {
V[0][y] = probEmit[y].get(obs[0]) + probStart[y] V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
mem_path[0][y] = 0 memPath[0][y] = 0
} }
for t := 1; t < obsLength; t++ { for t := 1; t < obsLength; t++ {
prev_states := make([]uint16, 0) var prevStates []uint16
for x := range mem_path[t-1] { for x := range memPath[t-1] {
if len(probTrans[x]) > 0 { if len(probTrans[x]) > 0 {
prev_states = append(prev_states, x) prevStates = append(prevStates, x)
} }
} }
//use Go's map to implement Python's Set() //use Go's map to implement Python's Set()
prev_states_expect_next := make(map[uint16]int) prevStatesExpectNext := make(map[uint16]int)
for _, x := range prev_states { for _, x := range prevStates {
for y := range probTrans[x] { for y := range probTrans[x] {
prev_states_expect_next[y] = 1 prevStatesExpectNext[y] = 1
} }
} }
tmp_obs_states := charStateTab.get(obs[t]) tmpObsStates := charStateTab.get(obs[t])
obs_states := make([]uint16, 0) var obsStates []uint16
for index := range tmp_obs_states { for index := range tmpObsStates {
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok { if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
obs_states = append(obs_states, tmp_obs_states[index]) obsStates = append(obsStates, tmpObsStates[index])
} }
} }
if len(obs_states) == 0 { if len(obsStates) == 0 {
for key := range prev_states_expect_next { for key := range prevStatesExpectNext {
obs_states = append(obs_states, key) obsStates = append(obsStates, key)
} }
} }
if len(obs_states) == 0 { if len(obsStates) == 0 {
obs_states = probTransKeys obsStates = probTransKeys
} }
mem_path[t] = make(map[uint16]uint16) memPath[t] = make(map[uint16]uint16)
V[t] = make(map[uint16]float64) V[t] = make(map[uint16]float64)
for _, y := range obs_states { for _, y := range obsStates {
var max, ps probState var max, ps probState
for i, y0 := range prev_states { for i, y0 := range prevStates {
ps = probState{ ps = probState{
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]), prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
state: y0} state: y0}
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
} }
} }
V[t][y] = max.prob V[t][y] = max.prob
mem_path[t][y] = max.state memPath[t][y] = max.state
} }
} }
last := make(probStates, 0) last := make(probStates, 0)
length := len(mem_path) length := len(memPath)
vlength := len(V) vlength := len(V)
for y := range mem_path[length-1] { for y := range memPath[length-1] {
ps := probState{prob: V[vlength-1][y], state: y} ps := probState{prob: V[vlength-1][y], state: y}
last = append(last, ps) last = append(last, ps)
} }
sort.Sort(sort.Reverse(last)) sort.Sort(sort.Reverse(last))
state := last[0].state state := last[0].state
route := make([]Tag, len(obs)) route := make([]tag, len(obs))
for i := obsLength - 1; i >= 0; i-- { for i := obsLength - 1; i >= 0; i-- {
route[i] = Tag(state) route[i] = tag(state)
state = mem_path[i][state] state = memPath[i][state]
} }
return route return route
} }

View File

@@ -4,49 +4,49 @@ import (
"testing" "testing"
) )
var defaultRoute []Tag var defaultRoute []tag
func init() { func init() {
var t Tag var t tag
t, _ = NewTag("B", "nr") t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr") t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr") t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v") t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v") t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v") t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "n") t, _ = newTag("B", "n")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "n") t, _ = newTag("M", "n")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "n") t, _ = newTag("E", "n")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "d") t, _ = newTag("S", "d")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "v") t, _ = newTag("S", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "n") t, _ = newTag("S", "n")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "v") t, _ = newTag("B", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "v") t, _ = newTag("E", "v")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("B", "nr") t, _ = newTag("B", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr") t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr") t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("M", "nr") t, _ = newTag("M", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("E", "nr") t, _ = newTag("E", "nr")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
t, _ = NewTag("S", "zg") t, _ = newTag("S", "zg")
defaultRoute = append(defaultRoute, t) defaultRoute = append(defaultRoute, t)
} }

View File

@@ -9,18 +9,40 @@ import (
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
) )
// Name is the jieba tokenizer name.
const Name = "jieba" const Name = "jieba"
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`) var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
// JiebaTokenizer is the beleve tokenizer for jiebago.
type JiebaTokenizer struct { type JiebaTokenizer struct {
seg Segmenter seg Segmenter
hmm, searchMode bool hmm, searchMode bool
} }
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) { /*
NewJiebaTokenizer creates a new JiebaTokenizer.
Parameters:
dictFilePath: path of the dictioanry file.
hmm: whether to use Hidden Markov Model to cut unknown words,
i.e. not found in dictionary. For example word "安卓" (means "Android" in
English) not in the dictionary file. If hmm is set to false, it will be
cutted into two single words "安" and "卓", if hmm is set to true, it will
be traded as one single word because Jieba using Hidden Markov Model with
Viterbi algorithm to guess the best possibility.
searchMode: whether to further cut long words into serveral short words.
In Chinese, some long words may contains other words, for example "交换机"
is a Chinese word for "Switcher", if sechMode is false, it will trade
"交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words.
*/
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
var seg Segmenter var seg Segmenter
err := seg.LoadDictionary(dictFileName) err := seg.LoadDictionary(dictFilePath)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
hmm: hmm, hmm: hmm,
@@ -28,6 +50,7 @@ func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Toke
}, err }, err
} }
// Tokenize cuts input into bleve token stream.
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0)
runeStart := 0 runeStart := 0
@@ -77,9 +100,20 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
return rv return rv
} }
/*
JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter:
file: the path of the dictionary file.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
*/
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
analysis.Tokenizer, error) { analysis.Tokenizer, error) {
dictFileName, ok := config["file"].(string) dictFilePath, ok := config["file"].(string)
if !ok { if !ok {
return nil, fmt.Errorf("must specify dictionary file path") return nil, fmt.Errorf("must specify dictionary file path")
} }
@@ -92,11 +126,11 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
searchMode = true searchMode = true
} }
return NewJiebaTokenizer(dictFileName, hmm, searchMode) return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
} }
func detectTokenType(term string) analysis.TokenType { func detectTokenType(term string) analysis.TokenType {
if IdeographRegexp.MatchString(term) { if ideographRegexp.MatchString(term) {
return analysis.Ideographic return analysis.Ideographic
} }
_, err := strconv.ParseFloat(term, 64) _, err := strconv.ParseFloat(term, 64)

View File

@@ -2,12 +2,14 @@ package util
import "regexp" import "regexp"
// RegexpSplit split slices s into substrings separated by the expression and /*
// returns a slice of the substrings between those expression matches. RegexpSplit split slices s into substrings separated by the expression and
// If capturing parentheses are used in expression, then the text of all groups returns a slice of the substrings between those expression matches.
// in the expression are also returned as part of the resulting slice. If capturing parentheses are used in expression, then the text of all groups
// in the expression are also returned as part of the resulting slice.
// This function acts consistent with Python's re.split function.
This function acts consistent with Python's re.split function.
*/
func RegexpSplit(re *regexp.Regexp, s string, n int) []string { func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
if n == 0 { if n == 0 {
return nil return nil