mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
code refactor, added more documents
This commit is contained in:
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
test_contents = []string{
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -259,7 +259,7 @@ func TestExtractTags(t *testing.T) {
|
||||
te.LoadDictionary("../dict.txt")
|
||||
te.LoadIdf("idf.txt")
|
||||
|
||||
for index, sentence := range test_contents {
|
||||
for index, sentence := range testContents {
|
||||
result := te.ExtractTags(sentence, 20)
|
||||
if len(result) != len(Tags[index]) {
|
||||
t.Fatalf("%s = %v", sentence, result)
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// Idf represents a thread-safe dictionary for all words with their
|
||||
// IDFs(Inverse Document Frequency).
|
||||
type Idf struct {
|
||||
freqMap map[string]float64
|
||||
median float64
|
||||
@@ -14,6 +16,7 @@ type Idf struct {
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// AddToken adds a new word with IDF into it's dictionary.
|
||||
func (i *Idf) AddToken(token dictionary.Token) {
|
||||
i.Lock()
|
||||
i.freqMap[token.Text()] = token.Frequency()
|
||||
@@ -23,6 +26,7 @@ func (i *Idf) AddToken(token dictionary.Token) {
|
||||
i.Unlock()
|
||||
}
|
||||
|
||||
// Load loads all tokens from channel into it's dictionary.
|
||||
func (i *Idf) Load(ch <-chan dictionary.Token) {
|
||||
i.Lock()
|
||||
for token := range ch {
|
||||
@@ -38,6 +42,7 @@ func (i *Idf) loadDictionary(fileName string) error {
|
||||
return dictionary.LoadDictionary(i, fileName)
|
||||
}
|
||||
|
||||
// Frequency returns the IDF of given word.
|
||||
func (i *Idf) Frequency(key string) (float64, bool) {
|
||||
i.RLock()
|
||||
freq, ok := i.freqMap[key]
|
||||
@@ -45,6 +50,7 @@ func (i *Idf) Frequency(key string) (float64, bool) {
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// NewIdf creates a new Idf instance.
|
||||
func NewIdf() *Idf {
|
||||
return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// DefaultStopWordMap contains some stop words.
|
||||
var DefaultStopWordMap = map[string]int{
|
||||
"the": 1,
|
||||
"of": 1,
|
||||
@@ -40,23 +41,27 @@ var DefaultStopWordMap = map[string]int{
|
||||
"or": 1,
|
||||
}
|
||||
|
||||
// StopWord is a thread-safe dictionary for all stop words.
|
||||
type StopWord struct {
|
||||
stopWordMap map[string]int
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// AddToken adds a token into StopWord dictionary.
|
||||
func (s *StopWord) AddToken(token dictionary.Token) {
|
||||
s.Lock()
|
||||
s.stopWordMap[token.Text()] = 1
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
// NewStopWord create a new StopWord with default stop words.
|
||||
func NewStopWord() *StopWord {
|
||||
s := new(StopWord)
|
||||
s.stopWordMap = DefaultStopWordMap
|
||||
return s
|
||||
}
|
||||
|
||||
// IsStopWord checks if a given word is stop word.
|
||||
func (s *StopWord) IsStopWord(word string) bool {
|
||||
s.RLock()
|
||||
_, ok := s.stopWordMap[word]
|
||||
@@ -64,6 +69,7 @@ func (s *StopWord) IsStopWord(word string) bool {
|
||||
return ok
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel into StopWord dictionary.
|
||||
func (s *StopWord) Load(ch <-chan dictionary.Token) {
|
||||
s.Lock()
|
||||
for token := range ch {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
@@ -9,23 +8,23 @@ import (
|
||||
"github.com/wangbin/jiebago"
|
||||
)
|
||||
|
||||
// Segment represents a word with weight.
|
||||
type Segment struct {
|
||||
text string
|
||||
weight float64
|
||||
}
|
||||
|
||||
// Text returns the segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
// Weight returns the segment's weight.
|
||||
func (s Segment) Weight() float64 {
|
||||
return s.weight
|
||||
}
|
||||
|
||||
func (s Segment) String() string {
|
||||
return fmt.Sprintf("{%s: %f}", s.text, s.weight)
|
||||
}
|
||||
|
||||
// Segments represents a slice of Segment.
|
||||
type Segments []Segment
|
||||
|
||||
func (ss Segments) Len() int {
|
||||
@@ -44,29 +43,33 @@ func (ss Segments) Swap(i, j int) {
|
||||
ss[i], ss[j] = ss[j], ss[i]
|
||||
}
|
||||
|
||||
// TagExtracter is used to extract tags from sentence.
|
||||
type TagExtracter struct {
|
||||
seg *jiebago.Segmenter
|
||||
idf *Idf
|
||||
stopWord *StopWord
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given filename and create a new dictionary.
|
||||
func (t *TagExtracter) LoadDictionary(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
t.seg = new(jiebago.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||
func (t *TagExtracter) LoadIdf(fileName string) error {
|
||||
t.idf = NewIdf()
|
||||
return t.idf.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||
func (t *TagExtracter) LoadStopWords(fileName string) error {
|
||||
t.stopWord = NewStopWord()
|
||||
return t.stopWord.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// Keyword extraction.
|
||||
// ExtractTags extracts the topK key words from sentence.
|
||||
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
|
||||
freqMap := make(map[string]float64)
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package analyse
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
@@ -20,10 +19,6 @@ type edge struct {
|
||||
weight float64
|
||||
}
|
||||
|
||||
func (e edge) String() string {
|
||||
return fmt.Sprintf("(%s %s): %f", e.start, e.end, e.weight)
|
||||
}
|
||||
|
||||
type edges []edge
|
||||
|
||||
func (es edges) Len() int {
|
||||
@@ -114,8 +109,8 @@ func (u *undirectWeightedGraph) rank() Segments {
|
||||
return result
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm. the allowed POS list
|
||||
// could be manually speificed.
|
||||
// TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
|
||||
// Parameter allowPOS allows a customized pos list.
|
||||
func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
|
||||
posFilt := make(map[string]int)
|
||||
for _, pos := range allowPOS {
|
||||
@@ -124,7 +119,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
||||
g := newUndirectWeightedGraph()
|
||||
cm := make(map[[2]string]float64)
|
||||
span := 5
|
||||
pairs := make([]posseg.Segment, 0)
|
||||
var pairs []posseg.Segment
|
||||
for pair := range t.seg.Cut(sentence, true) {
|
||||
pairs = append(pairs, pair)
|
||||
}
|
||||
@@ -152,16 +147,18 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
||||
return tags
|
||||
}
|
||||
|
||||
// Extract keywords from sentence using TextRank algorithm.
|
||||
// topK specify how many top keywords to be returned at most.
|
||||
// TextRank extract keywords from sentence using TextRank algorithm.
|
||||
// Parameter topK specify how many top keywords to be returned at most.
|
||||
func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
||||
return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
|
||||
}
|
||||
|
||||
// TextRanker is used to extract tags from sentence.
|
||||
type TextRanker struct {
|
||||
seg *posseg.Segmenter
|
||||
}
|
||||
|
||||
// LoadDictionary reads a given file and create a new dictionary file for Textranker.
|
||||
func (t *TextRanker) LoadDictionary(fileName string) error {
|
||||
t.seg = new(posseg.Segmenter)
|
||||
return t.seg.LoadDictionary(fileName)
|
||||
|
||||
@@ -14,7 +14,7 @@ type Dictionary struct {
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// Load loads all tokens from channel
|
||||
// Load loads all tokens from given channel
|
||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
@@ -49,7 +49,7 @@ func (d *Dictionary) updateLogTotal() {
|
||||
d.logTotal = math.Log(d.total)
|
||||
}
|
||||
|
||||
// Frequency returns the frequency of give word, if not found, the second result is false
|
||||
// Frequency returns the frequency and existence of give word
|
||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
d.RLock()
|
||||
freq, ok := d.freqMap[key]
|
||||
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DictLoader represents a interface that could add one token or load bunch of
|
||||
// tokens from channel.
|
||||
type DictLoader interface {
|
||||
Load(<-chan Token)
|
||||
AddToken(Token)
|
||||
@@ -49,6 +51,7 @@ func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
|
||||
|
||||
}
|
||||
|
||||
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||
func LoadDictionary(dl DictLoader, fileName string) error {
|
||||
filePath, err := dictPath(fileName)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,23 +1,28 @@
|
||||
package dictionary
|
||||
|
||||
// Token represents a Chinese word with (optional) frequency and POS.
|
||||
type Token struct {
|
||||
text string
|
||||
frequency float64
|
||||
pos string
|
||||
}
|
||||
|
||||
//Text returns token's text.
|
||||
func (t Token) Text() string {
|
||||
return t.text
|
||||
}
|
||||
|
||||
// Frequency returns token's frequency.
|
||||
func (t Token) Frequency() float64 {
|
||||
return t.frequency
|
||||
}
|
||||
|
||||
// Pos returns token's POS.
|
||||
func (t Token) Pos() string {
|
||||
return t.pos
|
||||
}
|
||||
|
||||
// NewToken creates a new token.
|
||||
func NewToken(text string, frequency float64, pos string) Token {
|
||||
return Token{text: text, frequency: frequency, pos: pos}
|
||||
}
|
||||
|
||||
@@ -13,10 +13,10 @@ func cutHan(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
runes := []rune(sentence)
|
||||
_, pos_list := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
|
||||
begin, next := 0, 0
|
||||
for i, char := range runes {
|
||||
pos := pos_list[i]
|
||||
pos := posList[i]
|
||||
switch pos {
|
||||
case 'B':
|
||||
begin = i
|
||||
@@ -36,6 +36,8 @@ func cutHan(sentence string) chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
|
||||
// algorithm. It is used by Jiebago for unknonw words.
|
||||
func Cut(sentence string) chan string {
|
||||
result := make(chan string)
|
||||
s := sentence
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func chanToArray(ch chan string) []string {
|
||||
result := make([]string, 0)
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
|
||||
@@ -67,11 +67,11 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
V[t] = make(map[byte]float64)
|
||||
for _, y := range states {
|
||||
ps0 := make(probStates, 0)
|
||||
var em_p float64
|
||||
var emP float64
|
||||
if val, ok := probEmit[y][obs[t]]; ok {
|
||||
em_p = val
|
||||
emP = val
|
||||
} else {
|
||||
em_p = minFloat
|
||||
emP = minFloat
|
||||
}
|
||||
for _, y0 := range prevStatus[y] {
|
||||
var transP float64
|
||||
@@ -80,7 +80,7 @@ func viterbi(obs []rune, states []byte) (float64, []byte) {
|
||||
} else {
|
||||
transP = minFloat
|
||||
}
|
||||
prob0 := V[t-1][y0] + transP + em_p
|
||||
prob0 := V[t-1][y0] + transP + emP
|
||||
ps0 = append(ps0, &probState{prob: prob0, state: y0})
|
||||
}
|
||||
sort.Sort(sort.Reverse(ps0))
|
||||
|
||||
23
jieba.go
23
jieba.go
@@ -16,15 +16,21 @@ var (
|
||||
reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name. Everytime
|
||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
@@ -46,7 +52,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
if freq > 0.0 {
|
||||
dag[k] = append(dag[k], i)
|
||||
}
|
||||
i += 1
|
||||
i++
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
@@ -98,7 +104,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan string {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -156,7 +162,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -181,6 +187,10 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts a sentence into words using accurate mode.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
// Accurate mode attempts to cut the sentence into the most accurate
|
||||
// segmentations, which is suitable for text analysis.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
var cut cutFunc
|
||||
@@ -246,6 +256,9 @@ func (seg *Segmenter) cutAll(sentence string) <-chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// CutAll cuts a sentence into words using full mode.
|
||||
// Full mode gets all the possible words from the sentence.
|
||||
// Fast but not accurate.
|
||||
func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
@@ -268,6 +281,10 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
|
||||
return result
|
||||
}
|
||||
|
||||
// CutForSearch cuts sentence into words using search engine mode.
|
||||
// Search engine mode, based on the accurate mode, attempts to cut long words
|
||||
// into several short words, which can raise the recall rate.
|
||||
// Suitable for search engines.
|
||||
func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
|
||||
@@ -3,8 +3,8 @@ package jiebago
|
||||
import "testing"
|
||||
|
||||
var (
|
||||
seg Segmenter
|
||||
test_contents = []string{
|
||||
seg Segmenter
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -620,7 +620,7 @@ func init() {
|
||||
}
|
||||
|
||||
func chanToArray(ch <-chan string) []string {
|
||||
result := make([]string, 0)
|
||||
var result []string
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -643,7 +643,7 @@ func TestCutDAGNoHmm(t *testing.T) {
|
||||
|
||||
func TestDefaultCut(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, true))
|
||||
if len(result) != len(defaultCutResult[index]) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
@@ -661,7 +661,7 @@ func TestDefaultCut(t *testing.T) {
|
||||
|
||||
func TestCutAll(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutAll(content))
|
||||
if len(result) != len(cutAllResult[index]) {
|
||||
t.Errorf("cut all for %s length should be %d not %d\n",
|
||||
@@ -679,7 +679,7 @@ func TestCutAll(t *testing.T) {
|
||||
|
||||
func TestDefaultCutNoHMM(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, false))
|
||||
if len(result) != len(defaultCutNoHMMResult[index]) {
|
||||
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
|
||||
@@ -695,7 +695,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
|
||||
|
||||
func TestCutForSearch(t *testing.T) {
|
||||
var result []string
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutForSearch(content, true))
|
||||
if len(result) != len(cutForSearchResult[index]) {
|
||||
t.Fatalf("cut for search for %s length should be %d not %d\n",
|
||||
@@ -707,7 +707,7 @@ func TestCutForSearch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.CutForSearch(content, false))
|
||||
if len(result) != len(cutForSearchNoHMMResult[index]) {
|
||||
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
|
||||
@@ -724,7 +724,7 @@ func TestCutForSearch(t *testing.T) {
|
||||
func TestLoadDictionary(t *testing.T) {
|
||||
var result []string
|
||||
seg.LoadDictionary("foobar.txt")
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result = chanToArray(seg.Cut(content, true))
|
||||
if len(result) != len(userDictCutResult[index]) {
|
||||
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
|
||||
|
||||
@@ -2,9 +2,9 @@ package posseg
|
||||
|
||||
import "fmt"
|
||||
|
||||
type Tag uint16
|
||||
type tag uint16
|
||||
|
||||
func (t Tag) Tag() string {
|
||||
func (t tag) position() string {
|
||||
switch t / 100 {
|
||||
case 4:
|
||||
return "S"
|
||||
@@ -19,31 +19,29 @@ func (t Tag) Tag() string {
|
||||
}
|
||||
}
|
||||
|
||||
func (t Tag) POS() string {
|
||||
func (t tag) pos() string {
|
||||
return poss[t%100]
|
||||
}
|
||||
|
||||
func (t Tag) String() string {
|
||||
return fmt.Sprintf("(%s, %s)", t.Tag(), t.POS())
|
||||
}
|
||||
|
||||
func NewTag(tag, pos string) (Tag, error) {
|
||||
tagIndex := -1
|
||||
func newTag(position, pos string) (tag, error) {
|
||||
positionIndex := -1
|
||||
posIndex := -1
|
||||
for i, t := range tags {
|
||||
if tag == t {
|
||||
tagIndex = (i + 1) * 100
|
||||
for i, p := range positions {
|
||||
if position == p {
|
||||
positionIndex = (i + 1) * 100
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, p := range poss {
|
||||
if pos == p {
|
||||
posIndex = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if tagIndex < 0 || posIndex < 0 {
|
||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", tag, pos)
|
||||
if positionIndex < 0 || posIndex < 0 {
|
||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
|
||||
}
|
||||
return Tag(tagIndex + posIndex), nil
|
||||
return tag(positionIndex + posIndex), nil
|
||||
}
|
||||
|
||||
type charStateTabMap map[rune][]uint16
|
||||
@@ -51,9 +49,8 @@ type charStateTabMap map[rune][]uint16
|
||||
func (m charStateTabMap) get(key rune) []uint16 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return probTransKeys
|
||||
}
|
||||
return probTransKeys
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -6708,6 +6705,6 @@ var (
|
||||
'\u9fa0': []uint16{413},
|
||||
}
|
||||
|
||||
tags = []string{"B", "E", "M", "S"}
|
||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||
positions = []string{"B", "E", "M", "S"}
|
||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/wangbin/jiebago/dictionary"
|
||||
)
|
||||
|
||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||
type Dictionary struct {
|
||||
total, logTotal float64
|
||||
freqMap map[string]float64
|
||||
@@ -14,6 +15,7 @@ type Dictionary struct {
|
||||
sync.RWMutex
|
||||
}
|
||||
|
||||
// Load loads all tokens from given channel
|
||||
func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.Lock()
|
||||
for token := range ch {
|
||||
@@ -23,6 +25,7 @@ func (d *Dictionary) Load(ch <-chan dictionary.Token) {
|
||||
d.updateLogTotal()
|
||||
}
|
||||
|
||||
// AddToken adds one token
|
||||
func (d *Dictionary) AddToken(token dictionary.Token) {
|
||||
d.Lock()
|
||||
d.addToken(token)
|
||||
@@ -50,6 +53,7 @@ func (d *Dictionary) updateLogTotal() {
|
||||
d.logTotal = math.Log(d.total)
|
||||
}
|
||||
|
||||
// Frequency returns the frequency and existence of give word
|
||||
func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
d.RLock()
|
||||
freq, ok := d.freqMap[key]
|
||||
@@ -57,6 +61,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
||||
return freq, ok
|
||||
}
|
||||
|
||||
// Pos returns the POS and existence of give word
|
||||
func (d *Dictionary) Pos(key string) (string, bool) {
|
||||
d.RLock()
|
||||
pos, ok := d.posMap[key]
|
||||
|
||||
@@ -17,27 +17,36 @@ var (
|
||||
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||
)
|
||||
|
||||
// Segment represents a word with it's POS
|
||||
type Segment struct {
|
||||
text, pos string
|
||||
}
|
||||
|
||||
// Text returns the Segment's text.
|
||||
func (s Segment) Text() string {
|
||||
return s.text
|
||||
}
|
||||
|
||||
// Pos returns the Segment's POS.
|
||||
func (s Segment) Pos() string {
|
||||
return s.pos
|
||||
}
|
||||
|
||||
// Segmenter is a Chinese words segmentation struct.
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// LoadDictionary loads dictionary from given file name.
|
||||
// Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||
func (seg *Segmenter) LoadDictionary(fileName string) error {
|
||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
|
||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||
// instead it will override exist entries.
|
||||
func (seg *Segmenter) LoadUserDictionary(fileName string) error {
|
||||
return seg.dict.loadDictionary(fileName)
|
||||
}
|
||||
@@ -52,19 +61,19 @@ func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
||||
next := 0
|
||||
for i, char := range runes {
|
||||
pos := posList[i]
|
||||
switch pos.Tag() {
|
||||
switch pos.position() {
|
||||
case "B":
|
||||
begin = i
|
||||
case "E":
|
||||
result <- Segment{string(runes[begin : i+1]), pos.POS()}
|
||||
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
||||
next = i + 1
|
||||
case "S":
|
||||
result <- Segment{string(char), pos.POS()}
|
||||
result <- Segment{string(char), pos.pos()}
|
||||
next = i + 1
|
||||
}
|
||||
}
|
||||
if next < len(runes) {
|
||||
result <- Segment{string(runes[next:]), posList[next].POS()}
|
||||
result <- Segment{string(runes[next:]), posList[next].pos()}
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
@@ -117,7 +126,7 @@ func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
||||
if freq > 0.0 {
|
||||
dag[k] = append(dag[k], i)
|
||||
}
|
||||
i += 1
|
||||
i++
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
@@ -170,7 +179,7 @@ func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -253,7 +262,7 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
routes := seg.calc(runes)
|
||||
var y int
|
||||
length := len(runes)
|
||||
buf := make([]rune, 0)
|
||||
var buf []rune
|
||||
for x := 0; x < length; {
|
||||
y = routes[x].index + 1
|
||||
frag := runes[x:y]
|
||||
@@ -283,6 +292,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
||||
return result
|
||||
}
|
||||
|
||||
// Cut cuts a sentence into words.
|
||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
||||
result := make(chan Segment)
|
||||
var cut cutFunc
|
||||
|
||||
@@ -5,8 +5,8 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
seg Segmenter
|
||||
test_contents = []string{
|
||||
seg Segmenter
|
||||
testContents = []string{
|
||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||
"我不喜欢日本和服。",
|
||||
"雷猴回归人间。",
|
||||
@@ -273,7 +273,7 @@ func init() {
|
||||
}
|
||||
|
||||
func chanToArray(ch <-chan Segment) []Segment {
|
||||
result := make([]Segment, 0)
|
||||
var result []Segment
|
||||
for word := range ch {
|
||||
result = append(result, word)
|
||||
}
|
||||
@@ -281,7 +281,7 @@ func chanToArray(ch <-chan Segment) []Segment {
|
||||
}
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
for index, content := range test_contents {
|
||||
for index, content := range testContents {
|
||||
result := chanToArray(seg.Cut(content, true))
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||
@@ -289,7 +289,7 @@ func TestCut(t *testing.T) {
|
||||
t.Errorf("expect: %v\n", defaultCutResult[index])
|
||||
t.Fatalf("got: %v\n", result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != defaultCutResult[index][i] {
|
||||
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||
}
|
||||
@@ -298,7 +298,7 @@ func TestCut(t *testing.T) {
|
||||
if len(noHMMCutResult[index]) != len(result) {
|
||||
t.Fatal(content)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != noHMMCutResult[index][i] {
|
||||
t.Fatal(content)
|
||||
}
|
||||
@@ -320,7 +320,7 @@ func TestBug132(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
@@ -404,7 +404,7 @@ func TestUserDict(t *testing.T) {
|
||||
if len(cutResult) != len(result) {
|
||||
t.Fatal(result)
|
||||
}
|
||||
for i, _ := range result {
|
||||
for i := range result {
|
||||
if result[i] != cutResult[i] {
|
||||
t.Fatal(result[i])
|
||||
}
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
package posseg
|
||||
|
||||
const MinFloat = -3.14e100
|
||||
const minFloat = -3.14e100
|
||||
|
||||
type runeFloatMap map[rune]float64
|
||||
|
||||
func (m runeFloatMap) get(key rune) float64 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return MinFloat
|
||||
}
|
||||
return minFloat
|
||||
}
|
||||
|
||||
var probEmit = map[uint16]runeFloatMap{
|
||||
|
||||
@@ -11,9 +11,8 @@ type probTransMap map[uint16]float64
|
||||
func (m probTransMap) Get(key uint16) float64 {
|
||||
if value, ok := m[key]; ok {
|
||||
return value
|
||||
} else {
|
||||
return inf
|
||||
}
|
||||
return inf
|
||||
}
|
||||
|
||||
var (
|
||||
|
||||
@@ -31,52 +31,52 @@ func (pss probStates) Swap(i, j int) {
|
||||
pss[i], pss[j] = pss[j], pss[i]
|
||||
}
|
||||
|
||||
func viterbi(obs []rune) []Tag {
|
||||
func viterbi(obs []rune) []tag {
|
||||
obsLength := len(obs)
|
||||
V := make([]map[uint16]float64, obsLength)
|
||||
V[0] = make(map[uint16]float64)
|
||||
mem_path := make([]map[uint16]uint16, obsLength)
|
||||
mem_path[0] = make(map[uint16]uint16)
|
||||
memPath := make([]map[uint16]uint16, obsLength)
|
||||
memPath[0] = make(map[uint16]uint16)
|
||||
ys := charStateTab.get(obs[0]) // default is all_states
|
||||
for _, y := range ys {
|
||||
V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
|
||||
mem_path[0][y] = 0
|
||||
memPath[0][y] = 0
|
||||
}
|
||||
for t := 1; t < obsLength; t++ {
|
||||
prev_states := make([]uint16, 0)
|
||||
for x := range mem_path[t-1] {
|
||||
var prevStates []uint16
|
||||
for x := range memPath[t-1] {
|
||||
if len(probTrans[x]) > 0 {
|
||||
prev_states = append(prev_states, x)
|
||||
prevStates = append(prevStates, x)
|
||||
}
|
||||
}
|
||||
//use Go's map to implement Python's Set()
|
||||
prev_states_expect_next := make(map[uint16]int)
|
||||
for _, x := range prev_states {
|
||||
prevStatesExpectNext := make(map[uint16]int)
|
||||
for _, x := range prevStates {
|
||||
for y := range probTrans[x] {
|
||||
prev_states_expect_next[y] = 1
|
||||
prevStatesExpectNext[y] = 1
|
||||
}
|
||||
}
|
||||
tmp_obs_states := charStateTab.get(obs[t])
|
||||
tmpObsStates := charStateTab.get(obs[t])
|
||||
|
||||
obs_states := make([]uint16, 0)
|
||||
for index := range tmp_obs_states {
|
||||
if _, ok := prev_states_expect_next[tmp_obs_states[index]]; ok {
|
||||
obs_states = append(obs_states, tmp_obs_states[index])
|
||||
var obsStates []uint16
|
||||
for index := range tmpObsStates {
|
||||
if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
|
||||
obsStates = append(obsStates, tmpObsStates[index])
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
for key := range prev_states_expect_next {
|
||||
obs_states = append(obs_states, key)
|
||||
if len(obsStates) == 0 {
|
||||
for key := range prevStatesExpectNext {
|
||||
obsStates = append(obsStates, key)
|
||||
}
|
||||
}
|
||||
if len(obs_states) == 0 {
|
||||
obs_states = probTransKeys
|
||||
if len(obsStates) == 0 {
|
||||
obsStates = probTransKeys
|
||||
}
|
||||
mem_path[t] = make(map[uint16]uint16)
|
||||
memPath[t] = make(map[uint16]uint16)
|
||||
V[t] = make(map[uint16]float64)
|
||||
for _, y := range obs_states {
|
||||
for _, y := range obsStates {
|
||||
var max, ps probState
|
||||
for i, y0 := range prev_states {
|
||||
for i, y0 := range prevStates {
|
||||
ps = probState{
|
||||
prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
|
||||
state: y0}
|
||||
@@ -85,23 +85,23 @@ func viterbi(obs []rune) []Tag {
|
||||
}
|
||||
}
|
||||
V[t][y] = max.prob
|
||||
mem_path[t][y] = max.state
|
||||
memPath[t][y] = max.state
|
||||
}
|
||||
}
|
||||
last := make(probStates, 0)
|
||||
length := len(mem_path)
|
||||
length := len(memPath)
|
||||
vlength := len(V)
|
||||
for y := range mem_path[length-1] {
|
||||
for y := range memPath[length-1] {
|
||||
ps := probState{prob: V[vlength-1][y], state: y}
|
||||
last = append(last, ps)
|
||||
}
|
||||
sort.Sort(sort.Reverse(last))
|
||||
state := last[0].state
|
||||
route := make([]Tag, len(obs))
|
||||
route := make([]tag, len(obs))
|
||||
|
||||
for i := obsLength - 1; i >= 0; i-- {
|
||||
route[i] = Tag(state)
|
||||
state = mem_path[i][state]
|
||||
route[i] = tag(state)
|
||||
state = memPath[i][state]
|
||||
}
|
||||
return route
|
||||
}
|
||||
|
||||
@@ -4,49 +4,49 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
var defaultRoute []Tag
|
||||
var defaultRoute []tag
|
||||
|
||||
func init() {
|
||||
var t Tag
|
||||
t, _ = NewTag("B", "nr")
|
||||
var t tag
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "nr")
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "v")
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "v")
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "v")
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "n")
|
||||
t, _ = newTag("B", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "n")
|
||||
t, _ = newTag("M", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "n")
|
||||
t, _ = newTag("E", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "d")
|
||||
t, _ = newTag("S", "d")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "v")
|
||||
t, _ = newTag("S", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "n")
|
||||
t, _ = newTag("S", "n")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "v")
|
||||
t, _ = newTag("B", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "v")
|
||||
t, _ = newTag("E", "v")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("B", "nr")
|
||||
t, _ = newTag("B", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("M", "nr")
|
||||
t, _ = newTag("M", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("E", "nr")
|
||||
t, _ = newTag("E", "nr")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
t, _ = NewTag("S", "zg")
|
||||
t, _ = newTag("S", "zg")
|
||||
defaultRoute = append(defaultRoute, t)
|
||||
}
|
||||
|
||||
|
||||
46
tokenizer.go
46
tokenizer.go
@@ -9,18 +9,40 @@ import (
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
// Name is the jieba tokenizer name.
|
||||
const Name = "jieba"
|
||||
|
||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
|
||||
|
||||
// JiebaTokenizer is the beleve tokenizer for jiebago.
|
||||
type JiebaTokenizer struct {
|
||||
seg Segmenter
|
||||
hmm, searchMode bool
|
||||
}
|
||||
|
||||
func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
/*
|
||||
NewJiebaTokenizer creates a new JiebaTokenizer.
|
||||
|
||||
Parameters:
|
||||
|
||||
dictFilePath: path of the dictioanry file.
|
||||
|
||||
hmm: whether to use Hidden Markov Model to cut unknown words,
|
||||
i.e. not found in dictionary. For example word "安卓" (means "Android" in
|
||||
English) not in the dictionary file. If hmm is set to false, it will be
|
||||
cutted into two single words "安" and "卓", if hmm is set to true, it will
|
||||
be traded as one single word because Jieba using Hidden Markov Model with
|
||||
Viterbi algorithm to guess the best possibility.
|
||||
|
||||
searchMode: whether to further cut long words into serveral short words.
|
||||
In Chinese, some long words may contains other words, for example "交换机"
|
||||
is a Chinese word for "Switcher", if sechMode is false, it will trade
|
||||
"交换机" as a single word. If searchMode is true, it will further split
|
||||
this word into "交换", "换机", which are valid Chinese words.
|
||||
*/
|
||||
func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||
var seg Segmenter
|
||||
err := seg.LoadDictionary(dictFileName)
|
||||
err := seg.LoadDictionary(dictFilePath)
|
||||
return &JiebaTokenizer{
|
||||
seg: seg,
|
||||
hmm: hmm,
|
||||
@@ -28,6 +50,7 @@ func NewJiebaTokenizer(dictFileName string, hmm, searchMode bool) (analysis.Toke
|
||||
}, err
|
||||
}
|
||||
|
||||
// Tokenize cuts input into bleve token stream.
|
||||
func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
runeStart := 0
|
||||
@@ -77,9 +100,20 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
return rv
|
||||
}
|
||||
|
||||
/*
|
||||
JiebaTokenizerConstructor creates a JiebaTokenizer.
|
||||
|
||||
Parameter config should contains at least one parameter:
|
||||
|
||||
file: the path of the dictionary file.
|
||||
|
||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||
|
||||
search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
|
||||
*/
|
||||
func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||
analysis.Tokenizer, error) {
|
||||
dictFileName, ok := config["file"].(string)
|
||||
dictFilePath, ok := config["file"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify dictionary file path")
|
||||
}
|
||||
@@ -92,11 +126,11 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
|
||||
searchMode = true
|
||||
}
|
||||
|
||||
return NewJiebaTokenizer(dictFileName, hmm, searchMode)
|
||||
return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
|
||||
}
|
||||
|
||||
func detectTokenType(term string) analysis.TokenType {
|
||||
if IdeographRegexp.MatchString(term) {
|
||||
if ideographRegexp.MatchString(term) {
|
||||
return analysis.Ideographic
|
||||
}
|
||||
_, err := strconv.ParseFloat(term, 64)
|
||||
|
||||
14
util/util.go
14
util/util.go
@@ -2,12 +2,14 @@ package util
|
||||
|
||||
import "regexp"
|
||||
|
||||
// RegexpSplit split slices s into substrings separated by the expression and
|
||||
// returns a slice of the substrings between those expression matches.
|
||||
// If capturing parentheses are used in expression, then the text of all groups
|
||||
// in the expression are also returned as part of the resulting slice.
|
||||
//
|
||||
// This function acts consistent with Python's re.split function.
|
||||
/*
|
||||
RegexpSplit split slices s into substrings separated by the expression and
|
||||
returns a slice of the substrings between those expression matches.
|
||||
If capturing parentheses are used in expression, then the text of all groups
|
||||
in the expression are also returned as part of the resulting slice.
|
||||
|
||||
This function acts consistent with Python's re.split function.
|
||||
*/
|
||||
func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
|
||||
if n == 0 {
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user