mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-27 15:40:32 +08:00
优化 Segmenter
This commit is contained in:
@@ -133,10 +133,7 @@ func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []strin
|
|||||||
return h.Sum64()
|
return h.Sum64()
|
||||||
}
|
}
|
||||||
span := 5
|
span := 5
|
||||||
var pairs []posseg.Segment
|
pairs := (*posseg.Segmenter)(t).Cut(sentence, true)
|
||||||
for pair := range (*posseg.Segmenter)(t).Cut(sentence, true) {
|
|
||||||
pairs = append(pairs, pair)
|
|
||||||
}
|
|
||||||
for i := range pairs {
|
for i := range pairs {
|
||||||
if _, ok := posFilt[pairs[i].Pos()]; ok {
|
if _, ok := posFilt[pairs[i].Pos()]; ok {
|
||||||
for j := i + 1; j < i+span && j <= len(pairs); j++ {
|
for j := i + 1; j < i+span && j <= len(pairs); j++ {
|
||||||
@@ -174,13 +171,13 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
|||||||
type TextRanker posseg.Segmenter
|
type TextRanker posseg.Segmenter
|
||||||
|
|
||||||
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
||||||
func NewTextRanker(file fs.File) (TextRanker, error) {
|
func NewTextRanker(file fs.File) (*TextRanker, error) {
|
||||||
seg := posseg.Segmenter{}
|
seg, err := posseg.LoadDictionary(file)
|
||||||
return TextRanker(seg), seg.LoadDictionary(file)
|
return (*TextRanker)(seg), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
|
// NewTextRankerAt reads a given file and create a new dictionary file for Textranker.
|
||||||
func NewTextRankerAt(fileName string) (TextRanker, error) {
|
func NewTextRankerAt(file string) (*TextRanker, error) {
|
||||||
seg := posseg.Segmenter{}
|
seg, err := posseg.LoadDictionaryAt(file)
|
||||||
return TextRanker(seg), seg.LoadDictionaryAt(fileName)
|
return (*TextRanker)(seg), err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ type Dictionary struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load loads all tokens from given channel
|
// Load loads all tokens
|
||||||
func (d *Dictionary) Load(tokens ...dictionary.Token) {
|
func (d *Dictionary) Load(tokens ...dictionary.Token) {
|
||||||
d.Lock()
|
d.Lock()
|
||||||
for _, token := range tokens {
|
for _, token := range tokens {
|
||||||
|
|||||||
@@ -10,8 +10,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// DictLoader is the interface that could add one token or load
|
// DictLoader is the interface that could add one token or load tokens
|
||||||
// tokens from channel.
|
|
||||||
type DictLoader interface {
|
type DictLoader interface {
|
||||||
Load(...Token)
|
Load(...Token)
|
||||||
AddToken(Token)
|
AddToken(Token)
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ func newTag(position, pos string) (tag, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if positionIndex < 0 || posIndex < 0 {
|
if positionIndex < 0 || posIndex < 0 {
|
||||||
return 0, fmt.Errorf("Failed to convert %s %s to Tag", position, pos)
|
return 0, fmt.Errorf("failed to convert %s %s to Tag", position, pos)
|
||||||
}
|
}
|
||||||
return tag(positionIndex + posIndex), nil
|
return tag(positionIndex + posIndex), nil
|
||||||
}
|
}
|
||||||
@@ -6705,6 +6705,6 @@ var (
|
|||||||
'\u9fa0': []uint16{413},
|
'\u9fa0': []uint16{413},
|
||||||
}
|
}
|
||||||
|
|
||||||
positions = []string{"B", "E", "M", "S"}
|
positions = [...]string{"B", "E", "M", "S"}
|
||||||
poss = []string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
poss = [...]string{"a", "ad", "ag", "an", "b", "bg", "c", "d", "df", "dg", "e", "en", "f", "g", "h", "i", "in", "j", "jn", "k", "l", "ln", "m", "mg", "mq", "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "o", "p", "q", "qe", "qg", "r", "rg", "rr", "rz", "s", "t", "tg", "u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "vg", "vi", "vn", "vq", "w", "x", "y", "yg", "z", "zg"}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -10,13 +10,13 @@ import (
|
|||||||
|
|
||||||
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
// A Dictionary represents a thread-safe dictionary used for word segmentation.
|
||||||
type Dictionary struct {
|
type Dictionary struct {
|
||||||
|
sync.RWMutex
|
||||||
total, logTotal float64
|
total, logTotal float64
|
||||||
freqMap map[string]float64
|
freqMap map[string]float64
|
||||||
posMap map[string]string
|
posMap map[string]string
|
||||||
sync.RWMutex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load loads all tokens from given channel
|
// Load loads all tokens
|
||||||
func (d *Dictionary) Load(tokens ...dictionary.Token) {
|
func (d *Dictionary) Load(tokens ...dictionary.Token) {
|
||||||
d.Lock()
|
d.Lock()
|
||||||
for _, token := range tokens {
|
for _, token := range tokens {
|
||||||
|
|||||||
@@ -7,10 +7,12 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func Example() {
|
func Example() {
|
||||||
var seg posseg.Segmenter
|
seg, err := posseg.LoadDictionaryAt("../dict.txt")
|
||||||
seg.LoadDictionaryAt("../dict.txt")
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
for segment := range seg.Cut("我爱北京天安门", true) {
|
for _, segment := range seg.Cut("我爱北京天安门", true) {
|
||||||
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
|
||||||
}
|
}
|
||||||
// Output:
|
// Output:
|
||||||
|
|||||||
380
posseg/posseg.go
380
posseg/posseg.go
@@ -35,107 +35,102 @@ func (s Segment) Pos() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Segmenter is a Chinese words segmentation struct.
|
// Segmenter is a Chinese words segmentation struct.
|
||||||
type Segmenter struct {
|
type Segmenter Dictionary
|
||||||
dict *Dictionary
|
|
||||||
}
|
|
||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name.
|
// LoadDictionary loads dictionary from given file name.
|
||||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionary(file fs.File) error {
|
func LoadDictionary(file fs.File) (*Segmenter, error) {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
return seg.dict.loadDictionary(file)
|
err := dict.loadDictionary(file)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return (*Segmenter)(dict), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionaryAt loads dictionary from given file name.
|
// LoadDictionaryAt loads dictionary from given file name.
|
||||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
func (seg *Segmenter) LoadDictionaryAt(fileName string) error {
|
func LoadDictionaryAt(file string) (*Segmenter, error) {
|
||||||
seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
return seg.dict.loadDictionaryAt(fileName)
|
err := dict.loadDictionaryAt(file)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return (*Segmenter)(dict), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
||||||
return seg.dict.loadDictionary(file)
|
return (*Dictionary)(seg).loadDictionary(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
// LoadUserDictionaryAt loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
|
func (seg *Segmenter) LoadUserDictionaryAt(fileName string) error {
|
||||||
return seg.dict.loadDictionaryAt(fileName)
|
return (*Dictionary)(seg).loadDictionaryAt(fileName)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
|
func (seg *Segmenter) cutDetailInternal(sentence string) (results []Segment) {
|
||||||
result := make(chan Segment)
|
runes := []rune(sentence)
|
||||||
|
posList := viterbi(runes)
|
||||||
go func() {
|
begin := 0
|
||||||
runes := []rune(sentence)
|
next := 0
|
||||||
posList := viterbi(runes)
|
for i, char := range runes {
|
||||||
begin := 0
|
pos := posList[i]
|
||||||
next := 0
|
switch pos.position() {
|
||||||
for i, char := range runes {
|
case "B":
|
||||||
pos := posList[i]
|
begin = i
|
||||||
switch pos.position() {
|
case "E":
|
||||||
case "B":
|
results = append(results, Segment{string(runes[begin : i+1]), pos.pos()})
|
||||||
begin = i
|
next = i + 1
|
||||||
case "E":
|
case "S":
|
||||||
result <- Segment{string(runes[begin : i+1]), pos.pos()}
|
results = append(results, Segment{string(char), pos.pos()})
|
||||||
next = i + 1
|
next = i + 1
|
||||||
case "S":
|
|
||||||
result <- Segment{string(char), pos.pos()}
|
|
||||||
next = i + 1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if next < len(runes) {
|
}
|
||||||
result <- Segment{string(runes[next:]), posList[next].pos()}
|
if next < len(runes) {
|
||||||
}
|
results = append(results, Segment{string(runes[next:]), posList[next].pos()})
|
||||||
close(result)
|
}
|
||||||
}()
|
return
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
|
func (seg *Segmenter) cutDetail(sentence string) (results []Segment) {
|
||||||
result := make(chan Segment)
|
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
|
||||||
go func() {
|
if reHanDetail.MatchString(blk) {
|
||||||
for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
|
results = append(results, seg.cutDetailInternal(blk)...)
|
||||||
if reHanDetail.MatchString(blk) {
|
continue
|
||||||
for segment := range seg.cutDetailInternal(blk) {
|
}
|
||||||
result <- segment
|
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
|
||||||
}
|
if len(x) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
|
switch {
|
||||||
if len(x) == 0 {
|
case reNum.MatchString(x):
|
||||||
continue
|
results = append(results, Segment{x, "m"})
|
||||||
}
|
case reEng.MatchString(x):
|
||||||
switch {
|
results = append(results, Segment{x, "eng"})
|
||||||
case reNum.MatchString(x):
|
default:
|
||||||
result <- Segment{x, "m"}
|
results = append(results, Segment{x, "x"})
|
||||||
case reEng.MatchString(x):
|
|
||||||
result <- Segment{x, "eng"}
|
|
||||||
default:
|
|
||||||
result <- Segment{x, "x"}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
close(result)
|
}
|
||||||
}()
|
return
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) dag(runes []rune) map[int][]int {
|
func (seg *Segmenter) dag(runes []rune) [][]int {
|
||||||
dag := make(map[int][]int)
|
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
|
dag := make([][]int, n)
|
||||||
var frag []rune
|
var frag []rune
|
||||||
var i int
|
var i int
|
||||||
for k := 0; k < n; k++ {
|
for k := 0; k < n; k++ {
|
||||||
dag[k] = make([]int, 0)
|
dag[k] = make([]int, 0, 64)
|
||||||
i = k
|
i = k
|
||||||
frag = runes[k : k+1]
|
frag = runes[k : k+1]
|
||||||
for {
|
for {
|
||||||
freq, ok := seg.dict.Frequency(string(frag))
|
freq, ok := (*Dictionary)(seg).Frequency(string(frag))
|
||||||
if !ok {
|
if !ok {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -160,20 +155,20 @@ type route struct {
|
|||||||
index int
|
index int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (seg *Segmenter) calc(runes []rune) map[int]route {
|
func (seg *Segmenter) calc(runes []rune) []*route {
|
||||||
dag := seg.dag(runes)
|
dag := seg.dag(runes)
|
||||||
n := len(runes)
|
n := len(runes)
|
||||||
rs := make(map[int]route)
|
rs := make([]*route, n+1)
|
||||||
rs[n] = route{frequency: 0.0, index: 0}
|
rs[n] = &route{frequency: 0.0, index: 0}
|
||||||
var r route
|
var r *route
|
||||||
for idx := n - 1; idx >= 0; idx-- {
|
for idx := n - 1; idx >= 0; idx-- {
|
||||||
for _, i := range dag[idx] {
|
for _, i := range dag[idx] {
|
||||||
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
|
if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok {
|
||||||
r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||||
} else {
|
} else {
|
||||||
r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
|
r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i}
|
||||||
}
|
}
|
||||||
if v, ok := rs[idx]; !ok {
|
if v := rs[idx]; v == nil {
|
||||||
rs[idx] = r
|
rs[idx] = r
|
||||||
} else {
|
} else {
|
||||||
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
|
||||||
@@ -185,168 +180,139 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
|
|||||||
return rs
|
return rs
|
||||||
}
|
}
|
||||||
|
|
||||||
type cutFunc func(sentence string) <-chan Segment
|
func (seg *Segmenter) cutDAG(sentence string) (results []Segment) {
|
||||||
|
runes := []rune(sentence)
|
||||||
func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
|
routes := seg.calc(runes)
|
||||||
result := make(chan Segment)
|
buf := make([]rune, 0, 256)
|
||||||
|
for x := 0; x < len(runes); {
|
||||||
go func() {
|
y := routes[x].index + 1
|
||||||
runes := []rune(sentence)
|
frag := runes[x:y]
|
||||||
routes := seg.calc(runes)
|
if y-x == 1 {
|
||||||
var y int
|
buf = append(buf, frag...)
|
||||||
length := len(runes)
|
|
||||||
var buf []rune
|
|
||||||
for x := 0; x < length; {
|
|
||||||
y = routes[x].index + 1
|
|
||||||
frag := runes[x:y]
|
|
||||||
if y-x == 1 {
|
|
||||||
buf = append(buf, frag...)
|
|
||||||
x = y
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if len(buf) > 0 {
|
|
||||||
bufString := string(buf)
|
|
||||||
if len(buf) == 1 {
|
|
||||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
|
||||||
result <- Segment{bufString, tag}
|
|
||||||
} else {
|
|
||||||
result <- Segment{bufString, "x"}
|
|
||||||
}
|
|
||||||
buf = make([]rune, 0)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
|
||||||
for t := range seg.cutDetail(bufString) {
|
|
||||||
result <- t
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for _, elem := range buf {
|
|
||||||
selem := string(elem)
|
|
||||||
if tag, ok := seg.dict.Pos(selem); ok {
|
|
||||||
result <- Segment{selem, tag}
|
|
||||||
} else {
|
|
||||||
result <- Segment{selem, "x"}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
buf = make([]rune, 0)
|
|
||||||
}
|
|
||||||
word := string(frag)
|
|
||||||
if tag, ok := seg.dict.Pos(word); ok {
|
|
||||||
result <- Segment{word, tag}
|
|
||||||
} else {
|
|
||||||
result <- Segment{word, "x"}
|
|
||||||
}
|
|
||||||
x = y
|
x = y
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
bufString := string(buf)
|
bufString := string(buf)
|
||||||
if len(buf) == 1 {
|
if len(buf) == 1 {
|
||||||
if tag, ok := seg.dict.Pos(bufString); ok {
|
if tag, ok := (*Dictionary)(seg).Pos(bufString); ok {
|
||||||
result <- Segment{bufString, tag}
|
results = append(results, Segment{bufString, tag})
|
||||||
} else {
|
} else {
|
||||||
result <- Segment{bufString, "x"}
|
results = append(results, Segment{bufString, "x"})
|
||||||
}
|
}
|
||||||
} else {
|
buf = buf[:0]
|
||||||
if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
|
|
||||||
for t := range seg.cutDetail(bufString) {
|
|
||||||
result <- t
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for _, elem := range buf {
|
|
||||||
selem := string(elem)
|
|
||||||
if tag, ok := seg.dict.Pos(selem); ok {
|
|
||||||
result <- Segment{selem, tag}
|
|
||||||
} else {
|
|
||||||
result <- Segment{selem, "x"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
close(result)
|
|
||||||
}()
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
|
|
||||||
result := make(chan Segment)
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
runes := []rune(sentence)
|
|
||||||
routes := seg.calc(runes)
|
|
||||||
var y int
|
|
||||||
length := len(runes)
|
|
||||||
var buf []rune
|
|
||||||
for x := 0; x < length; {
|
|
||||||
y = routes[x].index + 1
|
|
||||||
frag := runes[x:y]
|
|
||||||
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
|
|
||||||
buf = append(buf, frag...)
|
|
||||||
x = y
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(buf) > 0 {
|
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||||
result <- Segment{string(buf), "eng"}
|
results = append(results, seg.cutDetail(bufString)...)
|
||||||
buf = make([]rune, 0)
|
|
||||||
}
|
|
||||||
word := string(frag)
|
|
||||||
if tag, ok := seg.dict.Pos(word); ok {
|
|
||||||
result <- Segment{word, tag}
|
|
||||||
} else {
|
} else {
|
||||||
result <- Segment{word, "x"}
|
for _, elem := range buf {
|
||||||
|
selem := string(elem)
|
||||||
|
if tag, ok := (*Dictionary)(seg).Pos(selem); ok {
|
||||||
|
results = append(results, Segment{selem, tag})
|
||||||
|
} else {
|
||||||
|
results = append(results, Segment{selem, "x"})
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
x = y
|
buf = buf[:0]
|
||||||
|
}
|
||||||
|
word := string(frag)
|
||||||
|
if tag, ok := (*Dictionary)(seg).Pos(word); ok {
|
||||||
|
results = append(results, Segment{word, tag})
|
||||||
|
} else {
|
||||||
|
results = append(results, Segment{word, "x"})
|
||||||
|
}
|
||||||
|
x = y
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(buf) > 0 {
|
||||||
|
bufString := string(buf)
|
||||||
|
if len(buf) == 1 {
|
||||||
|
if tag, ok := (*Dictionary)(seg).Pos(bufString); ok {
|
||||||
|
results = append(results, Segment{bufString, tag})
|
||||||
|
} else {
|
||||||
|
results = append(results, Segment{bufString, "x"})
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 {
|
||||||
|
results = append(results, seg.cutDetail(bufString)...)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, elem := range buf {
|
||||||
|
selem := string(elem)
|
||||||
|
if tag, ok := (*Dictionary)(seg).Pos(selem); ok {
|
||||||
|
results = append(results, Segment{selem, tag})
|
||||||
|
} else {
|
||||||
|
results = append(results, Segment{selem, "x"})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (seg *Segmenter) cutDAGNoHMM(sentence string) (results []Segment) {
|
||||||
|
runes := []rune(sentence)
|
||||||
|
routes := seg.calc(runes)
|
||||||
|
buf := make([]rune, 0, 256)
|
||||||
|
for x := 0; x < len(runes); {
|
||||||
|
y := routes[x].index + 1
|
||||||
|
frag := runes[x:y]
|
||||||
|
if reEng1.MatchString(string(frag)) && len(frag) == 1 {
|
||||||
|
buf = append(buf, frag...)
|
||||||
|
x = y
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
if len(buf) > 0 {
|
if len(buf) > 0 {
|
||||||
result <- Segment{string(buf), "eng"}
|
results = append(results, Segment{string(buf), "eng"})
|
||||||
buf = make([]rune, 0)
|
buf = buf[:0]
|
||||||
}
|
}
|
||||||
close(result)
|
word := string(frag)
|
||||||
}()
|
if tag, ok := (*Dictionary)(seg).Pos(word); ok {
|
||||||
return result
|
results = append(results, Segment{word, tag})
|
||||||
|
} else {
|
||||||
|
results = append(results, Segment{word, "x"})
|
||||||
|
}
|
||||||
|
x = y
|
||||||
|
}
|
||||||
|
if len(buf) > 0 {
|
||||||
|
results = append(results, Segment{string(buf), "eng"})
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cut cuts a sentence into words.
|
// Cut cuts a sentence into words.
|
||||||
// Parameter hmm controls whether to use the Hidden Markov Model.
|
// Parameter hmm controls whether to use the Hidden Markov Model.
|
||||||
func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
|
func (seg *Segmenter) Cut(sentence string, hmm bool) (results []Segment) {
|
||||||
result := make(chan Segment)
|
var cut func(sentence string) []Segment
|
||||||
var cut cutFunc
|
|
||||||
if hmm {
|
if hmm {
|
||||||
cut = seg.cutDAG
|
cut = seg.cutDAG
|
||||||
} else {
|
} else {
|
||||||
cut = seg.cutDAGNoHMM
|
cut = seg.cutDAGNoHMM
|
||||||
}
|
}
|
||||||
go func() {
|
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
|
||||||
for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
|
if reHanInternal.MatchString(blk) {
|
||||||
if reHanInternal.MatchString(blk) {
|
results = append(results, cut(blk)...)
|
||||||
for wordTag := range cut(blk) {
|
continue
|
||||||
result <- wordTag
|
}
|
||||||
}
|
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
|
||||||
|
if reSkipInternal.MatchString(x) {
|
||||||
|
results = append(results, Segment{x, "x"})
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
|
for _, xx := range x {
|
||||||
if reSkipInternal.MatchString(x) {
|
s := string(xx)
|
||||||
result <- Segment{x, "x"}
|
switch {
|
||||||
continue
|
case reNum.MatchString(s):
|
||||||
}
|
results = append(results, Segment{s, "m"})
|
||||||
for _, xx := range x {
|
case reEng.MatchString(x):
|
||||||
s := string(xx)
|
results = append(results, Segment{x, "eng"})
|
||||||
switch {
|
default:
|
||||||
case reNum.MatchString(s):
|
results = append(results, Segment{s, "x"})
|
||||||
result <- Segment{s, "m"}
|
|
||||||
case reEng.MatchString(x):
|
|
||||||
result <- Segment{x, "eng"}
|
|
||||||
default:
|
|
||||||
result <- Segment{s, "x"}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
close(result)
|
}
|
||||||
}()
|
return
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
seg Segmenter
|
seg, _ = LoadDictionaryAt("../dict.txt")
|
||||||
testContents = []string{
|
testContents = []string{
|
||||||
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
"我不喜欢日本和服。",
|
"我不喜欢日本和服。",
|
||||||
@@ -268,21 +268,9 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
|
||||||
seg.LoadDictionaryAt("../dict.txt")
|
|
||||||
}
|
|
||||||
|
|
||||||
func chanToArray(ch <-chan Segment) []Segment {
|
|
||||||
var result []Segment
|
|
||||||
for word := range ch {
|
|
||||||
result = append(result, word)
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestCut(t *testing.T) {
|
func TestCut(t *testing.T) {
|
||||||
for index, content := range testContents {
|
for index, content := range testContents {
|
||||||
result := chanToArray(seg.Cut(content, true))
|
result := seg.Cut(content, true)
|
||||||
if len(defaultCutResult[index]) != len(result) {
|
if len(defaultCutResult[index]) != len(result) {
|
||||||
t.Errorf("default cut for %s length should be %d not %d\n",
|
t.Errorf("default cut for %s length should be %d not %d\n",
|
||||||
content, len(defaultCutResult[index]), len(result))
|
content, len(defaultCutResult[index]), len(result))
|
||||||
@@ -294,7 +282,7 @@ func TestCut(t *testing.T) {
|
|||||||
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = chanToArray(seg.Cut(content, false))
|
result = seg.Cut(content, false)
|
||||||
if len(noHMMCutResult[index]) != len(result) {
|
if len(noHMMCutResult[index]) != len(result) {
|
||||||
t.Fatal(content)
|
t.Fatal(content)
|
||||||
}
|
}
|
||||||
@@ -316,7 +304,7 @@ func TestBug132(t *testing.T) {
|
|||||||
{"又", "d"},
|
{"又", "d"},
|
||||||
{"啞", "v"},
|
{"啞", "v"},
|
||||||
}
|
}
|
||||||
result := chanToArray(seg.Cut(sentence, true))
|
result := seg.Cut(sentence, true)
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
@@ -345,7 +333,7 @@ func TestBug137(t *testing.T) {
|
|||||||
{"研究", "vn"},
|
{"研究", "vn"},
|
||||||
{"組", "x"},
|
{"組", "x"},
|
||||||
}
|
}
|
||||||
result := chanToArray(seg.Cut(sentence, true))
|
result := seg.Cut(sentence, true)
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
@@ -358,7 +346,9 @@ func TestBug137(t *testing.T) {
|
|||||||
|
|
||||||
func TestUserDict(t *testing.T) {
|
func TestUserDict(t *testing.T) {
|
||||||
seg.LoadUserDictionaryAt("../userdict.txt")
|
seg.LoadUserDictionaryAt("../userdict.txt")
|
||||||
defer seg.LoadDictionaryAt("../dict.txt")
|
defer func() {
|
||||||
|
seg, _ = LoadDictionaryAt("../dict.txt")
|
||||||
|
}()
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
|
||||||
cutResult := []Segment{
|
cutResult := []Segment{
|
||||||
@@ -400,7 +390,7 @@ func TestUserDict(t *testing.T) {
|
|||||||
{"N", "eng"},
|
{"N", "eng"},
|
||||||
{"类型", "n"}}
|
{"类型", "n"}}
|
||||||
|
|
||||||
result := chanToArray(seg.Cut(sentence, true))
|
result := seg.Cut(sentence, true)
|
||||||
if len(cutResult) != len(result) {
|
if len(cutResult) != len(result) {
|
||||||
t.Fatal(result)
|
t.Fatal(result)
|
||||||
}
|
}
|
||||||
@@ -415,7 +405,7 @@ func BenchmarkCutNoHMM(b *testing.B) {
|
|||||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
chanToArray(seg.Cut(sentence, false))
|
seg.Cut(sentence, false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -423,6 +413,6 @@ func BenchmarkCut(b *testing.B) {
|
|||||||
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
chanToArray(seg.Cut(sentence, true))
|
seg.Cut(sentence, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user