1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-13 05:31:02 +08:00

code refactor for RegexpSplit function, moved it to util.go, add return chan string

This commit is contained in:
Wang Bin
2015-03-24 14:40:06 +08:00
parent 323b6714fa
commit 0027927b6d
5 changed files with 178 additions and 55 deletions

14
dict.go
View File

@@ -3,7 +3,6 @@ package jiebago
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
)
@@ -13,19 +12,6 @@ type WordTagFreq struct {
Freq float64
}
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}
func ParseDictFile(dictFilePath string) (wtfs []*WordTagFreq, err error) {
var dictFile *os.File
dictFile, err = os.Open(dictFilePath)

View File

@@ -48,30 +48,6 @@ func (rs routes) Swap(i, j int) {
rs[i], rs[j] = rs[j], rs[i]
}
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) []string {
result := make([]string, 0)
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
if len(locs) == 0 {
return []string{sentence}
}
for _, loc := range locs {
if loc[0] == lastLoc {
result = append(result, sentence[loc[0]:loc[1]])
} else {
result = append(result, sentence[lastLoc:loc[0]])
result = append(result, sentence[loc[0]:loc[1]])
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result = append(result, sentence[lastLoc:])
}
return result
}
// Build a directed acyclic graph (DAG) for sentence.
func DAG(sentence string) map[int][]int {
dag := make(map[int][]int)
@@ -286,7 +262,6 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
reHan = reHanDefault
reSkip = reSkipDefault
}
blocks := RegexpSplit(reHan, sentence)
var cut cutFunc
if HMM {
cut = cutDAG
@@ -296,7 +271,7 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
if isCutAll {
cut = cutAll
}
for _, blk := range blocks {
for blk := range RegexpSplit(reHan, sentence) {
if len(blk) == 0 {
continue
}
@@ -305,19 +280,26 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
result <- x
}
} else {
type skipSplitFunc func(sentence string) []string
type skipSplitFunc func(sentence string) chan string
var ssf skipSplitFunc
if isCutAll {
ssf = func(sentence string) []string {
return reSkip.Split(sentence, -1)
ssf = func(sentence string) chan string {
ch := make(chan string)
go func() {
for _, s := range reSkip.Split(sentence, -1) {
ch <- s
}
close(ch)
}()
return ch
}
} else {
ssf = func(sentence string) []string {
ssf = func(sentence string) chan string {
return RegexpSplit(reSkip, sentence)
}
}
for _, x := range ssf(blk) {
for x := range ssf(blk) {
if reSkip.MatchString(x) {
result <- x
} else if !isCutAll {

View File

@@ -644,13 +644,13 @@ func TestCutDAGNoHmm(t *testing.T) {
}
func TestRegexpSplit(t *testing.T) {
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度")
result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`),
"BP神经网络如何训练才能在分类时增加区分度"))
if len(result) != 3 {
t.Error(result)
}
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?")
result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
",BP神经网络如何训练才能在分类时#增加区分度?"))
if len(result) != 3 {
t.Error(result)
}

View File

@@ -72,14 +72,13 @@ func cutDetail(sentence string) chan WordTag {
result := make(chan WordTag)
go func() {
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
for _, blk := range blocks {
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
if reHanDetail.MatchString(blk) {
for wordTag := range cutDetailInternal(blk) {
result <- wordTag
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
for x := range jiebago.RegexpSplit(reSkipDetail, blk) {
if len(x) == 0 {
continue
}
@@ -242,7 +241,6 @@ func Cut(sentence string, HMM bool) chan WordTag {
delete(jiebago.UserWordTagTab, key)
}
result := make(chan WordTag)
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
var cut cutFunc
if HMM {
cut = cutDAG
@@ -250,13 +248,13 @@ func Cut(sentence string, HMM bool) chan WordTag {
cut = cutDAGNoHMM
}
go func() {
for _, blk := range blocks {
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
if reHanInternal.MatchString(blk) {
for wordTag := range cut(blk) {
result <- wordTag
}
} else {
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
for x := range jiebago.RegexpSplit(reSkipInternal, blk) {
if reSkipInternal.MatchString(x) {
result <- WordTag{x, "x"}
} else {

157
util.go Normal file
View File

@@ -0,0 +1,157 @@
package jiebago
import (
// "bufio"
// "crypto/md5"
// "encoding/gob"
// "fmt"
"os"
"path/filepath"
"regexp"
// "strconv"
// "strings"
)
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}
/*
func cachePath(dictPath string) string {
return filepath.Join(os.TempDir(),
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(f.dictFilePath))))
}
func fileInfo(filePath string, missingOk bool) (*os.FileInfo, err) {
fileInfo, err := os.Stat(filePath)
if missingOk && err.Err == os.ErrNotExist {
return fileInfo, nil
}
return fileInfo, err
}
func isCached(dictPath, cachePath string) (bool, error) {
dictFileInfo, err := fileInfo(dictPath, false)
if err != nil {
return false, err
}
cacheFileInfo, err := fileInfo(cachePath, true)
if err != nil {
return false, err
}
return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil
}
func load(cachePath string, d DictLoader) error {
dec := gob.NewDecoder(cacheFile)
return dec.Decode(&d)
}
func read(dictPath, d DictLoader, pos bool) error {
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
}
defer dictFile.Close()
scanner := bufio.NewScanner(dictFile)
var token *Token
var line string
var fields []string
for scanner.Scan() {
line = scanner.Text()
fields = strings.Split(line, " ")
token = &Token{Term: strings.Replace(fields[0], "\ufeff", "", 1)}
if length := len(fields); length > 1 {
token.Freq, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
return err
}
if pos && length > 2 {
token.Pos = fields[2]
}
}
d.Add(token)
}
return scanner.Err()
}
func dump(cachePath string, d DictLoader) error {
cacheFile, err = os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
return enc.Encode(d)
}
func SetDict(s Segmenter, dictName string, pos bool) error {
dictPath, err := DictPath(dictName)
if err != nil {
return err
}
cachePath = cachePath(dictPath)
cached, err := isCached(dictPath, cachePath)
if err != nil {
return err
}
if cached {
err = load(cachePath, s)
if err == nil {
return nil
}
cached = false
}
err = read(dictPath, s, pos)
if err != nil {
return err
}
err = dump(cachePath, s)
if err != nil {
return err
}
}
func LoadUserDict(dictName string, s Segmenter, pos bool) error {
dictPath, err := DictPath(dictName)
if err != nil {
return err
}
return read(dictPath, s, pos)
}
*/
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
result := make(chan string)
go func() {
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
for _, loc := range locs {
if loc[0] == lastLoc {
result <- sentence[loc[0]:loc[1]]
} else {
result <- sentence[lastLoc:loc[0]]
result <- sentence[loc[0]:loc[1]]
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result <- sentence[lastLoc:]
}
close(result)
}()
return result
}