From 0027927b6d348075063335da6a0b8c0eb34b5e2f Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Tue, 24 Mar 2015 14:40:06 +0800 Subject: [PATCH] code refactor for RegexpSplit function, moved it to util.go, add return chan string --- dict.go | 14 ----- jieba.go | 44 ++++--------- jieba_test.go | 8 +-- posseg/posseg.go | 10 ++- util.go | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 55 deletions(-) create mode 100644 util.go diff --git a/dict.go b/dict.go index 9d4704a..4a5d3db 100644 --- a/dict.go +++ b/dict.go @@ -3,7 +3,6 @@ package jiebago import ( "bufio" "os" - "path/filepath" "strconv" "strings" ) @@ -13,19 +12,6 @@ type WordTagFreq struct { Freq float64 } -func DictPath(dictFileName string) (string, error) { - if filepath.IsAbs(dictFileName) { - return dictFileName, nil - } - var dictFilePath string - cwd, err := os.Getwd() - if err != nil { - return dictFilePath, err - } - dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) - return dictFilePath, nil -} - func ParseDictFile(dictFilePath string) (wtfs []*WordTagFreq, err error) { var dictFile *os.File dictFile, err = os.Open(dictFilePath) diff --git a/jieba.go b/jieba.go index 7893199..cad16e8 100644 --- a/jieba.go +++ b/jieba.go @@ -48,30 +48,6 @@ func (rs routes) Swap(i, j int) { rs[i], rs[j] = rs[j], rs[i] } -// Split sentence using regular expression. -func RegexpSplit(r *regexp.Regexp, sentence string) []string { - result := make([]string, 0) - locs := r.FindAllStringIndex(sentence, -1) - lastLoc := 0 - if len(locs) == 0 { - return []string{sentence} - } - for _, loc := range locs { - if loc[0] == lastLoc { - result = append(result, sentence[loc[0]:loc[1]]) - } else { - result = append(result, sentence[lastLoc:loc[0]]) - result = append(result, sentence[loc[0]:loc[1]]) - } - lastLoc = loc[1] - } - if lastLoc < len(sentence) { - result = append(result, sentence[lastLoc:]) - } - - return result -} - // Build a directed acyclic graph (DAG) for sentence. func DAG(sentence string) map[int][]int { dag := make(map[int][]int) @@ -286,7 +262,6 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string { reHan = reHanDefault reSkip = reSkipDefault } - blocks := RegexpSplit(reHan, sentence) var cut cutFunc if HMM { cut = cutDAG @@ -296,7 +271,7 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string { if isCutAll { cut = cutAll } - for _, blk := range blocks { + for blk := range RegexpSplit(reHan, sentence) { if len(blk) == 0 { continue } @@ -305,19 +280,26 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string { result <- x } } else { - type skipSplitFunc func(sentence string) []string + type skipSplitFunc func(sentence string) chan string var ssf skipSplitFunc if isCutAll { - ssf = func(sentence string) []string { - return reSkip.Split(sentence, -1) + ssf = func(sentence string) chan string { + ch := make(chan string) + go func() { + for _, s := range reSkip.Split(sentence, -1) { + ch <- s + } + close(ch) + }() + return ch } } else { - ssf = func(sentence string) []string { + ssf = func(sentence string) chan string { return RegexpSplit(reSkip, sentence) } } - for _, x := range ssf(blk) { + for x := range ssf(blk) { if reSkip.MatchString(x) { result <- x } else if !isCutAll { diff --git a/jieba_test.go b/jieba_test.go index d009dda..9d161c1 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -644,13 +644,13 @@ func TestCutDAGNoHmm(t *testing.T) { } func TestRegexpSplit(t *testing.T) { - result := RegexpSplit(regexp.MustCompile(`\p{Han}+`), - "BP神经网络如何训练才能在分类时增加区分度?") + result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`), + "BP神经网络如何训练才能在分类时增加区分度?")) if len(result) != 3 { t.Error(result) } - result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), - ",BP神经网络如何训练才能在分类时#增加区分度?") + result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), + ",BP神经网络如何训练才能在分类时#增加区分度?")) if len(result) != 3 { t.Error(result) } diff --git a/posseg/posseg.go b/posseg/posseg.go index 696becc..cc0ddcb 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -72,14 +72,13 @@ func cutDetail(sentence string) chan WordTag { result := make(chan WordTag) go func() { - blocks := jiebago.RegexpSplit(reHanDetail, sentence) - for _, blk := range blocks { + for blk := range jiebago.RegexpSplit(reHanDetail, sentence) { if reHanDetail.MatchString(blk) { for wordTag := range cutDetailInternal(blk) { result <- wordTag } } else { - for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) { + for x := range jiebago.RegexpSplit(reSkipDetail, blk) { if len(x) == 0 { continue } @@ -242,7 +241,6 @@ func Cut(sentence string, HMM bool) chan WordTag { delete(jiebago.UserWordTagTab, key) } result := make(chan WordTag) - blocks := jiebago.RegexpSplit(reHanInternal, sentence) var cut cutFunc if HMM { cut = cutDAG @@ -250,13 +248,13 @@ func Cut(sentence string, HMM bool) chan WordTag { cut = cutDAGNoHMM } go func() { - for _, blk := range blocks { + for blk := range jiebago.RegexpSplit(reHanInternal, sentence) { if reHanInternal.MatchString(blk) { for wordTag := range cut(blk) { result <- wordTag } } else { - for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) { + for x := range jiebago.RegexpSplit(reSkipInternal, blk) { if reSkipInternal.MatchString(x) { result <- WordTag{x, "x"} } else { diff --git a/util.go b/util.go new file mode 100644 index 0000000..51a3652 --- /dev/null +++ b/util.go @@ -0,0 +1,157 @@ +package jiebago + +import ( + // "bufio" + // "crypto/md5" + // "encoding/gob" + // "fmt" + "os" + "path/filepath" + "regexp" + // "strconv" + // "strings" +) + +func DictPath(dictFileName string) (string, error) { + if filepath.IsAbs(dictFileName) { + return dictFileName, nil + } + var dictFilePath string + cwd, err := os.Getwd() + if err != nil { + return dictFilePath, err + } + dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) + return dictFilePath, nil +} + +/* +func cachePath(dictPath string) string { + return filepath.Join(os.TempDir(), + fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(f.dictFilePath)))) +} + +func fileInfo(filePath string, missingOk bool) (*os.FileInfo, err) { + fileInfo, err := os.Stat(filePath) + if missingOk && err.Err == os.ErrNotExist { + return fileInfo, nil + } + return fileInfo, err +} + +func isCached(dictPath, cachePath string) (bool, error) { + dictFileInfo, err := fileInfo(dictPath, false) + if err != nil { + return false, err + } + cacheFileInfo, err := fileInfo(cachePath, true) + if err != nil { + return false, err + } + return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil +} + +func load(cachePath string, d DictLoader) error { + dec := gob.NewDecoder(cacheFile) + return dec.Decode(&d) +} + +func read(dictPath, d DictLoader, pos bool) error { + dictFile, err := os.Open(dictFilePath) + if err != nil { + return err + } + defer dictFile.Close() + scanner := bufio.NewScanner(dictFile) + var token *Token + var line string + var fields []string + for scanner.Scan() { + line = scanner.Text() + fields = strings.Split(line, " ") + token = &Token{Term: strings.Replace(fields[0], "\ufeff", "", 1)} + if length := len(fields); length > 1 { + token.Freq, err = strconv.ParseFloat(fields[1], 64) + if err != nil { + return err + } + if pos && length > 2 { + token.Pos = fields[2] + } + } + d.Add(token) + } + return scanner.Err() +} + +func dump(cachePath string, d DictLoader) error { + cacheFile, err = os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return err + } + defer cacheFile.Close() + enc := gob.NewEncoder(cacheFile) + return enc.Encode(d) +} + +func SetDict(s Segmenter, dictName string, pos bool) error { + dictPath, err := DictPath(dictName) + if err != nil { + return err + } + cachePath = cachePath(dictPath) + cached, err := isCached(dictPath, cachePath) + if err != nil { + return err + } + + if cached { + err = load(cachePath, s) + if err == nil { + return nil + } + cached = false + } + + err = read(dictPath, s, pos) + if err != nil { + return err + } + + err = dump(cachePath, s) + if err != nil { + return err + } +} + +func LoadUserDict(dictName string, s Segmenter, pos bool) error { + dictPath, err := DictPath(dictName) + if err != nil { + return err + } + return read(dictPath, s, pos) +} +*/ + +// Split sentence using regular expression. +func RegexpSplit(r *regexp.Regexp, sentence string) chan string { + result := make(chan string) + go func() { + locs := r.FindAllStringIndex(sentence, -1) + lastLoc := 0 + for _, loc := range locs { + if loc[0] == lastLoc { + result <- sentence[loc[0]:loc[1]] + } else { + result <- sentence[lastLoc:loc[0]] + result <- sentence[loc[0]:loc[1]] + } + lastLoc = loc[1] + } + if lastLoc < len(sentence) { + result <- sentence[lastLoc:] + } + close(result) + }() + return result +}