mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-13 05:31:02 +08:00
code refactor for RegexpSplit function, moved it to util.go, add return chan string
This commit is contained in:
14
dict.go
14
dict.go
@@ -3,7 +3,6 @@ package jiebago
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -13,19 +12,6 @@ type WordTagFreq struct {
|
||||
Freq float64
|
||||
}
|
||||
|
||||
func DictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
|
||||
func ParseDictFile(dictFilePath string) (wtfs []*WordTagFreq, err error) {
|
||||
var dictFile *os.File
|
||||
dictFile, err = os.Open(dictFilePath)
|
||||
|
||||
44
jieba.go
44
jieba.go
@@ -48,30 +48,6 @@ func (rs routes) Swap(i, j int) {
|
||||
rs[i], rs[j] = rs[j], rs[i]
|
||||
}
|
||||
|
||||
// Split sentence using regular expression.
|
||||
func RegexpSplit(r *regexp.Regexp, sentence string) []string {
|
||||
result := make([]string, 0)
|
||||
locs := r.FindAllStringIndex(sentence, -1)
|
||||
lastLoc := 0
|
||||
if len(locs) == 0 {
|
||||
return []string{sentence}
|
||||
}
|
||||
for _, loc := range locs {
|
||||
if loc[0] == lastLoc {
|
||||
result = append(result, sentence[loc[0]:loc[1]])
|
||||
} else {
|
||||
result = append(result, sentence[lastLoc:loc[0]])
|
||||
result = append(result, sentence[loc[0]:loc[1]])
|
||||
}
|
||||
lastLoc = loc[1]
|
||||
}
|
||||
if lastLoc < len(sentence) {
|
||||
result = append(result, sentence[lastLoc:])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Build a directed acyclic graph (DAG) for sentence.
|
||||
func DAG(sentence string) map[int][]int {
|
||||
dag := make(map[int][]int)
|
||||
@@ -286,7 +262,6 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
reHan = reHanDefault
|
||||
reSkip = reSkipDefault
|
||||
}
|
||||
blocks := RegexpSplit(reHan, sentence)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
@@ -296,7 +271,7 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
if isCutAll {
|
||||
cut = cutAll
|
||||
}
|
||||
for _, blk := range blocks {
|
||||
for blk := range RegexpSplit(reHan, sentence) {
|
||||
if len(blk) == 0 {
|
||||
continue
|
||||
}
|
||||
@@ -305,19 +280,26 @@ func Cut(sentence string, isCutAll bool, HMM bool) chan string {
|
||||
result <- x
|
||||
}
|
||||
} else {
|
||||
type skipSplitFunc func(sentence string) []string
|
||||
type skipSplitFunc func(sentence string) chan string
|
||||
var ssf skipSplitFunc
|
||||
if isCutAll {
|
||||
ssf = func(sentence string) []string {
|
||||
return reSkip.Split(sentence, -1)
|
||||
ssf = func(sentence string) chan string {
|
||||
ch := make(chan string)
|
||||
go func() {
|
||||
for _, s := range reSkip.Split(sentence, -1) {
|
||||
ch <- s
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
} else {
|
||||
ssf = func(sentence string) []string {
|
||||
ssf = func(sentence string) chan string {
|
||||
return RegexpSplit(reSkip, sentence)
|
||||
}
|
||||
}
|
||||
|
||||
for _, x := range ssf(blk) {
|
||||
for x := range ssf(blk) {
|
||||
if reSkip.MatchString(x) {
|
||||
result <- x
|
||||
} else if !isCutAll {
|
||||
|
||||
@@ -644,13 +644,13 @@ func TestCutDAGNoHmm(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRegexpSplit(t *testing.T) {
|
||||
result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?")
|
||||
result := chanToArray(RegexpSplit(regexp.MustCompile(`\p{Han}+`),
|
||||
"BP神经网络如何训练才能在分类时增加区分度?"))
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
}
|
||||
result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||
",BP神经网络如何训练才能在分类时#增加区分度?")
|
||||
result = chanToArray(RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
|
||||
",BP神经网络如何训练才能在分类时#增加区分度?"))
|
||||
if len(result) != 3 {
|
||||
t.Error(result)
|
||||
}
|
||||
|
||||
@@ -72,14 +72,13 @@ func cutDetail(sentence string) chan WordTag {
|
||||
result := make(chan WordTag)
|
||||
|
||||
go func() {
|
||||
blocks := jiebago.RegexpSplit(reHanDetail, sentence)
|
||||
for _, blk := range blocks {
|
||||
for blk := range jiebago.RegexpSplit(reHanDetail, sentence) {
|
||||
if reHanDetail.MatchString(blk) {
|
||||
for wordTag := range cutDetailInternal(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipDetail, blk) {
|
||||
for x := range jiebago.RegexpSplit(reSkipDetail, blk) {
|
||||
if len(x) == 0 {
|
||||
continue
|
||||
}
|
||||
@@ -242,7 +241,6 @@ func Cut(sentence string, HMM bool) chan WordTag {
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
result := make(chan WordTag)
|
||||
blocks := jiebago.RegexpSplit(reHanInternal, sentence)
|
||||
var cut cutFunc
|
||||
if HMM {
|
||||
cut = cutDAG
|
||||
@@ -250,13 +248,13 @@ func Cut(sentence string, HMM bool) chan WordTag {
|
||||
cut = cutDAGNoHMM
|
||||
}
|
||||
go func() {
|
||||
for _, blk := range blocks {
|
||||
for blk := range jiebago.RegexpSplit(reHanInternal, sentence) {
|
||||
if reHanInternal.MatchString(blk) {
|
||||
for wordTag := range cut(blk) {
|
||||
result <- wordTag
|
||||
}
|
||||
} else {
|
||||
for _, x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||
for x := range jiebago.RegexpSplit(reSkipInternal, blk) {
|
||||
if reSkipInternal.MatchString(x) {
|
||||
result <- WordTag{x, "x"}
|
||||
} else {
|
||||
|
||||
157
util.go
Normal file
157
util.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package jiebago
|
||||
|
||||
import (
|
||||
// "bufio"
|
||||
// "crypto/md5"
|
||||
// "encoding/gob"
|
||||
// "fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
// "strconv"
|
||||
// "strings"
|
||||
)
|
||||
|
||||
func DictPath(dictFileName string) (string, error) {
|
||||
if filepath.IsAbs(dictFileName) {
|
||||
return dictFileName, nil
|
||||
}
|
||||
var dictFilePath string
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return dictFilePath, err
|
||||
}
|
||||
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
|
||||
return dictFilePath, nil
|
||||
}
|
||||
|
||||
/*
|
||||
func cachePath(dictPath string) string {
|
||||
return filepath.Join(os.TempDir(),
|
||||
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(f.dictFilePath))))
|
||||
}
|
||||
|
||||
func fileInfo(filePath string, missingOk bool) (*os.FileInfo, err) {
|
||||
fileInfo, err := os.Stat(filePath)
|
||||
if missingOk && err.Err == os.ErrNotExist {
|
||||
return fileInfo, nil
|
||||
}
|
||||
return fileInfo, err
|
||||
}
|
||||
|
||||
func isCached(dictPath, cachePath string) (bool, error) {
|
||||
dictFileInfo, err := fileInfo(dictPath, false)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
cacheFileInfo, err := fileInfo(cachePath, true)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil
|
||||
}
|
||||
|
||||
func load(cachePath string, d DictLoader) error {
|
||||
dec := gob.NewDecoder(cacheFile)
|
||||
return dec.Decode(&d)
|
||||
}
|
||||
|
||||
func read(dictPath, d DictLoader, pos bool) error {
|
||||
dictFile, err := os.Open(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer dictFile.Close()
|
||||
scanner := bufio.NewScanner(dictFile)
|
||||
var token *Token
|
||||
var line string
|
||||
var fields []string
|
||||
for scanner.Scan() {
|
||||
line = scanner.Text()
|
||||
fields = strings.Split(line, " ")
|
||||
token = &Token{Term: strings.Replace(fields[0], "\ufeff", "", 1)}
|
||||
if length := len(fields); length > 1 {
|
||||
token.Freq, err = strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if pos && length > 2 {
|
||||
token.Pos = fields[2]
|
||||
}
|
||||
}
|
||||
d.Add(token)
|
||||
}
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
func dump(cachePath string, d DictLoader) error {
|
||||
cacheFile, err = os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cacheFile.Close()
|
||||
enc := gob.NewEncoder(cacheFile)
|
||||
return enc.Encode(d)
|
||||
}
|
||||
|
||||
func SetDict(s Segmenter, dictName string, pos bool) error {
|
||||
dictPath, err := DictPath(dictName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cachePath = cachePath(dictPath)
|
||||
cached, err := isCached(dictPath, cachePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cached {
|
||||
err = load(cachePath, s)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
cached = false
|
||||
}
|
||||
|
||||
err = read(dictPath, s, pos)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = dump(cachePath, s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
func LoadUserDict(dictName string, s Segmenter, pos bool) error {
|
||||
dictPath, err := DictPath(dictName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return read(dictPath, s, pos)
|
||||
}
|
||||
*/
|
||||
|
||||
// Split sentence using regular expression.
|
||||
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
|
||||
result := make(chan string)
|
||||
go func() {
|
||||
locs := r.FindAllStringIndex(sentence, -1)
|
||||
lastLoc := 0
|
||||
for _, loc := range locs {
|
||||
if loc[0] == lastLoc {
|
||||
result <- sentence[loc[0]:loc[1]]
|
||||
} else {
|
||||
result <- sentence[lastLoc:loc[0]]
|
||||
result <- sentence[loc[0]:loc[1]]
|
||||
}
|
||||
lastLoc = loc[1]
|
||||
}
|
||||
if lastLoc < len(sentence) {
|
||||
result <- sentence[lastLoc:]
|
||||
}
|
||||
close(result)
|
||||
}()
|
||||
return result
|
||||
}
|
||||
Reference in New Issue
Block a user