1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-06 01:00:37 +08:00
Files
jieba/util.go

158 lines
3.2 KiB
Go

package jiebago
import (
// "bufio"
// "crypto/md5"
// "encoding/gob"
// "fmt"
"os"
"path/filepath"
"regexp"
// "strconv"
// "strings"
)
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}
/*
func cachePath(dictPath string) string {
return filepath.Join(os.TempDir(),
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(f.dictFilePath))))
}
func fileInfo(filePath string, missingOk bool) (*os.FileInfo, err) {
fileInfo, err := os.Stat(filePath)
if missingOk && err.Err == os.ErrNotExist {
return fileInfo, nil
}
return fileInfo, err
}
func isCached(dictPath, cachePath string) (bool, error) {
dictFileInfo, err := fileInfo(dictPath, false)
if err != nil {
return false, err
}
cacheFileInfo, err := fileInfo(cachePath, true)
if err != nil {
return false, err
}
return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil
}
func load(cachePath string, d DictLoader) error {
dec := gob.NewDecoder(cacheFile)
return dec.Decode(&d)
}
func read(dictPath, d DictLoader, pos bool) error {
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
}
defer dictFile.Close()
scanner := bufio.NewScanner(dictFile)
var token *Token
var line string
var fields []string
for scanner.Scan() {
line = scanner.Text()
fields = strings.Split(line, " ")
token = &Token{Term: strings.Replace(fields[0], "\ufeff", "", 1)}
if length := len(fields); length > 1 {
token.Freq, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
return err
}
if pos && length > 2 {
token.Pos = fields[2]
}
}
d.Add(token)
}
return scanner.Err()
}
func dump(cachePath string, d DictLoader) error {
cacheFile, err = os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
return enc.Encode(d)
}
func SetDict(s Segmenter, dictName string, pos bool) error {
dictPath, err := DictPath(dictName)
if err != nil {
return err
}
cachePath = cachePath(dictPath)
cached, err := isCached(dictPath, cachePath)
if err != nil {
return err
}
if cached {
err = load(cachePath, s)
if err == nil {
return nil
}
cached = false
}
err = read(dictPath, s, pos)
if err != nil {
return err
}
err = dump(cachePath, s)
if err != nil {
return err
}
}
func LoadUserDict(dictName string, s Segmenter, pos bool) error {
dictPath, err := DictPath(dictName)
if err != nil {
return err
}
return read(dictPath, s, pos)
}
*/
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
result := make(chan string)
go func() {
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
for _, loc := range locs {
if loc[0] == lastLoc {
result <- sentence[loc[0]:loc[1]]
} else {
result <- sentence[lastLoc:loc[0]]
result <- sentence[loc[0]:loc[1]]
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result <- sentence[lastLoc:]
}
close(result)
}()
return result
}