1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
Files
jieba/util.go

152 lines
3.1 KiB
Go

package jiebago
import (
"bufio"
"crypto/md5"
"encoding/gob"
"fmt"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
)
func DictPath(dictFileName string) (string, error) {
if filepath.IsAbs(dictFileName) {
return dictFileName, nil
}
var dictFilePath string
cwd, err := os.Getwd()
if err != nil {
return dictFilePath, err
}
dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
return dictFilePath, nil
}
func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error {
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
}
defer dictFile.Close()
scanner := bufio.NewScanner(dictFile)
var entry *Entry
var line string
var fields []string
for scanner.Scan() {
line = scanner.Text()
fields = strings.Split(line, " ")
entry = NewEntry()
entry.Word = strings.Replace(fields[0], "\ufeff", "", 1)
if length := len(fields); length > 1 {
entry.Freq, err = strconv.ParseFloat(fields[1], 64)
if err != nil {
return err
}
if usingFlag && length > 2 {
entry.Flag = fields[2]
}
}
l.AddEntry(entry)
}
return scanner.Err()
}
func cachePath(dictPath string) string {
return filepath.Join(os.TempDir(),
fmt.Sprintf("jieba.%x.cache", md5.Sum([]byte(dictPath))))
}
func fileInfo(filePath string, missingOk bool) (os.FileInfo, error) {
fi, err := os.Stat(filePath)
if missingOk && err == os.ErrNotExist {
return fi, nil
}
return fi, err
}
func cached(dictPath, cachePath string) (bool, error) {
dictFileInfo, err := fileInfo(dictPath, false)
if err != nil {
return false, err
}
cacheFileInfo, err := fileInfo(cachePath, true)
if err != nil {
return false, err
}
return cacheFileInfo.ModTime().After(dictFileInfo.ModTime()), nil
}
func load(l DictLoader, cachePath string) error {
cacheFile, err := os.Open(cachePath)
if err != nil {
return err
}
defer cacheFile.Close()
dec := gob.NewDecoder(cacheFile)
return dec.Decode(&l)
}
func dump(l DictLoader, cachePath string) error {
cacheFile, err := os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer cacheFile.Close()
enc := gob.NewEncoder(cacheFile)
return enc.Encode(l)
}
func SetDict(l DictLoader, dictName string, pos bool) error {
dictPath, err := DictPath(dictName)
if err != nil {
return err
}
cachePath := cachePath(dictPath)
cached, err := cached(dictPath, cachePath)
if err != nil {
return err
}
if cached {
err = load(l, cachePath)
if err == nil {
return nil
}
cached = false
}
err = LoadDict(l, dictPath, pos)
if err != nil {
return err
}
return dump(l, cachePath)
}
// Split sentence using regular expression.
func RegexpSplit(r *regexp.Regexp, sentence string) chan string {
result := make(chan string)
go func() {
locs := r.FindAllStringIndex(sentence, -1)
lastLoc := 0
for _, loc := range locs {
if loc[0] == lastLoc {
result <- sentence[loc[0]:loc[1]]
} else {
result <- sentence[lastLoc:loc[0]]
result <- sentence[loc[0]:loc[1]]
}
lastLoc = loc[1]
}
if lastLoc < len(sentence) {
result <- sentence[lastLoc:]
}
close(result)
}()
return result
}