1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-08 02:00:24 +08:00

small refactor

This commit is contained in:
Wang Bin
2015-02-26 14:59:14 +08:00
parent 35094877da
commit c2abc24083
2 changed files with 15 additions and 59 deletions

View File

@@ -1,18 +1,12 @@
package posseg
import (
"bufio"
"fmt"
"github.com/wangbin/jiebago"
"os"
"path/filepath"
"regexp"
"runtime"
"strings"
)
var (
WordTagTab = make(map[string]string)
wordTagMap = make(map[string]string)
reHanDetail = regexp.MustCompile(`\p{Han}+`)
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
reEng = regexp.MustCompile(`[[:alnum:]]`)
@@ -20,28 +14,12 @@ var (
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
dictionary = "dict.txt"
)
type WordTag struct {
Word, Tag string
}
func (wt WordTag) String() string {
return fmt.Sprintf("%s/%s", wt.Word, wt.Tag)
}
func init() {
_, filename, _, _ := runtime.Caller(1)
dict_dir := filepath.Dir(filepath.Dir(filename))
dict_path := filepath.Join(dict_dir, dictionary)
err := load_model(dict_path)
if err != nil {
panic(err)
}
}
/*
func SetDictionary(dictFileName string) error {
err := jiebago.SetDictionary(dictFileName)
if err != nil {
@@ -51,32 +29,10 @@ func SetDictionary(dictFileName string) error {
if err != nil {
return err
}
dictFile, err := os.Open(dictFilePath)
if err != nil {
return err
}
defer dictFile.Close()
wtfs, err := jiebago.ParseDictFile(dictFilePath)
wtfs, err := ParseDictFile(dictFile)
}
*/
func load_model(f_name string) error {
file, openError := os.Open(f_name)
if openError != nil {
return openError
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(strings.TrimSpace(line), " ")
word, tag := words[0], words[2]
WordTagTab[word] = tag
}
if err := scanner.Err(); err != nil {
return err
for _, wtf := range wtfs {
wordTagMap[wtf.Word] = wtf.Tag
}
return nil
}
@@ -157,7 +113,7 @@ func cut_DAG(sentence string) []WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := WordTagTab[sbuf]; ok {
if tag, ok := wordTagMap[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
@@ -173,7 +129,7 @@ func cut_DAG(sentence string) []WordTag {
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := WordTagTab[selem]; ok {
if tag, ok := wordTagMap[selem]; ok {
result = append(result, WordTag{string(elem), tag})
} else {
result = append(result, WordTag{string(elem), "x"})
@@ -185,7 +141,7 @@ func cut_DAG(sentence string) []WordTag {
}
}
sl_word := string(l_word)
if tag, ok := WordTagTab[sl_word]; ok {
if tag, ok := wordTagMap[sl_word]; ok {
result = append(result, WordTag{sl_word, tag})
} else {
result = append(result, WordTag{sl_word, "x"})
@@ -197,7 +153,7 @@ func cut_DAG(sentence string) []WordTag {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := WordTagTab[sbuf]; ok {
if tag, ok := wordTagMap[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
@@ -212,7 +168,7 @@ func cut_DAG(sentence string) []WordTag {
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := WordTagTab[selem]; ok {
if tag, ok := wordTagMap[selem]; ok {
result = append(result, WordTag{selem, tag})
} else {
result = append(result, WordTag{selem, "x"})
@@ -248,7 +204,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := WordTagTab[sl_word]; ok {
if tag, ok := wordTagMap[sl_word]; ok {
result = append(result, WordTag{sl_word, tag})
} else {
result = append(result, WordTag{sl_word, "x"})
@@ -303,7 +259,7 @@ func cut(sentence string, HMM bool) []WordTag {
func Cut(sentence string, HMM bool) []WordTag {
for key := range jiebago.UserWordTagTab {
WordTagTab[key] = jiebago.UserWordTagTab[key]
wordTagMap[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
return cut(sentence, HMM)

View File

@@ -269,7 +269,7 @@ var (
)
func TestCut(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetDictionary("../dict.txt")
for index, content := range test_contents {
result := Cut(content, true)
if len(defaultCutResult[index]) != len(result) {
@@ -297,7 +297,7 @@ func TestBug132(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/132
*/
jiebago.SetDictionary("../dict.txt")
SetDictionary("../dict.txt")
sentence := "又跛又啞"
cutResult := []WordTag{
WordTag{"又", "d"},
@@ -320,7 +320,7 @@ func TestBug137(t *testing.T) {
/*
https://github.com/fxsjy/jieba/issues/137
*/
jiebago.SetDictionary("../dict.txt")
SetDictionary("../dict.txt")
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
cutResult := []WordTag{
WordTag{"前", "f"},
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
}
func TestUserDict(t *testing.T) {
jiebago.SetDictionary("../dict.txt")
SetDictionary("../dict.txt")
jiebago.LoadUserDict("../userdict.txt")
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"