mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-08 02:00:24 +08:00
small refactor
This commit is contained in:
@@ -1,18 +1,12 @@
|
||||
package posseg
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"github.com/wangbin/jiebago"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
WordTagTab = make(map[string]string)
|
||||
wordTagMap = make(map[string]string)
|
||||
reHanDetail = regexp.MustCompile(`\p{Han}+`)
|
||||
reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
|
||||
reEng = regexp.MustCompile(`[[:alnum:]]`)
|
||||
@@ -20,28 +14,12 @@ var (
|
||||
reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
|
||||
reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
|
||||
reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
|
||||
dictionary = "dict.txt"
|
||||
)
|
||||
|
||||
type WordTag struct {
|
||||
Word, Tag string
|
||||
}
|
||||
|
||||
func (wt WordTag) String() string {
|
||||
return fmt.Sprintf("%s/%s", wt.Word, wt.Tag)
|
||||
}
|
||||
|
||||
func init() {
|
||||
_, filename, _, _ := runtime.Caller(1)
|
||||
dict_dir := filepath.Dir(filepath.Dir(filename))
|
||||
dict_path := filepath.Join(dict_dir, dictionary)
|
||||
err := load_model(dict_path)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
func SetDictionary(dictFileName string) error {
|
||||
err := jiebago.SetDictionary(dictFileName)
|
||||
if err != nil {
|
||||
@@ -51,32 +29,10 @@ func SetDictionary(dictFileName string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dictFile, err := os.Open(dictFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer dictFile.Close()
|
||||
wtfs, err := jiebago.ParseDictFile(dictFilePath)
|
||||
|
||||
wtfs, err := ParseDictFile(dictFile)
|
||||
|
||||
}
|
||||
*/
|
||||
func load_model(f_name string) error {
|
||||
file, openError := os.Open(f_name)
|
||||
if openError != nil {
|
||||
return openError
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
words := strings.Split(strings.TrimSpace(line), " ")
|
||||
word, tag := words[0], words[2]
|
||||
WordTagTab[word] = tag
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return err
|
||||
for _, wtf := range wtfs {
|
||||
wordTagMap[wtf.Word] = wtf.Tag
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -157,7 +113,7 @@ func cut_DAG(sentence string) []WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := WordTagTab[sbuf]; ok {
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
result = append(result, WordTag{sbuf, tag})
|
||||
} else {
|
||||
result = append(result, WordTag{sbuf, "x"})
|
||||
@@ -173,7 +129,7 @@ func cut_DAG(sentence string) []WordTag {
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := WordTagTab[selem]; ok {
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
result = append(result, WordTag{string(elem), tag})
|
||||
} else {
|
||||
result = append(result, WordTag{string(elem), "x"})
|
||||
@@ -185,7 +141,7 @@ func cut_DAG(sentence string) []WordTag {
|
||||
}
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := WordTagTab[sl_word]; ok {
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
result = append(result, WordTag{sl_word, tag})
|
||||
} else {
|
||||
result = append(result, WordTag{sl_word, "x"})
|
||||
@@ -197,7 +153,7 @@ func cut_DAG(sentence string) []WordTag {
|
||||
if len(buf) > 0 {
|
||||
if len(buf) == 1 {
|
||||
sbuf := string(buf)
|
||||
if tag, ok := WordTagTab[sbuf]; ok {
|
||||
if tag, ok := wordTagMap[sbuf]; ok {
|
||||
result = append(result, WordTag{sbuf, tag})
|
||||
} else {
|
||||
result = append(result, WordTag{sbuf, "x"})
|
||||
@@ -212,7 +168,7 @@ func cut_DAG(sentence string) []WordTag {
|
||||
} else {
|
||||
for _, elem := range buf {
|
||||
selem := string(elem)
|
||||
if tag, ok := WordTagTab[selem]; ok {
|
||||
if tag, ok := wordTagMap[selem]; ok {
|
||||
result = append(result, WordTag{selem, tag})
|
||||
} else {
|
||||
result = append(result, WordTag{selem, "x"})
|
||||
@@ -248,7 +204,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag {
|
||||
buf = make([]rune, 0)
|
||||
}
|
||||
sl_word := string(l_word)
|
||||
if tag, ok := WordTagTab[sl_word]; ok {
|
||||
if tag, ok := wordTagMap[sl_word]; ok {
|
||||
result = append(result, WordTag{sl_word, tag})
|
||||
} else {
|
||||
result = append(result, WordTag{sl_word, "x"})
|
||||
@@ -303,7 +259,7 @@ func cut(sentence string, HMM bool) []WordTag {
|
||||
|
||||
func Cut(sentence string, HMM bool) []WordTag {
|
||||
for key := range jiebago.UserWordTagTab {
|
||||
WordTagTab[key] = jiebago.UserWordTagTab[key]
|
||||
wordTagMap[key] = jiebago.UserWordTagTab[key]
|
||||
delete(jiebago.UserWordTagTab, key)
|
||||
}
|
||||
return cut(sentence, HMM)
|
||||
|
||||
@@ -269,7 +269,7 @@ var (
|
||||
)
|
||||
|
||||
func TestCut(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetDictionary("../dict.txt")
|
||||
for index, content := range test_contents {
|
||||
result := Cut(content, true)
|
||||
if len(defaultCutResult[index]) != len(result) {
|
||||
@@ -297,7 +297,7 @@ func TestBug132(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/132
|
||||
*/
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetDictionary("../dict.txt")
|
||||
sentence := "又跛又啞"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"又", "d"},
|
||||
@@ -320,7 +320,7 @@ func TestBug137(t *testing.T) {
|
||||
/*
|
||||
https://github.com/fxsjy/jieba/issues/137
|
||||
*/
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetDictionary("../dict.txt")
|
||||
sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
|
||||
cutResult := []WordTag{
|
||||
WordTag{"前", "f"},
|
||||
@@ -349,7 +349,7 @@ func TestBug137(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestUserDict(t *testing.T) {
|
||||
jiebago.SetDictionary("../dict.txt")
|
||||
SetDictionary("../dict.txt")
|
||||
jiebago.LoadUserDict("../userdict.txt")
|
||||
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user