From c2abc240834454595fcad3398c3d2b1159911d47 Mon Sep 17 00:00:00 2001 From: Wang Bin Date: Thu, 26 Feb 2015 14:59:14 +0800 Subject: [PATCH] small refactor --- posseg/posseg.go | 66 ++++++++----------------------------------- posseg/posseg_test.go | 8 +++--- 2 files changed, 15 insertions(+), 59 deletions(-) diff --git a/posseg/posseg.go b/posseg/posseg.go index bb984ce..d31d5ad 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -1,18 +1,12 @@ package posseg import ( - "bufio" - "fmt" "github.com/wangbin/jiebago" - "os" - "path/filepath" "regexp" - "runtime" - "strings" ) var ( - WordTagTab = make(map[string]string) + wordTagMap = make(map[string]string) reHanDetail = regexp.MustCompile(`\p{Han}+`) reSkipDetail = regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`) reEng = regexp.MustCompile(`[[:alnum:]]`) @@ -20,28 +14,12 @@ var ( reEng1 = regexp.MustCompile(`[[:alnum:]]$`) reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) - dictionary = "dict.txt" ) type WordTag struct { Word, Tag string } -func (wt WordTag) String() string { - return fmt.Sprintf("%s/%s", wt.Word, wt.Tag) -} - -func init() { - _, filename, _, _ := runtime.Caller(1) - dict_dir := filepath.Dir(filepath.Dir(filename)) - dict_path := filepath.Join(dict_dir, dictionary) - err := load_model(dict_path) - if err != nil { - panic(err) - } -} - -/* func SetDictionary(dictFileName string) error { err := jiebago.SetDictionary(dictFileName) if err != nil { @@ -51,32 +29,10 @@ func SetDictionary(dictFileName string) error { if err != nil { return err } - dictFile, err := os.Open(dictFilePath) - if err != nil { - return err - } - defer dictFile.Close() + wtfs, err := jiebago.ParseDictFile(dictFilePath) - wtfs, err := ParseDictFile(dictFile) - -} -*/ -func load_model(f_name string) error { - file, openError := os.Open(f_name) - if openError != nil { - return openError - } - defer file.Close() - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - words := strings.Split(strings.TrimSpace(line), " ") - word, tag := words[0], words[2] - WordTagTab[word] = tag - } - if err := scanner.Err(); err != nil { - return err + for _, wtf := range wtfs { + wordTagMap[wtf.Word] = wtf.Tag } return nil } @@ -157,7 +113,7 @@ func cut_DAG(sentence string) []WordTag { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := WordTagTab[sbuf]; ok { + if tag, ok := wordTagMap[sbuf]; ok { result = append(result, WordTag{sbuf, tag}) } else { result = append(result, WordTag{sbuf, "x"}) @@ -173,7 +129,7 @@ func cut_DAG(sentence string) []WordTag { } else { for _, elem := range buf { selem := string(elem) - if tag, ok := WordTagTab[selem]; ok { + if tag, ok := wordTagMap[selem]; ok { result = append(result, WordTag{string(elem), tag}) } else { result = append(result, WordTag{string(elem), "x"}) @@ -185,7 +141,7 @@ func cut_DAG(sentence string) []WordTag { } } sl_word := string(l_word) - if tag, ok := WordTagTab[sl_word]; ok { + if tag, ok := wordTagMap[sl_word]; ok { result = append(result, WordTag{sl_word, tag}) } else { result = append(result, WordTag{sl_word, "x"}) @@ -197,7 +153,7 @@ func cut_DAG(sentence string) []WordTag { if len(buf) > 0 { if len(buf) == 1 { sbuf := string(buf) - if tag, ok := WordTagTab[sbuf]; ok { + if tag, ok := wordTagMap[sbuf]; ok { result = append(result, WordTag{sbuf, tag}) } else { result = append(result, WordTag{sbuf, "x"}) @@ -212,7 +168,7 @@ func cut_DAG(sentence string) []WordTag { } else { for _, elem := range buf { selem := string(elem) - if tag, ok := WordTagTab[selem]; ok { + if tag, ok := wordTagMap[selem]; ok { result = append(result, WordTag{selem, tag}) } else { result = append(result, WordTag{selem, "x"}) @@ -248,7 +204,7 @@ func cut_DAG_NO_HMM(sentence string) []WordTag { buf = make([]rune, 0) } sl_word := string(l_word) - if tag, ok := WordTagTab[sl_word]; ok { + if tag, ok := wordTagMap[sl_word]; ok { result = append(result, WordTag{sl_word, tag}) } else { result = append(result, WordTag{sl_word, "x"}) @@ -303,7 +259,7 @@ func cut(sentence string, HMM bool) []WordTag { func Cut(sentence string, HMM bool) []WordTag { for key := range jiebago.UserWordTagTab { - WordTagTab[key] = jiebago.UserWordTagTab[key] + wordTagMap[key] = jiebago.UserWordTagTab[key] delete(jiebago.UserWordTagTab, key) } return cut(sentence, HMM) diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 47faee1..6308ed1 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -269,7 +269,7 @@ var ( ) func TestCut(t *testing.T) { - jiebago.SetDictionary("../dict.txt") + SetDictionary("../dict.txt") for index, content := range test_contents { result := Cut(content, true) if len(defaultCutResult[index]) != len(result) { @@ -297,7 +297,7 @@ func TestBug132(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/132 */ - jiebago.SetDictionary("../dict.txt") + SetDictionary("../dict.txt") sentence := "又跛又啞" cutResult := []WordTag{ WordTag{"又", "d"}, @@ -320,7 +320,7 @@ func TestBug137(t *testing.T) { /* https://github.com/fxsjy/jieba/issues/137 */ - jiebago.SetDictionary("../dict.txt") + SetDictionary("../dict.txt") sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" cutResult := []WordTag{ WordTag{"前", "f"}, @@ -349,7 +349,7 @@ func TestBug137(t *testing.T) { } func TestUserDict(t *testing.T) { - jiebago.SetDictionary("../dict.txt") + SetDictionary("../dict.txt") jiebago.LoadUserDict("../userdict.txt") sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"