diff --git a/jieba_test.go b/jieba_test.go index 7163224..a2c807a 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -750,8 +750,9 @@ func TestLoadUserDict(t *testing.T) { SetDictionary("dict.txt") LoadUserDict("userdict.txt") - sentence := "李小福是创新办主任也是云计算方面的专家;例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" - result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"} + sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" + result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", " ", "\u4ec0\u4e48", "\u662f", "\u516b\u4e00\u53cc\u9e7f", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"} + words := Cut(sentence, false, true) if len(words) != len(result) { t.Error(len(words)) diff --git a/posseg/char_state_tab.go b/posseg/char_state_tab.go index 0e3ddcd..165b34d 100644 --- a/posseg/char_state_tab.go +++ b/posseg/char_state_tab.go @@ -23740,6 +23740,7 @@ func init() { CharStateTab['\u8ddb'] = []StateTag{ StateTag{'B', "n"}, StateTag{'B', "v"}, + StateTag{'S', "a"}, } CharStateTab['\u4edd'] = []StateTag{ StateTag{'E', "nr"}, diff --git a/posseg/posseg.go b/posseg/posseg.go index 270f53e..18c178c 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -12,8 +12,7 @@ import ( ) var ( - WordTagTab = make(map[string]string) - isUserDictLoaded = false + WordTagTab = make(map[string]string) ) type WordTag struct { @@ -282,11 +281,9 @@ func cut(sentence string, HMM bool) []WordTag { } func Cut(sentence string, HMM bool) []WordTag { - if !isUserDictLoaded { - for key, value := range jiebago.UserWordTagTab { - WordTagTab[key] = value - } - isUserDictLoaded = true + for key := range jiebago.UserWordTagTab { + WordTagTab[key] = jiebago.UserWordTagTab[key] + delete(jiebago.UserWordTagTab, key) } return cut(sentence, HMM) } diff --git a/posseg/posseg_test.go b/posseg/posseg_test.go index 620dd4e..4cd51db 100644 --- a/posseg/posseg_test.go +++ b/posseg/posseg_test.go @@ -292,3 +292,81 @@ func TestCut(t *testing.T) { } } + +func TestUserDict(t *testing.T) { + jiebago.SetDictionary("../dict.txt") + jiebago.LoadUserDict("../userdict.txt") + sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" + cutResult := []WordTag{ + WordTag{"\u674e\u5c0f\u798f", "nr"}, + WordTag{"\u662f", "v"}, + WordTag{"\u521b\u65b0\u529e", "i"}, + WordTag{"\u4e3b\u4efb", "b"}, + WordTag{"\u4e5f", "d"}, + WordTag{"\u662f", "v"}, + WordTag{"\u4e91\u8ba1\u7b97", "x"}, + WordTag{"\u65b9\u9762", "n"}, + WordTag{"\u7684", "uj"}, + WordTag{"\u4e13\u5bb6", "n"}, + WordTag{";", "x"}, + WordTag{" ", "x"}, + WordTag{"\u4ec0\u4e48", "r"}, + WordTag{"\u662f", "v"}, + WordTag{"\u516b\u4e00\u53cc\u9e7f", "nz"}, + WordTag{"\u4f8b\u5982", "v"}, + WordTag{"\u6211", "r"}, + WordTag{"\u8f93\u5165", "v"}, + WordTag{"\u4e00\u4e2a", "m"}, + WordTag{"\u5e26", "v"}, + WordTag{"\u201c", "x"}, + WordTag{"\u97e9\u7389\u8d4f\u9274", "nz"}, + WordTag{"\u201d", "x"}, + WordTag{"\u7684", "uj"}, + WordTag{"\u6807\u9898", "n"}, + WordTag{"\uff0c", "x"}, + WordTag{"\u5728", "p"}, + WordTag{"\u81ea\u5b9a\u4e49\u8bcd", "n"}, + WordTag{"\u5e93\u4e2d", "nrt"}, + WordTag{"\u4e5f", "d"}, + WordTag{"\u589e\u52a0", "v"}, + WordTag{"\u4e86", "ul"}, + WordTag{"\u6b64", "r"}, + WordTag{"\u8bcd", "n"}, + WordTag{"\u4e3a", "p"}, + WordTag{"N", "eng"}, + WordTag{"\u7c7b\u578b", "n"}, + } + + result := Cut(sentence, true) + if len(cutResult) != len(result) { + t.Error(result) + } + for i, _ := range result { + if result[i] != cutResult[i] { + t.Error(result[i]) + } + } +} + +func TestBug132(t *testing.T) { + /* + https://github.com/fxsjy/jieba/issues/132 + */ + jiebago.SetDictionary("../dict.txt") + sentence := "又跛又啞" + cutResult := []WordTag{ + WordTag{"\u53c8", "d"}, + WordTag{"\u8ddb", "a"}, + WordTag{"\u53c8", "d"}, + WordTag{"\u555e", "v"}, + } + result := Cut(sentence, true) + if len(cutResult) != len(result) { + t.Error(result) + } + for i, _ := range result { + if result[i] != cutResult[i] { + t.Error(result[i]) + } + } +} diff --git a/posseg/viterbi.go b/posseg/viterbi.go index b5cecd2..745ae73 100644 --- a/posseg/viterbi.go +++ b/posseg/viterbi.go @@ -86,6 +86,11 @@ func Viterbi(obs []rune) (float64, []StateTag) { obs_states = append(obs_states, tmp_obs_states[index]) } } + if len(obs_states) == 0 { + for key := range prev_states_expect_next { + obs_states = append(obs_states, key) + } + } if len(obs_states) == 0 { obs_states = ProbTransKeys } diff --git a/trie_node.go b/trie_node.go index b3bf182..d3c8c04 100644 --- a/trie_node.go +++ b/trie_node.go @@ -108,17 +108,22 @@ func LoadUserDict(file_path string) error { } defer file.Close() - reader := bufio.NewReader(file) - for { - line, readError := reader.ReadString('\n') - if readError != nil && len(line) == 0 { - break - } + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() words := strings.Split(line, " ") word, freqStr := words[0], words[1] word = strings.Replace(word, "\ufeff", "", 1) - freq, _ := strconv.ParseFloat(freqStr, 64) - TT.addWord(word, freq) + freq, freqErr := strconv.ParseFloat(freqStr, 64) + if freqErr != nil { + continue // TODO: how to handle wrong type of frequency? + } + tag := "" + if len(words) == 3 { + tag = words[2] + } + addWord(word, freq, tag) } - return nil + + return scanner.Err() } diff --git a/userdict.txt b/userdict.txt index 73d2d58..9c831dc 100644 --- a/userdict.txt +++ b/userdict.txt @@ -3,4 +3,5 @@ 创新办 3 i easy_install 3 eng 好用 300 -韩玉赏鉴 3 nz \ No newline at end of file +韩玉赏鉴 3 nz +八一双鹿 3 nz