mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-30 09:00:30 +08:00
fixed the bug from issue #132 from original jieba issue track
- issue details from https://github.com/fxsjy/jieba/issues/132 - updated tests - also some code refactors
This commit is contained in:
@@ -750,8 +750,9 @@ func TestLoadUserDict(t *testing.T) {
|
|||||||
SetDictionary("dict.txt")
|
SetDictionary("dict.txt")
|
||||||
LoadUserDict("userdict.txt")
|
LoadUserDict("userdict.txt")
|
||||||
|
|
||||||
sentence := "李小福是创新办主任也是云计算方面的专家;例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"}
|
result := []string{"\u674e\u5c0f\u798f", "\u662f", "\u521b\u65b0\u529e", "\u4e3b\u4efb", "\u4e5f", "\u662f", "\u4e91\u8ba1\u7b97", "\u65b9\u9762", "\u7684", "\u4e13\u5bb6", ";", " ", "\u4ec0\u4e48", "\u662f", "\u516b\u4e00\u53cc\u9e7f", "\u4f8b\u5982", "\u6211", "\u8f93\u5165", "\u4e00\u4e2a", "\u5e26", "\u201c", "\u97e9\u7389\u8d4f\u9274", "\u201d", "\u7684", "\u6807\u9898", "\uff0c", "\u5728", "\u81ea\u5b9a\u4e49\u8bcd", "\u5e93\u4e2d", "\u4e5f", "\u589e\u52a0", "\u4e86", "\u6b64", "\u8bcd\u4e3a", "N", "\u7c7b\u578b"}
|
||||||
|
|
||||||
words := Cut(sentence, false, true)
|
words := Cut(sentence, false, true)
|
||||||
if len(words) != len(result) {
|
if len(words) != len(result) {
|
||||||
t.Error(len(words))
|
t.Error(len(words))
|
||||||
|
|||||||
@@ -23740,6 +23740,7 @@ func init() {
|
|||||||
CharStateTab['\u8ddb'] = []StateTag{
|
CharStateTab['\u8ddb'] = []StateTag{
|
||||||
StateTag{'B', "n"},
|
StateTag{'B', "n"},
|
||||||
StateTag{'B', "v"},
|
StateTag{'B', "v"},
|
||||||
|
StateTag{'S', "a"},
|
||||||
}
|
}
|
||||||
CharStateTab['\u4edd'] = []StateTag{
|
CharStateTab['\u4edd'] = []StateTag{
|
||||||
StateTag{'E', "nr"},
|
StateTag{'E', "nr"},
|
||||||
|
|||||||
@@ -12,8 +12,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
WordTagTab = make(map[string]string)
|
WordTagTab = make(map[string]string)
|
||||||
isUserDictLoaded = false
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type WordTag struct {
|
type WordTag struct {
|
||||||
@@ -282,11 +281,9 @@ func cut(sentence string, HMM bool) []WordTag {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Cut(sentence string, HMM bool) []WordTag {
|
func Cut(sentence string, HMM bool) []WordTag {
|
||||||
if !isUserDictLoaded {
|
for key := range jiebago.UserWordTagTab {
|
||||||
for key, value := range jiebago.UserWordTagTab {
|
WordTagTab[key] = jiebago.UserWordTagTab[key]
|
||||||
WordTagTab[key] = value
|
delete(jiebago.UserWordTagTab, key)
|
||||||
}
|
|
||||||
isUserDictLoaded = true
|
|
||||||
}
|
}
|
||||||
return cut(sentence, HMM)
|
return cut(sentence, HMM)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -292,3 +292,81 @@ func TestCut(t *testing.T) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUserDict(t *testing.T) {
|
||||||
|
jiebago.SetDictionary("../dict.txt")
|
||||||
|
jiebago.LoadUserDict("../userdict.txt")
|
||||||
|
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
|
cutResult := []WordTag{
|
||||||
|
WordTag{"\u674e\u5c0f\u798f", "nr"},
|
||||||
|
WordTag{"\u662f", "v"},
|
||||||
|
WordTag{"\u521b\u65b0\u529e", "i"},
|
||||||
|
WordTag{"\u4e3b\u4efb", "b"},
|
||||||
|
WordTag{"\u4e5f", "d"},
|
||||||
|
WordTag{"\u662f", "v"},
|
||||||
|
WordTag{"\u4e91\u8ba1\u7b97", "x"},
|
||||||
|
WordTag{"\u65b9\u9762", "n"},
|
||||||
|
WordTag{"\u7684", "uj"},
|
||||||
|
WordTag{"\u4e13\u5bb6", "n"},
|
||||||
|
WordTag{";", "x"},
|
||||||
|
WordTag{" ", "x"},
|
||||||
|
WordTag{"\u4ec0\u4e48", "r"},
|
||||||
|
WordTag{"\u662f", "v"},
|
||||||
|
WordTag{"\u516b\u4e00\u53cc\u9e7f", "nz"},
|
||||||
|
WordTag{"\u4f8b\u5982", "v"},
|
||||||
|
WordTag{"\u6211", "r"},
|
||||||
|
WordTag{"\u8f93\u5165", "v"},
|
||||||
|
WordTag{"\u4e00\u4e2a", "m"},
|
||||||
|
WordTag{"\u5e26", "v"},
|
||||||
|
WordTag{"\u201c", "x"},
|
||||||
|
WordTag{"\u97e9\u7389\u8d4f\u9274", "nz"},
|
||||||
|
WordTag{"\u201d", "x"},
|
||||||
|
WordTag{"\u7684", "uj"},
|
||||||
|
WordTag{"\u6807\u9898", "n"},
|
||||||
|
WordTag{"\uff0c", "x"},
|
||||||
|
WordTag{"\u5728", "p"},
|
||||||
|
WordTag{"\u81ea\u5b9a\u4e49\u8bcd", "n"},
|
||||||
|
WordTag{"\u5e93\u4e2d", "nrt"},
|
||||||
|
WordTag{"\u4e5f", "d"},
|
||||||
|
WordTag{"\u589e\u52a0", "v"},
|
||||||
|
WordTag{"\u4e86", "ul"},
|
||||||
|
WordTag{"\u6b64", "r"},
|
||||||
|
WordTag{"\u8bcd", "n"},
|
||||||
|
WordTag{"\u4e3a", "p"},
|
||||||
|
WordTag{"N", "eng"},
|
||||||
|
WordTag{"\u7c7b\u578b", "n"},
|
||||||
|
}
|
||||||
|
|
||||||
|
result := Cut(sentence, true)
|
||||||
|
if len(cutResult) != len(result) {
|
||||||
|
t.Error(result)
|
||||||
|
}
|
||||||
|
for i, _ := range result {
|
||||||
|
if result[i] != cutResult[i] {
|
||||||
|
t.Error(result[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBug132(t *testing.T) {
|
||||||
|
/*
|
||||||
|
https://github.com/fxsjy/jieba/issues/132
|
||||||
|
*/
|
||||||
|
jiebago.SetDictionary("../dict.txt")
|
||||||
|
sentence := "又跛又啞"
|
||||||
|
cutResult := []WordTag{
|
||||||
|
WordTag{"\u53c8", "d"},
|
||||||
|
WordTag{"\u8ddb", "a"},
|
||||||
|
WordTag{"\u53c8", "d"},
|
||||||
|
WordTag{"\u555e", "v"},
|
||||||
|
}
|
||||||
|
result := Cut(sentence, true)
|
||||||
|
if len(cutResult) != len(result) {
|
||||||
|
t.Error(result)
|
||||||
|
}
|
||||||
|
for i, _ := range result {
|
||||||
|
if result[i] != cutResult[i] {
|
||||||
|
t.Error(result[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -86,6 +86,11 @@ func Viterbi(obs []rune) (float64, []StateTag) {
|
|||||||
obs_states = append(obs_states, tmp_obs_states[index])
|
obs_states = append(obs_states, tmp_obs_states[index])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if len(obs_states) == 0 {
|
||||||
|
for key := range prev_states_expect_next {
|
||||||
|
obs_states = append(obs_states, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
if len(obs_states) == 0 {
|
if len(obs_states) == 0 {
|
||||||
obs_states = ProbTransKeys
|
obs_states = ProbTransKeys
|
||||||
}
|
}
|
||||||
|
|||||||
23
trie_node.go
23
trie_node.go
@@ -108,17 +108,22 @@ func LoadUserDict(file_path string) error {
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
reader := bufio.NewReader(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for {
|
for scanner.Scan() {
|
||||||
line, readError := reader.ReadString('\n')
|
line := scanner.Text()
|
||||||
if readError != nil && len(line) == 0 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
words := strings.Split(line, " ")
|
words := strings.Split(line, " ")
|
||||||
word, freqStr := words[0], words[1]
|
word, freqStr := words[0], words[1]
|
||||||
word = strings.Replace(word, "\ufeff", "", 1)
|
word = strings.Replace(word, "\ufeff", "", 1)
|
||||||
freq, _ := strconv.ParseFloat(freqStr, 64)
|
freq, freqErr := strconv.ParseFloat(freqStr, 64)
|
||||||
TT.addWord(word, freq)
|
if freqErr != nil {
|
||||||
|
continue // TODO: how to handle wrong type of frequency?
|
||||||
|
}
|
||||||
|
tag := ""
|
||||||
|
if len(words) == 3 {
|
||||||
|
tag = words[2]
|
||||||
|
}
|
||||||
|
addWord(word, freq, tag)
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
return scanner.Err()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,3 +4,4 @@
|
|||||||
easy_install 3 eng
|
easy_install 3 eng
|
||||||
好用 300
|
好用 300
|
||||||
韩玉赏鉴 3 nz
|
韩玉赏鉴 3 nz
|
||||||
|
八一双鹿 3 nz
|
||||||
|
|||||||
Reference in New Issue
Block a user