diff --git a/analyse/analyse.go b/analyse/analyse.go index 47423c8..83707a4 100644 --- a/analyse/analyse.go +++ b/analyse/analyse.go @@ -41,6 +41,10 @@ type TagExtracter struct { stopWords map[string]int } +func (t *TagExtracter) AddEntry(entry *jiebago.Entry) { + t.stopWords[entry.Word] = 1 +} + func NewTagExtracter(dictFileName, IDFFileName string) (*TagExtracter, error) { j, err := jiebago.NewJieba(dictFileName) if err != nil { @@ -61,11 +65,7 @@ func (t *TagExtracter) SetStopWords(stopWordsFileName string) error { return err } - wtfs, err := jiebago.ParseDictFile(stopWordsFilePath) - for _, wtf := range wtfs { - t.stopWords[wtf.Word] = 1 - } - return nil + return jiebago.LoadDict(t, stopWordsFilePath, false) } // Keyword extraction. diff --git a/analyse/idf.go b/analyse/idf.go index a706cc0..c3b6f40 100644 --- a/analyse/idf.go +++ b/analyse/idf.go @@ -8,6 +8,12 @@ import ( type IDFLoader struct { IDFFreq map[string]float64 Median float64 + freqs []float64 +} + +func (l *IDFLoader) AddEntry(entry *jiebago.Entry) { + l.IDFFreq[entry.Word] = entry.Freq + l.freqs = append(l.freqs, entry.Freq) } func NewIDFLoader(IDFFileName string) (*IDFLoader, error) { @@ -15,18 +21,14 @@ func NewIDFLoader(IDFFileName string) (*IDFLoader, error) { if err != nil { return nil, err } - wtfs, err := jiebago.ParseDictFile(IDFFilePath) + loader := &IDFLoader{make(map[string]float64), 0.0, make([]float64, 0)} + err = jiebago.LoadDict(loader, IDFFilePath, false) if err != nil { return nil, err } - freqs := make([]float64, len(wtfs)) - loader := &IDFLoader{make(map[string]float64), 0.0} - for index, wtf := range wtfs { - loader.IDFFreq[wtf.Word] = wtf.Freq - freqs[index] = wtf.Freq - } - sort.Float64s(freqs) - loader.Median = freqs[len(freqs)/2] + sort.Float64s(loader.freqs) + loader.Median = loader.freqs[len(loader.freqs)/2] + loader.freqs = []float64{} return loader, nil } diff --git a/dict.go b/dict.go deleted file mode 100644 index 5dd5fe9..0000000 --- a/dict.go +++ /dev/null @@ -1,38 +0,0 @@ -package jiebago - -import ( - "bufio" - "os" - "strconv" - "strings" -) - -func ParseDictFile(dictFilePath string) ([]*Entry, error) { - dictFile, err := os.Open(dictFilePath) - if err != nil { - return nil, err - } - defer dictFile.Close() - entries := make([]*Entry, 0) - scanner := bufio.NewScanner(dictFile) - for scanner.Scan() { - line := scanner.Text() - fields := strings.Split(line, " ") - length := len(fields) - word := fields[0] - word = strings.Replace(word, "\ufeff", "", 1) - entry := NewEntry() - entry.Word = word - if length > 1 { - entry.Freq, err = strconv.ParseFloat(fields[1], 64) - if err != nil { - return nil, err - } - } - if length > 2 { - entry.Flag = fields[2] - } - entries = append(entries, entry) - } - return entries, scanner.Err() -} diff --git a/dictionary.go b/dictionary.go index 97bb9ac..a0bd55b 100644 --- a/dictionary.go +++ b/dictionary.go @@ -14,7 +14,6 @@ func NewEntry() *Entry { return &Entry{new(Pair), 0.0} } -type Loader interface { - AddEntry(Entry) - CachePath(string) string +type DictLoader interface { + AddEntry(*Entry) } diff --git a/posseg/posseg.go b/posseg/posseg.go index f5b281e..00b694a 100644 --- a/posseg/posseg.go +++ b/posseg/posseg.go @@ -26,8 +26,8 @@ type Posseg struct { } func (p *Posseg) AddEntry(entry *jiebago.Entry) { - if len(entry.Tag) > 0 { - p.Flag[Entry.Word] = strings.TrimSpace(Entry.Flag) + if len(entry.Flag) > 0 { + p.Flag[entry.Word] = strings.TrimSpace(entry.Flag) } p.Add(entry.Word, entry.Freq) } @@ -41,24 +41,13 @@ func NewPosseg(dictFileName string) (*Posseg, error) { if err != nil { return nil, err } - wtfs, err := jiebago.ParseDictFile(dictFilePath) - - for _, wtf := range wtfs { - p.Add(wtf) - } - return p, nil + err = jiebago.LoadDict(p, dictFilePath, true) + return p, err } // Load user specified dictionary file. func (p *Posseg) LoadUserDict(dictFilePath string) error { - wtfs, err := jiebago.ParseDictFile(dictFilePath) - if err != nil { - return err - } - for _, wtf := range wtfs { - p.Add(wtf) - } - return nil + return jiebago.LoadDict(p, dictFilePath, true) } func (p *Posseg) cutDetailInternal(sentence string) chan WordTag { diff --git a/trie.go b/trie.go index 5351024..b214728 100644 --- a/trie.go +++ b/trie.go @@ -60,14 +60,10 @@ func (j *Jieba) load(dictFileName string) error { } if !isDictCached { - entries, err := ParseDictFile(dictFilePath) + err = LoadDict(j, dictFilePath, false) if err != nil { return err } - - for _, entry := range entries { - j.AddEntry(entry) - } // dump trie cacheFile, err = os.OpenFile(cacheFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) if err != nil { @@ -103,14 +99,7 @@ func (j *Jieba) Add(word string, freq float64) { // Load user specified dictionary file. func (j *Jieba) LoadUserDict(dictFilePath string) error { - entries, err := ParseDictFile(dictFilePath) - if err != nil { - return err - } - for _, entry := range entries { - j.Add(entry.Word, entry.Freq) - } - return nil + return LoadDict(j, dictFilePath, false) } // Set the dictionary, could be absolute path of dictionary file, or dictionary diff --git a/util.go b/util.go index 51a3652..a9e0e1f 100644 --- a/util.go +++ b/util.go @@ -1,15 +1,15 @@ package jiebago import ( - // "bufio" + "bufio" // "crypto/md5" // "encoding/gob" // "fmt" "os" "path/filepath" "regexp" - // "strconv" - // "strings" + "strconv" + "strings" ) func DictPath(dictFileName string) (string, error) { @@ -25,6 +25,35 @@ func DictPath(dictFileName string) (string, error) { return dictFilePath, nil } +func LoadDict(l DictLoader, dictFilePath string, usingFlag bool) error { + dictFile, err := os.Open(dictFilePath) + if err != nil { + return err + } + defer dictFile.Close() + scanner := bufio.NewScanner(dictFile) + var entry *Entry + var line string + var fields []string + for scanner.Scan() { + line = scanner.Text() + fields = strings.Split(line, " ") + entry = NewEntry() + entry.Word = strings.Replace(fields[0], "\ufeff", "", 1) + if length := len(fields); length > 1 { + entry.Freq, err = strconv.ParseFloat(fields[1], 64) + if err != nil { + return err + } + if usingFlag && length > 2 { + entry.Flag = fields[2] + } + } + l.AddEntry(entry) + } + return scanner.Err() +} + /* func cachePath(dictPath string) string { return filepath.Join(os.TempDir(), @@ -56,34 +85,6 @@ func load(cachePath string, d DictLoader) error { return dec.Decode(&d) } -func read(dictPath, d DictLoader, pos bool) error { - dictFile, err := os.Open(dictFilePath) - if err != nil { - return err - } - defer dictFile.Close() - scanner := bufio.NewScanner(dictFile) - var token *Token - var line string - var fields []string - for scanner.Scan() { - line = scanner.Text() - fields = strings.Split(line, " ") - token = &Token{Term: strings.Replace(fields[0], "\ufeff", "", 1)} - if length := len(fields); length > 1 { - token.Freq, err = strconv.ParseFloat(fields[1], 64) - if err != nil { - return err - } - if pos && length > 2 { - token.Pos = fields[2] - } - } - d.Add(token) - } - return scanner.Err() -} - func dump(cachePath string, d DictLoader) error { cacheFile, err = os.OpenFile(cachePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) if err != nil {