mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
fs.File -> io.Reader
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
[](https://godoc.org/github.com/fumiama/jieba)
|
[](https://godoc.org/github.com/fumiama/jieba)
|
||||||
|
|
||||||
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,本仓库是结巴分词的 Golang 语言实现,修改于[jiebago](https://github.com/wangbin/jiebago),大幅优化了速度与性能,增加了从`fs.File`加载字典等功能。
|
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,本仓库是结巴分词的 Golang 语言实现,修改于[jiebago](https://github.com/wangbin/jiebago),大幅优化了速度与性能,增加了从`io.Reader`加载字典等功能。
|
||||||
|
|
||||||
|
|
||||||
## 使用
|
## 使用
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -39,7 +39,7 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
|
|||||||
i.Unlock()
|
i.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *Idf) loadDictionary(file fs.File) error {
|
func (i *Idf) loadDictionary(file io.Reader) error {
|
||||||
return dictionary.LoadDictionary(i, file)
|
return dictionary.LoadDictionary(i, file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/fumiama/jieba/dictionary"
|
"github.com/fumiama/jieba/dictionary"
|
||||||
@@ -83,7 +83,7 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
|
|||||||
s.Unlock()
|
s.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *StopWord) loadDictionary(file fs.File) error {
|
func (s *StopWord) loadDictionary(file io.Reader) error {
|
||||||
return dictionary.LoadDictionary(s, file)
|
return dictionary.LoadDictionary(s, file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
package analyse
|
package analyse
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
@@ -53,7 +53,7 @@ type TagExtracter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary reads the given filename and create a new dictionary.
|
// LoadDictionary reads the given filename and create a new dictionary.
|
||||||
func (t *TagExtracter) LoadDictionary(file fs.File) (err error) {
|
func (t *TagExtracter) LoadDictionary(file io.Reader) (err error) {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
t.seg, err = jieba.LoadDictionary(file)
|
t.seg, err = jieba.LoadDictionary(file)
|
||||||
return
|
return
|
||||||
@@ -67,7 +67,7 @@ func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadIdf reads the given file and create a new Idf dictionary.
|
// LoadIdf reads the given file and create a new Idf dictionary.
|
||||||
func (t *TagExtracter) LoadIdf(file fs.File) error {
|
func (t *TagExtracter) LoadIdf(file io.Reader) error {
|
||||||
t.idf = NewIdf()
|
t.idf = NewIdf()
|
||||||
return t.idf.loadDictionary(file)
|
return t.idf.loadDictionary(file)
|
||||||
}
|
}
|
||||||
@@ -79,7 +79,7 @@ func (t *TagExtracter) LoadIdfAt(fileName string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
// LoadStopWords reads the given file and create a new StopWord dictionary.
|
||||||
func (t *TagExtracter) LoadStopWords(file fs.File) error {
|
func (t *TagExtracter) LoadStopWords(file io.Reader) error {
|
||||||
t.stopWord = NewStopWord()
|
t.stopWord = NewStopWord()
|
||||||
return t.stopWord.loadDictionary(file)
|
return t.stopWord.loadDictionary(file)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ package analyse
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"hash/crc64"
|
"hash/crc64"
|
||||||
"io/fs"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
@@ -171,7 +171,7 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
|
|||||||
type TextRanker posseg.Segmenter
|
type TextRanker posseg.Segmenter
|
||||||
|
|
||||||
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
// NewTextRanker reads a given file and create a new dictionary file for Textranker.
|
||||||
func NewTextRanker(file fs.File) (*TextRanker, error) {
|
func NewTextRanker(file io.Reader) (*TextRanker, error) {
|
||||||
seg, err := posseg.LoadDictionary(file)
|
seg, err := posseg.LoadDictionary(file)
|
||||||
return (*TextRanker)(seg), err
|
return (*TextRanker)(seg), err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package jieba
|
package jieba
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
|
|||||||
return freq, ok
|
return freq, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) loadDictionary(file fs.File) error {
|
func (d *Dictionary) loadDictionary(file io.Reader) error {
|
||||||
return dictionary.LoadDictionary(d, file)
|
return dictionary.LoadDictionary(d, file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ package dictionary
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"io/fs"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -16,7 +16,7 @@ type DictLoader interface {
|
|||||||
AddToken(Token)
|
AddToken(Token)
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadDictionary(file fs.File) (tokens []Token, err error) {
|
func loadDictionary(file io.Reader) (tokens []Token, err error) {
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
var token Token
|
var token Token
|
||||||
var line string
|
var line string
|
||||||
@@ -44,7 +44,7 @@ func loadDictionary(file fs.File) (tokens []Token, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
// LoadDictionary reads the given file and passes all tokens to a DictLoader.
|
||||||
func LoadDictionary(dl DictLoader, file fs.File) error {
|
func LoadDictionary(dl DictLoader, file io.Reader) error {
|
||||||
tokens, err := loadDictionary(file)
|
tokens, err := loadDictionary(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
6
jieba.go
6
jieba.go
@@ -2,7 +2,7 @@
|
|||||||
package jieba
|
package jieba
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -92,7 +92,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
|
|||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name. Everytime
|
// LoadDictionary loads dictionary from given file name. Everytime
|
||||||
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
// LoadDictionary is called, previously loaded dictionary will be cleard.
|
||||||
func LoadDictionary(file fs.File) (*Segmenter, error) {
|
func LoadDictionary(file io.Reader) (*Segmenter, error) {
|
||||||
d := &Dictionary{freqMap: make(map[string]float64)}
|
d := &Dictionary{freqMap: make(map[string]float64)}
|
||||||
err := d.loadDictionary(file)
|
err := d.loadDictionary(file)
|
||||||
return (*Segmenter)(d), err
|
return (*Segmenter)(d), err
|
||||||
@@ -109,7 +109,7 @@ func LoadDictionaryAt(file string) (*Segmenter, error) {
|
|||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
func (seg *Segmenter) LoadUserDictionary(file io.Reader) error {
|
||||||
return (*Dictionary)(seg).loadDictionary(file)
|
return (*Dictionary)(seg).loadDictionary(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ func (d *Dictionary) Pos(key string) (string, bool) {
|
|||||||
return pos, ok
|
return pos, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) loadDictionary(file fs.File) error {
|
func (d *Dictionary) loadDictionary(file io.Reader) error {
|
||||||
return dictionary.LoadDictionary(d, file)
|
return dictionary.LoadDictionary(d, file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
package posseg
|
package posseg
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
|
||||||
@@ -39,7 +39,7 @@ type Segmenter Dictionary
|
|||||||
|
|
||||||
// LoadDictionary loads dictionary from given file name.
|
// LoadDictionary loads dictionary from given file name.
|
||||||
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
|
||||||
func LoadDictionary(file fs.File) (*Segmenter, error) {
|
func LoadDictionary(file io.Reader) (*Segmenter, error) {
|
||||||
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
|
||||||
err := dict.loadDictionary(file)
|
err := dict.loadDictionary(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -62,7 +62,7 @@ func LoadDictionaryAt(file string) (*Segmenter, error) {
|
|||||||
// LoadUserDictionary loads a user specified dictionary, it must be called
|
// LoadUserDictionary loads a user specified dictionary, it must be called
|
||||||
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
// after LoadDictionary, and it will not clear any previous loaded dictionary,
|
||||||
// instead it will override exist entries.
|
// instead it will override exist entries.
|
||||||
func (seg *Segmenter) LoadUserDictionary(file fs.File) error {
|
func (seg *Segmenter) LoadUserDictionary(file io.Reader) error {
|
||||||
return (*Dictionary)(seg).loadDictionary(file)
|
return (*Dictionary)(seg).loadDictionary(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package tokenizers
|
package tokenizers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/fs"
|
"io"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
@@ -42,7 +42,7 @@ Parameters:
|
|||||||
"交换机" as a single word. If searchMode is true, it will further split
|
"交换机" as a single word. If searchMode is true, it will further split
|
||||||
this word into "交换", "换机", which are valid Chinese words.
|
this word into "交换", "换机", which are valid Chinese words.
|
||||||
*/
|
*/
|
||||||
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
func NewJiebaTokenizer(dictFile io.Reader, hmm, searchMode bool) (analysis.Tokenizer, error) {
|
||||||
seg, err := jieba.LoadDictionary(dictFile)
|
seg, err := jieba.LoadDictionary(dictFile)
|
||||||
return &JiebaTokenizer{
|
return &JiebaTokenizer{
|
||||||
seg: seg,
|
seg: seg,
|
||||||
@@ -131,7 +131,7 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
|
|||||||
|
|
||||||
Parameter config should contains at least one parameter:
|
Parameter config should contains at least one parameter:
|
||||||
|
|
||||||
file: the path of the dictionary file or fs.File.
|
file: the path of the dictionary file or io.Reader.
|
||||||
|
|
||||||
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
|
||||||
|
|
||||||
@@ -150,7 +150,7 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
|
|||||||
if ok {
|
if ok {
|
||||||
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
|
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
|
||||||
}
|
}
|
||||||
dictFile := config["file"].(fs.File)
|
dictFile := config["file"].(io.Reader)
|
||||||
return NewJiebaTokenizer(dictFile, hmm, searchMode)
|
return NewJiebaTokenizer(dictFile, hmm, searchMode)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user