1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00

fs.File -> io.Reader

This commit is contained in:
源文雨
2022-12-03 10:54:06 +08:00
parent 35ac98dc5f
commit 36c17a10b5
11 changed files with 28 additions and 28 deletions

View File

@@ -2,7 +2,7 @@
[![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba) [![GoDoc](https://godoc.org/github.com/fumiama/jieba?status.svg)](https://godoc.org/github.com/fumiama/jieba)
[结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,本仓库是结巴分词的 Golang 语言实现,修改于[jiebago](https://github.com/wangbin/jiebago),大幅优化了速度与性能,增加了从`fs.File`加载字典等功能。 [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,本仓库是结巴分词的 Golang 语言实现,修改于[jiebago](https://github.com/wangbin/jiebago),大幅优化了速度与性能,增加了从`io.Reader`加载字典等功能。
## 使用 ## 使用

View File

@@ -1,7 +1,7 @@
package analyse package analyse
import ( import (
"io/fs" "io"
"sort" "sort"
"sync" "sync"
@@ -39,7 +39,7 @@ func (i *Idf) Load(tokens ...dictionary.Token) {
i.Unlock() i.Unlock()
} }
func (i *Idf) loadDictionary(file fs.File) error { func (i *Idf) loadDictionary(file io.Reader) error {
return dictionary.LoadDictionary(i, file) return dictionary.LoadDictionary(i, file)
} }

View File

@@ -1,7 +1,7 @@
package analyse package analyse
import ( import (
"io/fs" "io"
"sync" "sync"
"github.com/fumiama/jieba/dictionary" "github.com/fumiama/jieba/dictionary"
@@ -83,7 +83,7 @@ func (s *StopWord) Load(tokens ...dictionary.Token) {
s.Unlock() s.Unlock()
} }
func (s *StopWord) loadDictionary(file fs.File) error { func (s *StopWord) loadDictionary(file io.Reader) error {
return dictionary.LoadDictionary(s, file) return dictionary.LoadDictionary(s, file)
} }

View File

@@ -2,7 +2,7 @@
package analyse package analyse
import ( import (
"io/fs" "io"
"sort" "sort"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
@@ -53,7 +53,7 @@ type TagExtracter struct {
} }
// LoadDictionary reads the given filename and create a new dictionary. // LoadDictionary reads the given filename and create a new dictionary.
func (t *TagExtracter) LoadDictionary(file fs.File) (err error) { func (t *TagExtracter) LoadDictionary(file io.Reader) (err error) {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
t.seg, err = jieba.LoadDictionary(file) t.seg, err = jieba.LoadDictionary(file)
return return
@@ -67,7 +67,7 @@ func (t *TagExtracter) LoadDictionaryAt(file string) (err error) {
} }
// LoadIdf reads the given file and create a new Idf dictionary. // LoadIdf reads the given file and create a new Idf dictionary.
func (t *TagExtracter) LoadIdf(file fs.File) error { func (t *TagExtracter) LoadIdf(file io.Reader) error {
t.idf = NewIdf() t.idf = NewIdf()
return t.idf.loadDictionary(file) return t.idf.loadDictionary(file)
} }
@@ -79,7 +79,7 @@ func (t *TagExtracter) LoadIdfAt(fileName string) error {
} }
// LoadStopWords reads the given file and create a new StopWord dictionary. // LoadStopWords reads the given file and create a new StopWord dictionary.
func (t *TagExtracter) LoadStopWords(file fs.File) error { func (t *TagExtracter) LoadStopWords(file io.Reader) error {
t.stopWord = NewStopWord() t.stopWord = NewStopWord()
return t.stopWord.loadDictionary(file) return t.stopWord.loadDictionary(file)
} }

View File

@@ -2,7 +2,7 @@ package analyse
import ( import (
"hash/crc64" "hash/crc64"
"io/fs" "io"
"math" "math"
"sort" "sort"
@@ -171,7 +171,7 @@ func (t *TextRanker) TextRank(sentence string, topK int) Segments {
type TextRanker posseg.Segmenter type TextRanker posseg.Segmenter
// NewTextRanker reads a given file and create a new dictionary file for Textranker. // NewTextRanker reads a given file and create a new dictionary file for Textranker.
func NewTextRanker(file fs.File) (*TextRanker, error) { func NewTextRanker(file io.Reader) (*TextRanker, error) {
seg, err := posseg.LoadDictionary(file) seg, err := posseg.LoadDictionary(file)
return (*TextRanker)(seg), err return (*TextRanker)(seg), err
} }

View File

@@ -1,7 +1,7 @@
package jieba package jieba
import ( import (
"io/fs" "io"
"math" "math"
"sync" "sync"
@@ -58,7 +58,7 @@ func (d *Dictionary) Frequency(key string) (float64, bool) {
return freq, ok return freq, ok
} }
func (d *Dictionary) loadDictionary(file fs.File) error { func (d *Dictionary) loadDictionary(file io.Reader) error {
return dictionary.LoadDictionary(d, file) return dictionary.LoadDictionary(d, file)
} }

View File

@@ -4,7 +4,7 @@ package dictionary
import ( import (
"bufio" "bufio"
"io/fs" "io"
"os" "os"
"strconv" "strconv"
"strings" "strings"
@@ -16,7 +16,7 @@ type DictLoader interface {
AddToken(Token) AddToken(Token)
} }
func loadDictionary(file fs.File) (tokens []Token, err error) { func loadDictionary(file io.Reader) (tokens []Token, err error) {
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
var token Token var token Token
var line string var line string
@@ -44,7 +44,7 @@ func loadDictionary(file fs.File) (tokens []Token, err error) {
} }
// LoadDictionary reads the given file and passes all tokens to a DictLoader. // LoadDictionary reads the given file and passes all tokens to a DictLoader.
func LoadDictionary(dl DictLoader, file fs.File) error { func LoadDictionary(dl DictLoader, file io.Reader) error {
tokens, err := loadDictionary(file) tokens, err := loadDictionary(file)
if err != nil { if err != nil {
return err return err

View File

@@ -2,7 +2,7 @@
package jieba package jieba
import ( import (
"io/fs" "io"
"math" "math"
"regexp" "regexp"
"strings" "strings"
@@ -92,7 +92,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
// LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary loads dictionary from given file name. Everytime
// LoadDictionary is called, previously loaded dictionary will be cleard. // LoadDictionary is called, previously loaded dictionary will be cleard.
func LoadDictionary(file fs.File) (*Segmenter, error) { func LoadDictionary(file io.Reader) (*Segmenter, error) {
d := &Dictionary{freqMap: make(map[string]float64)} d := &Dictionary{freqMap: make(map[string]float64)}
err := d.loadDictionary(file) err := d.loadDictionary(file)
return (*Segmenter)(d), err return (*Segmenter)(d), err
@@ -109,7 +109,7 @@ func LoadDictionaryAt(file string) (*Segmenter, error) {
// LoadUserDictionary loads a user specified dictionary, it must be called // LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(file fs.File) error { func (seg *Segmenter) LoadUserDictionary(file io.Reader) error {
return (*Dictionary)(seg).loadDictionary(file) return (*Dictionary)(seg).loadDictionary(file)
} }

View File

@@ -1,7 +1,7 @@
package posseg package posseg
import ( import (
"io/fs" "io"
"math" "math"
"sync" "sync"
@@ -70,7 +70,7 @@ func (d *Dictionary) Pos(key string) (string, bool) {
return pos, ok return pos, ok
} }
func (d *Dictionary) loadDictionary(file fs.File) error { func (d *Dictionary) loadDictionary(file io.Reader) error {
return dictionary.LoadDictionary(d, file) return dictionary.LoadDictionary(d, file)
} }

View File

@@ -2,7 +2,7 @@
package posseg package posseg
import ( import (
"io/fs" "io"
"math" "math"
"regexp" "regexp"
@@ -39,7 +39,7 @@ type Segmenter Dictionary
// LoadDictionary loads dictionary from given file name. // LoadDictionary loads dictionary from given file name.
// Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard. // Everytime LoadDictionaryAt is called, previously loaded dictionary will be cleard.
func LoadDictionary(file fs.File) (*Segmenter, error) { func LoadDictionary(file io.Reader) (*Segmenter, error) {
dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} dict := &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
err := dict.loadDictionary(file) err := dict.loadDictionary(file)
if err != nil { if err != nil {
@@ -62,7 +62,7 @@ func LoadDictionaryAt(file string) (*Segmenter, error) {
// LoadUserDictionary loads a user specified dictionary, it must be called // LoadUserDictionary loads a user specified dictionary, it must be called
// after LoadDictionary, and it will not clear any previous loaded dictionary, // after LoadDictionary, and it will not clear any previous loaded dictionary,
// instead it will override exist entries. // instead it will override exist entries.
func (seg *Segmenter) LoadUserDictionary(file fs.File) error { func (seg *Segmenter) LoadUserDictionary(file io.Reader) error {
return (*Dictionary)(seg).loadDictionary(file) return (*Dictionary)(seg).loadDictionary(file)
} }

View File

@@ -1,7 +1,7 @@
package tokenizers package tokenizers
import ( import (
"io/fs" "io"
"regexp" "regexp"
"strconv" "strconv"
@@ -42,7 +42,7 @@ Parameters:
"交换机" as a single word. If searchMode is true, it will further split "交换机" as a single word. If searchMode is true, it will further split
this word into "交换", "换机", which are valid Chinese words. this word into "交换", "换机", which are valid Chinese words.
*/ */
func NewJiebaTokenizer(dictFile fs.File, hmm, searchMode bool) (analysis.Tokenizer, error) { func NewJiebaTokenizer(dictFile io.Reader, hmm, searchMode bool) (analysis.Tokenizer, error) {
seg, err := jieba.LoadDictionary(dictFile) seg, err := jieba.LoadDictionary(dictFile)
return &JiebaTokenizer{ return &JiebaTokenizer{
seg: seg, seg: seg,
@@ -131,7 +131,7 @@ JiebaTokenizerConstructor creates a JiebaTokenizer.
Parameter config should contains at least one parameter: Parameter config should contains at least one parameter:
file: the path of the dictionary file or fs.File. file: the path of the dictionary file or io.Reader.
hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
@@ -150,7 +150,7 @@ func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Ca
if ok { if ok {
return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode) return NewJiebaTokenizerAt(dictFilePath, hmm, searchMode)
} }
dictFile := config["file"].(fs.File) dictFile := config["file"].(io.Reader)
return NewJiebaTokenizer(dictFile, hmm, searchMode) return NewJiebaTokenizer(dictFile, hmm, searchMode)
} }