mirror of
https://github.com/fumiama/jieba.git
synced 2026-06-05 00:32:51 +08:00
93 lines
1.6 KiB
Go
Executable File
93 lines
1.6 KiB
Go
Executable File
package analyse
|
|
|
|
import (
|
|
"io"
|
|
"sync"
|
|
|
|
"github.com/fumiama/jieba/dictionary"
|
|
)
|
|
|
|
// DefaultStopWordMap contains some stop words.
|
|
var DefaultStopWordMap = map[string]int{
|
|
"the": 1,
|
|
"of": 1,
|
|
"is": 1,
|
|
"and": 1,
|
|
"to": 1,
|
|
"in": 1,
|
|
"that": 1,
|
|
"we": 1,
|
|
"for": 1,
|
|
"an": 1,
|
|
"are": 1,
|
|
"by": 1,
|
|
"be": 1,
|
|
"as": 1,
|
|
"on": 1,
|
|
"with": 1,
|
|
"can": 1,
|
|
"if": 1,
|
|
"from": 1,
|
|
"which": 1,
|
|
"you": 1,
|
|
"it": 1,
|
|
"this": 1,
|
|
"then": 1,
|
|
"at": 1,
|
|
"have": 1,
|
|
"all": 1,
|
|
"not": 1,
|
|
"one": 1,
|
|
"has": 1,
|
|
"or": 1,
|
|
}
|
|
|
|
// StopWord is a thread-safe dictionary for all stop words.
|
|
type StopWord struct {
|
|
sync.RWMutex
|
|
stopWordMap map[string]int
|
|
}
|
|
|
|
// AddToken adds a token into StopWord dictionary.
|
|
func (s *StopWord) AddToken(token dictionary.Token) {
|
|
s.Lock()
|
|
s.stopWordMap[token.Text()] = 1
|
|
s.Unlock()
|
|
}
|
|
|
|
// NewStopWord create a new StopWord with default stop words.
|
|
func NewStopWord() *StopWord {
|
|
m := make(map[string]int, len(DefaultStopWordMap)*2)
|
|
for k, v := range DefaultStopWordMap {
|
|
m[k] = v
|
|
}
|
|
return &StopWord{
|
|
stopWordMap: m,
|
|
}
|
|
}
|
|
|
|
// IsStopWord checks if a given word is stop word.
|
|
func (s *StopWord) IsStopWord(word string) bool {
|
|
s.RLock()
|
|
_, ok := s.stopWordMap[word]
|
|
s.RUnlock()
|
|
return ok
|
|
}
|
|
|
|
// Load loads all tokens into StopWord dictionary.
|
|
func (s *StopWord) Load(tokens ...dictionary.Token) {
|
|
s.Lock()
|
|
for _, token := range tokens {
|
|
s.stopWordMap[token.Text()] = 1
|
|
}
|
|
s.Unlock()
|
|
}
|
|
|
|
func (s *StopWord) loadDictionary(file io.Reader) error {
|
|
return dictionary.LoadDictionary(s, file)
|
|
}
|
|
|
|
func (s *StopWord) loadDictionaryAt(file string) error {
|
|
return dictionary.LoadDictionaryAt(s, file)
|
|
}
|