// Package jieba is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module. package jieba import ( "io" "math" "regexp" "strings" "github.com/fumiama/jieba/dictionary" "github.com/fumiama/jieba/finalseg" "github.com/fumiama/jieba/util" ) var ( reEng = regexp.MustCompile(`[[:alnum:]]`) reHanCutAll = regexp.MustCompile(`(\p{Han}+)`) reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) ) // Segmenter is a Chinese words segmentation struct. type Segmenter Dictionary // Frequency returns a word's frequency and existence func (seg *Segmenter) Frequency(word string) (float64, bool) { return (*Dictionary)(seg).Frequency(word) } // AddWord adds a new word with frequency to dictionary func (seg *Segmenter) AddWord(word string, frequency float64) { (*Dictionary)(seg).AddToken(dictionary.NewToken(word, frequency, "")) } // DeleteWord removes a word from dictionary func (seg *Segmenter) DeleteWord(word string) { (*Dictionary)(seg).AddToken(dictionary.NewToken(word, 0.0, "")) } /* SuggestFrequency returns a suggested frequncy of a word or a long word cutted into several short words. This method is useful when a word in the sentence is not cutted out correctly. If a word should not be further cutted, for example word "石墨烯" should not be cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu frequency for this word. If a word should be further cutted, for example word "今天天气" should be further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气") should return the minimum frequency for word "今天天气". */ func (seg *Segmenter) SuggestFrequency(words ...string) float64 { frequency := 1.0 if len(words) > 1 { for _, word := range words { if freq, ok := (*Dictionary)(seg).Frequency(word); ok { frequency *= freq } frequency /= (*Dictionary)(seg).total } frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total) wordFreq := 0.0 if freq, ok := (*Dictionary)(seg).Frequency(strings.Join(words, "")); ok { wordFreq = freq } if wordFreq < frequency { frequency = wordFreq } return frequency } word := words[0] for _, segment := range seg.Cut(word, false) { if freq, ok := (*Dictionary)(seg).Frequency(segment); ok { frequency *= freq } frequency /= (*Dictionary)(seg).total } frequency, _ = math.Modf(frequency * (*Dictionary)(seg).total) frequency += 1.0 wordFreq := 1.0 if freq, ok := (*Dictionary)(seg).Frequency(word); ok { wordFreq = freq } if wordFreq > frequency { frequency = wordFreq } return frequency } // LoadDictionary loads dictionary from given file name. Everytime // LoadDictionary is called, previously loaded dictionary will be cleard. func LoadDictionary(file io.Reader) (*Segmenter, error) { d := &Dictionary{freqMap: make(map[string]float64)} err := d.loadDictionary(file) return (*Segmenter)(d), err } // LoadDictionaryAt loads dictionary from given file name. Everytime // LoadDictionaryAt is called, previously loaded dictionary will be cleard. func LoadDictionaryAt(file string) (*Segmenter, error) { d := &Dictionary{freqMap: make(map[string]float64)} err := d.loadDictionaryAt(file) return (*Segmenter)(d), err } // LoadUserDictionary loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionary(file io.Reader) error { return (*Dictionary)(seg).loadDictionary(file) } // LoadUserDictionaryAt loads a user specified dictionary, it must be called // after LoadDictionary, and it will not clear any previous loaded dictionary, // instead it will override exist entries. func (seg *Segmenter) LoadUserDictionaryAt(file string) error { return (*Dictionary)(seg).loadDictionaryAt(file) } func (seg *Segmenter) dag(runes []rune) [][]int { n := len(runes) dag := make([][]int, n) for k := 0; k < n; k++ { dag[k] = make([]int, 0, 64) i := k frag := runes[k : k+1] for { freq, ok := (*Dictionary)(seg).Frequency(string(frag)) if !ok { break } if freq > 0.0 { dag[k] = append(dag[k], i) } i++ if i >= n { break } frag = runes[k : i+1] } if len(dag[k]) == 0 { dag[k] = append(dag[k], k) } } return dag } type route struct { frequency float64 index int } func (seg *Segmenter) calc(runes []rune) []*route { dag := seg.dag(runes) n := len(runes) rs := make([]*route, n+1) rs[n] = &route{frequency: 0.0, index: 0} for idx := n - 1; idx >= 0; idx-- { for _, i := range dag[idx] { var r *route if freq, ok := (*Dictionary)(seg).Frequency(string(runes[idx : i+1])); ok { r = &route{frequency: math.Log(freq) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } else { r = &route{frequency: math.Log(1.0) - (*Dictionary)(seg).logTotal + rs[i+1].frequency, index: i} } if v := rs[idx]; v == nil { rs[idx] = r } else { if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { rs[idx] = r } } } } return rs } // ratio words and letters in an article commonly const ( RatioLetterWord float32 = 1.5 RatioLetterWordFull float32 = 1 ) type cutFunc func(sentence string) []string func (seg *Segmenter) cutDAG(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) runes := []rune(sentence) routes := seg.calc(runes) buf := make([]rune, 0, 256) for x := 0; x < len(runes); { y := routes[x].index + 1 frag := runes[x:y] if y-x == 1 { buf = append(buf, frag...) } else { if len(buf) > 0 { bufString := string(buf) if len(buf) == 1 { result = append(result, bufString) } else { if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { result = append(result, finalseg.Cut(bufString)...) } else { for _, elem := range buf { result = append(result, string(elem)) } } } buf = buf[:0] } result = append(result, string(frag)) } x = y } if len(buf) > 0 { bufString := string(buf) if len(buf) == 1 { result = append(result, bufString) } else { if v, ok := (*Dictionary)(seg).Frequency(bufString); !ok || v == 0.0 { result = append(result, finalseg.Cut(bufString)...) } else { for _, elem := range buf { result = append(result, string(elem)) } } } } return result } func (seg *Segmenter) cutDAGNoHMM(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) runes := []rune(sentence) routes := seg.calc(runes) buf := make([]rune, 0, 256) for x := 0; x < len(runes); { y := routes[x].index + 1 frag := runes[x:y] if reEng.MatchString(string(frag)) && len(frag) == 1 { buf = append(buf, frag...) x = y continue } if len(buf) > 0 { result = append(result, string(buf)) buf = buf[:0] } result = append(result, string(frag)) x = y } if len(buf) > 0 { result = append(result, string(buf)) } return result } // Cut cuts a sentence into words using accurate mode. // Parameter hmm controls whether to use the Hidden Markov Model. // Accurate mode attempts to cut the sentence into the most accurate // segmentations, which is suitable for text analysis. func (seg *Segmenter) Cut(sentence string, hmm bool) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) var cut cutFunc if hmm { cut = seg.cutDAG } else { cut = seg.cutDAGNoHMM } for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { if len(block) == 0 { continue } if reHanDefault.MatchString(block) { result = append(result, cut(block)...) continue } for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) { if reSkipDefault.MatchString(subBlock) { result = append(result, subBlock) continue } for _, r := range subBlock { result = append(result, string(r)) } } } return result } func (seg *Segmenter) cutAll(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) runes := []rune(sentence) dag := seg.dag(runes) start := -1 for k := 0; k < len(dag); k++ { l := dag[k] if len(l) == 1 && k > start { result = append(result, string(runes[k:l[0]+1])) start = l[0] continue } for _, j := range l { if j > k { result = append(result, string(runes[k:j+1])) start = j } } } return result } // CutAll cuts a sentence into words using full mode. // Full mode gets all the possible words from the sentence. // Fast but not accurate. func (seg *Segmenter) CutAll(sentence string) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1) for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { if len(block) == 0 { continue } if reHanCutAll.MatchString(block) { result = append(result, seg.cutAll(block)...) continue } result = append(result, reSkipCutAll.Split(block, -1)...) } return result } // CutForSearch cuts sentence into words using search engine mode. // Search engine mode, based on the accurate mode, attempts to cut long words // into several short words, which can raise the recall rate. // Suitable for search engines. func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string { result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1) for _, word := range seg.Cut(sentence, hmm) { runes := []rune(word) for _, increment := range []int{2, 3} { if len(runes) <= increment { continue } for i := 0; i < len(runes)-increment+1; i++ { gram := string(runes[i : i+increment]) if v, ok := (*Dictionary)(seg).Frequency(gram); ok && v > 0.0 { result = append(result, gram) } } } result = append(result, word) } return result }