1
0
mirror of https://github.com/fumiama/jieba.git synced 2026-06-05 00:32:51 +08:00
Files
jieba/posseg/posseg.go

293 lines
6.5 KiB
Go

package posseg
import (
"bufio"
"fmt"
"github.com/wangbin/jiebago"
"os"
"path/filepath"
"regexp"
"runtime"
"strings"
)
var (
WordTagTab = make(map[string]string)
)
type WordTag struct {
Word, Tag string
}
func (wt WordTag) String() string {
return fmt.Sprintf("%s/%s", wt.Word, wt.Tag)
}
func init() {
_, filename, _, _ := runtime.Caller(1)
dict_dir := filepath.Dir(filepath.Dir(filename))
dict_path := filepath.Join(dict_dir, jiebago.Dictionary)
err := load_model(dict_path)
if err != nil {
panic(err)
}
}
func load_model(f_name string) error {
file, openError := os.Open(f_name)
if openError != nil {
return openError
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
words := strings.Split(strings.TrimSpace(line), " ")
word, tag := words[0], words[2]
WordTagTab[word] = tag
}
if err := scanner.Err(); err != nil {
return err
}
return nil
}
func __cut(sentence string) []WordTag {
result := make([]WordTag, 0)
runes := []rune(sentence)
_, posList := Viterbi(runes)
begin := 0
next := 0
for i, char := range runes {
pos := posList[i].State
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, WordTag{string(runes[begin : i+1]), posList[i].Tag})
next = i + 1
case 'S':
result = append(result, WordTag{string(char), posList[i].Tag})
next = i + 1
}
}
if next < len(runes) {
result = append(result, WordTag{string(runes[next:]), posList[next].Tag})
}
return result
}
func cutDetail(sentence string) []WordTag {
result := make([]WordTag, 0)
re_han := regexp.MustCompile(`\p{Han}+`)
re_skip := regexp.MustCompile(`[[\.[:digit:]]+|[:alnum:]]+`)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
blocks := jiebago.RegexpSplit(re_han, sentence)
for _, blk := range blocks {
if re_han.MatchString(blk) {
for _, wordTag := range __cut(blk) {
result = append(result, wordTag)
}
} else {
for _, x := range jiebago.RegexpSplit(re_skip, blk) {
if len(x) == 0 {
continue
}
switch {
case re_num.MatchString(x):
result = append(result, WordTag{x, "m"})
case re_eng.MatchString(x):
result = append(result, WordTag{x, "eng"})
default:
result = append(result, WordTag{x, "x"})
}
}
}
}
return result
}
type cutAction func(sentence string) []WordTag
func cut_DAG(sentence string) []WordTag {
dag := jiebago.GetDAG(sentence)
routes := jiebago.Calc(sentence, dag, 0)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
result := make([]WordTag, 0)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if y-x == 1 {
buf = append(buf, l_word...)
} else {
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := WordTagTab[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
}
buf = make([]rune, 0)
} else {
bufString := string(buf)
if _, ok := jiebago.TT.Freq[bufString]; !ok {
recognized := cutDetail(bufString)
for _, t := range recognized {
result = append(result, t)
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := WordTagTab[selem]; ok {
result = append(result, WordTag{string(elem), tag})
} else {
result = append(result, WordTag{string(elem), "x"})
}
}
}
buf = make([]rune, 0)
}
}
sl_word := string(l_word)
if tag, ok := WordTagTab[sl_word]; ok {
result = append(result, WordTag{sl_word, tag})
} else {
result = append(result, WordTag{sl_word, "x"})
}
}
x = y
}
if len(buf) > 0 {
if len(buf) == 1 {
sbuf := string(buf)
if tag, ok := WordTagTab[sbuf]; ok {
result = append(result, WordTag{sbuf, tag})
} else {
result = append(result, WordTag{sbuf, "x"})
}
} else {
bufString := string(buf)
if _, ok := jiebago.TT.Freq[bufString]; !ok {
recognized := cutDetail(bufString)
for _, t := range recognized {
result = append(result, t)
}
} else {
for _, elem := range buf {
selem := string(elem)
if tag, ok := WordTagTab[selem]; ok {
result = append(result, WordTag{selem, tag})
} else {
result = append(result, WordTag{selem, "x"})
}
}
}
}
}
return result
}
func cut_DAG_NO_HMM(sentence string) []WordTag {
result := make([]WordTag, 0)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
dag := jiebago.GetDAG(sentence)
routes := jiebago.Calc(sentence, dag, 0)
x := 0
var y int
runes := []rune(sentence)
length := len(runes)
buf := make([]rune, 0)
for {
if x >= length {
break
}
y = routes[x].Index + 1
l_word := runes[x:y]
if re_eng.MatchString(string(l_word)) && len(l_word) == 1 {
buf = append(buf, l_word...)
x = y
} else {
if len(buf) > 0 {
result = append(result, WordTag{string(buf), "eng"})
buf = make([]rune, 0)
}
sl_word := string(l_word)
if tag, ok := WordTagTab[sl_word]; ok {
result = append(result, WordTag{sl_word, tag})
} else {
result = append(result, WordTag{sl_word, "x"})
}
x = y
}
}
if len(buf) > 0 {
result = append(result, WordTag{string(buf), "eng"})
buf = make([]rune, 0)
}
return result
}
func cut(sentence string, HMM bool) []WordTag {
result := make([]WordTag, 0)
re_han := regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
re_skip := regexp.MustCompile(`(\r\n|\s)`)
re_eng := regexp.MustCompile(`[[:alnum:]]`)
re_num := regexp.MustCompile(`[\.[:digit:]]+`)
blocks := jiebago.RegexpSplit(re_han, sentence)
var cut_block cutAction
if HMM {
cut_block = cut_DAG
} else {
cut_block = cut_DAG_NO_HMM
}
for _, blk := range blocks {
if re_han.MatchString(blk) {
for _, wordTag := range cut_block(blk) {
result = append(result, wordTag)
}
} else {
for _, x := range jiebago.RegexpSplit(re_skip, blk) {
if re_skip.MatchString(x) {
result = append(result, WordTag{x, "x"})
} else {
for _, xx := range x {
s := string(xx)
switch {
case re_num.MatchString(s):
result = append(result, WordTag{s, "m"})
case re_eng.MatchString(x):
result = append(result, WordTag{x, "eng"})
break
default:
result = append(result, WordTag{s, "x"})
}
}
}
}
}
}
return result
}
func Cut(sentence string, HMM bool) []WordTag {
for key := range jiebago.UserWordTagTab {
WordTagTab[key] = jiebago.UserWordTagTab[key]
delete(jiebago.UserWordTagTab, key)
}
return cut(sentence, HMM)
}