diff --git a/backend/global/file.go b/backend/global/file.go index ba02344..72c88df 100644 --- a/backend/global/file.go +++ b/backend/global/file.go @@ -1,9 +1,14 @@ package global import ( + "bytes" "crypto/md5" "encoding/binary" + "encoding/hex" + "encoding/json" "errors" + "fmt" + "image" "io" "os" "regexp" @@ -11,6 +16,13 @@ import ( "strings" "time" + _ "image/jpeg" + _ "image/png" + + _ "golang.org/x/image/webp" + + "github.com/corona10/goimagehash" + base14 "github.com/fumiama/go-base16384" "github.com/fumiama/go-docx" ) @@ -175,12 +187,12 @@ type File struct { // AddFile from FileFolder+tempath and copy it to File.Path. // The para res must belong to a valid user -func (f *FileDatabase) AddFile(tempath string, reg *Regex) (*File, error) { +func (f *FileDatabase) AddFile(tempath string, reg *Regex, istemp bool, progress func(uint)) (*File, error) { user, err := UserDB.GetUserByID(reg.ID) if err != nil { return nil, err } - if !user.IsFileManager() { + if !user.IsFileManager() && !istemp { return nil, ErrInvalidRole } if strings.Contains(tempath, "..") { @@ -212,6 +224,7 @@ func (f *FileDatabase) AddFile(tempath string, reg *Regex) (*File, error) { if err != nil { return nil, err } + doc.Document.Body.DropDrawingOf("NilPicture") majorre, err := regexp.Compile(reg.Major) if err != nil { return nil, err @@ -220,6 +233,7 @@ func (f *FileDatabase) AddFile(tempath string, reg *Regex) (*File, error) { if len(docs) < 2 { return nil, ErrMajorSplitsTooShort } + // filling File struct file := &File{ ID: id, UID: *user.ID, @@ -307,7 +321,85 @@ func (f *FileDatabase) AddFile(tempath string, reg *Regex) (*File, error) { } } docs = docs[1:] + // parse questions + subre, err := regexp.Compile(reg.Sub) + if err != nil { + return nil, err + } + for _, majordoc := range docs { + majorq := QuestionJSON{} + for _, it := range majordoc.Document.Body.Items { + if p, ok := it.(*docx.Paragraph); ok { + text := p.String() + majorinfo := majorre.FindStringSubmatch(text) + if len(majorinfo) >= 6 { + name, points := majorinfo[2], majorinfo[5] + majorq.Name = name + majorq.Points, _ = strconv.Atoi(points) + } + } + } + subdocs := majordoc.SplitByParagraph(docx.SplitDocxByPlainTextRegex(subre)) + majorq.Sub = make([]QuestionJSON, 0, len(subdocs)) + for _, subdoc := range subdocs { + sb := bytes.NewBuffer(make([]byte, 0, 4096)) + for _, it := range subdoc.Document.Body.Items { + sb.WriteString(fmt.Sprint(it)) + } + m := md5.Sum(sb.Bytes()) + que := &Question{ + ID: binary.LittleEndian.Uint64(m[:8]), + Plain: base14.BytesToString(sb.Bytes()), + Images: func() []byte { + m := make(map[string]string) + _ = subdoc.RangeRelationships(func(r *docx.Relationship) error { + if r.Type != docx.REL_IMAGE { + return nil + } + if r.Target == "" { + return nil + } + i := strings.LastIndex(r.Target, "/") + if i < 0 { + return nil + } + name := r.Target[i+1:] + if name == "" { + return nil + } + md := subdoc.Media(name) + if md == nil { + return nil + } + img, _, err := image.Decode(bytes.NewReader(md.Data)) + if err != nil { + return nil + } + dh, err := goimagehash.DifferenceHash(img) + if err != nil { + return nil + } + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], dh.GetHash()) + m[name] = hex.EncodeToString(buf[:]) + return nil + }) + if len(m) == 0 { + return nil + } + data, err := json.Marshal(m) + if err != nil { + return nil + } + return data + }(), + Vector: func() []byte { + plain := base14.BytesToString(sb.Bytes()) + }(), + } + } + } return file, nil } @@ -322,8 +414,7 @@ type QuestionJSON struct { type Question struct { ID uint64 // ID is the first 8 bytes of the Plain's md5 Plain string // Plain is the plain text of the question (like markdown format) - XML []byte // XML is the OpenXML bytes of the question - Images []byte // Images is json of the image paths in XML, ex. ['md5.jpg', 'md5.png', ...] + Images []byte // Images is json of the image dhash in XML, ex. ['rId1': '1234567890abcdef', ...] Vector []byte // Vector is json of {word: rate, ...} freq Dup []byte // Dup is json of Duplication struct } diff --git a/backend/utils/dict.zip b/backend/utils/dict.zip new file mode 100644 index 0000000..bf711da Binary files /dev/null and b/backend/utils/dict.zip differ diff --git a/backend/utils/jieba.go b/backend/utils/jieba.go new file mode 100644 index 0000000..1edb6bd --- /dev/null +++ b/backend/utils/jieba.go @@ -0,0 +1,29 @@ +package utils + +import ( + "archive/zip" + "bytes" + _ "embed" + + "github.com/fumiama/jieba" +) + +//go:embed dict.zip +var dictzip []byte + +// Segmenter jieba 分词器 +var Segmenter = func() *jieba.Segmenter { + r, err := zip.NewReader(bytes.NewReader(dictzip), int64(len(dictzip))) + if err != nil { + panic(err) + } + f, err := r.Open("dict.txt") + if err != nil { + panic(err) + } + seg, err := jieba.LoadDictionary(f) + if err != nil { + panic(err) + } + return seg +}() diff --git a/go.mod b/go.mod index 87587fc..161e526 100644 --- a/go.mod +++ b/go.mod @@ -6,18 +6,22 @@ require ( github.com/FloatTech/sqlite v1.6.0 github.com/FloatTech/ttl v0.0.0-20220715042055-15612be72f5b github.com/RomiChan/syncx v0.0.0-20221202055724-5f842c53020e + github.com/corona10/goimagehash v1.1.0 github.com/fumiama/go-base16384 v1.6.4 - github.com/fumiama/go-docx v0.0.0-20230310052825-daf7190ea69b + github.com/fumiama/go-docx v0.0.0-20230330141738-34f53a967c03 github.com/fumiama/imgsz v0.0.2 + github.com/fumiama/jieba v0.0.0-20221203025406-36c17a10b565 github.com/sirupsen/logrus v1.9.0 + golang.org/x/image v0.6.0 ) require ( github.com/google/uuid v1.3.0 // indirect github.com/mattn/go-isatty v0.0.16 // indirect + github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 // indirect - golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect - golang.org/x/text v0.3.7 // indirect + golang.org/x/sys v0.5.0 // indirect + golang.org/x/text v0.8.0 // indirect modernc.org/libc v1.21.5 // indirect modernc.org/mathutil v1.5.0 // indirect modernc.org/memory v1.4.0 // indirect diff --git a/go.sum b/go.sum index 03ff942..959d03c 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/FloatTech/ttl v0.0.0-20220715042055-15612be72f5b h1:tvciXWq2nuvTbFeJG github.com/FloatTech/ttl v0.0.0-20220715042055-15612be72f5b/go.mod h1:fHZFWGquNXuHttu9dUYoKuNbm3dzLETnIOnm1muSfDs= github.com/RomiChan/syncx v0.0.0-20221202055724-5f842c53020e h1:wR3MXQ3VbUlPKOOUwLOYgh/QaJThBTYtsl673O3lqSA= github.com/RomiChan/syncx v0.0.0-20221202055724-5f842c53020e/go.mod h1:vD7Ra3Q9onRtojoY5sMCLQ7JBgjUsrXDnDKyFxqpf9w= +github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI= +github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -11,16 +13,20 @@ github.com/fumiama/bigfft v0.0.0-20211011143303-6e0bfa3c836b h1:Zt3pFQditAdWTHCO github.com/fumiama/bigfft v0.0.0-20211011143303-6e0bfa3c836b/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/fumiama/go-base16384 v1.6.4 h1:rYDRwD/th2cG4U7QLokpzmST1cCxZGXtHmolOUePt5o= github.com/fumiama/go-base16384 v1.6.4/go.mod h1:OEn+947GV5gsbTAnyuUW/SrfxJYUdYupSIQXOuGOcXM= -github.com/fumiama/go-docx v0.0.0-20230310052825-daf7190ea69b h1:1KYtEitNxRbh+O8mgGlji4nQKzf7SQHYxiuqDAwRCc8= -github.com/fumiama/go-docx v0.0.0-20230310052825-daf7190ea69b/go.mod h1:ssRF0IaB1hCcKIObp3FkZOsjTcAHpgii70JelNb4H8M= +github.com/fumiama/go-docx v0.0.0-20230330141738-34f53a967c03 h1:gQ9cMYk9QhTXh5sx74RwuNoDzJqVLYfOBsl/0rMMDns= +github.com/fumiama/go-docx v0.0.0-20230330141738-34f53a967c03/go.mod h1:ssRF0IaB1hCcKIObp3FkZOsjTcAHpgii70JelNb4H8M= github.com/fumiama/imgsz v0.0.2 h1:fAkC0FnIscdKOXwAxlyw3EUba5NzxZdSxGaq3Uyfxak= github.com/fumiama/imgsz v0.0.2/go.mod h1:dR71mI3I2O5u6+PCpd47M9TZptzP+39tRBcbdIkoqM4= +github.com/fumiama/jieba v0.0.0-20221203025406-36c17a10b565 h1:sQuR2+N5HurnvsZhiKdEg+Ig354TaqgCQRxd/0KgIOQ= +github.com/fumiama/jieba v0.0.0-20221203025406-36c17a10b565/go.mod h1:UUEvyLTJ7yoOA/viKG4wEis4ERydM7+Ny6gZUWgkS80= github.com/fumiama/sqlite3 v1.20.0-with-win386 h1:ZR1AXGBEtkfq9GAXehOVcwn+aaCG8itrkgEsz4ggx5k= github.com/fumiama/sqlite3 v1.20.0-with-win386/go.mod h1:Os58MHwYCcYZCy2PGChBrQtBAw5/LS1ZZOkfc+C/I7s= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ= +github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= @@ -29,12 +35,43 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/image v0.6.0 h1:bR8b5okrPI3g/gyZakLZHeWxAR8Dn5CyxXv1hLH5g/4= +golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68= +golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=