From 09fb239206bcce058daf1866133a2aedeb10120d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Fri, 31 Mar 2023 16:17:31 +0800 Subject: [PATCH] add GetDuplicateRate --- backend/global/file.go | 104 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 9 deletions(-) diff --git a/backend/global/file.go b/backend/global/file.go index 72c88df..1960886 100644 --- a/backend/global/file.go +++ b/backend/global/file.go @@ -24,6 +24,8 @@ import ( "github.com/corona10/goimagehash" base14 "github.com/fumiama/go-base16384" "github.com/fumiama/go-docx" + + "github.com/fumiama/paper-manager/backend/utils" ) const ( @@ -394,10 +396,38 @@ func (f *FileDatabase) AddFile(tempath string, reg *Regex, istemp bool, progress return data }(), Vector: func() []byte { - plain := base14.BytesToString(sb.Bytes()) - + words := utils.Segmenter.Cut(base14.BytesToString(sb.Bytes()), true) + if len(words) == 0 { + return nil + } + v := make(map[string]uint8, len(words)*2) + for _, word := range words { + v[word]++ + } + data, err := json.Marshal(v) + if err != nil { + return nil + } + return data }(), } + var q Question + dupmap := make(map[string]float64, 64) + FileDB.mu.RLock() + err = FileDB.db.FindFor(FileTableQuestion, &q, "", func() error { + r, err := q.GetDuplicateRate(que) + if err != nil { + return err + } + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], q.ID) + dupmap[hex.EncodeToString(buf[:])] = r + return nil + }) + FileDB.mu.RUnlock() + if err == nil { + + } } } return file, nil @@ -415,13 +445,69 @@ type Question struct { ID uint64 // ID is the first 8 bytes of the Plain's md5 Plain string // Plain is the plain text of the question (like markdown format) Images []byte // Images is json of the image dhash in XML, ex. ['rId1': '1234567890abcdef', ...] - Vector []byte // Vector is json of {word: rate, ...} freq - Dup []byte // Dup is json of Duplication struct + Vector []byte // Vector is json of {word: freq, ...} + Dup []byte // Dup is json of {queid: rate, ...} } -// Duplication is the struct representation of Question.Dup -type Duplication struct { - ID string `json:"id"` // ID is hex string for json's 53 bits number - Rate float64 `json:"rate"` // Rate is the avg(non-leaf) or max(leaf) similarity - To []Duplication `json:"to,omitempty"` +// GetDuplicateRate calc q & que's dup rate +func (q *Question) GetDuplicateRate(que *Question) (float64, error) { + v1, v2 := make(map[string]uint8, 64), make(map[string]uint8, 64) + m1, m2 := make(map[string]string, 64), make(map[string]string, 64) + err := json.Unmarshal(q.Images, &m1) + if err != nil { + return 0, err + } + err = json.Unmarshal(que.Images, &m2) + if err != nil { + return 0, err + } + err = json.Unmarshal(q.Vector, &v1) + if err != nil { + return 0, err + } + err = json.Unmarshal(que.Vector, &v2) + if err != nil { + return 0, err + } + imgdsts := uint64(0) + for _, dhstr2 := range m2 { + d, err := hex.DecodeString(dhstr2) + if err != nil { + return 0, err + } + dh2 := goimagehash.NewImageHash(binary.LittleEndian.Uint64(d), goimagehash.DHash) + r := 0 + for _, dhstr1 := range m1 { + d, err := hex.DecodeString(dhstr1) + if err != nil { + return 0, err + } + dh1 := goimagehash.NewImageHash(binary.LittleEndian.Uint64(d), goimagehash.DHash) + dst, err := dh2.Distance(dh1) + if err != nil { + return 0, err + } + if dst > r { + r = dst + } + } + imgdsts += uint64(r) + } + imgdupr := float64(imgdsts) / float64(len(m2)) / 64.0 + v1space := make([]uint8, 0, len(v1)+len(v2)) + v2space := make([]uint8, 0, len(v1)+len(v2)) + for k, v := range v1 { + v1space = append(v1space, v) + if tv, ok := v2[k]; ok { + v2space = append(v2space, tv) + delete(v2, k) + } else { + v2space = append(v2space, 0) + } + } + for _, v := range v2 { + v1space = append(v1space, 0) + v2space = append(v2space, v) + } + return utils.Similarity(v1space, v2space) + imgdupr/2.0, nil }