mirror of
https://github.com/fumiama/paper-manager.git
synced 2026-06-12 12:10:25 +08:00
add GetDuplicateRate
This commit is contained in:
@@ -24,6 +24,8 @@ import (
|
|||||||
"github.com/corona10/goimagehash"
|
"github.com/corona10/goimagehash"
|
||||||
base14 "github.com/fumiama/go-base16384"
|
base14 "github.com/fumiama/go-base16384"
|
||||||
"github.com/fumiama/go-docx"
|
"github.com/fumiama/go-docx"
|
||||||
|
|
||||||
|
"github.com/fumiama/paper-manager/backend/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -394,10 +396,38 @@ func (f *FileDatabase) AddFile(tempath string, reg *Regex, istemp bool, progress
|
|||||||
return data
|
return data
|
||||||
}(),
|
}(),
|
||||||
Vector: func() []byte {
|
Vector: func() []byte {
|
||||||
plain := base14.BytesToString(sb.Bytes())
|
words := utils.Segmenter.Cut(base14.BytesToString(sb.Bytes()), true)
|
||||||
|
if len(words) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
v := make(map[string]uint8, len(words)*2)
|
||||||
|
for _, word := range words {
|
||||||
|
v[word]++
|
||||||
|
}
|
||||||
|
data, err := json.Marshal(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return data
|
||||||
}(),
|
}(),
|
||||||
}
|
}
|
||||||
|
var q Question
|
||||||
|
dupmap := make(map[string]float64, 64)
|
||||||
|
FileDB.mu.RLock()
|
||||||
|
err = FileDB.db.FindFor(FileTableQuestion, &q, "", func() error {
|
||||||
|
r, err := q.GetDuplicateRate(que)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var buf [8]byte
|
||||||
|
binary.LittleEndian.PutUint64(buf[:], q.ID)
|
||||||
|
dupmap[hex.EncodeToString(buf[:])] = r
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
FileDB.mu.RUnlock()
|
||||||
|
if err == nil {
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return file, nil
|
return file, nil
|
||||||
@@ -415,13 +445,69 @@ type Question struct {
|
|||||||
ID uint64 // ID is the first 8 bytes of the Plain's md5
|
ID uint64 // ID is the first 8 bytes of the Plain's md5
|
||||||
Plain string // Plain is the plain text of the question (like markdown format)
|
Plain string // Plain is the plain text of the question (like markdown format)
|
||||||
Images []byte // Images is json of the image dhash in XML, ex. ['rId1': '1234567890abcdef', ...]
|
Images []byte // Images is json of the image dhash in XML, ex. ['rId1': '1234567890abcdef', ...]
|
||||||
Vector []byte // Vector is json of {word: rate, ...} freq
|
Vector []byte // Vector is json of {word: freq, ...}
|
||||||
Dup []byte // Dup is json of Duplication struct
|
Dup []byte // Dup is json of {queid: rate, ...}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Duplication is the struct representation of Question.Dup
|
// GetDuplicateRate calc q & que's dup rate
|
||||||
type Duplication struct {
|
func (q *Question) GetDuplicateRate(que *Question) (float64, error) {
|
||||||
ID string `json:"id"` // ID is hex string for json's 53 bits number
|
v1, v2 := make(map[string]uint8, 64), make(map[string]uint8, 64)
|
||||||
Rate float64 `json:"rate"` // Rate is the avg(non-leaf) or max(leaf) similarity
|
m1, m2 := make(map[string]string, 64), make(map[string]string, 64)
|
||||||
To []Duplication `json:"to,omitempty"`
|
err := json.Unmarshal(q.Images, &m1)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
err = json.Unmarshal(que.Images, &m2)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
err = json.Unmarshal(q.Vector, &v1)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
err = json.Unmarshal(que.Vector, &v2)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
imgdsts := uint64(0)
|
||||||
|
for _, dhstr2 := range m2 {
|
||||||
|
d, err := hex.DecodeString(dhstr2)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
dh2 := goimagehash.NewImageHash(binary.LittleEndian.Uint64(d), goimagehash.DHash)
|
||||||
|
r := 0
|
||||||
|
for _, dhstr1 := range m1 {
|
||||||
|
d, err := hex.DecodeString(dhstr1)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
dh1 := goimagehash.NewImageHash(binary.LittleEndian.Uint64(d), goimagehash.DHash)
|
||||||
|
dst, err := dh2.Distance(dh1)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
if dst > r {
|
||||||
|
r = dst
|
||||||
|
}
|
||||||
|
}
|
||||||
|
imgdsts += uint64(r)
|
||||||
|
}
|
||||||
|
imgdupr := float64(imgdsts) / float64(len(m2)) / 64.0
|
||||||
|
v1space := make([]uint8, 0, len(v1)+len(v2))
|
||||||
|
v2space := make([]uint8, 0, len(v1)+len(v2))
|
||||||
|
for k, v := range v1 {
|
||||||
|
v1space = append(v1space, v)
|
||||||
|
if tv, ok := v2[k]; ok {
|
||||||
|
v2space = append(v2space, tv)
|
||||||
|
delete(v2, k)
|
||||||
|
} else {
|
||||||
|
v2space = append(v2space, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, v := range v2 {
|
||||||
|
v1space = append(v1space, 0)
|
||||||
|
v2space = append(v2space, v)
|
||||||
|
}
|
||||||
|
return utils.Similarity(v1space, v2space) + imgdupr/2.0, nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user