From 369cf02def4e3c673bc06935be8566f7488f459a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Wed, 14 Dec 2022 01:14:03 +0800 Subject: [PATCH] finish arm64 decode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta EncodeTo/16-8 10.6ns ± 0% 10.4ns ± 0% -2.07% (p=0.008 n=5+5) EncodeTo/256-8 80.8ns ± 0% 55.7ns ± 0% -31.11% (p=0.008 n=5+5) EncodeTo/4K-8 1.21µs ± 0% 0.82µs ± 0% -32.67% (p=0.016 n=4+5) EncodeTo/32K-8 9.64µs ± 0% 6.47µs ± 0% -32.90% (p=0.008 n=5+5) DecodeTo/16-8 9.79ns ± 0% 26.02ns ± 0% +165.85% (p=0.016 n=4+5) DecodeTo/256-8 80.9ns ± 0% 111.6ns ± 0% +37.98% (p=0.008 n=5+5) DecodeTo/4K-8 1.22µs ± 0% 1.17µs ± 0% -3.73% (p=0.008 n=5+5) DecodeTo/32K-8 9.71µs ± 1% 8.80µs ± 1% -9.37% (p=0.008 n=5+5) Encoder/16-8 76.5ns ± 0% 76.5ns ± 0% ~ (p=0.810 n=5+5) Encoder/256-8 356ns ± 0% 291ns ± 0% -18.22% (p=0.008 n=5+5) Encoder/4K-8 4.05µs ± 0% 3.70µs ± 0% -8.76% (p=0.008 n=5+5) Encoder/32K-8 34.1µs ± 0% 29.2µs ± 0% -14.30% (p=0.008 n=5+5) Decoder/16-8 205ns ± 0% 207ns ± 0% +1.08% (p=0.008 n=5+5) Decoder/256-8 262ns ± 0% 244ns ± 1% -6.94% (p=0.008 n=5+5) Decoder/4K-8 1.49µs ± 0% 1.12µs ± 0% -24.87% (p=0.008 n=5+5) Decoder/32K-8 11.0µs ± 0% 8.0µs ± 0% -27.00% (p=0.008 n=5+5) name old speed new speed delta EncodeTo/16-8 1.50GB/s ± 0% 1.54GB/s ± 0% +2.11% (p=0.008 n=5+5) EncodeTo/256-8 3.17GB/s ± 0% 4.60GB/s ± 0% +45.15% (p=0.008 n=5+5) EncodeTo/4K-8 3.37GB/s ± 0% 5.01GB/s ± 0% +48.51% (p=0.008 n=5+5) EncodeTo/32K-8 3.40GB/s ± 0% 5.06GB/s ± 0% +49.02% (p=0.008 n=5+5) DecodeTo/16-8 2.25GB/s ± 0% 0.85GB/s ± 0% -62.39% (p=0.016 n=4+5) DecodeTo/256-8 3.66GB/s ± 0% 2.65GB/s ± 0% -27.54% (p=0.008 n=5+5) DecodeTo/4K-8 3.84GB/s ± 0% 3.99GB/s ± 0% +3.87% (p=0.008 n=5+5) DecodeTo/32K-8 3.86GB/s ± 1% 4.26GB/s ± 1% +10.33% (p=0.008 n=5+5) Encoder/16-8 209MB/s ± 0% 209MB/s ± 0% ~ (p=0.802 n=5+5) Encoder/256-8 720MB/s ± 0% 880MB/s ± 0% +22.28% (p=0.008 n=5+5) Encoder/4K-8 1.01GB/s ± 0% 1.11GB/s ± 0% +9.60% (p=0.008 n=5+5) Encoder/32K-8 962MB/s ± 0% 1122MB/s ± 0% +16.69% (p=0.008 n=5+5) Decoder/16-8 78.1MB/s ± 0% 77.3MB/s ± 0% -1.08% (p=0.008 n=5+5) Decoder/256-8 977MB/s ± 0% 1050MB/s ± 1% +7.47% (p=0.008 n=5+5) Decoder/4K-8 2.76GB/s ± 0% 3.67GB/s ± 0% +33.10% (p=0.008 n=5+5) Decoder/32K-8 2.98GB/s ± 0% 4.08GB/s ± 0% +36.98% (p=0.008 n=5+5) name old alloc/op new alloc/op delta EncodeTo/16-8 0.00B 0.00B ~ (all equal) EncodeTo/256-8 0.00B 0.00B ~ (all equal) EncodeTo/4K-8 0.00B 0.00B ~ (all equal) EncodeTo/32K-8 0.00B 0.00B ~ (all equal) DecodeTo/16-8 0.00B 48.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/256-8 0.00B 576.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/4K-8 0.00B 6144.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/32K-8 0.00B 49152.00B ± 0% +Inf% (p=0.008 n=5+5) Encoder/16-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/256-8 472B ± 0% 24B ± 0% -94.92% (p=0.008 n=5+5) Encoder/4K-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/32K-8 41.0kB ± 0% 0.0kB ± 0% -99.94% (p=0.008 n=5+5) Decoder/16-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/256-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/4K-8 4.98kB ± 0% 4.98kB ± 0% ~ (all equal) Decoder/32K-8 41.1kB ± 0% 41.1kB ± 0% ~ (all equal) name old allocs/op new allocs/op delta EncodeTo/16-8 0.00 0.00 ~ (all equal) EncodeTo/256-8 0.00 0.00 ~ (all equal) EncodeTo/4K-8 0.00 0.00 ~ (all equal) EncodeTo/32K-8 0.00 0.00 ~ (all equal) DecodeTo/16-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/256-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/4K-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/32K-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) Encoder/16-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/256-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Encoder/4K-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/32K-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Decoder/16-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/256-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/4K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/32K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) --- base14_arm64.go | 13 ++---- base14_arm64.s | 107 ++++++++++++++++++++++++++++++++++++++++++++++-- base14_test.go | 4 +- c/base1432.c | 55 +++++++++++++++++++++++++ 4 files changed, 166 insertions(+), 13 deletions(-) diff --git a/base14_arm64.go b/base14_arm64.go index 254b93a..8aab1a5 100644 --- a/base14_arm64.go +++ b/base14_arm64.go @@ -25,7 +25,6 @@ func encode(offset, outlen int, b, encd []byte) { return } n := valn - (uintptr)(*(*unsafe.Pointer)(unsafe.Pointer(&encd))) - println(sum, n) var tmp [4]byte binary.LittleEndian.PutUint32(tmp[:], uint32(sum)) copy(encd[n:], tmp[:]) @@ -34,12 +33,8 @@ func encode(offset, outlen int, b, encd []byte) { } func decode(offset, outlen int, b, decd []byte) { - /* - if offset != 0 && cap(b) == len(b) { - b = append(b, make([]byte, 8)...) - } - _decode(offset, outlen, b, decd) - */ - - decodeGeneric(offset, outlen, b, decd) + if offset != 0 && cap(b) == len(b) { + b = append(b, make([]byte, 8)...) + } + _decode(offset, outlen, b, decd) } diff --git a/base14_arm64.s b/base14_arm64.s index 8889c57..d704a7e 100644 --- a/base14_arm64.s +++ b/base14_arm64.s @@ -9,7 +9,7 @@ TEXT ·_encode(SB), NOSPLIT, $0-81 MOVD ·data+16(FP), R9 MOVD ·dlen+24(FP), R3 MOVD ·encd+40(FP), R5 - + SUBW $6, R3, R3 CMPW $0, R3 BLE enctil @@ -87,7 +87,7 @@ encrem: ANDW $0xf000000, R4, R4 ORRW R3, R4, R3 ADDW $0x4e0000, R3, R3 - ADDW $78, R3, R3 + ADDW $0x4e, R3, R3 MOVW R3, (R5)(R8<<2) MOVBU (R2)(R10), R3 UBFIZW $2, R3, $4, R3 @@ -120,4 +120,105 @@ enctil: // func _decode(offset, outlen int, b, decd []byte) TEXT ·_decode(SB), NOSPLIT, $0-64 - \ No newline at end of file + MOVD ·offset+0(FP), R0 + MOVD ·outlen+8(FP), R1 + MOVD ·data+16(FP), R2 + MOVD ·decd+40(FP), R5 + + SUBW $6, R1, R1 + CMPW $0, R1 + BLE dectil + MOVW $0xb200, R11 + MOVD R5, R9 + SUB $8, R2, R14 + SUB $4, R2, R13 + MOVD $2, R8 + MOVW $0, R10 + MOVK $(0xb1ff<<16), R11 +declop: + MOVW (R14)(R8<<2), R4 + ADDW $7, R10, R10 + MOVW (R13)(R8<<2), R3 + MOVW R8, R12 + REVW R4, R4 + CMPW R1, R10 + ADDW R11, R4, R4 + REVW R3, R3 + ADDW R11, R3, R3 + ADD $2, R8, R8 + LSLW $2, R4, R7 + UBFIZW $4, R4, $14, R4 + LSLW $6, R3, R6 + ANDW $-262144, R7, R7 + ORRW R4, R7, R7 + ANDW $-4194304, R6, R4 + UBFIZW $8, R3, $14, R6 + ORRW R3>>26, R7, R3 + ORRW R6, R4, R4 + REVW R3, R3 + REVW R4, R4 + STPW (R3, R4), (R9) + ADD $7, R9, R9 + BLT declop +decrem: + CBZW R0, decret + MOVW (R2)(R12.UXTW<<2), R1 + CMPW $1, R0 + SUBW $0x4e, R1, R3 + UBFX $14, R3, $2, R4 + ORRW R3<<2, R4, R3 + MOVB R3, (R5)(R10.SXTW) + BEQ decret + + MOVW $0xffb2, R7 + ADDW $1, R10, R4 + MOVK $(0xffb1<<16), R7 + ADDW R7, R1, R1 + CMPW $2, R0 + UBFX $20, R1, $8, R6 + LSRW $6, R1, R3 + ANDW $3, R6, R8 + ANDW $-4, R3, R3 + ORRW R8, R3, R3 + MOVB R3, (R5)(R4.SXTW) + BEQ decret + + ADDW $2, R10, R3 + LSRW $12, R1, R4 + ANDW $-16, R4, R4 + CMPW $3, R0 + ORRW R1>>28, R4, R1 + MOVB R1, (R5)(R3.SXTW) + BEQ decret + + ADDW $3, R10, R1 + ADDW $1, R12, R12 + ANDW $0xf0, R6, R6 + CMPW $4, R0 + MOVW (R2)(R12<<2), R3 + SUBW $0x4e, R3, R2 + UBFX $2, R2, $4, R4 + ORRW R6, R4, R4 + MOVB R4, (R5)(R1.SXTW) + BEQ decret + + ADDW $4, R10, R1 + UBFX $10, R2, $6, R4 + ORRW R2<<6, R4, R2 + CMPW $5, R0 + MOVB R2, (R5)(R1.SXTW) + BEQ decret + + ADDW R7, R3, R3 + ADDW $5, R10, R10 + LSRW $2, R3, R0 + UBFX $16, R3, $6, R3 + ANDW $-64, R0, R0 + ORRW R3, R0, R3 + MOVB R3, (R5)(R10.SXTW) +decret: + RET +dectil: + MOVW $0, R10 + MOVW $0, R12 + JMP decrem diff --git a/base14_test.go b/base14_test.go index 583eb3c..44e37d6 100644 --- a/base14_test.go +++ b/base14_test.go @@ -24,7 +24,9 @@ func TestBase14(t *testing.T) { for i := 1; i < 4096; i++ { rand.Read(buf[:i]) out := Decode(Encode(buf[:i])) - assert.Equal(t, hex.EncodeToString(buf[:i]), hex.EncodeToString(out)) + if !assert.Equal(t, hex.EncodeToString(buf[:i]), hex.EncodeToString(out)) { + t.Fatal() + } } } diff --git a/c/base1432.c b/c/base1432.c index cf065f3..2b6a531 100644 --- a/c/base1432.c +++ b/c/base1432.c @@ -104,3 +104,58 @@ int base16384_encode(int offset, int outlen, const char* data, int dlen, int dca } return outlen; } + +void base16384_decode(int offset, int outlen, const char* data, int dlen, int dcap, char* buf, int blen, int bcap) { + uint32_t* vals = (uint32_t*)data; + uint32_t n = 0; + int32_t i = 0; + for(; i <= outlen - 7; i+=7) { // n实际每次自增2 + register uint32_t sum = 0; + register uint32_t shift = htobe32(vals[n++]) - 0x4e004e00; + shift <<= 2; + sum |= shift & 0xfffc0000; + shift <<= 2; + sum |= shift & 0x0003fff0; + shift = htobe32(vals[n++]) - 0x4e004e00; + sum |= shift >> 26; + *(uint32_t*)(buf+i) = be32toh(sum); + sum = 0; + shift <<= 6; + sum |= shift & 0xffc00000; + shift <<= 2; + sum |= shift & 0x003fff00; + *(uint32_t*)(buf+i+4) = be32toh(sum); + } + if(offset--) { + // 这里有读取越界 + #ifdef WORDS_BIGENDIAN + register uint32_t sum = __builtin_bswap32(vals[n++]); + #else + register uint32_t sum = vals[n++]; + #endif + sum -= 0x0000004e; + buf[i++] = ((sum & 0x0000003f) << 2) | ((sum & 0x0000c000) >> 14); + if(offset--) { + sum -= 0x004e0000; + buf[i++] = ((sum & 0x00003f00) >> 6) | ((sum & 0x00300000) >> 20); + if(offset--) { + buf[i++] = ((sum & 0x000f0000) >> 12) | ((sum & 0xf0000000) >> 28); + if(offset--) { + buf[i] = (sum & 0x0f000000) >> 20; + // 这里有读取越界 + sum = vals[n]; + sum -= 0x0000004e; + buf[i++] |= (sum & 0x0000003c) >> 2; + if(offset--) { + buf[i++] = ((sum & 0x00000003) << 6) | ((sum & 0x0000fc00) >> 10); + if(offset--) { + sum -= 0x004e0000; + buf[i] = ((sum & 0x00000300) >> 2) | ((sum & 0x003f0000) >> 16); + } + } + } + } + } + } + return; +}