1
0
mirror of https://github.com/fumiama/go-base16384.git synced 2026-06-05 00:32:52 +08:00

arm64: 优化流水线和内存分配

goos: darwin
goarch: arm64
pkg: github.com/fumiama/go-base16384

name            old time/op    new time/op    delta
EncodeTo/16-8     10.6ns ± 0%     9.8ns ± 1%   -7.60%  (p=0.008 n=5+5)
EncodeTo/256-8    80.8ns ± 0%    56.0ns ± 0%  -30.72%  (p=0.008 n=5+5)
EncodeTo/4K-8     1.21µs ± 0%    0.82µs ± 1%  -32.34%  (p=0.016 n=4+5)
EncodeTo/32K-8    9.64µs ± 0%    6.52µs ± 1%  -32.36%  (p=0.008 n=5+5)
DecodeTo/16-8     9.79ns ± 0%    6.33ns ± 1%  -35.29%  (p=0.016 n=4+5)
DecodeTo/256-8    80.9ns ± 0%    52.6ns ± 1%  -34.98%  (p=0.008 n=5+5)
DecodeTo/4K-8     1.22µs ± 0%    0.78µs ± 1%  -35.91%  (p=0.008 n=5+5)
DecodeTo/32K-8    9.71µs ± 1%    6.21µs ± 1%  -36.01%  (p=0.008 n=5+5)
Encoder/16-8      76.5ns ± 0%    76.2ns ± 0%   -0.42%  (p=0.008 n=5+5)
Encoder/256-8      356ns ± 0%     290ns ± 0%  -18.39%  (p=0.008 n=5+5)
Encoder/4K-8      4.05µs ± 0%    3.70µs ± 0%   -8.65%  (p=0.008 n=5+5)
Encoder/32K-8     34.1µs ± 0%    29.2µs ± 0%  -14.22%  (p=0.008 n=5+5)
Decoder/16-8       205ns ± 0%     207ns ± 1%   +1.28%  (p=0.008 n=5+5)
Decoder/256-8      262ns ± 0%     246ns ± 1%   -6.05%  (p=0.008 n=5+5)
Decoder/4K-8      1.49µs ± 0%    1.12µs ± 0%  -24.48%  (p=0.008 n=5+5)
Decoder/32K-8     11.0µs ± 0%     8.1µs ± 1%  -26.64%  (p=0.008 n=5+5)

name            old speed      new speed      delta
EncodeTo/16-8   1.50GB/s ± 0%  1.63GB/s ± 1%   +8.22%  (p=0.008 n=5+5)
EncodeTo/256-8  3.17GB/s ± 0%  4.57GB/s ± 0%  +44.35%  (p=0.008 n=5+5)
EncodeTo/4K-8   3.37GB/s ± 0%  4.99GB/s ± 1%  +47.78%  (p=0.008 n=5+5)
EncodeTo/32K-8  3.40GB/s ± 0%  5.02GB/s ± 1%  +47.85%  (p=0.008 n=5+5)
DecodeTo/16-8   2.25GB/s ± 0%  3.47GB/s ± 1%  +54.53%  (p=0.016 n=4+5)
DecodeTo/256-8  3.66GB/s ± 0%  5.63GB/s ± 1%  +53.81%  (p=0.008 n=5+5)
DecodeTo/4K-8   3.84GB/s ± 0%  6.00GB/s ± 1%  +56.05%  (p=0.008 n=5+5)
DecodeTo/32K-8  3.86GB/s ± 1%  6.03GB/s ± 1%  +56.27%  (p=0.008 n=5+5)
Encoder/16-8     209MB/s ± 0%   210MB/s ± 0%   +0.42%  (p=0.008 n=5+5)
Encoder/256-8    720MB/s ± 0%   882MB/s ± 0%  +22.53%  (p=0.008 n=5+5)
Encoder/4K-8    1.01GB/s ± 0%  1.11GB/s ± 0%   +9.47%  (p=0.008 n=5+5)
Encoder/32K-8    962MB/s ± 0%  1121MB/s ± 0%  +16.58%  (p=0.008 n=5+5)
Decoder/16-8    78.1MB/s ± 0%  77.1MB/s ± 1%   -1.25%  (p=0.008 n=5+5)
Decoder/256-8    977MB/s ± 0%  1040MB/s ± 1%   +6.45%  (p=0.008 n=5+5)
Decoder/4K-8    2.76GB/s ± 0%  3.65GB/s ± 0%  +32.39%  (p=0.008 n=5+5)
Decoder/32K-8   2.98GB/s ± 0%  4.06GB/s ± 1%  +36.31%  (p=0.008 n=5+5)

name            old alloc/op   new alloc/op   delta
EncodeTo/16-8      0.00B          0.00B          ~     (all equal)
EncodeTo/256-8     0.00B          0.00B          ~     (all equal)
EncodeTo/4K-8      0.00B          0.00B          ~     (all equal)
EncodeTo/32K-8     0.00B          0.00B          ~     (all equal)
DecodeTo/16-8      0.00B          0.00B          ~     (all equal)
DecodeTo/256-8     0.00B          0.00B          ~     (all equal)
DecodeTo/4K-8      0.00B          0.00B          ~     (all equal)
DecodeTo/32K-8     0.00B          0.00B          ~     (all equal)
Encoder/16-8       24.0B ± 0%     24.0B ± 0%     ~     (all equal)
Encoder/256-8       472B ± 0%       24B ± 0%  -94.92%  (p=0.008 n=5+5)
Encoder/4K-8       24.0B ± 0%     24.0B ± 0%     ~     (all equal)
Encoder/32K-8     41.0kB ± 0%     0.0kB ± 0%  -99.94%  (p=0.008 n=5+5)
Decoder/16-8      1.39kB ± 0%    1.39kB ± 0%     ~     (all equal)
Decoder/256-8     1.39kB ± 0%    1.39kB ± 0%     ~     (all equal)
Decoder/4K-8      4.98kB ± 0%    4.98kB ± 0%     ~     (all equal)
Decoder/32K-8     41.1kB ± 0%    41.1kB ± 0%     ~     (all equal)

name            old allocs/op  new allocs/op  delta
EncodeTo/16-8       0.00           0.00          ~     (all equal)
EncodeTo/256-8      0.00           0.00          ~     (all equal)
EncodeTo/4K-8       0.00           0.00          ~     (all equal)
EncodeTo/32K-8      0.00           0.00          ~     (all equal)
DecodeTo/16-8       0.00           0.00          ~     (all equal)
DecodeTo/256-8      0.00           0.00          ~     (all equal)
DecodeTo/4K-8       0.00           0.00          ~     (all equal)
DecodeTo/32K-8      0.00           0.00          ~     (all equal)
Encoder/16-8        1.00 ± 0%      1.00 ± 0%     ~     (all equal)
Encoder/256-8       2.00 ± 0%      1.00 ± 0%  -50.00%  (p=0.008 n=5+5)
Encoder/4K-8        1.00 ± 0%      1.00 ± 0%     ~     (all equal)
Encoder/32K-8       2.00 ± 0%      1.00 ± 0%  -50.00%  (p=0.008 n=5+5)
Decoder/16-8        3.00 ± 0%      3.00 ± 0%     ~     (all equal)
Decoder/256-8       3.00 ± 0%      3.00 ± 0%     ~     (all equal)
Decoder/4K-8        3.00 ± 0%      3.00 ± 0%     ~     (all equal)
Decoder/32K-8       3.00 ± 0%      3.00 ± 0%     ~     (all equal)
This commit is contained in:
源文雨
2022-12-14 10:07:47 +08:00
parent 369cf02def
commit cdc9c6322a
2 changed files with 70 additions and 77 deletions

View File

@@ -5,26 +5,21 @@ package base14
import (
"encoding/binary"
"unsafe"
)
//go:noescape
//go:nosplit
func _encode(offset, outlen int, b, encd []byte) (sum uint64, valn uintptr)
func _encode(offset int, b, encd []byte) (sum uint64, n int)
//go:noescape
//go:nosplit
func _decode(offset, outlen int, b, decd []byte)
func encode(offset, outlen int, b, encd []byte) {
if len(b) == 7 {
b = append(b, 0)
}
sum, valn := _encode(offset, outlen, b, encd)
sum, n := _encode(offset, b, encd)
if offset == 0 {
return
}
n := valn - (uintptr)(*(*unsafe.Pointer)(unsafe.Pointer(&encd)))
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(sum))
copy(encd[n:], tmp[:])
@@ -33,8 +28,5 @@ func encode(offset, outlen int, b, encd []byte) {
}
func decode(offset, outlen int, b, decd []byte) {
if offset != 0 && cap(b) == len(b) {
b = append(b, make([]byte, 8)...)
}
_decode(offset, outlen, b, decd)
}

View File

@@ -3,12 +3,12 @@
#include "textflag.h"
// func _encode(offset, outlen int, b, encd []byte) (sum uint64, &vals[n] uintptr)
TEXT ·_encode(SB), NOSPLIT, $0-81
// func _encode(offset, b, encd []byte) (sum uint64, n int)
TEXT ·_encode(SB), NOSPLIT, $0-72
MOVD ·offset+0(FP), R0
MOVD ·data+16(FP), R9
MOVD ·dlen+24(FP), R3
MOVD ·encd+40(FP), R5
MOVD ·data+8(FP), R9
MOVD ·dlen+16(FP), R3
MOVD ·encd+32(FP), R5
SUBW $6, R3, R3
CMPW $0, R3
@@ -109,8 +109,9 @@ encrem:
encsum:
ADDW $0x4e0000, R3, R3
ADDW $0x4e, R3, R3
MOVD R3, ·sum+64(FP)
MOVD R7, ·n+72(FP)
SUB R5, R7, R7
MOVD R3, ·sum+56(FP)
MOVD R7, ·n+64(FP)
encret:
RET
enctil:
@@ -125,16 +126,16 @@ TEXT ·_decode(SB), NOSPLIT, $0-64
MOVD ·data+16(FP), R2
MOVD ·decd+40(FP), R5
SUBW $6, R1, R1
CMPW $0, R1
BLE dectil
MOVW $0xb200, R11
MOVD R5, R9
SUB $8, R2, R14
SUB $4, R2, R13
MOVD $2, R8
MOVW $0, R10
MOVK $(0xb1ff<<16), R11
SUBW $6, R1, R1 // sub w1, w1, #6
CMPW $0, R1 // cmp w1, 0
BLE dectil // ble .L7
MOVW $0xb200, R11 // mov w11, 45568
MOVD R5, R9 // mov x9, x5
SUB $8, R2, R14 // sub x14, x2, #8
SUB $4, R2, R13 // sub x13, x2, #4
MOVD $2, R8 // mov x8, 2
MOVW $0, R10 // mov w10, 0
MOVK $(0xb1ff<<16), R11 // movk w11, 0xb1ff, lsl 16
declop:
MOVW (R14)(R8<<2), R4
ADDW $7, R10, R10
@@ -161,61 +162,61 @@ declop:
ADD $7, R9, R9
BLT declop
decrem:
CBZW R0, decret
MOVW (R2)(R12.UXTW<<2), R1
CMPW $1, R0
SUBW $0x4e, R1, R3
UBFX $14, R3, $2, R4
ORRW R3<<2, R4, R3
MOVB R3, (R5)(R10.SXTW)
BEQ decret
CBZW R0, decret // cbz w0, .L1
MOVW (R2)(R12.UXTW<<2), R1 // ldr w1, [x2, w12, uxtw 2]
CMPW $1, R0 // cmp w0, 1
SUBW $0x4e, R1, R3 // sub w3, w1, #78
UBFX $14, R3, $2, R4 // ubfx x4, x3, 14, 2
ORRW R3<<2, R4, R3 // orr w3, w4, w3, lsl 2
MOVB R3, (R5)(R10.SXTW) // strb w3, [x5, w10, sxtw]
BEQ decret // beq .L1
MOVW $0xffb2, R7
ADDW $1, R10, R4
MOVK $(0xffb1<<16), R7
ADDW R7, R1, R1
CMPW $2, R0
UBFX $20, R1, $8, R6
LSRW $6, R1, R3
ANDW $3, R6, R8
ANDW $-4, R3, R3
ORRW R8, R3, R3
MOVB R3, (R5)(R4.SXTW)
BEQ decret
MOVW $0xffb2, R7 // mov w7, 65458
ADDW $1, R10, R6 // add w6, w10, 1
MOVK $(0xffb1<<16), R7 // movk w7, 0xffb1, lsl 16
ADDW R7, R1, R1 // add w1, w1, w7
CMPW $2, R0 // cmp w0, 2
UBFX $20, R1, $8, R4 // ubfx x4, x1, 20, 8
LSRW $6, R1, R3 // lsr w3, w1, 6
ANDW $3, R4, R8 // and w8, w4, 3
ANDW $-4, R3, R3 // and w3, w3, -4
ORRW R8, R3, R3 // orr w3, w3, w8
MOVB R3, (R5)(R6.SXTW) // strb w3, [x5, w6, sxtw]
BEQ decret // beq .L1
ADDW $2, R10, R3
LSRW $12, R1, R4
ANDW $-16, R4, R4
CMPW $3, R0
ORRW R1>>28, R4, R1
MOVB R1, (R5)(R3.SXTW)
BEQ decret
ADDW $2, R10, R3 // add w3, w10, 2
LSRW $12, R1, R6 // lsr w6, w1, 12
ANDW $-16, R6, R6 // and w6, w6, -16
CMPW $3, R0 // cmp w0, 3
ORRW R1>>28, R6, R1 // orr w1, w6, w1, lsr 28
MOVB R1, (R5)(R3.SXTW) // strb w1, [x5, w3, sxtw]
BEQ decret // beq .L1
ADDW $3, R10, R1
ADDW $1, R12, R12
ANDW $0xf0, R6, R6
CMPW $4, R0
MOVW (R2)(R12<<2), R3
SUBW $0x4e, R3, R2
UBFX $2, R2, $4, R4
ORRW R6, R4, R4
MOVB R4, (R5)(R1.SXTW)
BEQ decret
ADDW $3, R10, R1 // add w1, w10, 3
ADDW $1, R12, R12 // add w12, w12, 1
ANDW $0xf0, R4, R4 // and w4, w4, 240
CMPW $4, R0 // cmp w0, 4
MOVW (R2)(R12<<2), R3 // ldr w3, [x2, x12, lsl 2]
SUBW $0x4e, R3, R2 // sub w2, w3, #78
UBFX $2, R2, $4, R6 // ubfx x6, x2, 2, 4
ORRW R6, R4, R4 // orr w4, w4, w6
MOVB R4, (R5)(R1.SXTW) // strb w4, [x5, w1, sxtw]
BEQ decret // beq .L1
ADDW $4, R10, R1
UBFX $10, R2, $6, R4
ORRW R2<<6, R4, R2
CMPW $5, R0
MOVB R2, (R5)(R1.SXTW)
BEQ decret
ADDW $4, R10, R1 // add w1, w10, 4
UBFX $10, R2, $6, R4 // ubfx x4, x2, 10, 6
ORRW R2<<6, R4, R2 // orr w2, w4, w2, lsl 6
CMPW $5, R0 // cmp w0, 5
MOVB R2, (R5)(R1.SXTW) // strb w2, [x5, w1, sxtw]
BEQ decret // beq .L1
ADDW R7, R3, R3
ADDW $5, R10, R10
LSRW $2, R3, R0
UBFX $16, R3, $6, R3
ANDW $-64, R0, R0
ORRW R3, R0, R3
MOVB R3, (R5)(R10.SXTW)
ADDW R7, R3, R3 // add w3, w3, w7
ADDW $5, R10, R10 // add w10, w10, 5
LSRW $2, R3, R0 // lsr w0, w3, 2
UBFX $16, R3, $6, R3 // ubfx x3, x3, 16, 6
ANDW $-64, R0, R0 // and w0, w0, -64
ORRW R3, R0, R3 // orr w3, w0, w3
MOVB R3, (R5)(R10.SXTW) // strb w3, [x5, w10, sxtw]
decret:
RET
dectil: