mirror of
https://github.com/fumiama/go-base16384.git
synced 2026-06-05 00:32:52 +08:00
arm64: 优化流水线和内存分配
goos: darwin goarch: arm64 pkg: github.com/fumiama/go-base16384 name old time/op new time/op delta EncodeTo/16-8 10.6ns ± 0% 9.8ns ± 1% -7.60% (p=0.008 n=5+5) EncodeTo/256-8 80.8ns ± 0% 56.0ns ± 0% -30.72% (p=0.008 n=5+5) EncodeTo/4K-8 1.21µs ± 0% 0.82µs ± 1% -32.34% (p=0.016 n=4+5) EncodeTo/32K-8 9.64µs ± 0% 6.52µs ± 1% -32.36% (p=0.008 n=5+5) DecodeTo/16-8 9.79ns ± 0% 6.33ns ± 1% -35.29% (p=0.016 n=4+5) DecodeTo/256-8 80.9ns ± 0% 52.6ns ± 1% -34.98% (p=0.008 n=5+5) DecodeTo/4K-8 1.22µs ± 0% 0.78µs ± 1% -35.91% (p=0.008 n=5+5) DecodeTo/32K-8 9.71µs ± 1% 6.21µs ± 1% -36.01% (p=0.008 n=5+5) Encoder/16-8 76.5ns ± 0% 76.2ns ± 0% -0.42% (p=0.008 n=5+5) Encoder/256-8 356ns ± 0% 290ns ± 0% -18.39% (p=0.008 n=5+5) Encoder/4K-8 4.05µs ± 0% 3.70µs ± 0% -8.65% (p=0.008 n=5+5) Encoder/32K-8 34.1µs ± 0% 29.2µs ± 0% -14.22% (p=0.008 n=5+5) Decoder/16-8 205ns ± 0% 207ns ± 1% +1.28% (p=0.008 n=5+5) Decoder/256-8 262ns ± 0% 246ns ± 1% -6.05% (p=0.008 n=5+5) Decoder/4K-8 1.49µs ± 0% 1.12µs ± 0% -24.48% (p=0.008 n=5+5) Decoder/32K-8 11.0µs ± 0% 8.1µs ± 1% -26.64% (p=0.008 n=5+5) name old speed new speed delta EncodeTo/16-8 1.50GB/s ± 0% 1.63GB/s ± 1% +8.22% (p=0.008 n=5+5) EncodeTo/256-8 3.17GB/s ± 0% 4.57GB/s ± 0% +44.35% (p=0.008 n=5+5) EncodeTo/4K-8 3.37GB/s ± 0% 4.99GB/s ± 1% +47.78% (p=0.008 n=5+5) EncodeTo/32K-8 3.40GB/s ± 0% 5.02GB/s ± 1% +47.85% (p=0.008 n=5+5) DecodeTo/16-8 2.25GB/s ± 0% 3.47GB/s ± 1% +54.53% (p=0.016 n=4+5) DecodeTo/256-8 3.66GB/s ± 0% 5.63GB/s ± 1% +53.81% (p=0.008 n=5+5) DecodeTo/4K-8 3.84GB/s ± 0% 6.00GB/s ± 1% +56.05% (p=0.008 n=5+5) DecodeTo/32K-8 3.86GB/s ± 1% 6.03GB/s ± 1% +56.27% (p=0.008 n=5+5) Encoder/16-8 209MB/s ± 0% 210MB/s ± 0% +0.42% (p=0.008 n=5+5) Encoder/256-8 720MB/s ± 0% 882MB/s ± 0% +22.53% (p=0.008 n=5+5) Encoder/4K-8 1.01GB/s ± 0% 1.11GB/s ± 0% +9.47% (p=0.008 n=5+5) Encoder/32K-8 962MB/s ± 0% 1121MB/s ± 0% +16.58% (p=0.008 n=5+5) Decoder/16-8 78.1MB/s ± 0% 77.1MB/s ± 1% -1.25% (p=0.008 n=5+5) Decoder/256-8 977MB/s ± 0% 1040MB/s ± 1% +6.45% (p=0.008 n=5+5) Decoder/4K-8 2.76GB/s ± 0% 3.65GB/s ± 0% +32.39% (p=0.008 n=5+5) Decoder/32K-8 2.98GB/s ± 0% 4.06GB/s ± 1% +36.31% (p=0.008 n=5+5) name old alloc/op new alloc/op delta EncodeTo/16-8 0.00B 0.00B ~ (all equal) EncodeTo/256-8 0.00B 0.00B ~ (all equal) EncodeTo/4K-8 0.00B 0.00B ~ (all equal) EncodeTo/32K-8 0.00B 0.00B ~ (all equal) DecodeTo/16-8 0.00B 0.00B ~ (all equal) DecodeTo/256-8 0.00B 0.00B ~ (all equal) DecodeTo/4K-8 0.00B 0.00B ~ (all equal) DecodeTo/32K-8 0.00B 0.00B ~ (all equal) Encoder/16-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/256-8 472B ± 0% 24B ± 0% -94.92% (p=0.008 n=5+5) Encoder/4K-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/32K-8 41.0kB ± 0% 0.0kB ± 0% -99.94% (p=0.008 n=5+5) Decoder/16-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/256-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/4K-8 4.98kB ± 0% 4.98kB ± 0% ~ (all equal) Decoder/32K-8 41.1kB ± 0% 41.1kB ± 0% ~ (all equal) name old allocs/op new allocs/op delta EncodeTo/16-8 0.00 0.00 ~ (all equal) EncodeTo/256-8 0.00 0.00 ~ (all equal) EncodeTo/4K-8 0.00 0.00 ~ (all equal) EncodeTo/32K-8 0.00 0.00 ~ (all equal) DecodeTo/16-8 0.00 0.00 ~ (all equal) DecodeTo/256-8 0.00 0.00 ~ (all equal) DecodeTo/4K-8 0.00 0.00 ~ (all equal) DecodeTo/32K-8 0.00 0.00 ~ (all equal) Encoder/16-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/256-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Encoder/4K-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/32K-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Decoder/16-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/256-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/4K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/32K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal)
This commit is contained in:
@@ -5,26 +5,21 @@ package base14
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
//go:nosplit
|
||||
func _encode(offset, outlen int, b, encd []byte) (sum uint64, valn uintptr)
|
||||
func _encode(offset int, b, encd []byte) (sum uint64, n int)
|
||||
|
||||
//go:noescape
|
||||
//go:nosplit
|
||||
func _decode(offset, outlen int, b, decd []byte)
|
||||
|
||||
func encode(offset, outlen int, b, encd []byte) {
|
||||
if len(b) == 7 {
|
||||
b = append(b, 0)
|
||||
}
|
||||
sum, valn := _encode(offset, outlen, b, encd)
|
||||
sum, n := _encode(offset, b, encd)
|
||||
if offset == 0 {
|
||||
return
|
||||
}
|
||||
n := valn - (uintptr)(*(*unsafe.Pointer)(unsafe.Pointer(&encd)))
|
||||
var tmp [4]byte
|
||||
binary.LittleEndian.PutUint32(tmp[:], uint32(sum))
|
||||
copy(encd[n:], tmp[:])
|
||||
@@ -33,8 +28,5 @@ func encode(offset, outlen int, b, encd []byte) {
|
||||
}
|
||||
|
||||
func decode(offset, outlen int, b, decd []byte) {
|
||||
if offset != 0 && cap(b) == len(b) {
|
||||
b = append(b, make([]byte, 8)...)
|
||||
}
|
||||
_decode(offset, outlen, b, decd)
|
||||
}
|
||||
|
||||
135
base14_arm64.s
135
base14_arm64.s
@@ -3,12 +3,12 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func _encode(offset, outlen int, b, encd []byte) (sum uint64, &vals[n] uintptr)
|
||||
TEXT ·_encode(SB), NOSPLIT, $0-81
|
||||
// func _encode(offset, b, encd []byte) (sum uint64, n int)
|
||||
TEXT ·_encode(SB), NOSPLIT, $0-72
|
||||
MOVD ·offset+0(FP), R0
|
||||
MOVD ·data+16(FP), R9
|
||||
MOVD ·dlen+24(FP), R3
|
||||
MOVD ·encd+40(FP), R5
|
||||
MOVD ·data+8(FP), R9
|
||||
MOVD ·dlen+16(FP), R3
|
||||
MOVD ·encd+32(FP), R5
|
||||
|
||||
SUBW $6, R3, R3
|
||||
CMPW $0, R3
|
||||
@@ -109,8 +109,9 @@ encrem:
|
||||
encsum:
|
||||
ADDW $0x4e0000, R3, R3
|
||||
ADDW $0x4e, R3, R3
|
||||
MOVD R3, ·sum+64(FP)
|
||||
MOVD R7, ·n+72(FP)
|
||||
SUB R5, R7, R7
|
||||
MOVD R3, ·sum+56(FP)
|
||||
MOVD R7, ·n+64(FP)
|
||||
encret:
|
||||
RET
|
||||
enctil:
|
||||
@@ -125,16 +126,16 @@ TEXT ·_decode(SB), NOSPLIT, $0-64
|
||||
MOVD ·data+16(FP), R2
|
||||
MOVD ·decd+40(FP), R5
|
||||
|
||||
SUBW $6, R1, R1
|
||||
CMPW $0, R1
|
||||
BLE dectil
|
||||
MOVW $0xb200, R11
|
||||
MOVD R5, R9
|
||||
SUB $8, R2, R14
|
||||
SUB $4, R2, R13
|
||||
MOVD $2, R8
|
||||
MOVW $0, R10
|
||||
MOVK $(0xb1ff<<16), R11
|
||||
SUBW $6, R1, R1 // sub w1, w1, #6
|
||||
CMPW $0, R1 // cmp w1, 0
|
||||
BLE dectil // ble .L7
|
||||
MOVW $0xb200, R11 // mov w11, 45568
|
||||
MOVD R5, R9 // mov x9, x5
|
||||
SUB $8, R2, R14 // sub x14, x2, #8
|
||||
SUB $4, R2, R13 // sub x13, x2, #4
|
||||
MOVD $2, R8 // mov x8, 2
|
||||
MOVW $0, R10 // mov w10, 0
|
||||
MOVK $(0xb1ff<<16), R11 // movk w11, 0xb1ff, lsl 16
|
||||
declop:
|
||||
MOVW (R14)(R8<<2), R4
|
||||
ADDW $7, R10, R10
|
||||
@@ -161,61 +162,61 @@ declop:
|
||||
ADD $7, R9, R9
|
||||
BLT declop
|
||||
decrem:
|
||||
CBZW R0, decret
|
||||
MOVW (R2)(R12.UXTW<<2), R1
|
||||
CMPW $1, R0
|
||||
SUBW $0x4e, R1, R3
|
||||
UBFX $14, R3, $2, R4
|
||||
ORRW R3<<2, R4, R3
|
||||
MOVB R3, (R5)(R10.SXTW)
|
||||
BEQ decret
|
||||
CBZW R0, decret // cbz w0, .L1
|
||||
MOVW (R2)(R12.UXTW<<2), R1 // ldr w1, [x2, w12, uxtw 2]
|
||||
CMPW $1, R0 // cmp w0, 1
|
||||
SUBW $0x4e, R1, R3 // sub w3, w1, #78
|
||||
UBFX $14, R3, $2, R4 // ubfx x4, x3, 14, 2
|
||||
ORRW R3<<2, R4, R3 // orr w3, w4, w3, lsl 2
|
||||
MOVB R3, (R5)(R10.SXTW) // strb w3, [x5, w10, sxtw]
|
||||
BEQ decret // beq .L1
|
||||
|
||||
MOVW $0xffb2, R7
|
||||
ADDW $1, R10, R4
|
||||
MOVK $(0xffb1<<16), R7
|
||||
ADDW R7, R1, R1
|
||||
CMPW $2, R0
|
||||
UBFX $20, R1, $8, R6
|
||||
LSRW $6, R1, R3
|
||||
ANDW $3, R6, R8
|
||||
ANDW $-4, R3, R3
|
||||
ORRW R8, R3, R3
|
||||
MOVB R3, (R5)(R4.SXTW)
|
||||
BEQ decret
|
||||
MOVW $0xffb2, R7 // mov w7, 65458
|
||||
ADDW $1, R10, R6 // add w6, w10, 1
|
||||
MOVK $(0xffb1<<16), R7 // movk w7, 0xffb1, lsl 16
|
||||
ADDW R7, R1, R1 // add w1, w1, w7
|
||||
CMPW $2, R0 // cmp w0, 2
|
||||
UBFX $20, R1, $8, R4 // ubfx x4, x1, 20, 8
|
||||
LSRW $6, R1, R3 // lsr w3, w1, 6
|
||||
ANDW $3, R4, R8 // and w8, w4, 3
|
||||
ANDW $-4, R3, R3 // and w3, w3, -4
|
||||
ORRW R8, R3, R3 // orr w3, w3, w8
|
||||
MOVB R3, (R5)(R6.SXTW) // strb w3, [x5, w6, sxtw]
|
||||
BEQ decret // beq .L1
|
||||
|
||||
ADDW $2, R10, R3
|
||||
LSRW $12, R1, R4
|
||||
ANDW $-16, R4, R4
|
||||
CMPW $3, R0
|
||||
ORRW R1>>28, R4, R1
|
||||
MOVB R1, (R5)(R3.SXTW)
|
||||
BEQ decret
|
||||
ADDW $2, R10, R3 // add w3, w10, 2
|
||||
LSRW $12, R1, R6 // lsr w6, w1, 12
|
||||
ANDW $-16, R6, R6 // and w6, w6, -16
|
||||
CMPW $3, R0 // cmp w0, 3
|
||||
ORRW R1>>28, R6, R1 // orr w1, w6, w1, lsr 28
|
||||
MOVB R1, (R5)(R3.SXTW) // strb w1, [x5, w3, sxtw]
|
||||
BEQ decret // beq .L1
|
||||
|
||||
ADDW $3, R10, R1
|
||||
ADDW $1, R12, R12
|
||||
ANDW $0xf0, R6, R6
|
||||
CMPW $4, R0
|
||||
MOVW (R2)(R12<<2), R3
|
||||
SUBW $0x4e, R3, R2
|
||||
UBFX $2, R2, $4, R4
|
||||
ORRW R6, R4, R4
|
||||
MOVB R4, (R5)(R1.SXTW)
|
||||
BEQ decret
|
||||
ADDW $3, R10, R1 // add w1, w10, 3
|
||||
ADDW $1, R12, R12 // add w12, w12, 1
|
||||
ANDW $0xf0, R4, R4 // and w4, w4, 240
|
||||
CMPW $4, R0 // cmp w0, 4
|
||||
MOVW (R2)(R12<<2), R3 // ldr w3, [x2, x12, lsl 2]
|
||||
SUBW $0x4e, R3, R2 // sub w2, w3, #78
|
||||
UBFX $2, R2, $4, R6 // ubfx x6, x2, 2, 4
|
||||
ORRW R6, R4, R4 // orr w4, w4, w6
|
||||
MOVB R4, (R5)(R1.SXTW) // strb w4, [x5, w1, sxtw]
|
||||
BEQ decret // beq .L1
|
||||
|
||||
ADDW $4, R10, R1
|
||||
UBFX $10, R2, $6, R4
|
||||
ORRW R2<<6, R4, R2
|
||||
CMPW $5, R0
|
||||
MOVB R2, (R5)(R1.SXTW)
|
||||
BEQ decret
|
||||
ADDW $4, R10, R1 // add w1, w10, 4
|
||||
UBFX $10, R2, $6, R4 // ubfx x4, x2, 10, 6
|
||||
ORRW R2<<6, R4, R2 // orr w2, w4, w2, lsl 6
|
||||
CMPW $5, R0 // cmp w0, 5
|
||||
MOVB R2, (R5)(R1.SXTW) // strb w2, [x5, w1, sxtw]
|
||||
BEQ decret // beq .L1
|
||||
|
||||
ADDW R7, R3, R3
|
||||
ADDW $5, R10, R10
|
||||
LSRW $2, R3, R0
|
||||
UBFX $16, R3, $6, R3
|
||||
ANDW $-64, R0, R0
|
||||
ORRW R3, R0, R3
|
||||
MOVB R3, (R5)(R10.SXTW)
|
||||
ADDW R7, R3, R3 // add w3, w3, w7
|
||||
ADDW $5, R10, R10 // add w10, w10, 5
|
||||
LSRW $2, R3, R0 // lsr w0, w3, 2
|
||||
UBFX $16, R3, $6, R3 // ubfx x3, x3, 16, 6
|
||||
ANDW $-64, R0, R0 // and w0, w0, -64
|
||||
ORRW R3, R0, R3 // orr w3, w0, w3
|
||||
MOVB R3, (R5)(R10.SXTW) // strb w3, [x5, w10, sxtw]
|
||||
decret:
|
||||
RET
|
||||
dectil:
|
||||
|
||||
Reference in New Issue
Block a user