mirror of
https://github.com/fumiama/go-base16384.git
synced 2026-06-05 00:32:52 +08:00
name old time/op new time/op delta EncodeTo/16-8 10.6ns ± 0% 10.4ns ± 0% -2.07% (p=0.008 n=5+5) EncodeTo/256-8 80.8ns ± 0% 55.7ns ± 0% -31.11% (p=0.008 n=5+5) EncodeTo/4K-8 1.21µs ± 0% 0.82µs ± 0% -32.67% (p=0.016 n=4+5) EncodeTo/32K-8 9.64µs ± 0% 6.47µs ± 0% -32.90% (p=0.008 n=5+5) DecodeTo/16-8 9.79ns ± 0% 26.02ns ± 0% +165.85% (p=0.016 n=4+5) DecodeTo/256-8 80.9ns ± 0% 111.6ns ± 0% +37.98% (p=0.008 n=5+5) DecodeTo/4K-8 1.22µs ± 0% 1.17µs ± 0% -3.73% (p=0.008 n=5+5) DecodeTo/32K-8 9.71µs ± 1% 8.80µs ± 1% -9.37% (p=0.008 n=5+5) Encoder/16-8 76.5ns ± 0% 76.5ns ± 0% ~ (p=0.810 n=5+5) Encoder/256-8 356ns ± 0% 291ns ± 0% -18.22% (p=0.008 n=5+5) Encoder/4K-8 4.05µs ± 0% 3.70µs ± 0% -8.76% (p=0.008 n=5+5) Encoder/32K-8 34.1µs ± 0% 29.2µs ± 0% -14.30% (p=0.008 n=5+5) Decoder/16-8 205ns ± 0% 207ns ± 0% +1.08% (p=0.008 n=5+5) Decoder/256-8 262ns ± 0% 244ns ± 1% -6.94% (p=0.008 n=5+5) Decoder/4K-8 1.49µs ± 0% 1.12µs ± 0% -24.87% (p=0.008 n=5+5) Decoder/32K-8 11.0µs ± 0% 8.0µs ± 0% -27.00% (p=0.008 n=5+5) name old speed new speed delta EncodeTo/16-8 1.50GB/s ± 0% 1.54GB/s ± 0% +2.11% (p=0.008 n=5+5) EncodeTo/256-8 3.17GB/s ± 0% 4.60GB/s ± 0% +45.15% (p=0.008 n=5+5) EncodeTo/4K-8 3.37GB/s ± 0% 5.01GB/s ± 0% +48.51% (p=0.008 n=5+5) EncodeTo/32K-8 3.40GB/s ± 0% 5.06GB/s ± 0% +49.02% (p=0.008 n=5+5) DecodeTo/16-8 2.25GB/s ± 0% 0.85GB/s ± 0% -62.39% (p=0.016 n=4+5) DecodeTo/256-8 3.66GB/s ± 0% 2.65GB/s ± 0% -27.54% (p=0.008 n=5+5) DecodeTo/4K-8 3.84GB/s ± 0% 3.99GB/s ± 0% +3.87% (p=0.008 n=5+5) DecodeTo/32K-8 3.86GB/s ± 1% 4.26GB/s ± 1% +10.33% (p=0.008 n=5+5) Encoder/16-8 209MB/s ± 0% 209MB/s ± 0% ~ (p=0.802 n=5+5) Encoder/256-8 720MB/s ± 0% 880MB/s ± 0% +22.28% (p=0.008 n=5+5) Encoder/4K-8 1.01GB/s ± 0% 1.11GB/s ± 0% +9.60% (p=0.008 n=5+5) Encoder/32K-8 962MB/s ± 0% 1122MB/s ± 0% +16.69% (p=0.008 n=5+5) Decoder/16-8 78.1MB/s ± 0% 77.3MB/s ± 0% -1.08% (p=0.008 n=5+5) Decoder/256-8 977MB/s ± 0% 1050MB/s ± 1% +7.47% (p=0.008 n=5+5) Decoder/4K-8 2.76GB/s ± 0% 3.67GB/s ± 0% +33.10% (p=0.008 n=5+5) Decoder/32K-8 2.98GB/s ± 0% 4.08GB/s ± 0% +36.98% (p=0.008 n=5+5) name old alloc/op new alloc/op delta EncodeTo/16-8 0.00B 0.00B ~ (all equal) EncodeTo/256-8 0.00B 0.00B ~ (all equal) EncodeTo/4K-8 0.00B 0.00B ~ (all equal) EncodeTo/32K-8 0.00B 0.00B ~ (all equal) DecodeTo/16-8 0.00B 48.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/256-8 0.00B 576.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/4K-8 0.00B 6144.00B ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/32K-8 0.00B 49152.00B ± 0% +Inf% (p=0.008 n=5+5) Encoder/16-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/256-8 472B ± 0% 24B ± 0% -94.92% (p=0.008 n=5+5) Encoder/4K-8 24.0B ± 0% 24.0B ± 0% ~ (all equal) Encoder/32K-8 41.0kB ± 0% 0.0kB ± 0% -99.94% (p=0.008 n=5+5) Decoder/16-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/256-8 1.39kB ± 0% 1.39kB ± 0% ~ (all equal) Decoder/4K-8 4.98kB ± 0% 4.98kB ± 0% ~ (all equal) Decoder/32K-8 41.1kB ± 0% 41.1kB ± 0% ~ (all equal) name old allocs/op new allocs/op delta EncodeTo/16-8 0.00 0.00 ~ (all equal) EncodeTo/256-8 0.00 0.00 ~ (all equal) EncodeTo/4K-8 0.00 0.00 ~ (all equal) EncodeTo/32K-8 0.00 0.00 ~ (all equal) DecodeTo/16-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/256-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/4K-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) DecodeTo/32K-8 0.00 1.00 ± 0% +Inf% (p=0.008 n=5+5) Encoder/16-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/256-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Encoder/4K-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Encoder/32K-8 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) Decoder/16-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/256-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/4K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal) Decoder/32K-8 3.00 ± 0% 3.00 ± 0% ~ (all equal)
225 lines
4.9 KiB
ArmAsm
225 lines
4.9 KiB
ArmAsm
//go:build arm64
|
|
// +build arm64
|
|
|
|
#include "textflag.h"
|
|
|
|
// func _encode(offset, outlen int, b, encd []byte) (sum uint64, &vals[n] uintptr)
|
|
TEXT ·_encode(SB), NOSPLIT, $0-81
|
|
MOVD ·offset+0(FP), R0
|
|
MOVD ·data+16(FP), R9
|
|
MOVD ·dlen+24(FP), R3
|
|
MOVD ·encd+40(FP), R5
|
|
|
|
SUBW $6, R3, R3
|
|
CMPW $0, R3
|
|
BLE enctil
|
|
MOVW $0x4e00, R11
|
|
SUB $8, R5, R14
|
|
SUB $4, R5, R13
|
|
MOVD $2, R8
|
|
MOVW $0, R10 // int32_t i = 0
|
|
MOVK $(0x4e00<<16), R11
|
|
enclop:
|
|
MOVW (R9), R4
|
|
ADDW $7, R10, R10
|
|
MOVW R8, R12
|
|
CMPW R3, R10
|
|
REVW R4, R4
|
|
ADD $7, R9, R9
|
|
LSRW $2, R4, R6
|
|
UBFX $4, R4, $14, R15
|
|
ANDW $0x3fff0000, R6, R6
|
|
UBFIZW $26, R4, $4, R7
|
|
ORRW R15, R6, R6
|
|
ADDW R11, R6, R6
|
|
REVW R6, R6
|
|
MOVW R6, (R14)(R8<<2)
|
|
MOVW -3(R9), R4
|
|
REVW R4, R4
|
|
LSRW $6, R4, R4
|
|
ANDW $0x3fffffc, R4, R4
|
|
ORRW R7, R4, R4
|
|
ANDW $0x3fff0000, R4, R6
|
|
UBFX $2, R4, $14, R4
|
|
ORRW R6, R4, R4
|
|
ADDW R11, R4, R4
|
|
REVW R4, R4
|
|
MOVW R4, (R13)(R8<<2)
|
|
ADDW $2, R8, R8
|
|
BLT enclop
|
|
encrem:
|
|
ANDSW $0xff, R0, R0
|
|
BEQ encret
|
|
|
|
MOVBU (R2)(R10.SXTW), R3
|
|
UXTW R12, R8
|
|
CMPW $1, R0
|
|
SXTW R10, R10
|
|
ADD R8<<2, R5, R7
|
|
UBFIZW $14, R3, $2, R4
|
|
ORRW R3>>2, R4, R3
|
|
BEQ encsum
|
|
|
|
ADD R10, R2, R9
|
|
CMPW $2, R0
|
|
MOVBU 1(R9), R6
|
|
LSLW $6, R6, R4
|
|
UBFIZW $20, R6, $2, R6
|
|
ANDW $0x3f00, R4, R4
|
|
ORRW R3, R4, R3
|
|
ORRW R3, R6, R3
|
|
BEQ encsum
|
|
|
|
MOVBU 2(R9), R4
|
|
CMPW $3, R0
|
|
LSLW $12, R4, R6
|
|
ANDW $0xf0000, R6, R6
|
|
ORRW R4<<28, R6, R4
|
|
ORRW R4, R3, R3
|
|
BEQ encsum
|
|
|
|
ADD $3, R10, R10
|
|
ADDW $1, R12, R12
|
|
CMPW $4, R0
|
|
ADD R12<<2, R5, R7
|
|
MOVBU (R2)(R10), R4
|
|
LSLW $20, R4, R4
|
|
ANDW $0xf000000, R4, R4
|
|
ORRW R3, R4, R3
|
|
ADDW $0x4e0000, R3, R3
|
|
ADDW $0x4e, R3, R3
|
|
MOVW R3, (R5)(R8<<2)
|
|
MOVBU (R2)(R10), R3
|
|
UBFIZW $2, R3, $4, R3
|
|
BEQ encsum
|
|
|
|
MOVBU 4(R9), R4
|
|
CMPW $5, R0
|
|
UBFIZW $10, R4, $6, R2
|
|
ORRW R3, R2, R3
|
|
ORRW R4>>6, R3, R3
|
|
BEQ encsum
|
|
|
|
MOVBU 5(R9), R4
|
|
LSLW $2, R4, R2
|
|
UBFIZW $16, R4, $6, R4
|
|
ANDW $0x300, R2, R2
|
|
ORRW R4, R2, R2
|
|
ORRW R2, R3, R3
|
|
encsum:
|
|
ADDW $0x4e0000, R3, R3
|
|
ADDW $0x4e, R3, R3
|
|
MOVD R3, ·sum+64(FP)
|
|
MOVD R7, ·n+72(FP)
|
|
encret:
|
|
RET
|
|
enctil:
|
|
MOVW $0, R10
|
|
MOVW $0, R12
|
|
JMP encrem
|
|
|
|
// func _decode(offset, outlen int, b, decd []byte)
|
|
TEXT ·_decode(SB), NOSPLIT, $0-64
|
|
MOVD ·offset+0(FP), R0
|
|
MOVD ·outlen+8(FP), R1
|
|
MOVD ·data+16(FP), R2
|
|
MOVD ·decd+40(FP), R5
|
|
|
|
SUBW $6, R1, R1
|
|
CMPW $0, R1
|
|
BLE dectil
|
|
MOVW $0xb200, R11
|
|
MOVD R5, R9
|
|
SUB $8, R2, R14
|
|
SUB $4, R2, R13
|
|
MOVD $2, R8
|
|
MOVW $0, R10
|
|
MOVK $(0xb1ff<<16), R11
|
|
declop:
|
|
MOVW (R14)(R8<<2), R4
|
|
ADDW $7, R10, R10
|
|
MOVW (R13)(R8<<2), R3
|
|
MOVW R8, R12
|
|
REVW R4, R4
|
|
CMPW R1, R10
|
|
ADDW R11, R4, R4
|
|
REVW R3, R3
|
|
ADDW R11, R3, R3
|
|
ADD $2, R8, R8
|
|
LSLW $2, R4, R7
|
|
UBFIZW $4, R4, $14, R4
|
|
LSLW $6, R3, R6
|
|
ANDW $-262144, R7, R7
|
|
ORRW R4, R7, R7
|
|
ANDW $-4194304, R6, R4
|
|
UBFIZW $8, R3, $14, R6
|
|
ORRW R3>>26, R7, R3
|
|
ORRW R6, R4, R4
|
|
REVW R3, R3
|
|
REVW R4, R4
|
|
STPW (R3, R4), (R9)
|
|
ADD $7, R9, R9
|
|
BLT declop
|
|
decrem:
|
|
CBZW R0, decret
|
|
MOVW (R2)(R12.UXTW<<2), R1
|
|
CMPW $1, R0
|
|
SUBW $0x4e, R1, R3
|
|
UBFX $14, R3, $2, R4
|
|
ORRW R3<<2, R4, R3
|
|
MOVB R3, (R5)(R10.SXTW)
|
|
BEQ decret
|
|
|
|
MOVW $0xffb2, R7
|
|
ADDW $1, R10, R4
|
|
MOVK $(0xffb1<<16), R7
|
|
ADDW R7, R1, R1
|
|
CMPW $2, R0
|
|
UBFX $20, R1, $8, R6
|
|
LSRW $6, R1, R3
|
|
ANDW $3, R6, R8
|
|
ANDW $-4, R3, R3
|
|
ORRW R8, R3, R3
|
|
MOVB R3, (R5)(R4.SXTW)
|
|
BEQ decret
|
|
|
|
ADDW $2, R10, R3
|
|
LSRW $12, R1, R4
|
|
ANDW $-16, R4, R4
|
|
CMPW $3, R0
|
|
ORRW R1>>28, R4, R1
|
|
MOVB R1, (R5)(R3.SXTW)
|
|
BEQ decret
|
|
|
|
ADDW $3, R10, R1
|
|
ADDW $1, R12, R12
|
|
ANDW $0xf0, R6, R6
|
|
CMPW $4, R0
|
|
MOVW (R2)(R12<<2), R3
|
|
SUBW $0x4e, R3, R2
|
|
UBFX $2, R2, $4, R4
|
|
ORRW R6, R4, R4
|
|
MOVB R4, (R5)(R1.SXTW)
|
|
BEQ decret
|
|
|
|
ADDW $4, R10, R1
|
|
UBFX $10, R2, $6, R4
|
|
ORRW R2<<6, R4, R2
|
|
CMPW $5, R0
|
|
MOVB R2, (R5)(R1.SXTW)
|
|
BEQ decret
|
|
|
|
ADDW R7, R3, R3
|
|
ADDW $5, R10, R10
|
|
LSRW $2, R3, R0
|
|
UBFX $16, R3, $6, R3
|
|
ANDW $-64, R0, R0
|
|
ORRW R3, R0, R3
|
|
MOVB R3, (R5)(R10.SXTW)
|
|
decret:
|
|
RET
|
|
dectil:
|
|
MOVW $0, R10
|
|
MOVW $0, R12
|
|
JMP decrem
|