1
0
mirror of https://github.com/fumiama/go-base16384.git synced 2026-06-05 00:32:52 +08:00
Files
go-base16384/base14_arm64.s
源文雨 369cf02def finish arm64 decode
name            old time/op    new time/op     delta
EncodeTo/16-8     10.6ns ± 0%     10.4ns ± 0%    -2.07%  (p=0.008 n=5+5)
EncodeTo/256-8    80.8ns ± 0%     55.7ns ± 0%   -31.11%  (p=0.008 n=5+5)
EncodeTo/4K-8     1.21µs ± 0%     0.82µs ± 0%   -32.67%  (p=0.016 n=4+5)
EncodeTo/32K-8    9.64µs ± 0%     6.47µs ± 0%   -32.90%  (p=0.008 n=5+5)
DecodeTo/16-8     9.79ns ± 0%    26.02ns ± 0%  +165.85%  (p=0.016 n=4+5)
DecodeTo/256-8    80.9ns ± 0%    111.6ns ± 0%   +37.98%  (p=0.008 n=5+5)
DecodeTo/4K-8     1.22µs ± 0%     1.17µs ± 0%    -3.73%  (p=0.008 n=5+5)
DecodeTo/32K-8    9.71µs ± 1%     8.80µs ± 1%    -9.37%  (p=0.008 n=5+5)
Encoder/16-8      76.5ns ± 0%     76.5ns ± 0%      ~     (p=0.810 n=5+5)
Encoder/256-8      356ns ± 0%      291ns ± 0%   -18.22%  (p=0.008 n=5+5)
Encoder/4K-8      4.05µs ± 0%     3.70µs ± 0%    -8.76%  (p=0.008 n=5+5)
Encoder/32K-8     34.1µs ± 0%     29.2µs ± 0%   -14.30%  (p=0.008 n=5+5)
Decoder/16-8       205ns ± 0%      207ns ± 0%    +1.08%  (p=0.008 n=5+5)
Decoder/256-8      262ns ± 0%      244ns ± 1%    -6.94%  (p=0.008 n=5+5)
Decoder/4K-8      1.49µs ± 0%     1.12µs ± 0%   -24.87%  (p=0.008 n=5+5)
Decoder/32K-8     11.0µs ± 0%      8.0µs ± 0%   -27.00%  (p=0.008 n=5+5)

name            old speed      new speed       delta
EncodeTo/16-8   1.50GB/s ± 0%   1.54GB/s ± 0%    +2.11%  (p=0.008 n=5+5)
EncodeTo/256-8  3.17GB/s ± 0%   4.60GB/s ± 0%   +45.15%  (p=0.008 n=5+5)
EncodeTo/4K-8   3.37GB/s ± 0%   5.01GB/s ± 0%   +48.51%  (p=0.008 n=5+5)
EncodeTo/32K-8  3.40GB/s ± 0%   5.06GB/s ± 0%   +49.02%  (p=0.008 n=5+5)
DecodeTo/16-8   2.25GB/s ± 0%   0.85GB/s ± 0%   -62.39%  (p=0.016 n=4+5)
DecodeTo/256-8  3.66GB/s ± 0%   2.65GB/s ± 0%   -27.54%  (p=0.008 n=5+5)
DecodeTo/4K-8   3.84GB/s ± 0%   3.99GB/s ± 0%    +3.87%  (p=0.008 n=5+5)
DecodeTo/32K-8  3.86GB/s ± 1%   4.26GB/s ± 1%   +10.33%  (p=0.008 n=5+5)
Encoder/16-8     209MB/s ± 0%    209MB/s ± 0%      ~     (p=0.802 n=5+5)
Encoder/256-8    720MB/s ± 0%    880MB/s ± 0%   +22.28%  (p=0.008 n=5+5)
Encoder/4K-8    1.01GB/s ± 0%   1.11GB/s ± 0%    +9.60%  (p=0.008 n=5+5)
Encoder/32K-8    962MB/s ± 0%   1122MB/s ± 0%   +16.69%  (p=0.008 n=5+5)
Decoder/16-8    78.1MB/s ± 0%   77.3MB/s ± 0%    -1.08%  (p=0.008 n=5+5)
Decoder/256-8    977MB/s ± 0%   1050MB/s ± 1%    +7.47%  (p=0.008 n=5+5)
Decoder/4K-8    2.76GB/s ± 0%   3.67GB/s ± 0%   +33.10%  (p=0.008 n=5+5)
Decoder/32K-8   2.98GB/s ± 0%   4.08GB/s ± 0%   +36.98%  (p=0.008 n=5+5)

name            old alloc/op   new alloc/op    delta
EncodeTo/16-8      0.00B           0.00B           ~     (all equal)
EncodeTo/256-8     0.00B           0.00B           ~     (all equal)
EncodeTo/4K-8      0.00B           0.00B           ~     (all equal)
EncodeTo/32K-8     0.00B           0.00B           ~     (all equal)
DecodeTo/16-8      0.00B          48.00B ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/256-8     0.00B         576.00B ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/4K-8      0.00B        6144.00B ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/32K-8     0.00B       49152.00B ± 0%     +Inf%  (p=0.008 n=5+5)
Encoder/16-8       24.0B ± 0%      24.0B ± 0%      ~     (all equal)
Encoder/256-8       472B ± 0%        24B ± 0%   -94.92%  (p=0.008 n=5+5)
Encoder/4K-8       24.0B ± 0%      24.0B ± 0%      ~     (all equal)
Encoder/32K-8     41.0kB ± 0%      0.0kB ± 0%   -99.94%  (p=0.008 n=5+5)
Decoder/16-8      1.39kB ± 0%     1.39kB ± 0%      ~     (all equal)
Decoder/256-8     1.39kB ± 0%     1.39kB ± 0%      ~     (all equal)
Decoder/4K-8      4.98kB ± 0%     4.98kB ± 0%      ~     (all equal)
Decoder/32K-8     41.1kB ± 0%     41.1kB ± 0%      ~     (all equal)

name            old allocs/op  new allocs/op   delta
EncodeTo/16-8       0.00            0.00           ~     (all equal)
EncodeTo/256-8      0.00            0.00           ~     (all equal)
EncodeTo/4K-8       0.00            0.00           ~     (all equal)
EncodeTo/32K-8      0.00            0.00           ~     (all equal)
DecodeTo/16-8       0.00            1.00 ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/256-8      0.00            1.00 ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/4K-8       0.00            1.00 ± 0%     +Inf%  (p=0.008 n=5+5)
DecodeTo/32K-8      0.00            1.00 ± 0%     +Inf%  (p=0.008 n=5+5)
Encoder/16-8        1.00 ± 0%       1.00 ± 0%      ~     (all equal)
Encoder/256-8       2.00 ± 0%       1.00 ± 0%   -50.00%  (p=0.008 n=5+5)
Encoder/4K-8        1.00 ± 0%       1.00 ± 0%      ~     (all equal)
Encoder/32K-8       2.00 ± 0%       1.00 ± 0%   -50.00%  (p=0.008 n=5+5)
Decoder/16-8        3.00 ± 0%       3.00 ± 0%      ~     (all equal)
Decoder/256-8       3.00 ± 0%       3.00 ± 0%      ~     (all equal)
Decoder/4K-8        3.00 ± 0%       3.00 ± 0%      ~     (all equal)
Decoder/32K-8       3.00 ± 0%       3.00 ± 0%      ~     (all equal)
2022-12-14 01:14:03 +08:00

225 lines
4.9 KiB
ArmAsm

//go:build arm64
// +build arm64
#include "textflag.h"
// func _encode(offset, outlen int, b, encd []byte) (sum uint64, &vals[n] uintptr)
TEXT ·_encode(SB), NOSPLIT, $0-81
MOVD ·offset+0(FP), R0
MOVD ·data+16(FP), R9
MOVD ·dlen+24(FP), R3
MOVD ·encd+40(FP), R5
SUBW $6, R3, R3
CMPW $0, R3
BLE enctil
MOVW $0x4e00, R11
SUB $8, R5, R14
SUB $4, R5, R13
MOVD $2, R8
MOVW $0, R10 // int32_t i = 0
MOVK $(0x4e00<<16), R11
enclop:
MOVW (R9), R4
ADDW $7, R10, R10
MOVW R8, R12
CMPW R3, R10
REVW R4, R4
ADD $7, R9, R9
LSRW $2, R4, R6
UBFX $4, R4, $14, R15
ANDW $0x3fff0000, R6, R6
UBFIZW $26, R4, $4, R7
ORRW R15, R6, R6
ADDW R11, R6, R6
REVW R6, R6
MOVW R6, (R14)(R8<<2)
MOVW -3(R9), R4
REVW R4, R4
LSRW $6, R4, R4
ANDW $0x3fffffc, R4, R4
ORRW R7, R4, R4
ANDW $0x3fff0000, R4, R6
UBFX $2, R4, $14, R4
ORRW R6, R4, R4
ADDW R11, R4, R4
REVW R4, R4
MOVW R4, (R13)(R8<<2)
ADDW $2, R8, R8
BLT enclop
encrem:
ANDSW $0xff, R0, R0
BEQ encret
MOVBU (R2)(R10.SXTW), R3
UXTW R12, R8
CMPW $1, R0
SXTW R10, R10
ADD R8<<2, R5, R7
UBFIZW $14, R3, $2, R4
ORRW R3>>2, R4, R3
BEQ encsum
ADD R10, R2, R9
CMPW $2, R0
MOVBU 1(R9), R6
LSLW $6, R6, R4
UBFIZW $20, R6, $2, R6
ANDW $0x3f00, R4, R4
ORRW R3, R4, R3
ORRW R3, R6, R3
BEQ encsum
MOVBU 2(R9), R4
CMPW $3, R0
LSLW $12, R4, R6
ANDW $0xf0000, R6, R6
ORRW R4<<28, R6, R4
ORRW R4, R3, R3
BEQ encsum
ADD $3, R10, R10
ADDW $1, R12, R12
CMPW $4, R0
ADD R12<<2, R5, R7
MOVBU (R2)(R10), R4
LSLW $20, R4, R4
ANDW $0xf000000, R4, R4
ORRW R3, R4, R3
ADDW $0x4e0000, R3, R3
ADDW $0x4e, R3, R3
MOVW R3, (R5)(R8<<2)
MOVBU (R2)(R10), R3
UBFIZW $2, R3, $4, R3
BEQ encsum
MOVBU 4(R9), R4
CMPW $5, R0
UBFIZW $10, R4, $6, R2
ORRW R3, R2, R3
ORRW R4>>6, R3, R3
BEQ encsum
MOVBU 5(R9), R4
LSLW $2, R4, R2
UBFIZW $16, R4, $6, R4
ANDW $0x300, R2, R2
ORRW R4, R2, R2
ORRW R2, R3, R3
encsum:
ADDW $0x4e0000, R3, R3
ADDW $0x4e, R3, R3
MOVD R3, ·sum+64(FP)
MOVD R7, ·n+72(FP)
encret:
RET
enctil:
MOVW $0, R10
MOVW $0, R12
JMP encrem
// func _decode(offset, outlen int, b, decd []byte)
TEXT ·_decode(SB), NOSPLIT, $0-64
MOVD ·offset+0(FP), R0
MOVD ·outlen+8(FP), R1
MOVD ·data+16(FP), R2
MOVD ·decd+40(FP), R5
SUBW $6, R1, R1
CMPW $0, R1
BLE dectil
MOVW $0xb200, R11
MOVD R5, R9
SUB $8, R2, R14
SUB $4, R2, R13
MOVD $2, R8
MOVW $0, R10
MOVK $(0xb1ff<<16), R11
declop:
MOVW (R14)(R8<<2), R4
ADDW $7, R10, R10
MOVW (R13)(R8<<2), R3
MOVW R8, R12
REVW R4, R4
CMPW R1, R10
ADDW R11, R4, R4
REVW R3, R3
ADDW R11, R3, R3
ADD $2, R8, R8
LSLW $2, R4, R7
UBFIZW $4, R4, $14, R4
LSLW $6, R3, R6
ANDW $-262144, R7, R7
ORRW R4, R7, R7
ANDW $-4194304, R6, R4
UBFIZW $8, R3, $14, R6
ORRW R3>>26, R7, R3
ORRW R6, R4, R4
REVW R3, R3
REVW R4, R4
STPW (R3, R4), (R9)
ADD $7, R9, R9
BLT declop
decrem:
CBZW R0, decret
MOVW (R2)(R12.UXTW<<2), R1
CMPW $1, R0
SUBW $0x4e, R1, R3
UBFX $14, R3, $2, R4
ORRW R3<<2, R4, R3
MOVB R3, (R5)(R10.SXTW)
BEQ decret
MOVW $0xffb2, R7
ADDW $1, R10, R4
MOVK $(0xffb1<<16), R7
ADDW R7, R1, R1
CMPW $2, R0
UBFX $20, R1, $8, R6
LSRW $6, R1, R3
ANDW $3, R6, R8
ANDW $-4, R3, R3
ORRW R8, R3, R3
MOVB R3, (R5)(R4.SXTW)
BEQ decret
ADDW $2, R10, R3
LSRW $12, R1, R4
ANDW $-16, R4, R4
CMPW $3, R0
ORRW R1>>28, R4, R1
MOVB R1, (R5)(R3.SXTW)
BEQ decret
ADDW $3, R10, R1
ADDW $1, R12, R12
ANDW $0xf0, R6, R6
CMPW $4, R0
MOVW (R2)(R12<<2), R3
SUBW $0x4e, R3, R2
UBFX $2, R2, $4, R4
ORRW R6, R4, R4
MOVB R4, (R5)(R1.SXTW)
BEQ decret
ADDW $4, R10, R1
UBFX $10, R2, $6, R4
ORRW R2<<6, R4, R2
CMPW $5, R0
MOVB R2, (R5)(R1.SXTW)
BEQ decret
ADDW R7, R3, R3
ADDW $5, R10, R10
LSRW $2, R3, R0
UBFX $16, R3, $6, R3
ANDW $-64, R0, R0
ORRW R3, R0, R3
MOVB R3, (R5)(R10.SXTW)
decret:
RET
dectil:
MOVW $0, R10
MOVW $0, R12
JMP decrem