diff --git a/README.md b/README.md index 8bace7f..c013092 100644 --- a/README.md +++ b/README.md @@ -82,4 +82,54 @@ func main() { panic("fail!") } } -``` \ No newline at end of file +``` + +# Performace Analysis +The performance is optimized by replacing generic encode/decode functions with assembly code. + +## Encode Speedup by ASM +``` +goos: darwin +goarch: arm64 +pkg: github.com/fumiama/go-base16384 +cpu: Apple M4 Max + │ old.txt │ new.txt │ + │ sec/op │ sec/op vs base │ +EncodeTo/16-16 5.340n ± 1% 5.664n ± 0% +6.08% (p=0.002 n=6) +EncodeTo/256-16 39.04n ± 1% 34.20n ± 1% -12.37% (p=0.002 n=6) +EncodeTo/4K-16 537.4n ± 1% 425.6n ± 0% -20.80% (p=0.002 n=6) +EncodeTo/32K-16 4.228µ ± 1% 3.361µ ± 1% -20.51% (p=0.002 n=6) +geomean 147.5n 129.0n -12.54% + + │ old.txt │ new.txt │ + │ B/s │ B/s vs base │ +EncodeTo/16-16 2.791Gi ± 1% 2.631Gi ± 0% -5.73% (p=0.002 n=6) +EncodeTo/256-16 6.108Gi ± 1% 6.970Gi ± 1% +14.12% (p=0.002 n=6) +EncodeTo/4K-16 7.098Gi ± 1% 8.963Gi ± 0% +26.27% (p=0.002 n=6) +EncodeTo/32K-16 7.218Gi ± 1% 9.079Gi ± 1% +25.79% (p=0.002 n=6) +geomean 5.436Gi 6.215Gi +14.33% + +``` + +## Decode Speedup by ASM +``` +goos: darwin +goarch: arm64 +pkg: github.com/fumiama/go-base16384 +cpu: Apple M4 Max + │ old.txt │ new.txt │ + │ sec/op │ sec/op vs base │ +DecodeTo/16-16 5.302n ± 5% 3.525n ± 0% -33.52% (p=0.002 n=6) +DecodeTo/256-16 46.04n ± 1% 29.91n ± 1% -35.05% (p=0.002 n=6) +DecodeTo/4K-16 585.6n ± 1% 405.8n ± 0% -30.70% (p=0.002 n=6) +DecodeTo/32K-16 4.567µ ± 0% 3.197µ ± 0% -30.00% (p=0.002 n=6) +geomean 159.8n 108.1n -32.35% + + │ old.txt │ new.txt │ + │ B/s │ B/s vs base │ +DecodeTo/16-16 3.864Gi ± 5% 5.812Gi ± 1% +50.40% (p=0.002 n=6) +DecodeTo/256-16 5.987Gi ± 1% 9.219Gi ± 1% +53.99% (p=0.002 n=6) +DecodeTo/4K-16 7.450Gi ± 1% 10.749Gi ± 0% +44.29% (p=0.002 n=6) +DecodeTo/32K-16 7.638Gi ± 0% 10.911Gi ± 0% +42.84% (p=0.002 n=6) +geomean 6.024Gi 8.903Gi +47.81% +``` diff --git a/base14.go b/base14.go index 5956f10..c7c7371 100644 --- a/base14.go +++ b/base14.go @@ -24,6 +24,7 @@ func EncodeLen(in int) (out int) { return } +//go:nosplit func Encode(b []byte) (encd []byte) { outlen := len(b) / 7 * 8 offset := len(b) % 7 @@ -44,6 +45,7 @@ func Encode(b []byte) (encd []byte) { return } +//go:nosplit func EncodeTo(b, encd []byte) (int, error) { outlen := len(b) / 7 * 8 offset := len(b) % 7 diff --git a/base14_amd64.go b/base14_amd64.go index b6c8aa5..570ebb6 100644 --- a/base14_amd64.go +++ b/base14_amd64.go @@ -15,6 +15,7 @@ func _encode(offset int, b, encd []byte) (sum uint64, n uint64) //go:nosplit func _decode(offset, outlen int, b, decd []byte) +//go:nosplit func encode(offset, outlen int, b, encd []byte) { if movbe { sum, n := _encode(offset, b, encd) @@ -31,6 +32,7 @@ func encode(offset, outlen int, b, encd []byte) { } } +//go:nosplit func decode(offset, outlen int, b, decd []byte) { if movbe { _decode(offset, outlen, b, decd) diff --git a/base14_arm64.go b/base14_arm64.go index 64c8adf..1dc862e 100644 --- a/base14_arm64.go +++ b/base14_arm64.go @@ -15,6 +15,7 @@ func _encode(offset int, b, encd []byte) (sum uint64, n int) //go:nosplit func _decode(offset, outlen int, b, decd []byte) +//go:nosplit func encode(offset, outlen int, b, encd []byte) { sum, n := _encode(offset, b, encd) if offset == 0 { @@ -27,6 +28,7 @@ func encode(offset, outlen int, b, encd []byte) { encd[outlen-1] = byte(offset) } +//go:nosplit func decode(offset, outlen int, b, decd []byte) { _decode(offset, outlen, b, decd) } diff --git a/base14_test.go b/base14_test.go index 291cb2a..98fee4c 100644 --- a/base14_test.go +++ b/base14_test.go @@ -3,6 +3,7 @@ package base14 import ( "bytes" "encoding/hex" + "errors" "io" "math/rand" "testing" @@ -31,85 +32,69 @@ func TestBase14(t *testing.T) { } func TestEncoder(t *testing.T) { - buf := make([]byte, 1024*1024+1) + buf := make([]byte, 4096+1) _, err := rand.Read(buf) if err != nil { t.Fatal(err) } - w := bytes.NewBuffer(make([]byte, 0, 1024*1024+1)) - for i := 0; i <= 1024*1024; i += rand.Intn(128) * 7 { - e := NewEncoder(bytes.NewReader(buf[:i])) - _, err = io.Copy(w, e) + w := bytes.NewBuffer(make([]byte, 0, 4096+1)) + for i := 0; i <= 4096; i++ { + e := NewEncoder(w) + _, err = io.Copy(e, bytes.NewReader(buf[:i])) if err != nil { t.Fatal(err) } + _ = e.Close() if !bytes.Equal(Encode(buf[:i]), w.Bytes()) { - t.Fail() - } - w.Reset() - } -} - -func TestBufferedEncoder(t *testing.T) { - buf := make([]byte, 1024*1024+1) - _, err := rand.Read(buf) - if err != nil { - t.Fatal(err) - } - w := bytes.NewBuffer(make([]byte, 0, 1024*1024+1)) - for i := 0; i <= 1024*1024; i += rand.Intn(128) * 7 { - e := NewBufferedEncoder(buf[:i]) - _, err = io.Copy(w, e) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(Encode(buf[:i]), w.Bytes()) { - t.Fail() + t.Log("expect", hex.EncodeToString(Encode(buf[:i]))) + t.Log("butgot", hex.EncodeToString(w.Bytes())) + t.Fatal("unexpected at index", i) } w.Reset() } } func TestDecoder(t *testing.T) { - buf := make([]byte, 1024*1024+1) + buf := make([]byte, 4096+1) _, err := rand.Read(buf) if err != nil { t.Fatal(err) } - encd := Encode(buf) - w := bytes.NewBuffer(make([]byte, 0, 1024*1024+1)) - for i := 0; i <= 1024*1024; i += rand.Intn(128) * 8 { - d := NewDecoder(bytes.NewReader(encd[:i])) + w := bytes.NewBuffer(make([]byte, 0, 4096+1)) + for i := 0; i <= 4096; i++ { + d := NewDecoder(bytes.NewReader(Encode(buf[:i]))) _, err = io.Copy(w, d) if err != nil { t.Fatal(err) } - if !bytes.Equal(buf[:i/8*7], w.Bytes()) { + if !bytes.Equal(buf[:i], w.Bytes()) { t.Fail() } w.Reset() } } -func TestBufferedDecoder(t *testing.T) { - buf := make([]byte, 1024*1024+1) - _, err := rand.Read(buf) - if err != nil { - t.Fatal(err) +//go:nosplit +func encodeToGeneric(b, encd []byte) (int, error) { + outlen := len(b) / 7 * 8 + offset := len(b) % 7 + switch offset { //算上偏移标志字符占用的2字节 + case 0: + break + case 1: + outlen += 4 + case 2, 3: + outlen += 6 + case 4, 5: + outlen += 8 + case 6: + outlen += 10 } - encd := Encode(buf) - w := bytes.NewBuffer(make([]byte, 0, 1024*1024+1)) - for i := 0; i <= 1024*1024; i += rand.Intn(128) * 8 { - d := NewBufferedDecoder(encd[:i]) - _, err = io.Copy(w, d) - if err != nil { - t.Fatal(err) - } - if !bytes.Equal(buf[:i/8*7], w.Bytes()) { - t.Fail() - } - w.Reset() + if len(encd) < outlen { + return 0, errors.New("encd too small") } + encodeGeneric(offset, outlen, b, encd) + return outlen, nil } func benchEncode(b *testing.B, data []byte) { @@ -125,6 +110,46 @@ func benchEncode(b *testing.B, data []byte) { } } +func benchEncodeGeneric(b *testing.B, data []byte) { + _, err := rand.Read(data) + if err != nil { + panic(err) + } + buf := make([]byte, EncodeLen(len(data))) + b.SetBytes(int64(len(data))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = encodeToGeneric(data, buf) + } +} + +//go:nosplit +func decodeToGeneric(b []byte, decd []byte) (int, error) { + outlen := len(b) + offset := 0 + if b[len(b)-2] == '=' { + offset = int(b[len(b)-1]) + switch offset { //算上偏移标志字符占用的2字节 + case 0: + break + case 1: + outlen -= 4 + case 2, 3: + outlen -= 6 + case 4, 5: + outlen -= 8 + case 6: + outlen -= 10 + } + } + outlen = outlen/8*7 + offset + if len(decd) < outlen { + return 0, errors.New("decd too small") + } + decodeGeneric(offset, outlen, b, decd) + return outlen, nil +} + func benchDecode(b *testing.B, data []byte) { _, err := rand.Read(data) if err != nil { @@ -142,6 +167,23 @@ func benchDecode(b *testing.B, data []byte) { } } +func benchDecodeGeneric(b *testing.B, data []byte) { + _, err := rand.Read(data) + if err != nil { + panic(err) + } + buf := make([]byte, EncodeLen(len(data))) + _, err = encodeToGeneric(data, buf) + if err != nil { + panic(err) + } + b.SetBytes(int64(len(buf))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = decodeToGeneric(buf, data) + } +} + func BenchmarkEncodeTo(b *testing.B) { b.Run("16", func(b *testing.B) { data := make([]byte, 16) @@ -161,6 +203,25 @@ func BenchmarkEncodeTo(b *testing.B) { }) } +func BenchmarkEncodeToGeneric(b *testing.B) { + b.Run("16", func(b *testing.B) { + data := make([]byte, 16) + benchEncodeGeneric(b, data) + }) + b.Run("256", func(b *testing.B) { + data := make([]byte, 256) + benchEncodeGeneric(b, data) + }) + b.Run("4K", func(b *testing.B) { + data := make([]byte, 1024*4) + benchEncodeGeneric(b, data) + }) + b.Run("32K", func(b *testing.B) { + data := make([]byte, 1024*32) + benchEncodeGeneric(b, data) + }) +} + func BenchmarkDecodeTo(b *testing.B) { b.Run("16", func(b *testing.B) { data := make([]byte, 16) @@ -180,21 +241,42 @@ func BenchmarkDecodeTo(b *testing.B) { }) } +func BenchmarkDecodeToGeneric(b *testing.B) { + b.Run("16", func(b *testing.B) { + data := make([]byte, 16) + benchDecodeGeneric(b, data) + }) + b.Run("256", func(b *testing.B) { + data := make([]byte, 256) + benchDecodeGeneric(b, data) + }) + b.Run("4K", func(b *testing.B) { + data := make([]byte, 4096) + benchDecodeGeneric(b, data) + }) + b.Run("32K", func(b *testing.B) { + data := make([]byte, 1024*32) + benchDecodeGeneric(b, data) + }) +} + func benchEncoder(b *testing.B, cnt int64) { - enc := NewEncoder(rand.New(rand.NewSource(0))) + s := rand.New(rand.NewSource(0)) buf := bytes.NewBuffer(make([]byte, 0, cnt)) + enc := NewEncoder(buf) b.SetBytes(cnt) b.ResetTimer() for i := 0; i < b.N; i++ { - _, _ = io.CopyN(buf, enc, cnt) + _, _ = io.CopyN(enc, s, cnt) buf.Reset() } } func benchDecoder(b *testing.B, cnt int64) { - enc := NewEncoder(rand.New(rand.NewSource(0))) + s := rand.New(rand.NewSource(0)) buf := bytes.NewBuffer(make([]byte, 0, cnt)) - _, err := io.CopyN(buf, enc, cnt) + enc := NewEncoder(buf) + _, err := io.CopyN(enc, s, cnt) if err != nil { panic(err) } diff --git a/decoder.go b/decoder.go index cb6eb0a..8b51604 100644 --- a/decoder.go +++ b/decoder.go @@ -14,10 +14,6 @@ func NewDecoder(r io.Reader) *Decoder { return &Decoder{r: r} } -func NewBufferedDecoder(b []byte) *Decoder { - return &Decoder{b: b} -} - func (d *Decoder) Read(p []byte) (n int, err error) { i := len(d.b) if i == 0 && d.r == nil { diff --git a/encoder.go b/encoder.go index 715d389..cf912f5 100644 --- a/encoder.go +++ b/encoder.go @@ -1,49 +1,68 @@ package base14 import ( + "bytes" "io" ) type Encoder struct { b []byte - r io.Reader - io.Reader + w io.Writer + io.WriteCloser + io.ReaderFrom } -func NewEncoder(r io.Reader) *Encoder { - return &Encoder{r: r} +func NewEncoder(w io.Writer) *Encoder { + return &Encoder{w: w} } -func NewBufferedEncoder(b []byte) *Encoder { - return &Encoder{b: b} -} - -func (e *Encoder) Read(p []byte) (n int, err error) { +func (e *Encoder) ReadFrom(r io.Reader) (int64, error) { + if r == nil { + return 0, nil + } i := len(e.b) - if i == 0 && e.r == nil { - err = io.EOF - return + if i == 0 && e.w == nil { + return 0, io.EOF } - inlen := len(p) / 8 * 7 - if e.r != nil { + n := 0 + iseof := false + for !iseof { + inlen := 1024 / 8 * 7 // batch size e.b = append(e.b, make([]byte, inlen)...) - n, err = e.r.Read(e.b[i:]) - inlen = i + n + cnt, err := r.Read(e.b[i:]) + n += cnt + iseof = err == io.EOF if err != nil { - if len(e.b) > 0 { - n, _ = EncodeTo(e.b[:inlen], p) + if !iseof { + return int64(n), err } - e.b = nil - e.r = nil - return } - n, err = EncodeTo(e.b[:inlen], p) - e.b = e.b[:0] - return - } else if inlen > len(e.b) { - inlen = len(e.b) + e.b = e.b[:i+cnt] + inlen = len(e.b) / 8 * 7 // real batch size + if inlen == 0 { + if iseof { + return int64(n), nil + } + i = len(e.b) + continue + } + _, err = e.w.Write(Encode(e.b[:inlen])) + if err != nil { + return int64(n), err + } + i = copy(e.b, e.b[inlen:]) + e.b = e.b[:i] } - n, err = EncodeTo(e.b[:inlen], p) - e.b = e.b[inlen:] - return + return int64(n), nil +} + +func (e *Encoder) Write(p []byte) (int, error) { + n, err := e.ReadFrom(bytes.NewReader(p)) + return int(n), err +} + +func (e *Encoder) Close() error { + _, err := e.w.Write(Encode(e.b)) + e.b = nil + return err }