Skip to content

Commit

Permalink
perf: improve copyZeroAlloc for File and TCPConn (#1893)
Browse files Browse the repository at this point in the history
Improve performance of `copyZeroAlloc` function

```
goos: linux
goarch: amd64
pkg: github.com/valyala/fasthttp
cpu: QEMU Virtual CPU version 2.5+
                                       │   old6.txt    │              new7.txt               │
                                       │    sec/op     │   sec/op     vs base                │
CopyZeroAllocOSFileToBytesBuffer-8        1.802µ ±  3%   1.303µ ± 2%  -27.69% (p=0.000 n=25)
CopyZeroAllocBytesBufferToOSFile-8        1.066µ ± 17%   1.048µ ± 1%   -1.69% (p=0.043 n=25)
CopyZeroAllocOSFileToStringsBuilder-8     9.477µ ±  0%   1.345µ ± 2%  -85.81% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8    1.031µ ±  1%   1.092µ ± 4%   +5.92% (p=0.000 n=25)
CopyZeroAllocOSFileToOSFile-8            12.132µ ±  1%   2.386µ ± 2%  -80.33% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8            2.009µ ±  2%   1.995µ ± 2%        ~ (p=0.733 n=25)
CopyZeroAllocNetConnToOSFile-8            21.86µ ±  2%   20.21µ ± 1%   -7.56% (p=0.000 n=25)
geomean                                   3.728µ         2.121µ       -43.11%

                                       │    old6.txt    │                 new7.txt                  │
                                       │      B/op      │     B/op      vs base                     │
CopyZeroAllocOSFileToBytesBuffer-8         40.00 ± 0%       0.00 ±  0%  -100.00% (p=0.000 n=25)
CopyZeroAllocBytesBufferToOSFile-8         0.000 ± 0%      0.000 ±  0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToStringsBuilder-8    32.04Ki ± 0%     0.00Ki ±  0%  -100.00% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8     0.000 ± 0%      0.000 ±  0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToOSFile-8            32.06Ki ± 0%     0.00Ki ±  0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8             96.00 ± 0%      96.00 ±  0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocNetConnToOSFile-8            16.000 ± 6%      8.000 ± 12%   -50.00% (p=0.000 n=25)
geomean                                               ²                 ?                       ² ³
¹ all samples are equal
² summaries must be >0 to compute geomean
³ ratios must be >0 to compute geomean

                                       │   old6.txt   │                new7.txt                 │
                                       │  allocs/op   │ allocs/op   vs base                     │
CopyZeroAllocOSFileToBytesBuffer-8       4.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocBytesBufferToOSFile-8       0.000 ± 0%     0.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToStringsBuilder-8    5.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8   0.000 ± 0%     0.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToOSFile-8            8.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8           6.000 ± 0%     6.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocNetConnToOSFile-8           2.000 ± 0%     1.000 ± 0%   -50.00% (p=0.000 n=25)
geomean                                             ²               ?                       ² ³
¹ all samples are equal
² summaries must be >0 to compute geomean
³ ratios must be >0 to compute geomean
```

```
goos: windows
goarch: amd64
pkg: github.com/valyala/fasthttp
cpu: Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz
                                       │  old_win.txt  │             new_win.txt              │
                                       │    sec/op     │    sec/op     vs base                │
CopyZeroAllocOSFileToBytesBuffer-8        4.347µ ±  7%   4.220µ ± 11%        ~ (p=0.211 n=25)
CopyZeroAllocBytesBufferToOSFile-8        1.408µ ± 12%   1.460µ ±  7%        ~ (p=0.427 n=25)
CopyZeroAllocOSFileToStringsBuilder-8    17.448µ ±  5%   3.613µ ±  9%  -79.29% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8    1.324µ ±  8%   1.257µ ±  6%   -5.06% (p=0.024 n=25)
CopyZeroAllocOSFileToOSFile-8            19.953µ ±  8%   4.846µ ±  7%  -75.71% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8            18.18µ ±  8%   18.22µ ±  7%        ~ (p=0.405 n=25)
CopyZeroAllocNetConnToOSFile-8            74.75µ ±  2%   68.10µ ±  3%   -8.90% (p=0.000 n=25)
geomean                                   8.720µ         5.579µ        -36.02%

                                       │  old_win.txt   │                new_win.txt                │
                                       │      B/op      │     B/op      vs base                     │
CopyZeroAllocOSFileToBytesBuffer-8         8.000 ± 0%       0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocBytesBufferToOSFile-8         0.000 ± 0%       0.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToStringsBuilder-8    32.01Ki ± 0%      0.00Ki ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8     9.000 ± 0%       0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToOSFile-8            32.02Ki ± 0%      0.00Ki ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8           32.02Ki ± 0%     32.02Ki ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocNetConnToOSFile-8           32.02Ki ± 0%     32.02Ki ± 0%    -0.00% (p=0.012 n=25)
geomean                                               ²                 ?                       ² ³
¹ all samples are equal
² summaries must be >0 to compute geomean
³ ratios must be >0 to compute geomean

                                       │ old_win.txt  │               new_win.txt               │
                                       │  allocs/op   │ allocs/op   vs base                     │
CopyZeroAllocOSFileToBytesBuffer-8       1.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocBytesBufferToOSFile-8       0.000 ± 0%     0.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocOSFileToStringsBuilder-8    2.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocIOLimitedReaderToOSFile-8   2.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToOSFile-8            3.000 ± 0%     0.000 ± 0%  -100.00% (p=0.000 n=25)
CopyZeroAllocOSFileToNetConn-8           3.000 ± 0%     3.000 ± 0%         ~ (p=1.000 n=25) ¹
CopyZeroAllocNetConnToOSFile-8           3.000 ± 0%     3.000 ± 0%         ~ (p=1.000 n=25) ¹
geomean                                             ²               ?                       ² ³
¹ all samples are equal
² summaries must be >0 to compute geomean
³ ratios must be >0 to compute geomean
```
  • Loading branch information
ksw2000 authored Nov 8, 2024
1 parent bea47f5 commit f6ba4ab
Show file tree
Hide file tree
Showing 2 changed files with 378 additions and 6 deletions.
101 changes: 95 additions & 6 deletions http.go
Original file line number Diff line number Diff line change
Expand Up @@ -2219,20 +2219,109 @@ func writeBodyFixedSize(w *bufio.Writer, r io.Reader, size int64) error {
return err
}

// copyZeroAlloc optimizes io.Copy by calling ReadFrom or WriteTo only when
// copying between os.File and net.TCPConn. If the reader has a WriteTo
// method, it uses WriteTo for copying; if the writer has a ReadFrom method,
// it uses ReadFrom for copying. If neither method is available, it gets a
// buffer from sync.Pool to perform the copy.
//
// io.CopyBuffer always uses the WriterTo or ReadFrom interface if it's
// available. however, os.File and net.TCPConn unfortunately have a
// fallback in their WriterTo that calls io.Copy if sendfile isn't possible.
//
// See issue: https://github.com/valyala/fasthttp/issues/1889
//
// sendfile can only be triggered when copying between os.File and net.TCPConn.
// Since the function confirming zero-copy is a private function, we use
// ReadFrom only in this specific scenario. For all other cases, we prioritize
// using our own copyBuffer method.
//
// o: our copyBuffer
// r: readFrom
// w: writeTo
//
// write\read *File *TCPConn writeTo other
// *File o r w o
// *TCPConn w,r o w o
// readFrom r r w r
// other o o w o
//
//nolint:dupword
func copyZeroAlloc(w io.Writer, r io.Reader) (int64, error) {
if wt, ok := r.(io.WriterTo); ok {
return wt.WriteTo(w)
}
if rt, ok := w.(io.ReaderFrom); ok {
return rt.ReadFrom(r)
var readerIsFile, readerIsConn bool

switch r := r.(type) {
case *os.File:
readerIsFile = true
case *net.TCPConn:
readerIsConn = true
case io.WriterTo:
return r.WriteTo(w)
}

switch w := w.(type) {
case *os.File:
if readerIsConn {
return w.ReadFrom(r)
}
case *net.TCPConn:
if readerIsFile {
// net.WriteTo requires go1.22 or later
// Benchmark tests show that on Windows, WriteTo performs
// significantly better than ReadFrom. On Linux, however,
// ReadFrom slightly outperforms WriteTo. When possible,
// copyZeroAlloc aims to perform better than or as well
// as io.Copy, so we use WriteTo whenever possible for
// optimal performance.
if rt, ok := r.(io.WriterTo); ok {
return rt.WriteTo(w)
}
return w.ReadFrom(r)
}
case io.ReaderFrom:
return w.ReadFrom(r)
}

vbuf := copyBufPool.Get()
buf := vbuf.([]byte)
n, err := io.CopyBuffer(w, r, buf)
n, err := copyBuffer(w, r, buf)
copyBufPool.Put(vbuf)
return n, err
}

// copyBuffer is rewritten from io.copyBuffer. We do not check if src has a
// WriteTo method, if dst has a ReadFrom method, or if buf is empty.
func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) {
for {
nr, er := src.Read(buf)
if nr > 0 {
nw, ew := dst.Write(buf[0:nr])
if nw < 0 || nr < nw {
nw = 0
if ew == nil {
ew = errors.New("invalid write result")
}
}
written += int64(nw)
if ew != nil {
err = ew
break
}
if nr != nw {
err = io.ErrShortWrite
break
}
}
if er != nil {
if er != io.EOF {
err = er
}
break
}
}
return written, err
}

var copyBufPool = sync.Pool{
New: func() any {
return make([]byte, 4096)
Expand Down
283 changes: 283 additions & 0 deletions http_timing_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
package fasthttp

import (
"bytes"
"io"
"net"
"os"
"strings"
"testing"
)

func BenchmarkCopyZeroAllocOSFileToBytesBuffer(b *testing.B) {
r, err := os.Open("./README.md")
if err != nil {
b.Fatal(err)
}
defer r.Close()

buf := &bytes.Buffer{}

b.ResetTimer()
for i := 0; i < b.N; i++ {
buf.Reset()
_, err = copyZeroAlloc(buf, r)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocBytesBufferToOSFile(b *testing.B) {
f, err := os.Open("./README.md")
if err != nil {
b.Fatal(err)
}
defer f.Close()

buf := &bytes.Buffer{}
_, err = io.Copy(buf, f)
if err != nil {
b.Fatal(err)
}

tmp, err := os.CreateTemp(os.TempDir(), "test_*")
if err != nil {
b.Fatal(err)
}
defer os.Remove(tmp.Name())

w, err := os.OpenFile(tmp.Name(), os.O_WRONLY, 0o444)
if err != nil {
b.Fatal(err)
}
defer w.Close()

b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := w.Seek(0, 0)
if err != nil {
b.Fatal(err)
}
_, err = copyZeroAlloc(w, buf)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocOSFileToStringsBuilder(b *testing.B) {
r, err := os.Open("./README.md")
if err != nil {
b.Fatalf("Failed to open testing file: %v", err)
}
defer r.Close()

w := &strings.Builder{}

b.ResetTimer()
for i := 0; i < b.N; i++ {
w.Reset()
_, err = copyZeroAlloc(w, r)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocIOLimitedReaderToOSFile(b *testing.B) {
f, err := os.Open("./README.md")
if err != nil {
b.Fatal(err)
}
defer f.Close()

r := io.LimitReader(f, 1024)

tmp, err := os.CreateTemp(os.TempDir(), "test_*")
if err != nil {
b.Fatal(err)
}
defer os.Remove(tmp.Name())

w, err := os.OpenFile(tmp.Name(), os.O_WRONLY, 0o444)
if err != nil {
b.Fatal(err)
}
defer w.Close()

b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := w.Seek(0, 0)
if err != nil {
b.Fatal(err)
}
_, err = copyZeroAlloc(w, r)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocOSFileToOSFile(b *testing.B) {
r, err := os.Open("./README.md")
if err != nil {
b.Fatal(err)
}
defer r.Close()

f, err := os.CreateTemp(os.TempDir(), "test_*")
if err != nil {
b.Fatal(err)
}
defer os.Remove(f.Name())

w, err := os.OpenFile(f.Name(), os.O_WRONLY, 0o444)
if err != nil {
b.Fatal(err)
}
defer w.Close()

b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := w.Seek(0, 0)
if err != nil {
b.Fatal(err)
}
_, err = copyZeroAlloc(w, r)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocOSFileToNetConn(b *testing.B) {
ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
b.Fatal(err)
}

addr := ln.Addr().String()
defer ln.Close()

done := make(chan struct{})
defer close(done)

go func() {
conn, err := ln.Accept()
if err != nil {
b.Error(err)
return
}
defer conn.Close()
for {
select {
case <-done:
return
default:
_, err := io.Copy(io.Discard, conn)
if err != nil {
b.Error(err)
return
}
}
}
}()

conn, err := net.Dial("tcp", addr)
if err != nil {
b.Fatal(err)
}
defer conn.Close()

file, err := os.Open("./README.md")
if err != nil {
b.Fatal(err)
}
defer file.Close()

b.ResetTimer()
for i := 0; i < b.N; i++ {
if _, err := copyZeroAlloc(conn, file); err != nil {
b.Fatal(err)
}
}
}

func BenchmarkCopyZeroAllocNetConnToOSFile(b *testing.B) {
data, err := os.ReadFile("./README.md")
if err != nil {
b.Fatal(err)
}

ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
b.Fatal(err)
}

addr := ln.Addr().String()
defer ln.Close()

done := make(chan struct{})
defer close(done)

writeDone := make(chan struct{})
go func() {
for {
select {
case <-done:
return
default:
conn, err := ln.Accept()
if err != nil {
b.Error(err)
return
}
_, err = conn.Write(data)
if err != nil {
b.Error(err)
}
conn.Close()
writeDone <- struct{}{}
}
}
}()

tmp, err := os.CreateTemp(os.TempDir(), "test_*")
if err != nil {
b.Fatal(err)
}
defer os.Remove(tmp.Name())

file, err := os.OpenFile(tmp.Name(), os.O_WRONLY, 0o444)
if err != nil {
b.Fatal(err)
}
defer file.Close()

conn, err := net.Dial("tcp", addr)
if err != nil {
b.Fatal(err)
}
defer conn.Close()

b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
<-writeDone
_, err = file.Seek(0, 0)
if err != nil {
b.Fatal(err)
}
b.StartTimer()
_, err = copyZeroAlloc(file, conn)
if err != nil {
b.Fatal(err)
}
b.StopTimer()
conn, err = net.Dial("tcp", addr)
if err != nil {
b.Fatal(err)
}
}
}

0 comments on commit f6ba4ab

Please sign in to comment.