Skip to content

Commit

Permalink
transport: fix logical race in flow control
Browse files Browse the repository at this point in the history
Remove the add and cancel methods of quotaPool. Their use is not required, and
leads to logical races when used concurrently from multiple goroutines. Rename
the reset method to add.

The typical way that a goroutine claims quota is to call the add method and
then to select on the channel returned by the acquire method. If two
goroutines are both trying to claim quota from a single quotaPool, the second
call to the add method can happen before the first attempt to read from the
channel. When that happens the second goroutine to attempt the channel read
will end up waiting for a very long time, in spite of its efforts to prepare
the channel for reading.

The quotaPool will always behave correctly when any positive quota is on the
channel rather than stored in the struct field. In the opposite case, when
positive quota is only in the struct field and not on the channel, users of
the quotaPool can fail to access available quota. Err on the side of storing
any positive quota in the channel.

This includes a reproducer for grpc#632, which fails on many runs with this
package at v1.0.4. The frequency of the test failures depends on how stressed
the server is, since it's now effectively checking for weird interleavings of
goroutines. It passes reliably with these changes to the transport package.

The behavior described in grpc#734 (an RPC with a streaming response hangs
unexpectedly) matches what I've seen in my programs, and what I see in the
test case added here. If it's a logical flow control bug, this change may well
fix it.

Updates grpc#632
Updates grpc#734
  • Loading branch information
rhysh committed Nov 30, 2016
1 parent cc3363f commit 609b15b
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 38 deletions.
1 change: 1 addition & 0 deletions test/end2end_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3208,6 +3208,7 @@ func interestingGoroutines() (gs []string) {
strings.Contains(stack, "testing.tRunner(") ||
strings.Contains(stack, "runtime.goexit") ||
strings.Contains(stack, "created by runtime.gc") ||
strings.Contains(stack, "created by runtime/trace.Start") ||
strings.Contains(stack, "created by google3/base/go/log.init") ||
strings.Contains(stack, "interestingGoroutines") ||
strings.Contains(stack, "runtime.MHeap_Scavenger") ||
Expand Down
177 changes: 177 additions & 0 deletions test/flow_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/*
*
* Copyright 2016, Google Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/

package grpc_test

import (
"bytes"
"io"
"net"
"testing"
"time"

"golang.org/x/net/context"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/test/grpc_testing"
)

func TestGRPCGoIssue632(t *testing.T) {
// Test for a regression of https://github.com/grpc/grpc-go/issues/632,
// and other flow control bugs.

defer leakCheck(t)()

const (
itemCount = 100
itemSize = 1 << 10
recvCount = 2
maxFailures = 3

requestCount = 10000
totalTimeout = 30 * time.Second
requestTimeout = time.Second
)

errs := make(chan error, 1)

lis, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("Listen; err = %q", err)
}
defer lis.Close()

s := grpc.NewServer()
grpc_testing.RegisterTestServiceServer(s, &server{
itemCount: itemCount,
itemSize: itemSize,
})
defer s.Stop()

go func() {
err = s.Serve(lis)
if err != nil {
select {
case errs <- err:
default:
}
}
}()

ctx := context.Background()
ctx, _ = context.WithTimeout(ctx, totalTimeout)
defer func() {
if ctx.Err() == context.DeadlineExceeded {
t.Fatalf("test timed out")
}
}()

cc, err := grpc.Dial(lis.Addr().String(), grpc.WithInsecure())
if err != nil {
t.Fatalf("Dial; err = %q", err)
}
defer cc.Close()
cl := grpc_testing.NewTestServiceClient(cc)

failures := 0
for i := 0; i < requestCount; i++ {
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
output, err := cl.StreamingOutputCall(ctx, &grpc_testing.StreamingOutputCallRequest{})
if err != nil {
t.Fatalf("StreamingOutputCall; err = %q", err)
}

j := 0
loop:
for ; j < recvCount; j++ {
_, err := output.Recv()
if err != nil {
if err == io.EOF {
break loop
}
switch grpc.Code(err) {
case codes.DeadlineExceeded:
break loop
default:
t.Fatalf("Recv; err = %q", err)
}
}
}
cancel()
<-ctx.Done()

if j < recvCount {
t.Errorf("got %d responses to request %d", j, i)
failures++
if failures >= maxFailures {
break
}
}
}

select {
case err = <-errs:
default:
}
if err != nil {
t.Fatalf("err = %q", err)
}
}

type server struct {
grpc_testing.TestServiceServer

itemSize int
itemCount int
}

func (s *server) StreamingOutputCall(req *grpc_testing.StreamingOutputCallRequest, srv grpc_testing.TestService_StreamingOutputCallServer) error {
for i := 0; i < s.itemCount; i++ {
err := srv.Send(&grpc_testing.StreamingOutputCallResponse{
Payload: &grpc_testing.Payload{
// Sending a large stream of data which the client reject
// helps to trigger some types of flow control bugs.
//
// Reallocating memory here is inefficient, but the stress it
// puts on the GC leads to more frequent flow control
// failures. The GC likely causes more variety in the
// goroutine scheduling orders.
Body: bytes.Repeat([]byte("a"), s.itemSize),
},
})
if err != nil {
return err
}
}
return nil
}
30 changes: 2 additions & 28 deletions transport/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,35 +111,9 @@ func newQuotaPool(q int) *quotaPool {
return qb
}

// add adds n to the available quota and tries to send it on acquire.
func (qb *quotaPool) add(n int) {
qb.mu.Lock()
defer qb.mu.Unlock()
qb.quota += n
if qb.quota <= 0 {
return
}
select {
case qb.c <- qb.quota:
qb.quota = 0
default:
}
}

// cancel cancels the pending quota sent on acquire, if any.
func (qb *quotaPool) cancel() {
qb.mu.Lock()
defer qb.mu.Unlock()
select {
case n := <-qb.c:
qb.quota += n
default:
}
}

// reset cancels the pending quota sent on acquired, incremented by v and sends
// add cancels the pending quota sent on acquired, incremented by v and sends
// it back on acquire.
func (qb *quotaPool) reset(v int) {
func (qb *quotaPool) add(v int) {
qb.mu.Lock()
defer qb.mu.Unlock()
select {
Expand Down
9 changes: 3 additions & 6 deletions transport/http2_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (_ *Strea
}
t.mu.Unlock()
if reset {
t.streamsQuota.reset(-1)
t.streamsQuota.add(-1)
}

// HPACK encodes various headers. Note that once WriteField(...) is
Expand Down Expand Up @@ -614,9 +614,6 @@ func (t *http2Client) Write(s *Stream, data []byte, opts *Options) error {
// Wait until the transport has some quota to send the data.
tq, err := wait(s.ctx, s.done, s.goAway, t.shutdownChan, t.sendQuotaPool.acquire())
if err != nil {
if _, ok := err.(StreamError); ok || err == io.EOF {
t.sendQuotaPool.cancel()
}
return err
}
if sq < size {
Expand Down Expand Up @@ -1035,13 +1032,13 @@ func (t *http2Client) applySettings(ss []http2.Setting) {
t.maxStreams = int(s.Val)
t.mu.Unlock()
if reset {
t.streamsQuota.reset(int(s.Val) - ms)
t.streamsQuota.add(int(s.Val) - ms)
}
case http2.SettingInitialWindowSize:
t.mu.Lock()
for _, stream := range t.activeStreams {
// Adjust the sending quota for each stream.
stream.sendQuotaPool.reset(int(s.Val - t.streamSendQuota))
stream.sendQuotaPool.add(int(s.Val - t.streamSendQuota))
}
t.streamSendQuota = s.Val
t.mu.Unlock()
Expand Down
5 changes: 1 addition & 4 deletions transport/http2_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,6 @@ func (t *http2Server) Write(s *Stream, data []byte, opts *Options) error {
// Wait until the transport has some quota to send the data.
tq, err := wait(s.ctx, nil, nil, t.shutdownChan, t.sendQuotaPool.acquire())
if err != nil {
if _, ok := err.(StreamError); ok {
t.sendQuotaPool.cancel()
}
return err
}
if sq < size {
Expand Down Expand Up @@ -706,7 +703,7 @@ func (t *http2Server) applySettings(ss []http2.Setting) {
t.mu.Lock()
defer t.mu.Unlock()
for _, stream := range t.activeStreams {
stream.sendQuotaPool.reset(int(s.Val - t.streamSendQuota))
stream.sendQuotaPool.add(int(s.Val - t.streamSendQuota))
}
t.streamSendQuota = s.Val
}
Expand Down

0 comments on commit 609b15b

Please sign in to comment.