-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
producer: ensure that the management message (fin) is never "leaked" #2182
Changes from 5 commits
5dbfdf1
da15def
c68a375
ba24670
30d770f
9ddaa25
6d41939
294a041
ef158b6
36f3a44
61133f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -840,6 +840,15 @@ func (bp *brokerProducer) run() { | |
continue | ||
} | ||
|
||
if msg.flags&fin == fin { | ||
// New broker producer that was caught up by the retry loop | ||
bp.parent.retryMessage(msg, ErrShuttingDown) | ||
delete(bp.currentRetries[msg.Topic], msg.Partition) | ||
Logger.Printf("producer/broker/%d state change to [dying-%d] on %s/%d\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The The fix preserves the original retry logic of bubbling up the |
||
bp.broker.ID(), msg.retries, msg.Topic, msg.Partition) | ||
continue | ||
} | ||
|
||
if bp.buffer.wouldOverflow(msg) { | ||
Logger.Printf("producer/broker/%d maximum request accumulated, waiting for space\n", bp.broker.ID()) | ||
if err := bp.waitForSpace(msg, false); err != nil { | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,31 +17,67 @@ import ( | |||||
|
||||||
const TestMessage = "ABC THE MESSAGE" | ||||||
|
||||||
func closeProducer(t *testing.T, p AsyncProducer) { | ||||||
func closeProducerWithTimeout(t *testing.T, p AsyncProducer, timeout time.Duration) { | ||||||
var wg sync.WaitGroup | ||||||
p.AsyncClose() | ||||||
|
||||||
closer := make(chan struct{}) | ||||||
timer := time.AfterFunc(timeout, func() { | ||||||
t.Error("timeout") | ||||||
close(closer) | ||||||
}) | ||||||
defer timer.Stop() | ||||||
|
||||||
wg.Add(2) | ||||||
go func() { | ||||||
for range p.Successes() { | ||||||
t.Error("Unexpected message on Successes()") | ||||||
defer wg.Done() | ||||||
for { | ||||||
select { | ||||||
case <-closer: | ||||||
return | ||||||
case _, ok := <-p.Successes(): | ||||||
if !ok { | ||||||
return | ||||||
} | ||||||
t.Error("Unexpected message on Successes()") | ||||||
} | ||||||
} | ||||||
wg.Done() | ||||||
}() | ||||||
go func() { | ||||||
for msg := range p.Errors() { | ||||||
t.Error(msg.Err) | ||||||
defer wg.Done() | ||||||
for { | ||||||
select { | ||||||
case <-closer: | ||||||
return | ||||||
case msg, ok := <-p.Errors(): | ||||||
if !ok { | ||||||
return | ||||||
} | ||||||
t.Error(msg.Err) | ||||||
} | ||||||
} | ||||||
wg.Done() | ||||||
}() | ||||||
wg.Wait() | ||||||
} | ||||||
|
||||||
func expectResults(t *testing.T, p AsyncProducer, successes, errors int) { | ||||||
func closeProducer(t *testing.T, p AsyncProducer) { | ||||||
closeProducerWithTimeout(t, p, 5*time.Minute) | ||||||
} | ||||||
|
||||||
func expectResultsWithTimeout(t *testing.T, p AsyncProducer, successes, errors int, timeout time.Duration) { | ||||||
t.Helper() | ||||||
expect := successes + errors | ||||||
defer func() { | ||||||
if successes != 0 || errors != 0 { | ||||||
t.Error("Unexpected successes", successes, "or errors", errors) | ||||||
} | ||||||
}() | ||||||
timer := time.NewTimer(timeout) | ||||||
defer timer.Stop() | ||||||
for expect > 0 { | ||||||
select { | ||||||
case <-timer.C: | ||||||
return | ||||||
case msg := <-p.Errors(): | ||||||
if msg.Msg.flags != 0 { | ||||||
t.Error("Message had flags set") | ||||||
|
@@ -62,9 +98,10 @@ func expectResults(t *testing.T, p AsyncProducer, successes, errors int) { | |||||
} | ||||||
} | ||||||
} | ||||||
if successes != 0 || errors != 0 { | ||||||
t.Error("Unexpected successes", successes, "or errors", errors) | ||||||
} | ||||||
} | ||||||
|
||||||
func expectResults(t *testing.T, p AsyncProducer, successes, errors int) { | ||||||
expectResultsWithTimeout(t, p, successes, errors, 5*time.Minute) | ||||||
} | ||||||
|
||||||
type testPartitioner chan *int32 | ||||||
|
@@ -693,6 +730,112 @@ func TestAsyncProducerMultipleRetriesWithConcurrentRequests(t *testing.T) { | |||||
closeProducer(t, producer) | ||||||
} | ||||||
|
||||||
func TestAsyncProducerBrokerRestart(t *testing.T) { | ||||||
// Logger = log.New(os.Stdout, "[sarama] ", log.LstdFlags) | ||||||
|
||||||
seedBroker := NewMockBroker(t, 1) | ||||||
leader := NewMockBroker(t, 2) | ||||||
|
||||||
var leaderLock sync.Mutex | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like the leaderID := leader.BrokerID() as the same ID is used when the leader is closed and recreated. metadataLeader := new(MetadataResponse)
...
seedBroker.setHandler(func(req *request) (res encoderWithHeader) {
return metadataLeader
}) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used to guard There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it, I thought like some other unit tests the same address was reused with |
||||||
|
||||||
// The seed broker only handles Metadata request | ||||||
seedBroker.setHandler(func(req *request) (res encoderWithHeader) { | ||||||
leaderLock.Lock() | ||||||
defer leaderLock.Unlock() | ||||||
metadataLeader := new(MetadataResponse) | ||||||
metadataLeader.AddBroker(leader.Addr(), leader.BrokerID()) | ||||||
metadataLeader.AddTopicPartition("my_topic", 0, leader.BrokerID(), nil, nil, nil, ErrNoError) | ||||||
return metadataLeader | ||||||
}) | ||||||
|
||||||
emptyValues := 0 | ||||||
|
||||||
produceRequestTest := func(req *request) { | ||||||
preq := req.body.(*ProduceRequest) | ||||||
if batch := preq.records["my_topic"][0].RecordBatch; batch != nil { | ||||||
for _, record := range batch.Records { | ||||||
if len(record.Value) == 0 { | ||||||
emptyValues++ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As Accessing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I will update this. |
||||||
} | ||||||
} | ||||||
} | ||||||
if batch := preq.records["my_topic"][0].MsgSet; batch != nil { | ||||||
for _, record := range batch.Messages { | ||||||
if len(record.Msg.Value) == 0 { | ||||||
emptyValues++ | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
leader.setHandler(func(req *request) (res encoderWithHeader) { | ||||||
produceRequestTest(req) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A more suitable name for that function might be:
Suggested change
|
||||||
|
||||||
time.Sleep(50 * time.Millisecond) | ||||||
|
||||||
prodSuccess := new(ProduceResponse) | ||||||
prodSuccess.AddTopicPartition("my_topic", 0, ErrNotLeaderForPartition) | ||||||
return prodSuccess | ||||||
}) | ||||||
|
||||||
config := NewTestConfig() | ||||||
config.Producer.Retry.Backoff = time.Second | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Being a unit test, we can probably speed the runtime by using the default 250ms backoff. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't play much with this setting, I just set this value reasonably high to ensure that the test is not flaky. |
||||||
config.Producer.Flush.MaxMessages = 1 | ||||||
config.Producer.Return.Errors = true | ||||||
config.Producer.Return.Successes = true | ||||||
config.Producer.Retry.Max = 10 | ||||||
|
||||||
producer, err := NewAsyncProducer([]string{seedBroker.Addr()}, config) | ||||||
if err != nil { | ||||||
t.Fatal(err) | ||||||
} | ||||||
|
||||||
var wg sync.WaitGroup | ||||||
|
||||||
pushMsg := func() { | ||||||
defer wg.Done() | ||||||
for i := 0; i < 10; i++ { | ||||||
producer.Input() <- &ProducerMessage{Topic: "my_topic", Key: nil, Value: StringEncoder(TestMessage)} | ||||||
time.Sleep(50 * time.Millisecond) | ||||||
} | ||||||
} | ||||||
|
||||||
wg.Add(1) | ||||||
go pushMsg() | ||||||
|
||||||
for i := 0; i < 3; i++ { | ||||||
time.Sleep(100 * time.Millisecond) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can reproduce the failure even if the other 👍 So this seems to be the crux of reproducing the race condition by sending records in bursts while restarting the (mock) broker. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the whole point is to have different "generations" of the retries. |
||||||
|
||||||
wg.Add(1) | ||||||
go pushMsg() | ||||||
} | ||||||
|
||||||
leader.Close() | ||||||
leaderLock.Lock() | ||||||
leader = NewMockBroker(t, 2) | ||||||
leaderLock.Unlock() | ||||||
leader.setHandler(func(req *request) (res encoderWithHeader) { | ||||||
produceRequestTest(req) | ||||||
|
||||||
prodSuccess := new(ProduceResponse) | ||||||
prodSuccess.AddTopicPartition("my_topic", 0, ErrNoError) | ||||||
return prodSuccess | ||||||
}) | ||||||
|
||||||
wg.Wait() | ||||||
|
||||||
expectResultsWithTimeout(t, producer, 40, 00, 10*time.Second) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
seedBroker.Close() | ||||||
leader.Close() | ||||||
|
||||||
closeProducerWithTimeout(t, producer, 5*time.Second) | ||||||
|
||||||
if emptyValues > 0 { | ||||||
t.Fatalf("%d empty values", emptyValues) | ||||||
} | ||||||
} | ||||||
|
||||||
func TestAsyncProducerOutOfRetries(t *testing.T) { | ||||||
t.Skip("Enable once bug #294 is fixed.") | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure those 3 lines are necessary because the
bp.currentRetries
should be "empty" for that particular topic partition becausebp.needsRetry(msg)
returnednil
above:https://github.com/Shopify/sarama/blob/bad67e5b089437bc73f8034a95017e77be71e8b0/async_producer.go#L903-L909
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is true that
bp.currentRetries[msg.Topic]
isnil
in this case (bp.currentRetries[msg.Topic][msg.Partition]
is set tonil
in the "syn" case above).I decided to
delete
the key to ensure that the key does not exist to avoid confusion "key exist but nil" vs. "key does not exist" to mimic the other block of code above related to handlingfin
message.TBH I'm not entirely sure about the purpose of the following code in handling
syn
case:I don't see where it might be useful (for now).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe when a
syn
message is seen thebrokerProducer
should:currentRetries
to prevent a panic when accessingbp.currentRetries[topic][partition]
(typically on first use of thebrokerProducer
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @slaunay for the reply!
I don't see how this could lead to any panics 🤔
I've given another thought and I think it does not really matter (as far as I can tell) whether the retries "metadata" structure is cleared or not (I was trying to imagine what if the code is changed in the future so this branch is executed even if
bp.currentRetries[msg.Topic][msg.Partition]
is notnil
). So I will update the code to minimise the changes.