-
Notifications
You must be signed in to change notification settings - Fork 7.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ZOOKEEPER-2678: Discovery and Sync can take a very long time on large DB (3.4) #157
Changes from all commits
5aa2562
f705293
f57c384
dcbf325
d079617
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ | |
package org.apache.zookeeper.server.quorum; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.mockito.Mockito.never; | ||
import static org.mockito.Mockito.spy; | ||
import static org.mockito.Mockito.verify; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.BufferedReader; | ||
|
@@ -645,6 +648,8 @@ public void converseWithFollower(InputArchive ia, OutputArchive oa, | |
tmpDir.mkdir(); | ||
File logDir = f.fzk.getTxnLogFactory().getDataDir().getParentFile(); | ||
File snapDir = f.fzk.getTxnLogFactory().getSnapDir().getParentFile(); | ||
//Spy on ZK so we can check if a snapshot happened or not. | ||
f.zk = spy(f.zk); | ||
try { | ||
Assert.assertEquals(0, f.self.getAcceptedEpoch()); | ||
Assert.assertEquals(0, f.self.getCurrentEpoch()); | ||
|
@@ -687,6 +692,10 @@ public void converseWithFollower(InputArchive ia, OutputArchive oa, | |
oa.writeRecord(qp, null); | ||
zkDb.serializeSnapshot(oa); | ||
oa.writeString("BenWasHere", null); | ||
Thread.sleep(10); //Give it some time to process the snap | ||
//No Snapshot taken yet, the SNAP was applied in memory | ||
verify(f.zk, never()).takeSnapshot(); | ||
|
||
qp.setType(Leader.NEWLEADER); | ||
qp.setZxid(ZxidUtils.makeZxid(1, 0)); | ||
oa.writeRecord(qp, null); | ||
|
@@ -697,7 +706,8 @@ public void converseWithFollower(InputArchive ia, OutputArchive oa, | |
Assert.assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid()); | ||
Assert.assertEquals(1, f.self.getAcceptedEpoch()); | ||
Assert.assertEquals(1, f.self.getCurrentEpoch()); | ||
|
||
//Make sure that we did take the snapshot now | ||
verify(f.zk).takeSnapshot(); | ||
Assert.assertEquals(firstZxid, f.fzk.getLastProcessedZxid()); | ||
|
||
// Make sure the data was recorded in the filesystem ok | ||
|
@@ -773,6 +783,8 @@ public void converseWithFollower(InputArchive ia, OutputArchive oa, | |
tmpDir.mkdir(); | ||
File logDir = f.fzk.getTxnLogFactory().getDataDir().getParentFile(); | ||
File snapDir = f.fzk.getTxnLogFactory().getSnapDir().getParentFile(); | ||
//Spy on ZK so we can check if a snapshot happened or not. | ||
f.zk = spy(f.zk); | ||
try { | ||
Assert.assertEquals(0, f.self.getAcceptedEpoch()); | ||
Assert.assertEquals(0, f.self.getCurrentEpoch()); | ||
|
@@ -839,13 +851,28 @@ public void converseWithFollower(InputArchive ia, OutputArchive oa, | |
Assert.assertEquals(1, f.self.getAcceptedEpoch()); | ||
Assert.assertEquals(1, f.self.getCurrentEpoch()); | ||
|
||
//Wait for the transactions to be written out. The thread that writes them out | ||
// does not send anything back when it is done. | ||
long start = System.currentTimeMillis(); | ||
while (createSessionZxid != f.fzk.getLastProcessedZxid() && (System.currentTimeMillis() - start) < 50) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just an idea, not sure if it is worth the effort and it may be outside the scope of this patch. we could play with the test infrastructure here a little bit and do some dependency injection in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does seem a bit beyond the scope of this. But if you really want me to I can look into it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be nice, a good way of actually validating everything is behaving as expected There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK Will see what I can do |
||
Thread.sleep(1); | ||
} | ||
|
||
Assert.assertEquals(createSessionZxid, f.fzk.getLastProcessedZxid()); | ||
|
||
// Make sure the data was recorded in the filesystem ok | ||
ZKDatabase zkDb2 = new ZKDatabase(new FileTxnSnapLog(logDir, snapDir)); | ||
start = System.currentTimeMillis(); | ||
zkDb2.loadDataBase(); | ||
while (zkDb2.getSessionWithTimeOuts().isEmpty() && (System.currentTimeMillis() - start) < 50) { | ||
Thread.sleep(1); | ||
zkDb2.loadDataBase(); | ||
} | ||
LOG.info("zkdb2 sessions:" + zkDb2.getSessions()); | ||
LOG.info("zkdb2 with timeouts:" + zkDb2.getSessionWithTimeOuts()); | ||
Assert.assertNotNull(zkDb2.getSessionWithTimeOuts().get(4L)); | ||
//Snapshot was never taken during very simple sync | ||
verify(f.zk, never()).takeSnapshot(); | ||
} finally { | ||
recursiveDelete(tmpDir); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is one case we may still want to clear db here - when one of the ZooKeeper critical threads (such as * processors, session trackers) fail, ZooKeeper server will shutdown (see runFromConfig) and consequently invoke ZooKeeper#shutdown. In this case, I don't see a particular reason not to clear the db, though not doing it might be fine (as one could argue the server will be dead anyway), but I tend to lean towards the safe side on cleaning the db. One way to conditionally do that is to add a Boolean parameter to ZooKeeper#shutdown so we can have fine grained control over when to clear db in what code path.