Skip to content

Commit

Permalink
ZOOKEEPER-261: Reinitialized servers should not participate in leader…
Browse files Browse the repository at this point in the history
… election

Author: Brian Nixon <nixon@fb.com>

Reviewers: Michael Han <hanm@apache.org>, Edward Ribeiro <edward.ribeiro@gmail.com>, benjamin reed <breed@apache.org>

Closes apache#120 from enixon/ZOOKEEPER-261
  • Loading branch information
Brian Nixon authored and RokLenarcic committed Sep 3, 2022
1 parent 4903bc8 commit 0838dcf
Show file tree
Hide file tree
Showing 12 changed files with 267 additions and 88 deletions.
2 changes: 2 additions & 0 deletions bin/zkServer-initialize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ initialize() {
else
echo "No myid provided, be sure to specify it in $ZOO_DATADIR/myid if using non-standalone"
fi

touch "$ZOO_DATADIR/initialize"
}

eval set -- "${OPTS}"
Expand Down
46 changes: 45 additions & 1 deletion src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,16 @@ server.3=zoo3:2888:3888</programlisting>
ensemble and should have a value between 1 and 255.</para>
</listitem>

<listitem>
<para>Create an initialization marker file <filename>initialize</filename>
in the same directory as <filename>myid</filename>. This file indicates
that an empty data directory is expected. When present, an empty data base
is created and the marker file deleted. When not present, an empty data
directory will mean this peer will not have voting rights and it will not
populate the data directory until it communicates with an active leader.
Intended use is to only create this file when bringing up a new
ensemble. </para>
</listitem>
<listitem>
<para>If your configuration file is set up, you can start a
ZooKeeper server:</para>
Expand Down Expand Up @@ -1408,6 +1418,35 @@ server.3=zoo3:2888:3888</programlisting>
</para>
</section>

<section id="sc_db_existence_validation">
<title>Enabling db existence validation</title>

<para><emphasis role="bold">New in 3.6.0:</emphasis> The default
behavior of a ZooKeeper server on startup when no data tree
is found is to set zxid to zero and join the quorum as a
voting member. This can be dangerous if some event (e.g. a
rogue 'rm -rf') has removed the data directory while the
server was down since this server may help elect a leader
that is missing transactions. Enabling db existence validation
will change the behavior on startup when no data tree is
found: the server joins the ensemble as a non-voting participant
until it is able to sync with the leader and acquire an up-to-date
version of the ensemble data. To indicate an empty data tree is
expected (ensemble creation), the user should place a file
'initialize' in the same directory as 'myid'. This file will
be detected and deleted by the server on startup.
</para>

<para> Initialization validation can be enabled when running
ZooKeeper servers directly from class files by setting
<emphasis role="bold">zookeeper.db.autocreate=false</emphasis>
on the java command line, i.e.
<emphasis role="bold">-Dzookeeper.db.autocreate=false</emphasis>.
Running <emphasis role="bold">zkServer-initialize.sh</emphasis>
will create the required initialization file.
</para>
</section>

<section id="sc_performance_options">
<title>Performance Tuning Options</title>

Expand Down Expand Up @@ -1943,14 +1982,19 @@ server.3=zoo3:2888:3888</programlisting>
<section>
<title>The Data Directory</title>

<para>This directory has two files in it:</para>
<para>This directory has two or three files in it:</para>

<itemizedlist>
<listitem>
<para><filename>myid</filename> - contains a single integer in
human readable ASCII text that represents the server id.</para>
</listitem>

<listitem>
<para><filename>initialize</filename> - presence indicates lack of
data tree is expected. Cleaned up once data tree is created.</para>
</listitem>

<listitem>
<para><filename>snapshot.&lt;zxid&gt;</filename> - holds the fuzzy
snapshot of a data tree.</para>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
Expand All @@ -45,14 +46,15 @@
* classes
*/
public class FileTxnSnapLog {
//the direcotry containing the
//the directory containing the
//the transaction logs
private final File dataDir;
//the directory containing the
//the snapshot directory
private final File snapDir;
private TxnLog txnLog;
private SnapShot snapLog;
private final boolean autoCreateDB;
public final static int VERSION = 2;
public final static String version = "version-";

Expand All @@ -63,6 +65,10 @@ public class FileTxnSnapLog {

public static final String ZOOKEEPER_DATADIR_AUTOCREATE_DEFAULT = "true";

static final String ZOOKEEPER_DB_AUTOCREATE = "zookeeper.db.autocreate";

private static final String ZOOKEEPER_DB_AUTOCREATE_DEFAULT = "true";

/**
* This listener helps
* the external apis calling
Expand Down Expand Up @@ -132,6 +138,9 @@ public FileTxnSnapLog(File dataDir, File snapDir) throws IOException {

txnLog = new FileTxnLog(this.dataDir);
snapLog = new FileSnap(this.snapDir);

autoCreateDB = Boolean.parseBoolean(System.getProperty(ZOOKEEPER_DB_AUTOCREATE,
ZOOKEEPER_DB_AUTOCREATE_DEFAULT));
}

/**
Expand Down Expand Up @@ -167,6 +176,14 @@ public long restore(DataTree dt, Map<Long, Integer> sessions,
PlayBackListener listener) throws IOException {
long deserializeResult = snapLog.deserialize(dt, sessions);
FileTxnLog txnLog = new FileTxnLog(dataDir);
boolean trustEmptyDB;
File initFile = new File(dataDir.getParent(), "initialize");
if (Files.deleteIfExists(initFile.toPath())) {
LOG.info("Initialize file found, an empty database will not block voting participation");
trustEmptyDB = true;
} else {
trustEmptyDB = autoCreateDB;
}
if (-1L == deserializeResult) {
/* this means that we couldn't find any snapshot, so we need to
* initialize an empty database (reported in ZOOKEEPER-2325) */
Expand All @@ -175,11 +192,20 @@ public long restore(DataTree dt, Map<Long, Integer> sessions,
"No snapshot found, but there are log entries. " +
"Something is broken!");
}
/* TODO: (br33d) we should either put a ConcurrentHashMap on restore()
* or use Map on save() */
save(dt, (ConcurrentHashMap<Long, Integer>)sessions);
/* return a zxid of zero, since we the database is empty */
return 0;

if (trustEmptyDB) {
/* TODO: (br33d) we should either put a ConcurrentHashMap on restore()
* or use Map on save() */
save(dt, (ConcurrentHashMap<Long, Integer>)sessions);

/* return a zxid of 0, since we know the database is empty */
return 0L;
} else {
/* return a zxid of -1, since we are possibly missing data */
LOG.warn("Unexpected empty data tree, setting zxid to -1");
dt.lastProcessedZxid = -1L;
return -1L;
}
}
TxnIterator itr = txnLog.read(dt.lastProcessedZxid+1);
long highestZxid = dt.lastProcessedZxid;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,14 @@ else if (self.getCurrentAndNextConfigVoters().contains(n.sid)) {
*/
switch (n.state) {
case LOOKING:
if (getInitLastLoggedZxid() == -1) {
LOG.debug("Ignoring notification as our zxid is -1");
break;
}
if (n.zxid == -1) {
LOG.debug("Ignoring notification from member with -1 zxid" + n.sid);
break;
}
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock.get()) {
logicalclock.set(n.electionEpoch);
Expand Down
4 changes: 3 additions & 1 deletion src/java/main/org/apache/zookeeper/server/quorum/Leader.java
Original file line number Diff line number Diff line change
Expand Up @@ -1211,7 +1211,9 @@ public void waitForEpochAck(long id, StateSummary ss) throws IOException, Interr
+ leaderStateSummary.getLastZxid()
+ " (last zxid)");
}
electingFollowers.add(id);
if (ss.getLastZxid() != -1) {
electingFollowers.add(id);
}
}
QuorumVerifier verifier = self.getQuorumVerifier();
if (electingFollowers.contains(self.getId()) && verifier.containsQuorum(electingFollowers)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ public MainThread(int clientPort, boolean preCreateDirs, File tmpDir, String con
if (!logDir.mkdir()) {
throw new IOException("unable to mkdir " + logDir);
}
ClientBase.createInitializeFile(logDir);
}

String normalizedDataDir = PathUtils.normalizeFileSystemPath(dataDir.toString());
Expand Down Expand Up @@ -318,71 +319,6 @@ public void testStandalone() throws Exception {
ClientBase.CONNECTION_TIMEOUT));
}

/**
* Test verifies server should fail when data dir or data log dir doesn't
* exists. Sets "zookeeper.datadir.autocreate" to false.
*/
@Test(timeout = 30000)
public void testWithoutAutoCreateDataLogDir() throws Exception {
ClientBase.setupTestEnv();
System.setProperty(FileTxnSnapLog.ZOOKEEPER_DATADIR_AUTOCREATE, "false");
try {
final int CLIENT_PORT = PortAssignment.unique();

MainThread main = new MainThread(CLIENT_PORT, false, null);
String args[] = new String[1];
args[0] = main.confFile.toString();
main.start();

Assert.assertFalse("waiting for server being up", ClientBase
.waitForServerUp("127.0.0.1:" + CLIENT_PORT,
CONNECTION_TIMEOUT / 2));
} finally {
// resets "zookeeper.datadir.autocreate" flag
System.setProperty(FileTxnSnapLog.ZOOKEEPER_DATADIR_AUTOCREATE,
FileTxnSnapLog.ZOOKEEPER_DATADIR_AUTOCREATE_DEFAULT);
}
}

/**
* Test verifies the auto creation of data dir and data log dir.
* Sets "zookeeper.datadir.autocreate" to true.
*/
@Test(timeout = 30000)
public void testWithAutoCreateDataLogDir() throws Exception {
ClientBase.setupTestEnv();
System.setProperty(FileTxnSnapLog.ZOOKEEPER_DATADIR_AUTOCREATE, "true");
final int CLIENT_PORT = PortAssignment.unique();

MainThread main = new MainThread(CLIENT_PORT, false, null);
String args[] = new String[1];
args[0] = main.confFile.toString();
main.start();

Assert.assertTrue("waiting for server being up",
ClientBase.waitForServerUp("127.0.0.1:" + CLIENT_PORT,
CONNECTION_TIMEOUT));
clientConnected = new CountDownLatch(1);
ZooKeeper zk = new ZooKeeper("127.0.0.1:" + CLIENT_PORT,
ClientBase.CONNECTION_TIMEOUT, this);
Assert.assertTrue("Failed to establish zkclient connection!",
clientConnected.await(CONNECTION_TIMEOUT, TimeUnit.MILLISECONDS));

zk.create("/foo", "foobar".getBytes(), Ids.OPEN_ACL_UNSAFE,
CreateMode.PERSISTENT);
Assert.assertEquals(new String(zk.getData("/foo", null, null)),
"foobar");
zk.close();

main.shutdown();
main.join();
main.deleteDirs();

Assert.assertTrue("waiting for server down", ClientBase
.waitForServerDown("127.0.0.1:" + CLIENT_PORT,
ClientBase.CONNECTION_TIMEOUT));
}

/**
* Test verifies that the server shouldn't allow minsessiontimeout >
* maxsessiontimeout
Expand All @@ -398,7 +334,7 @@ public void testWithMinSessionTimeoutGreaterThanMaxSessionTimeout()
final int maxSessionTimeout = tickTime * 2 - 100; // max is lower
final String configs = "maxSessionTimeout=" + maxSessionTimeout + "\n"
+ "minSessionTimeout=" + minSessionTimeout + "\n";
MainThread main = new MainThread(CLIENT_PORT, false, configs);
MainThread main = new MainThread(CLIENT_PORT, true, configs);
String args[] = new String[1];
args[0] = main.confFile.toString();
try {
Expand All @@ -423,7 +359,7 @@ public void testWithOnlyMinSessionTimeout() throws Exception {
final int minSessionTimeout = tickTime * 2 - 100;
int maxSessionTimeout = 20 * tickTime;
final String configs = "minSessionTimeout=" + minSessionTimeout + "\n";
MainThread main = new MainThread(CLIENT_PORT, false, configs);
MainThread main = new MainThread(CLIENT_PORT, true, configs);
main.start();

String HOSTPORT = "127.0.0.1:" + CLIENT_PORT;
Expand Down Expand Up @@ -456,7 +392,7 @@ public void testMinMaxSessionTimeOut() throws Exception {
final int maxSessionTimeout = 20 * tickTime + 1000;
final String configs = "maxSessionTimeout=" + maxSessionTimeout + "\n"
+ "minSessionTimeout=" + minSessionTimeout + "\n";
MainThread main = new MainThread(CLIENT_PORT, false, configs);
MainThread main = new MainThread(CLIENT_PORT, true, configs);
main.start();

String HOSTPORT = "127.0.0.1:" + CLIENT_PORT;
Expand Down
Loading

0 comments on commit 0838dcf

Please sign in to comment.