diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java index 0fa037366cb..b1a658ea2f6 100644 --- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java +++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java @@ -329,6 +329,11 @@ public enum Property { + " was changed and it now can accept multiple class names. The metrics spi was introduced in 2.1.3," + " the deprecated factory is org.apache.accumulo.core.metrics.MeterRegistryFactory.", "2.1.0"), + GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL("general.server.lock.verification.interval", "0", + PropertyType.TIMEDURATION, + "Interval at which the Manager and TabletServer should verify their server locks. A value of zero" + + " disables this check.", + "2.1.4"), // properties that are specific to manager server behavior MANAGER_PREFIX("manager.", null, PropertyType.PREFIX, "Properties in this category affect the behavior of the manager server. " diff --git a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java index 63807d0f374..375c2ef665a 100644 --- a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java +++ b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java @@ -762,4 +762,27 @@ public static boolean deleteLock(ZooReaderWriter zk, ServiceLockPath path, Strin return false; } + + /** + * Checks that the lock still exists in ZooKeeper. The typical mechanism for determining if a lock + * is lost depends on a Watcher set on the lock node. There exists a case where the Watcher may + * not get called if another Watcher is stuck waiting on I/O or otherwise hung. In the case where + * this method returns false, then the typical action is to exit the server process. + * + * @return true if lock path still exists, false otherwise and on error + */ + public boolean verifyLockAtSource() { + final String lockPath = getLockPath(); + if (lockPath == null) { + // lock not set yet or lock was lost + return false; + } + try { + return null != this.zooKeeper.exists(lockPath, false); + } catch (KeeperException | InterruptedException | RuntimeException e) { + LOG.error("Error verfiying lock at {}", lockPath, e); + return false; + } + } + } diff --git a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java index edfc567109d..c65314a0f12 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java +++ b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java @@ -19,6 +19,7 @@ package org.apache.accumulo.server; import java.util.Objects; +import java.util.OptionalInt; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; @@ -26,13 +27,18 @@ import org.apache.accumulo.core.classloader.ClassLoaderUtil; import org.apache.accumulo.core.conf.AccumuloConfiguration; import org.apache.accumulo.core.conf.Property; +import org.apache.accumulo.core.fate.zookeeper.ServiceLock; import org.apache.accumulo.core.metrics.MetricsProducer; import org.apache.accumulo.core.trace.TraceUtil; +import org.apache.accumulo.core.util.Halt; +import org.apache.accumulo.core.util.threads.Threads; import org.apache.accumulo.server.metrics.ProcessMetrics; import org.apache.accumulo.server.security.SecurityUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Preconditions; + import io.micrometer.core.instrument.MeterRegistry; public abstract class AbstractServer implements AutoCloseable, MetricsProducer, Runnable { @@ -44,6 +50,8 @@ public abstract class AbstractServer implements AutoCloseable, MetricsProducer, private final ProcessMetrics processMetrics; protected final long idleReportingPeriodNanos; private volatile long idlePeriodStartNanos = 0L; + private volatile Thread serverThread; + private volatile Thread verificationThread; protected AbstractServer(String appName, ServerOpts opts, String[] args) { this.log = LoggerFactory.getLogger(getClass().getName()); @@ -99,10 +107,14 @@ protected void updateIdleStatus(boolean isIdle) { */ public void runServer() throws Exception { final AtomicReference err = new AtomicReference<>(); - Thread service = new Thread(TraceUtil.wrap(this), applicationName); - service.setUncaughtExceptionHandler((thread, exception) -> err.set(exception)); - service.start(); - service.join(); + serverThread = new Thread(TraceUtil.wrap(this), applicationName); + serverThread.setUncaughtExceptionHandler((thread, exception) -> err.set(exception)); + serverThread.start(); + serverThread.join(); + if (verificationThread != null) { + verificationThread.interrupt(); + verificationThread.join(); + } Throwable thrown = err.get(); if (thrown != null) { if (thrown instanceof Error) { @@ -139,6 +151,53 @@ public String getApplicationName() { return applicationName; } + /** + * Get the ServiceLock for this server process. May return null if called before the lock is + * acquired. + * + * @return lock ServiceLock or null + */ + public abstract ServiceLock getLock(); + + public void startServiceLockVerificationThread() { + Preconditions.checkState(verificationThread == null, + "verification thread not null, startServiceLockVerificationThread likely called twice"); + Preconditions.checkState(serverThread != null, + "server thread is null, no server process is running"); + final long interval = + getConfiguration().getTimeInMillis(Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL); + if (interval > 0) { + verificationThread = Threads.createThread("service-lock-verification-thread", + OptionalInt.of(Thread.NORM_PRIORITY + 1), () -> { + while (serverThread.isAlive()) { + ServiceLock lock = getLock(); + try { + log.trace( + "ServiceLockVerificationThread - checking ServiceLock existence in ZooKeeper"); + if (lock != null && !lock.verifyLockAtSource()) { + Halt.halt("Lock verification thread could not find lock", -1); + } + // Need to sleep, not yield when the thread priority is greater than NORM_PRIORITY + // so that this thread does not get immediately rescheduled. + log.trace( + "ServiceLockVerificationThread - ServiceLock exists in ZooKeeper, sleeping for {}ms", + interval); + Thread.sleep(interval); + } catch (InterruptedException e) { + if (serverThread.isAlive()) { + // throw an Error, which will cause this process to be terminated + throw new Error("Sleep interrupted in ServiceLock verification thread"); + } + } + } + }); + verificationThread.start(); + } else { + log.info("ServiceLockVerificationThread not started as " + + Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL.getKey() + " is zero"); + } + } + @Override public void close() {} diff --git a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java index b6130672e1f..b735d8544dc 100644 --- a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java +++ b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java @@ -768,6 +768,11 @@ private void cleanUpCompactors() { } } + @Override + public ServiceLock getLock() { + return coordinatorLock; + } + public static void main(String[] args) throws Exception { try (CompactionCoordinator compactor = new CompactionCoordinator(new ServerOpts(), args)) { compactor.runServer(); diff --git a/server/compactor/src/main/java/org/apache/accumulo/compactor/Compactor.java b/server/compactor/src/main/java/org/apache/accumulo/compactor/Compactor.java index e03032326f3..b7426ce31bc 100644 --- a/server/compactor/src/main/java/org/apache/accumulo/compactor/Compactor.java +++ b/server/compactor/src/main/java/org/apache/accumulo/compactor/Compactor.java @@ -987,4 +987,10 @@ public String getRunningCompactionId(TInfo tinfo, TCredentials credentials) return eci.canonical(); } } + + @Override + public ServiceLock getLock() { + return compactorLock; + } + } diff --git a/server/gc/src/main/java/org/apache/accumulo/gc/SimpleGarbageCollector.java b/server/gc/src/main/java/org/apache/accumulo/gc/SimpleGarbageCollector.java index ce0e8682bb1..bd78388836e 100644 --- a/server/gc/src/main/java/org/apache/accumulo/gc/SimpleGarbageCollector.java +++ b/server/gc/src/main/java/org/apache/accumulo/gc/SimpleGarbageCollector.java @@ -78,6 +78,7 @@ public class SimpleGarbageCollector extends AbstractServer implements Iface { new GCStatus(new GcCycleStats(), new GcCycleStats(), new GcCycleStats(), new GcCycleStats()); private final GcCycleMetrics gcCycleMetrics = new GcCycleMetrics(); + private ServiceLock gcLock; SimpleGarbageCollector(ServerOpts opts, String[] args) { super("gc", opts, args); @@ -379,10 +380,9 @@ public void unableToMonitorLockNode(final Exception e) { }; UUID zooLockUUID = UUID.randomUUID(); - ServiceLock lock = - new ServiceLock(getContext().getZooReaderWriter().getZooKeeper(), path, zooLockUUID); + gcLock = new ServiceLock(getContext().getZooReaderWriter().getZooKeeper(), path, zooLockUUID); while (true) { - if (lock.tryLock(lockWatcher, + if (gcLock.tryLock(lockWatcher, new ServerServices(addr.toString(), Service.GC_CLIENT).toString().getBytes(UTF_8))) { log.debug("Got GC ZooKeeper lock"); return; @@ -439,4 +439,9 @@ public GcCycleMetrics getGcCycleMetrics() { return gcCycleMetrics; } + @Override + public ServiceLock getLock() { + return gcLock; + } + } diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java index 55255751531..a3540fa062a 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java @@ -1714,6 +1714,7 @@ private void getManagerLock(final ServiceLockPath zManagerLoc) managerLockWatcher.waitForChange(); if (managerLockWatcher.acquiredLock) { + startServiceLockVerificationThread(); break; } @@ -2001,4 +2002,10 @@ void getAssignments(SortedMap currentStatus, assignedOut); tabletBalancer.getAssignments(params); } + + @Override + public ServiceLock getLock() { + return managerLock; + } + } diff --git a/server/monitor/src/main/java/org/apache/accumulo/monitor/Monitor.java b/server/monitor/src/main/java/org/apache/accumulo/monitor/Monitor.java index 67fc46015d6..4108282bcb3 100644 --- a/server/monitor/src/main/java/org/apache/accumulo/monitor/Monitor.java +++ b/server/monitor/src/main/java/org/apache/accumulo/monitor/Monitor.java @@ -1048,4 +1048,9 @@ public Optional getCoordinatorHost() { public int getLivePort() { return livePort; } + + @Override + public ServiceLock getLock() { + return monitorLock; + } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java index 1df158cf013..54764b86725 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java @@ -483,7 +483,8 @@ private void splitTablet(Tablet tablet) { } } - TreeMap splitTablet(Tablet tablet, byte[] splitPoint) throws IOException { + protected TreeMap splitTablet(Tablet tablet, byte[] splitPoint) + throws IOException { long t1 = System.currentTimeMillis(); TreeMap tabletInfo = tablet.split(splitPoint); @@ -708,6 +709,7 @@ public void unableToMonitorLockNode(final Exception e) { lockSessionId = tabletServerLock.getSessionId(); log.debug("Obtained tablet server lock {} {}", tabletServerLock.getLockPath(), getTabletSession()); + startServiceLockVerificationThread(); return; } log.info("Waiting for tablet server lock"); diff --git a/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java b/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java index 343f07e72a0..75ba3d6c36c 100644 --- a/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java +++ b/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java @@ -217,6 +217,7 @@ public void testDeleteParent() throws Exception { ServiceLock zl = getZooLock(parent, UUID.randomUUID()); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); ZooReaderWriter zk = szk.getZooReaderWriter(); @@ -235,10 +236,12 @@ public void testDeleteParent() throws Exception { assertTrue(lw.locked); assertTrue(zl.isLocked()); + assertTrue(zl.verifyLockAtSource()); assertNull(lw.exception); assertNull(lw.reason); zl.unlock(); + assertFalse(zl.verifyLockAtSource()); } @Test @@ -250,6 +253,7 @@ public void testNoParent() throws Exception { ServiceLock zl = getZooLock(parent, UUID.randomUUID()); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); TestALW lw = new TestALW(); @@ -259,6 +263,7 @@ public void testNoParent() throws Exception { assertFalse(lw.locked); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); assertNotNull(lw.exception); assertNull(lw.reason); } @@ -275,6 +280,7 @@ public void testDeleteLock() throws Exception { ServiceLock zl = getZooLock(parent, UUID.randomUUID()); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); TestALW lw = new TestALW(); @@ -284,6 +290,7 @@ public void testDeleteLock() throws Exception { assertTrue(lw.locked); assertTrue(zl.isLocked()); + assertTrue(zl.verifyLockAtSource()); assertNull(lw.exception); assertNull(lw.reason); @@ -293,7 +300,7 @@ public void testDeleteLock() throws Exception { assertEquals(LockLossReason.LOCK_DELETED, lw.reason); assertNull(lw.exception); - + assertFalse(zl.verifyLockAtSource()); } @Test @@ -308,6 +315,7 @@ public void testDeleteWaiting() throws Exception { ServiceLock zl = getZooLock(parent, UUID.randomUUID()); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); TestALW lw = new TestALW(); @@ -317,6 +325,7 @@ public void testDeleteWaiting() throws Exception { assertTrue(lw.locked); assertTrue(zl.isLocked()); + assertTrue(zl.verifyLockAtSource()); assertNull(lw.exception); assertNull(lw.reason); @@ -328,6 +337,7 @@ public void testDeleteWaiting() throws Exception { assertFalse(lw2.locked); assertFalse(zl2.isLocked()); + assertFalse(zl2.verifyLockAtSource()); ServiceLock zl3 = getZooLock(parent, UUID.randomUUID()); @@ -356,10 +366,12 @@ public void testDeleteWaiting() throws Exception { assertTrue(lw3.locked); assertTrue(zl3.isLocked()); + assertTrue(zl3.verifyLockAtSource()); assertNull(lw3.exception); assertNull(lw3.reason); zl3.unlock(); + assertFalse(zl3.verifyLockAtSource()); } @@ -380,6 +392,7 @@ public void testUnexpectedEvent() throws Exception { ServiceLock zl = getZooLock(parent, UUID.randomUUID()); assertFalse(zl.isLocked()); + assertFalse(zl.verifyLockAtSource()); // would not expect data to be set on this node, but it should not cause problems..... zk.setData(parent.toString(), "foo".getBytes(UTF_8), -1); @@ -392,6 +405,7 @@ public void testUnexpectedEvent() throws Exception { assertTrue(lw.locked); assertTrue(zl.isLocked()); + assertTrue(zl.verifyLockAtSource()); assertNull(lw.exception); assertNull(lw.reason); @@ -404,6 +418,7 @@ public void testUnexpectedEvent() throws Exception { assertEquals(LockLossReason.LOCK_DELETED, lw.reason); assertNull(lw.exception); + assertFalse(zl.verifyLockAtSource()); } } @@ -476,8 +491,13 @@ public void testLockSerial() throws Exception { assertEquals("/zlretryLockSerial/zlock#00000000-0000-0000-0000-aaaaaaaaaaaa#0000000000", zl2.getWatching()); + assertTrue(zl1.verifyLockAtSource()); + assertFalse(zl2.verifyLockAtSource()); + zl1.unlock(); assertFalse(zlw1.isLockHeld()); + assertFalse(zl1.verifyLockAtSource()); + assertFalse(zl2.verifyLockAtSource()); zk1.close(); while (!zlw2.isLockHeld()) { @@ -485,7 +505,9 @@ public void testLockSerial() throws Exception { } assertTrue(zlw2.isLockHeld()); + assertTrue(zl2.verifyLockAtSource()); zl2.unlock(); + assertFalse(zl2.verifyLockAtSource()); } } diff --git a/test/src/main/java/org/apache/accumulo/test/functional/HalfDeadServerWatcherIT.java b/test/src/main/java/org/apache/accumulo/test/functional/HalfDeadServerWatcherIT.java new file mode 100644 index 00000000000..b0b81623484 --- /dev/null +++ b/test/src/main/java/org/apache/accumulo/test/functional/HalfDeadServerWatcherIT.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.test.functional; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.io.IOException; +import java.util.List; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.accumulo.core.Constants; +import org.apache.accumulo.core.client.Accumulo; +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.conf.Property; +import org.apache.accumulo.core.data.TableId; +import org.apache.accumulo.core.dataImpl.KeyExtent; +import org.apache.accumulo.core.fate.zookeeper.ZooUtil.NodeMissingPolicy; +import org.apache.accumulo.core.util.UtilWaitThread; +import org.apache.accumulo.harness.AccumuloClusterHarness; +import org.apache.accumulo.minicluster.ServerType; +import org.apache.accumulo.miniclusterImpl.MiniAccumuloConfigImpl; +import org.apache.accumulo.server.ServerContext; +import org.apache.accumulo.server.ServerOpts; +import org.apache.accumulo.test.util.Wait; +import org.apache.accumulo.tserver.TabletServer; +import org.apache.accumulo.tserver.tablet.Tablet; +import org.apache.accumulo.tserver.tablet.TabletData; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.WatchedEvent; +import org.apache.zookeeper.Watcher; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test that validates that the TabletServer will be terminated when the lock is removed in + * ZooKeeper, but a Watcher in the TabletServer is preventing the LockWatcher to be invoked. + */ +public class HalfDeadServerWatcherIT extends AccumuloClusterHarness { + + public static class HalfDeadTabletServer extends TabletServer { + + private static final Logger LOG = LoggerFactory.getLogger(HalfDeadTabletServer.class); + + public static void main(String[] args) throws Exception { + try (HalfDeadTabletServer tserver = new HalfDeadTabletServer(new ServerOpts(), args)) { + tserver.runServer(); + } + } + + public static class StuckWatcher implements Watcher { + private static final Logger LOG = LoggerFactory.getLogger(StuckWatcher.class); + + @Override + public void process(WatchedEvent event) { + LOG.info("started sleeping..."); + while (true) { + LOG.info("still sleeping..."); + UtilWaitThread.sleep(2000); + } + } + + } + + protected HalfDeadTabletServer(ServerOpts opts, String[] args) { + super(opts, args); + } + + @Override + protected TreeMap splitTablet(Tablet tablet, byte[] splitPoint) + throws IOException { + LOG.info("In HalfDeadServerWatcherIT::splitTablet"); + TreeMap results = super.splitTablet(tablet, splitPoint); + if (!tablet.getExtent().isMeta()) { + final TableId tid = tablet.getExtent().tableId(); + final String zooRoot = this.getContext().getZooKeeperRoot(); + final String tableZPath = zooRoot + Constants.ZTABLES + "/" + tid.canonical(); + try { + this.getContext().getZooReaderWriter().exists(tableZPath, new StuckWatcher()); + } catch (KeeperException | InterruptedException e) { + LOG.error("Error setting watch at: {}", tableZPath, e); + } + LOG.info("Set StuckWatcher at: {}", tableZPath); + } + return results; + } + } + + private static final AtomicBoolean USE_VERIFICATION_THREAD = new AtomicBoolean(false); + + @Override + public void configureMiniCluster(MiniAccumuloConfigImpl cfg, Configuration hadoopCoreSite) { + if (USE_VERIFICATION_THREAD.get()) { + cfg.setProperty(Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL, "10s"); + } else { + cfg.setProperty(Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL, "0"); + } + cfg.setServerClass(ServerType.TABLET_SERVER, HalfDeadTabletServer.class); + cfg.setNumCompactors(0); + cfg.setNumScanServers(0); + cfg.setNumTservers(1); + } + + @AfterEach + public void afterTest() throws Exception { + getCluster().getClusterControl().stopAllServers(ServerType.TABLET_SERVER); + super.teardownCluster(); + USE_VERIFICATION_THREAD.set(!USE_VERIFICATION_THREAD.get()); + } + + @Test + public void testOne() throws Exception { + if (USE_VERIFICATION_THREAD.get()) { + // This test should use the verification thread, which should + // end the TabletServer, throw an Exception on the ping call, + // and return true + assertTrue(testTabletServerWithStuckWatcherDies()); + } else { + // This test should time out + IllegalStateException e = + assertThrows(IllegalStateException.class, () -> testTabletServerWithStuckWatcherDies()); + assertTrue(e.getMessage().contains("Timeout exceeded")); + } + } + + @Test + public void testTwo() throws Exception { + if (USE_VERIFICATION_THREAD.get()) { + // This test should use the verification thread, which should + // end the TabletServer, throw an Exception on the ping call, + // and return true + assertTrue(testTabletServerWithStuckWatcherDies()); + } else { + // This test should time out + IllegalStateException e = + assertThrows(IllegalStateException.class, () -> testTabletServerWithStuckWatcherDies()); + assertTrue(e.getMessage().contains("Timeout exceeded")); + } + } + + public boolean testTabletServerWithStuckWatcherDies() throws Exception { + try (AccumuloClient client = Accumulo.newClient().from(getClientProps()).build()) { + String tableName = getUniqueNames(1)[0]; + client.tableOperations().create(tableName); + + // add splits to the table, which should set a StuckWatcher on the table node in zookeeper + TreeSet splits = new TreeSet<>(); + splits.add(new Text("j")); + splits.add(new Text("t")); + client.tableOperations().addSplits(tableName, splits); + + // delete the table, which should invoke the watcher + client.tableOperations().delete(tableName); + + final List tservers = client.instanceOperations().getTabletServers(); + assertEquals(1, tservers.size()); + + // Delete the lock for the TabletServer + final ServerContext ctx = getServerContext(); + final String zooRoot = ctx.getZooKeeperRoot(); + ctx.getZooReaderWriter().recursiveDelete( + zooRoot + Constants.ZTSERVERS + "/" + tservers.get(0), NodeMissingPolicy.FAIL); + + Wait.waitFor(() -> pingServer(client, tservers.get(0)) == false, 60_000); + return true; + } + + } + + private boolean pingServer(AccumuloClient client, String server) { + final boolean lockVerificationThreadInUse = USE_VERIFICATION_THREAD.get(); + try { + client.instanceOperations().ping(server); + return true; + } catch (AccumuloException e) { + if (lockVerificationThreadInUse) { + // If the lock verification thread is in use, the the TabletServer + // should shut down and the call to ping will throw an Exception + return false; + } else { + // With the lock verification thread disabled, the StuckWatcher + // should prevent the TabletServer from shutting down during + // this test method. + fail("TabletServer unexpectedly shut down"); + return false; + } + } + + } + +}