Skip to content

Commit

Permalink
Merge pull request #3483 from mapfish/deadlock_healthcheck
Browse files Browse the repository at this point in the history
Add deadlock healthcheck
  • Loading branch information
sebr72 authored Oct 24, 2024
2 parents b75cbed + bb8673c commit 8caf357
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 32 deletions.
40 changes: 26 additions & 14 deletions core/src/main/java/org/mapfish/print/config/WorkingDirectories.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import javax.servlet.ServletContext;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
Expand All @@ -27,14 +28,17 @@ public class WorkingDirectories {

@Autowired private ServletContext servletContext;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

public final File getWorking() {
return this.working;
}

/**
* Defines what is the root directory used to store every temporary files.
*
* <p>The given path can contain the pattern "{WEBAPP}" and it will replaced by the webapp name.
* <p>The given path can contain the pattern "{WEBAPP}" and it will be replaced by the webapp
* name.
*
* @param working The path
*/
Expand Down Expand Up @@ -110,7 +114,7 @@ public final void removeDirectory(final File directory) {

private void createIfMissing(final File directory, final String name) {
if (!directory.exists() && !directory.mkdirs()) {
if (!directory.exists()) { // Maybe somebody else created it in the mean time
if (!directory.exists()) { // Maybe somebody else created it in the meantime
throw new AssertionError(
"Unable to create working directory: '"
+ directory
Expand All @@ -135,18 +139,8 @@ public final File getBuildFileFor(
final String extension,
final Logger logger) {
final String configurationAbsolutePath = configuration.getDirectory().getPath();
final int prefixToConfiguration = configurationAbsolutePath.length() + 1;
final String parentDir = jasperFileXml.getAbsoluteFile().getParent();
final String relativePathToFile;
if (configurationAbsolutePath.equals(parentDir)) {
relativePathToFile = FilenameUtils.getBaseName(jasperFileXml.getName());
} else {
final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration);
relativePathToFile =
relativePathToContainingDirectory
+ File.separator
+ FilenameUtils.getBaseName(jasperFileXml.getName());
}
final String relativePathToFile =
determineRelativePath(jasperFileXml, configurationAbsolutePath);

final File buildFile =
new File(getJasperCompilation(configuration), relativePathToFile + extension);
Expand All @@ -159,6 +153,22 @@ public final File getBuildFileFor(
return buildFile;
}

private String determineRelativePath(final File jasperXmlFile, final String configAbsolutePath) {
final int prefixToConfiguration = configAbsolutePath.length() + 1;
final String parentDir = jasperXmlFile.getAbsoluteFile().getParent();
final String relativePathToFile;
if (configAbsolutePath.equals(parentDir)) {
relativePathToFile = FilenameUtils.getBaseName(jasperXmlFile.getName());
} else {
final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration);
relativePathToFile =
relativePathToContainingDirectory
+ File.separator
+ FilenameUtils.getBaseName(jasperXmlFile.getName());
}
return relativePathToFile;
}

/**
* Get the working directory for the configuration.
*
Expand Down Expand Up @@ -203,6 +213,8 @@ public void run() {
removeOldFiles(new File(System.getProperty("java.io.tmpdir")), "+~JF", this.maxAgeTaskDir);
} catch (Exception e) {
LOGGER.error("error running file clean-up task", e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "errorRunningFileCleanUpTask");
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
package org.mapfish.print.metrics;

import com.codahale.metrics.health.jvm.ThreadDeadlockHealthCheck;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Autowired;

public class HealthCheckingRegistry extends com.codahale.metrics.health.HealthCheckRegistry {
@Autowired private JobQueueHealthCheck jobQueueHealthCheck;
@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;
private final ThreadDeadlockHealthCheck threadDeadlockHealthCheck =
new ThreadDeadlockHealthCheck();

@PostConstruct
public void registerHealthCheck() {
register("jobQueueStatus", jobQueueHealthCheck);
register("unhealthyCountersStatus", unhealthyCountersHealthCheck);
register("threadDeadlock", threadDeadlockHealthCheck);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,17 @@ protected HealthCheck.Result check() throws Exception {
public void recordUnhealthyCounter(final String counterName) {
unhealthyCounters.add(counterName);
}

/**
* Records an unhealthy problem by incrementing a counter for the given class and description.
* This method helps in tracking issues which might lead to the server being marked unhealthy.
*
* @param className the name of the class where the problem occurred
* @param description a brief description of the problem
*/
public void recordUnhealthyProblem(final String className, final String description) {
String counterName = MetricRegistry.name(className, description);
metricRegistry.counter(counterName).inc();
recordUnhealthyCounter(counterName);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.mapfish.print.servlet.job;

import com.codahale.metrics.MetricRegistry;
import javax.annotation.Nonnull;
import org.hibernate.HibernateException;
import org.hibernate.Session;
Expand Down Expand Up @@ -85,11 +84,8 @@ protected void doInTransactionWithoutResult(@Nonnull final TransactionStatus sta
}
});
} catch (HibernateException ex) {
String name =
MetricRegistry.name(
getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkiped");
metricRegistry.counter(name).inc();
unhealthyCountersHealthCheck.recordUnhealthyCounter(name);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkipped");
LOGGER.error("Cannot save accounting information", ex);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.mapfish.print.ExceptionUtils;
import org.mapfish.print.PrintException;
import org.mapfish.print.config.WorkingDirectories;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.mapfish.print.servlet.job.JobManager;
import org.mapfish.print.servlet.job.JobQueue;
import org.mapfish.print.servlet.job.NoSuchReferenceException;
Expand Down Expand Up @@ -73,7 +74,7 @@ public class ThreadPoolJobManager implements JobManager {
private int maxNumberOfWaitingJobs = DEFAULT_MAX_WAITING_JOBS;

/** The amount of time to let a thread wait before being shutdown. */
private long maxIdleTime = DEFAULT_THREAD_IDLE_TIME;
private final long maxIdleTime = DEFAULT_THREAD_IDLE_TIME;

/** A print job is canceled, if it is not completed after this amount of time (in seconds). */
private long timeout = DEFAULT_TIMEOUT_IN_SECONDS;
Expand Down Expand Up @@ -118,6 +119,8 @@ public class ThreadPoolJobManager implements JobManager {

@Autowired private MetricRegistry metricRegistry;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

private boolean requestedToStop = false;
private Date lastExecutedJobTimestamp;

Expand Down Expand Up @@ -211,7 +214,7 @@ public final void init() {
}
return 0;
});
/* The ThreadPoolExecutor uses a unbounded queue (though we are enforcing a limit in `submit()`).
/* The ThreadPoolExecutor uses an unbounded queue (though we are enforcing a limit in `submit()`).
* Because of that, the executor creates only `corePoolSize` threads. But to use all threads,
* we set both `corePoolSize` and `maximumPoolSize` to `maxNumberOfRunningPrintJobs`. As a
* consequence, the `maxIdleTime` will be ignored, idle threads will not be terminated.
Expand Down Expand Up @@ -239,11 +242,15 @@ protected void beforeExecute(final Thread t, final Runnable runnable) {
ThreadPoolJobManager.this.jobQueue.start(printJob.getEntry().getReferenceId());
} catch (RuntimeException e) {
LOGGER.error("failed to mark job as running", e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
ThreadPoolJobManager.class.getSimpleName(), "failedToMarkJobAsRunning");
} catch (NoSuchReferenceException e) {
LOGGER.error(
"tried to mark non-existing job as 'running': {}",
printJob.getEntry().getReferenceId(),
e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
ThreadPoolJobManager.class.getSimpleName(), "couldNotMarkNonExistingJob");
}
}
}
Expand Down Expand Up @@ -573,7 +580,7 @@ public Callable<V> getCallable() {
}

/**
* This timer task changes the status of finished jobs in the registry. Also it stops jobs that
* This timer task changes the status of finished jobs in the registry. Also, it stops jobs that
* have been running for too long (timeout).
*/
@VisibleForTesting
Expand Down Expand Up @@ -628,8 +635,13 @@ public void run() {
}
} catch (javax.persistence.PessimisticLockException e) {
// Ignore error on pessimistic locking
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "ignoredPessimisticLockIssue");
LOGGER.warn("Ignored Pessimistic Lock Issue", e);
} catch (RuntimeException t) {
LOGGER.error("Error while polling/updating registry", t);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "pollingRegistryIssue");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,39 +1,48 @@
package org.mapfish.print.servlet.job.impl.hibernate;

import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.mapfish.print.servlet.job.JobQueue;
import org.mapfish.print.servlet.job.NoSuchReferenceException;
import org.mapfish.print.servlet.job.PrintJobEntry;
import org.mapfish.print.servlet.job.PrintJobResult;
import org.mapfish.print.servlet.job.PrintJobStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.TransactionTemplate;

/** Db Job Manager. */
/** Transfers request to the Job dao. */
@Transactional
public class HibernateJobQueue implements JobQueue {
private static final int DEFAULT_TIME_TO_KEEP_AFTER_ACCESS = 30; /* minutes */

private static final long DEFAULT_CLEAN_UP_INTERVAL = 300; /* seconds */
private static final TimeUnit CLEAN_UP_INTERVAL_TIME_UNIT = TimeUnit.SECONDS;

private static final Logger LOGGER = LoggerFactory.getLogger(HibernateJobQueue.class);

@Autowired private PrintJobDao dao;

@Autowired private PlatformTransactionManager txManager;

@Autowired private MetricRegistry metricRegistry;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

private ScheduledExecutorService cleanUpTimer;

/** The interval at which old records are deleted (in seconds). */
private long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL;
private final long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL;

/** The minimum time to keep records after last access. */
private int timeToKeepAfterAccessInMinutes = DEFAULT_TIME_TO_KEEP_AFTER_ACCESS;
Expand Down Expand Up @@ -198,7 +207,7 @@ public final void init() {
return thread;
});
this.cleanUpTimer.scheduleAtFixedRate(
this::cleanup, this.cleanupInterval, this.cleanupInterval, TimeUnit.SECONDS);
this::cleanup, cleanupInterval, cleanupInterval, CLEAN_UP_INTERVAL_TIME_UNIT);
}

/** Called by spring when application context is being destroyed. */
Expand All @@ -209,11 +218,28 @@ public final void shutdown() {

private void cleanup() {
TransactionTemplate tmpl = new TransactionTemplate(this.txManager);
final int nbDeleted =
tmpl.execute(
status ->
HibernateJobQueue.this.dao.deleteOld(
System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis()));
metricRegistry.counter(getClass().getName() + ".deleted_old").inc(nbDeleted);
String name = MetricRegistry.name(getClass().getSimpleName(), "deletedOldPrintJobs");
try (Timer.Context deletion = metricRegistry.timer(name).time()) {
final Integer nbDeleted =
tmpl.execute(
status -> {
try {
return HibernateJobQueue.this.dao.deleteOld(
System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis());
} catch (Exception e) {
unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "issue");
return null;
}
});
if (deletion.stop() > CLEAN_UP_INTERVAL_TIME_UNIT.toNanos(cleanupInterval)) {
unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "tooManyJobsToDeleteInInterval");
LOGGER.warn("Deleting Old Print Jobs took longer than the scheduled interval.");
}
if (nbDeleted != null && nbDeleted > 0) {
// This counter counts the number of deleted old job prints by this server instance.
// But it is not the grand total of deleted jobs in the database.
metricRegistry.counter(MetricRegistry.name(name, "totalCount")).inc(nbDeleted);
}
}
}
}

0 comments on commit 8caf357

Please sign in to comment.