From 0517ee0b670c53eb1813506b001bf4e2ed7dc6f4 Mon Sep 17 00:00:00 2001 From: sebr72 Date: Tue, 15 Oct 2024 11:12:56 +0200 Subject: [PATCH 1/6] Add deadlock healthcheck --- .../org/mapfish/print/metrics/HealthCheckingRegistry.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/main/java/org/mapfish/print/metrics/HealthCheckingRegistry.java b/core/src/main/java/org/mapfish/print/metrics/HealthCheckingRegistry.java index 468a17736..ed5ac7b1e 100644 --- a/core/src/main/java/org/mapfish/print/metrics/HealthCheckingRegistry.java +++ b/core/src/main/java/org/mapfish/print/metrics/HealthCheckingRegistry.java @@ -1,15 +1,19 @@ package org.mapfish.print.metrics; +import com.codahale.metrics.health.jvm.ThreadDeadlockHealthCheck; import javax.annotation.PostConstruct; import org.springframework.beans.factory.annotation.Autowired; public class HealthCheckingRegistry extends com.codahale.metrics.health.HealthCheckRegistry { @Autowired private JobQueueHealthCheck jobQueueHealthCheck; @Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck; + private final ThreadDeadlockHealthCheck threadDeadlockHealthCheck = + new ThreadDeadlockHealthCheck(); @PostConstruct public void registerHealthCheck() { register("jobQueueStatus", jobQueueHealthCheck); register("unhealthyCountersStatus", unhealthyCountersHealthCheck); + register("threadDeadlock", threadDeadlockHealthCheck); } } From 9890288de9d7a91c939b8d9e6153eb91c9ce53a2 Mon Sep 17 00:00:00 2001 From: sebr72 Date: Tue, 15 Oct 2024 22:18:19 +0200 Subject: [PATCH 2/6] Record when deleting old print jobs failed with hidden exceptions --- .../job/impl/hibernate/HibernateJobQueue.java | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java index f149cd5c7..fa44ba563 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java @@ -1,12 +1,14 @@ package org.mapfish.print.servlet.job.impl.hibernate; import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.Timer; import java.util.List; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; +import org.mapfish.print.metrics.UnhealthyCountersHealthCheck; import org.mapfish.print.servlet.job.JobQueue; import org.mapfish.print.servlet.job.NoSuchReferenceException; import org.mapfish.print.servlet.job.PrintJobEntry; @@ -17,12 +19,13 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.support.TransactionTemplate; -/** Db Job Manager. */ +/** Transfers request to the Job dao. */ @Transactional public class HibernateJobQueue implements JobQueue { private static final int DEFAULT_TIME_TO_KEEP_AFTER_ACCESS = 30; /* minutes */ private static final long DEFAULT_CLEAN_UP_INTERVAL = 300; /* seconds */ + private static final TimeUnit CLEAN_UP_INTERVAL_TIME_UNIT = TimeUnit.SECONDS; @Autowired private PrintJobDao dao; @@ -30,10 +33,12 @@ public class HibernateJobQueue implements JobQueue { @Autowired private MetricRegistry metricRegistry; + @Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck; + private ScheduledExecutorService cleanUpTimer; /** The interval at which old records are deleted (in seconds). */ - private long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL; + private final long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL; /** The minimum time to keep records after last access. */ private int timeToKeepAfterAccessInMinutes = DEFAULT_TIME_TO_KEEP_AFTER_ACCESS; @@ -198,7 +203,7 @@ public final void init() { return thread; }); this.cleanUpTimer.scheduleAtFixedRate( - this::cleanup, this.cleanupInterval, this.cleanupInterval, TimeUnit.SECONDS); + this::cleanup, cleanupInterval, cleanupInterval, CLEAN_UP_INTERVAL_TIME_UNIT); } /** Called by spring when application context is being destroyed. */ @@ -209,11 +214,29 @@ public final void shutdown() { private void cleanup() { TransactionTemplate tmpl = new TransactionTemplate(this.txManager); - final int nbDeleted = - tmpl.execute( - status -> - HibernateJobQueue.this.dao.deleteOld( - System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis())); - metricRegistry.counter(getClass().getName() + ".deleted_old").inc(nbDeleted); + String name = MetricRegistry.name(getClass().getSimpleName(), "deletedOldPrintJobs"); + try (Timer.Context deletion = metricRegistry.timer(name).time()) { + final Integer nbDeleted = + tmpl.execute( + status -> { + try { + return HibernateJobQueue.this.dao.deleteOld( + System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis()); + } catch (Exception e) { + String deleteFailed = MetricRegistry.name(name, "failedToDeleteOldJobs"); + metricRegistry.counter(deleteFailed).inc(); + unhealthyCountersHealthCheck.recordUnhealthyCounter(deleteFailed); + return null; + } + }); + if (deletion.stop() > CLEAN_UP_INTERVAL_TIME_UNIT.toNanos(cleanupInterval)) { + String serverOverloaded = MetricRegistry.name(name, "tooManyJobsToDeleteInInterval"); + metricRegistry.counter(serverOverloaded).inc(); + unhealthyCountersHealthCheck.recordUnhealthyCounter(serverOverloaded); + } + if (nbDeleted != null && nbDeleted > 0) { + metricRegistry.counter(name + ".totalDeleted").inc(nbDeleted); + } + } } } From 64a85da8ebcd31244581eccfe6d6e0be6e886fe6 Mon Sep 17 00:00:00 2001 From: sebr72 Date: Wed, 16 Oct 2024 00:14:31 +0200 Subject: [PATCH 3/6] Record unhealthy problems, that were only logged --- .../print/config/WorkingDirectories.java | 40 ++++++++++++------- .../metrics/UnhealthyCountersHealthCheck.java | 13 ++++++ .../job/impl/ThreadPoolJobManager.java | 17 ++++++-- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/core/src/main/java/org/mapfish/print/config/WorkingDirectories.java b/core/src/main/java/org/mapfish/print/config/WorkingDirectories.java index 162cbacdd..ff001e3f8 100644 --- a/core/src/main/java/org/mapfish/print/config/WorkingDirectories.java +++ b/core/src/main/java/org/mapfish/print/config/WorkingDirectories.java @@ -10,6 +10,7 @@ import javax.servlet.ServletContext; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; +import org.mapfish.print.metrics.UnhealthyCountersHealthCheck; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; @@ -27,6 +28,8 @@ public class WorkingDirectories { @Autowired private ServletContext servletContext; + @Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck; + public final File getWorking() { return this.working; } @@ -34,7 +37,8 @@ public final File getWorking() { /** * Defines what is the root directory used to store every temporary files. * - *

The given path can contain the pattern "{WEBAPP}" and it will replaced by the webapp name. + *

The given path can contain the pattern "{WEBAPP}" and it will be replaced by the webapp + * name. * * @param working The path */ @@ -110,7 +114,7 @@ public final void removeDirectory(final File directory) { private void createIfMissing(final File directory, final String name) { if (!directory.exists() && !directory.mkdirs()) { - if (!directory.exists()) { // Maybe somebody else created it in the mean time + if (!directory.exists()) { // Maybe somebody else created it in the meantime throw new AssertionError( "Unable to create working directory: '" + directory @@ -135,18 +139,8 @@ public final File getBuildFileFor( final String extension, final Logger logger) { final String configurationAbsolutePath = configuration.getDirectory().getPath(); - final int prefixToConfiguration = configurationAbsolutePath.length() + 1; - final String parentDir = jasperFileXml.getAbsoluteFile().getParent(); - final String relativePathToFile; - if (configurationAbsolutePath.equals(parentDir)) { - relativePathToFile = FilenameUtils.getBaseName(jasperFileXml.getName()); - } else { - final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration); - relativePathToFile = - relativePathToContainingDirectory - + File.separator - + FilenameUtils.getBaseName(jasperFileXml.getName()); - } + final String relativePathToFile = + determineRelativePath(jasperFileXml, configurationAbsolutePath); final File buildFile = new File(getJasperCompilation(configuration), relativePathToFile + extension); @@ -159,6 +153,22 @@ public final File getBuildFileFor( return buildFile; } + private String determineRelativePath(final File jasperXmlFile, final String configAbsolutePath) { + final int prefixToConfiguration = configAbsolutePath.length() + 1; + final String parentDir = jasperXmlFile.getAbsoluteFile().getParent(); + final String relativePathToFile; + if (configAbsolutePath.equals(parentDir)) { + relativePathToFile = FilenameUtils.getBaseName(jasperXmlFile.getName()); + } else { + final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration); + relativePathToFile = + relativePathToContainingDirectory + + File.separator + + FilenameUtils.getBaseName(jasperXmlFile.getName()); + } + return relativePathToFile; + } + /** * Get the working directory for the configuration. * @@ -203,6 +213,8 @@ public void run() { removeOldFiles(new File(System.getProperty("java.io.tmpdir")), "+~JF", this.maxAgeTaskDir); } catch (Exception e) { LOGGER.error("error running file clean-up task", e); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + getClass().getSimpleName(), "errorRunningFileCleanUpTask"); } } diff --git a/core/src/main/java/org/mapfish/print/metrics/UnhealthyCountersHealthCheck.java b/core/src/main/java/org/mapfish/print/metrics/UnhealthyCountersHealthCheck.java index d273767b4..cc013f16a 100644 --- a/core/src/main/java/org/mapfish/print/metrics/UnhealthyCountersHealthCheck.java +++ b/core/src/main/java/org/mapfish/print/metrics/UnhealthyCountersHealthCheck.java @@ -46,4 +46,17 @@ protected HealthCheck.Result check() throws Exception { public void recordUnhealthyCounter(final String counterName) { unhealthyCounters.add(counterName); } + + /** + * Records an unhealthy problem by incrementing a counter for the given class and description. + * This method helps in tracking issues which might lead to the server being marked unhealthy. + * + * @param className the name of the class where the problem occurred + * @param description a brief description of the problem + */ + public void recordUnhealthyProblem(final String className, final String description) { + String counterName = MetricRegistry.name(className, description); + metricRegistry.counter(counterName).inc(); + recordUnhealthyCounter(counterName); + } } diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java index fee1c3dd2..51cea7328 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java @@ -26,6 +26,7 @@ import org.mapfish.print.ExceptionUtils; import org.mapfish.print.PrintException; import org.mapfish.print.config.WorkingDirectories; +import org.mapfish.print.metrics.UnhealthyCountersHealthCheck; import org.mapfish.print.servlet.job.JobManager; import org.mapfish.print.servlet.job.JobQueue; import org.mapfish.print.servlet.job.NoSuchReferenceException; @@ -73,7 +74,7 @@ public class ThreadPoolJobManager implements JobManager { private int maxNumberOfWaitingJobs = DEFAULT_MAX_WAITING_JOBS; /** The amount of time to let a thread wait before being shutdown. */ - private long maxIdleTime = DEFAULT_THREAD_IDLE_TIME; + private final long maxIdleTime = DEFAULT_THREAD_IDLE_TIME; /** A print job is canceled, if it is not completed after this amount of time (in seconds). */ private long timeout = DEFAULT_TIMEOUT_IN_SECONDS; @@ -118,6 +119,8 @@ public class ThreadPoolJobManager implements JobManager { @Autowired private MetricRegistry metricRegistry; + @Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck; + private boolean requestedToStop = false; private Date lastExecutedJobTimestamp; @@ -211,7 +214,7 @@ public final void init() { } return 0; }); - /* The ThreadPoolExecutor uses a unbounded queue (though we are enforcing a limit in `submit()`). + /* The ThreadPoolExecutor uses an unbounded queue (though we are enforcing a limit in `submit()`). * Because of that, the executor creates only `corePoolSize` threads. But to use all threads, * we set both `corePoolSize` and `maximumPoolSize` to `maxNumberOfRunningPrintJobs`. As a * consequence, the `maxIdleTime` will be ignored, idle threads will not be terminated. @@ -239,11 +242,15 @@ protected void beforeExecute(final Thread t, final Runnable runnable) { ThreadPoolJobManager.this.jobQueue.start(printJob.getEntry().getReferenceId()); } catch (RuntimeException e) { LOGGER.error("failed to mark job as running", e); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + ThreadPoolJobManager.class.getSimpleName(), "failedToMarkJobAsRunning"); } catch (NoSuchReferenceException e) { LOGGER.error( "tried to mark non-existing job as 'running': {}", printJob.getEntry().getReferenceId(), e); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + ThreadPoolJobManager.class.getSimpleName(), "couldNotMarkNonExistingJob"); } } } @@ -573,7 +580,7 @@ public Callable getCallable() { } /** - * This timer task changes the status of finished jobs in the registry. Also it stops jobs that + * This timer task changes the status of finished jobs in the registry. Also, it stops jobs that * have been running for too long (timeout). */ @VisibleForTesting @@ -628,8 +635,12 @@ public void run() { } } catch (javax.persistence.PessimisticLockException e) { // Ignore error on pessimistic locking + unhealthyCountersHealthCheck.recordUnhealthyProblem( + getClass().getSimpleName(), "ignorePessimisticLockException"); } catch (RuntimeException t) { LOGGER.error("Error while polling/updating registry", t); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + getClass().getSimpleName(), "exceptionWhilePollingRegistry"); } } } From b2a69c26e7491f042ae2f4cce209a6e51f83cc2c Mon Sep 17 00:00:00 2001 From: sebr72 Date: Wed, 16 Oct 2024 09:05:51 +0200 Subject: [PATCH 4/6] Minor refactorings --- .../mapfish/print/servlet/job/HibernateAccounting.java | 8 ++------ .../servlet/job/impl/hibernate/HibernateJobQueue.java | 9 +++------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/mapfish/print/servlet/job/HibernateAccounting.java b/core/src/main/java/org/mapfish/print/servlet/job/HibernateAccounting.java index 79ff16420..964c57754 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/HibernateAccounting.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/HibernateAccounting.java @@ -1,6 +1,5 @@ package org.mapfish.print.servlet.job; -import com.codahale.metrics.MetricRegistry; import javax.annotation.Nonnull; import org.hibernate.HibernateException; import org.hibernate.Session; @@ -85,11 +84,8 @@ protected void doInTransactionWithoutResult(@Nonnull final TransactionStatus sta } }); } catch (HibernateException ex) { - String name = - MetricRegistry.name( - getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkiped"); - metricRegistry.counter(name).inc(); - unhealthyCountersHealthCheck.recordUnhealthyCounter(name); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkipped"); LOGGER.error("Cannot save accounting information", ex); } } diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java index fa44ba563..1d1102ce4 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java @@ -223,16 +223,13 @@ private void cleanup() { return HibernateJobQueue.this.dao.deleteOld( System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis()); } catch (Exception e) { - String deleteFailed = MetricRegistry.name(name, "failedToDeleteOldJobs"); - metricRegistry.counter(deleteFailed).inc(); - unhealthyCountersHealthCheck.recordUnhealthyCounter(deleteFailed); + unhealthyCountersHealthCheck.recordUnhealthyProblem( + name, "failedToDeleteOldJobs"); return null; } }); if (deletion.stop() > CLEAN_UP_INTERVAL_TIME_UNIT.toNanos(cleanupInterval)) { - String serverOverloaded = MetricRegistry.name(name, "tooManyJobsToDeleteInInterval"); - metricRegistry.counter(serverOverloaded).inc(); - unhealthyCountersHealthCheck.recordUnhealthyCounter(serverOverloaded); + unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "tooManyJobsToDeleteInInterval"); } if (nbDeleted != null && nbDeleted > 0) { metricRegistry.counter(name + ".totalDeleted").inc(nbDeleted); From 4b76654b4d9cfa74309071996923b79c3ffdf126 Mon Sep 17 00:00:00 2001 From: sebr72 Date: Fri, 18 Oct 2024 12:52:48 +0200 Subject: [PATCH 5/6] Improve counter names and add log --- .../print/servlet/job/impl/ThreadPoolJobManager.java | 5 +++-- .../job/impl/hibernate/HibernateJobQueue.java | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java index 51cea7328..a6fa026e3 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/ThreadPoolJobManager.java @@ -636,11 +636,12 @@ public void run() { } catch (javax.persistence.PessimisticLockException e) { // Ignore error on pessimistic locking unhealthyCountersHealthCheck.recordUnhealthyProblem( - getClass().getSimpleName(), "ignorePessimisticLockException"); + getClass().getSimpleName(), "ignoredPessimisticLockIssue"); + LOGGER.warn("Ignored Pessimistic Lock Issue", e); } catch (RuntimeException t) { LOGGER.error("Error while polling/updating registry", t); unhealthyCountersHealthCheck.recordUnhealthyProblem( - getClass().getSimpleName(), "exceptionWhilePollingRegistry"); + getClass().getSimpleName(), "pollingRegistryIssue"); } } } diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java index 1d1102ce4..ae6908125 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java @@ -14,6 +14,8 @@ import org.mapfish.print.servlet.job.PrintJobEntry; import org.mapfish.print.servlet.job.PrintJobResult; import org.mapfish.print.servlet.job.PrintJobStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.transaction.PlatformTransactionManager; import org.springframework.transaction.annotation.Transactional; @@ -27,6 +29,8 @@ public class HibernateJobQueue implements JobQueue { private static final long DEFAULT_CLEAN_UP_INTERVAL = 300; /* seconds */ private static final TimeUnit CLEAN_UP_INTERVAL_TIME_UNIT = TimeUnit.SECONDS; + private static final Logger LOGGER = LoggerFactory.getLogger(HibernateJobQueue.class); + @Autowired private PrintJobDao dao; @Autowired private PlatformTransactionManager txManager; @@ -223,16 +227,18 @@ private void cleanup() { return HibernateJobQueue.this.dao.deleteOld( System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis()); } catch (Exception e) { - unhealthyCountersHealthCheck.recordUnhealthyProblem( - name, "failedToDeleteOldJobs"); + unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "issue"); return null; } }); if (deletion.stop() > CLEAN_UP_INTERVAL_TIME_UNIT.toNanos(cleanupInterval)) { unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "tooManyJobsToDeleteInInterval"); + LOGGER.warn("Deleting Old Print Jobs took longer than the scheduled interval."); } if (nbDeleted != null && nbDeleted > 0) { - metricRegistry.counter(name + ".totalDeleted").inc(nbDeleted); + metricRegistry + .counter(MetricRegistry.name(name, "totalCountByThisServerInstance")) + .inc(nbDeleted); } } } From bb8673c09d3d94c785fd735a30651249207f74c7 Mon Sep 17 00:00:00 2001 From: sebr72 Date: Thu, 24 Oct 2024 10:48:30 +0200 Subject: [PATCH 6/6] Update counter name and explain its role --- .../print/servlet/job/impl/hibernate/HibernateJobQueue.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java index ae6908125..84563a279 100644 --- a/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java +++ b/core/src/main/java/org/mapfish/print/servlet/job/impl/hibernate/HibernateJobQueue.java @@ -236,9 +236,9 @@ private void cleanup() { LOGGER.warn("Deleting Old Print Jobs took longer than the scheduled interval."); } if (nbDeleted != null && nbDeleted > 0) { - metricRegistry - .counter(MetricRegistry.name(name, "totalCountByThisServerInstance")) - .inc(nbDeleted); + // This counter counts the number of deleted old job prints by this server instance. + // But it is not the grand total of deleted jobs in the database. + metricRegistry.counter(MetricRegistry.name(name, "totalCount")).inc(nbDeleted); } } }