From 92ab49333229af4214a4b362b014f1ed0bfeef69 Mon Sep 17 00:00:00 2001 From: Haoning Sun Date: Tue, 4 Apr 2023 10:25:52 +0800 Subject: [PATCH 01/27] Add usage info for omit-mount-info ### What changes are proposed in this pull request? Add usage info for omit-mount-info. ### Why are the changes needed? Show usage for omit-mount-info parameter. pr-link: Alluxio/alluxio#17165 change-id: cid-4c8c41999a1c307d7a6b2f5209166751526dddb6 --- docs/en/operation/User-CLI.md | 1 + dora/shell/src/main/java/alluxio/cli/fs/command/LsCommand.java | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/operation/User-CLI.md b/docs/en/operation/User-CLI.md index 103b23b78a67..80fd1c16d8ab 100644 --- a/docs/en/operation/User-CLI.md +++ b/docs/en/operation/User-CLI.md @@ -1203,6 +1203,7 @@ By default, it loads metadata only at the first time at which a directory is lis * `-r` reverses the sorting order. * `--timestamp` display the timestamp of the given option. Possible values are creationTime, lastModificationTime, and lastAccessTime. The default option is lastModificationTime. +* `-m` option excludes mount point related information. For example, `ls` can be used to browse the file system. diff --git a/dora/shell/src/main/java/alluxio/cli/fs/command/LsCommand.java b/dora/shell/src/main/java/alluxio/cli/fs/command/LsCommand.java index 6b6cfc7d228d..ebc542fff2ab 100644 --- a/dora/shell/src/main/java/alluxio/cli/fs/command/LsCommand.java +++ b/dora/shell/src/main/java/alluxio/cli/fs/command/LsCommand.java @@ -332,7 +332,8 @@ public int run(CommandLine cl) throws AlluxioException, IOException { @Override public String getUsage() { - return "ls [-d|-f|-p|-R/--recursive|-h|--sort=option|--timestamp=option|-r] ..."; + return "ls [-d|-f|-p|-R/--recursive|-h|--sort=option|--timestamp=option|-r" + + "|-m/--omit-mount-info] ..."; } @Override From cf0ed22d21528051dbf38033c0d12dbbef523219 Mon Sep 17 00:00:00 2001 From: humengyu Date: Tue, 4 Apr 2023 10:26:20 +0800 Subject: [PATCH 02/27] [SMALLFIX] Delete duplicate words ### What changes are proposed in this pull request? Delete duplicate words. ### Why are the changes needed? Please clarify why the changes are needed. For instance, 1. If you propose a new API, clarify the use case for a new API. 2. If you fix a bug, describe the bug. ### Does this PR introduce any user facing changes? Please list the user-facing changes introduced by your change, including 1. change in user-facing APIs 2. addition or removal of property keys 3. webui pr-link: Alluxio/alluxio#17178 change-id: cid-06ee4c54d254030e5cb2f141e81fe23a9fe43326 --- .../src/main/java/alluxio/worker/block/CacheRequestManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/CacheRequestManager.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/CacheRequestManager.java index 911954bd27c5..464feabc2356 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/CacheRequestManager.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/CacheRequestManager.java @@ -92,7 +92,7 @@ public void submitRequest(CacheRequest request) long blockId = request.getBlockId(); boolean async = request.getAsync(); if (mActiveCacheRequests.putIfAbsent(blockId, request) != null) { - // This block is already planned and just just return. + // This block is already planned and just return. if (async) { LOG.debug("request already planned: {}", request); } else { From 376a9874dcce48546e1f0d376e674d97be7a6677 Mon Sep 17 00:00:00 2001 From: jianghuazhu <740087514@qq.com> Date: Tue, 4 Apr 2023 10:26:53 +0800 Subject: [PATCH 03/27] [DOCFIX]Fix some bugs related to Caching.md ### What changes are proposed in this pull request? The purpose of this PR is to fix some bugs related to Caching.md ### Why are the changes needed? Fixing these errors will make Caching.md look cleaner. ### Does this PR introduce any user facing changes? Please list the user-facing changes introduced by your change, including 1. change in user-facing APIs 2. addition or removal of property keys 3. webui pr-link: Alluxio/alluxio#17187 change-id: cid-877b353109edfe228b434f2c594cb4db7be9428e --- docs/cn/core-services/Caching.md | 8 ++++++-- docs/en/core-services/Caching.md | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/cn/core-services/Caching.md b/docs/cn/core-services/Caching.md index 74279ffa46b6..643a0f3a5329 100644 --- a/docs/cn/core-services/Caching.md +++ b/docs/cn/core-services/Caching.md @@ -334,7 +334,7 @@ Alluxio支持命名空间中每个文件和目录的"生存时间(TTL)"设置。 则TTL功能可用于明确刷新旧数据,从而为新文件释放缓存空间。 Alluxio具有与每个文件或目录关联的TTL属性。这些属性将保存为 -日志的一部分,所以集群重新后也能持久保持。活跃master节点负责 +日志的一部分,所以集群重启后也能持久保持。活跃master节点负责 当Alluxio提供服务时将元数据保存在内存中。在内部,master运行一个后台 线程,该线程定期检查文件是否已达到其TTL到期时间。 @@ -478,8 +478,12 @@ Alluxio cluster summary: Started: 09-28-2018 12:52:09:486 Uptime: 0 day(s), 0 hour(s), 0 minute(s), and 26 second(s) Version: 2.0.0 - Safe Mode: true + Safe Mode: false Zookeeper Enabled: false + Raft-based Journal: true + Raft Journal Addresses: + localhost:19200 + localhost:19201 Live Workers: 1 Lost Workers: 0 Total Capacity: 10.67GB diff --git a/docs/en/core-services/Caching.md b/docs/en/core-services/Caching.md index 2b008e00c266..b381dae1aa03 100644 --- a/docs/en/core-services/Caching.md +++ b/docs/en/core-services/Caching.md @@ -657,8 +657,12 @@ Alluxio cluster summary: Started: 09-28-2018 12:52:09:486 Uptime: 0 day(s), 0 hour(s), 0 minute(s), and 26 second(s) Version: 2.0.0 - Safe Mode: true + Safe Mode: false Zookeeper Enabled: false + Raft-based Journal: true + Raft Journal Addresses: + localhost:19200 + localhost:19201 Live Workers: 1 Lost Workers: 0 Total Capacity: 10.67GB From 531efefd528f769ef85b5671c7b631f1762b87a5 Mon Sep 17 00:00:00 2001 From: humengyu Date: Wed, 5 Apr 2023 02:08:41 +0800 Subject: [PATCH 04/27] Print exception stack in s3 proxy ### What changes are proposed in this pull request? Print exception stack for s3 proxy. ### Why are the changes needed? Sometimes, the message of exception is null, and we need print stack to find where the exception occurred. Before add exception stack: ``` 2023-03-24 14:47:50,164 INFO ProxyWebServer - [ACCESSLOG] ListObjects Request:Request[GET //localhost:39999/api/v1/s3/s3/?prefix=user%2Fhumengyu%2Fword1&encoding-type=url]@54efde7a - Status:500 - ContentLength:None - Elapsed(ms):561 2023-03-24 14:47:50,325 WARN S3RestUtils - Error invoking REST endpoint for s3: null 2023-03-24 14:47:50,328 INFO ProxyWebServer - [ACCESSLOG] ListObjects Request:Request[GET //localhost:39999/api/v1/s3/s3/?prefix=user%2Fhumengyu%2Fword1&encoding-type=url]@54efde7a - Status:500 - ContentLength:None - Elapsed(ms):18 2023-03-24 14:47:50,469 WARN S3RestUtils - Error invoking REST endpoint for s3: null 2023-03-24 14:47:50,472 INFO ProxyWebServer - [ACCESSLOG] ListObjects Request:Request[GET //localhost:39999/api/v1/s3/s3/?prefix=user%2Fhumengyu%2Fword1&encoding-type=url]@54efde7a - Status:500 - ContentLength:None - Elapsed(ms):16 2023-03-24 14:47:50,828 WARN S3RestUtils - Error invoking REST endpoint for s3: null 2023-03-24 14:47:50,830 INFO ProxyWebServer - [ACCESSLOG] ListObjects Request:Request[GET //localhost:39999/api/v1/s3/s3/?prefix=user%2Fhumengyu%2Fword1&encoding-type=url]@54efde7a - Status:500 - ContentLength:None - Elapsed(ms):14 ``` After add exception stack: ``` 2023-03-24 14:54:41,470 WARN S3RestUtils - Error invoking REST endpoint for s3: null alluxio.proxy.s3.S3Exception at alluxio.proxy.s3.S3RestUtils.toBucketS3Exception(S3RestUtils.java:214) at alluxio.proxy.s3.S3BucketTask$ListObjectsTask.lambda$continueTask$1(S3BucketTask.java:335) at alluxio.proxy.s3.S3RestUtils.call(S3RestUtils.java:107) at alluxio.proxy.s3.S3BucketTask$ListObjectsTask.continueTask(S3BucketTask.java:264) at alluxio.proxy.s3.S3RequestServlet.serveRequest(S3RequestServlet.java:124) at alluxio.proxy.s3.S3RequestServlet.lambda$service$0(S3RequestServlet.java:93) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: alluxio.exception.AlluxioException at alluxio.exception.status.AlluxioStatusException.toAlluxioException(AlluxioStatusException.java:110) at alluxio.client.file.BaseFileSystem.wrapAndThrowAlluxioStatusException(BaseFileSystem.java:648) at alluxio.client.file.BaseFileSystem.rpc(BaseFileSystem.java:625) at alluxio.client.file.BaseFileSystem.exists(BaseFileSystem.java:207) at alluxio.client.file.DelegatingFileSystem.exists(DelegatingFileSystem.java:99) at alluxio.client.file.FileSystemCache$InstanceCachingFileSystem.exists(FileSystemCache.java:251) at alluxio.client.file.FileSystem.exists(FileSystem.java:302) at alluxio.proxy.s3.S3BucketTask$ListObjectsTask.lambda$continueTask$1(S3BucketTask.java:315) ... 9 more Caused by: alluxio.exception.status.UnknownException at alluxio.exception.status.AlluxioStatusException.from(AlluxioStatusException.java:174) at alluxio.exception.status.AlluxioStatusException.fromStatusRuntimeException(AlluxioStatusException.java:215) at alluxio.AbstractClient.retryRPCInternal(AbstractClient.java:486) at alluxio.AbstractClient.retryRPC(AbstractClient.java:450) at alluxio.AbstractClient.retryRPC(AbstractClient.java:439) at alluxio.client.file.RetryHandlingFileSystemMasterClient.exists(RetryHandlingFileSystemMasterClient.java:192) at alluxio.client.file.BaseFileSystem.lambda$exists$4(BaseFileSystem.java:210) at alluxio.client.file.BaseFileSystem.rpc(BaseFileSystem.java:623) ... 14 more Caused by: io.grpc.StatusRuntimeException: UNKNOWN at io.grpc.stub.ClientCalls.toStatusRuntimeException(ClientCalls.java:262) at io.grpc.stub.ClientCalls.getUnchecked(ClientCalls.java:243) at io.grpc.stub.ClientCalls.blockingUnaryCall(ClientCalls.java:156) at alluxio.grpc.FileSystemMasterClientServiceGrpc$FileSystemMasterClientServiceBlockingStub.exists(FileSystemMasterClientServiceGrpc.java:2018) at alluxio.client.file.RetryHandlingFileSystemMasterClient.lambda$exists$6(RetryHandlingFileSystemMasterClient.java:192) at alluxio.AbstractClient.retryRPCInternal(AbstractClient.java:484) ... 19 more ``` pr-link: Alluxio/alluxio#17141 change-id: cid-e1b5d218d52120bbf38b4277bbfa0ddd12af1e10 --- .../proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java index 19c6a07acb55..d10f9beb1aee 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java @@ -37,6 +37,7 @@ import alluxio.security.authentication.AuthenticatedClientUser; import alluxio.security.user.ServerUserState; import alluxio.util.SecurityUtils; +import alluxio.util.ThreadUtils; import com.fasterxml.jackson.dataformat.xml.XmlMapper; import com.google.common.annotations.VisibleForTesting; @@ -129,7 +130,11 @@ public static Response call(String resource, S3RestUtils.RestCallable cal XmlMapper mapper = new XmlMapper(); return Response.ok(mapper.writeValueAsString(result)).build(); } catch (Exception e) { - LOG.warn("Error invoking REST endpoint for {}:\n{}", resource, e.getMessage()); + String errOutputMsg = e.getMessage(); + if (StringUtils.isEmpty(errOutputMsg)) { + errOutputMsg = ThreadUtils.formatStackTrace(e); + } + LOG.warn("Error invoking REST endpoint for {}:\n{}", resource, errOutputMsg); return S3ErrorResponse.createErrorResponse(e, resource); } } From b12cc0f3cb9c4a1d0fad8a683fe22a82844c2d27 Mon Sep 17 00:00:00 2001 From: Rico Chiu Date: Wed, 5 Apr 2023 23:28:14 -0700 Subject: [PATCH 05/27] Fix double arg parsing in runTests both https://github.com/Alluxio/alluxio/pull/17170/files#diff-c80fd5152cf94b0cc483c26efb94f3946af189626fc4f10c0232bf968432078e and https://github.com/Alluxio/alluxio/pull/17068/files introduced the same change but in different lines so the merge was clean, but resulted in parsing args twice caught by running `bin/alluxio runTests --directory /path/to/dir` with error msg: ``` error: Exception in thread "main" com.beust.jcommander.ParameterException: Can only specify option --directory once. at com.beust.jcommander.ParameterDescription.addValue(ParameterDescription.java:240) at com.beust.jcommander.JCommander.processFixedArity(JCommander.java:913) at com.beust.jcommander.JCommander.processFixedArity(JCommander.java:894) at com.beust.jcommander.JCommander.parseValues(JCommander.java:724) at com.beust.jcommander.JCommander.parse(JCommander.java:356) at com.beust.jcommander.JCommander.parse(JCommander.java:335) at alluxio.cli.TestRunner.main(TestRunner.java:110) ``` pr-link: Alluxio/alluxio#17199 change-id: cid-41ef7b12702f10c1e8b4cc729f99acc9e1b33753 --- dora/shell/src/main/java/alluxio/cli/TestRunner.java | 1 - 1 file changed, 1 deletion(-) diff --git a/dora/shell/src/main/java/alluxio/cli/TestRunner.java b/dora/shell/src/main/java/alluxio/cli/TestRunner.java index 406e99763149..7dfd74ae61c7 100644 --- a/dora/shell/src/main/java/alluxio/cli/TestRunner.java +++ b/dora/shell/src/main/java/alluxio/cli/TestRunner.java @@ -105,7 +105,6 @@ private TestRunner() {} // prevent instantiation public static void main(String[] args) throws Exception { TestRunner runner = new TestRunner(); JCommander jCommander = new JCommander(runner); - jCommander.parse(args); jCommander.setProgramName("TestRunner"); jCommander.parse(args); if (runner.mHelp) { From 9e28d40898dd3ee9d2bec5dcb0f9c43a7950e93a Mon Sep 17 00:00:00 2001 From: lucyge2022 <111789461+lucyge2022@users.noreply.github.com> Date: Mon, 10 Apr 2023 14:26:19 -0700 Subject: [PATCH 06/27] Fix CopyObject writetype and unclosed outstream in InitiateMPUpload ### What changes are proposed in this pull request? 1. when s3 write type is CACHE_THRU, initiateMultipartUpload creates MultipartMetaFile without closing the outstream, causing leak in BlockWorkerClient resource. 2. createFilePOption in CopyObject didn't set any write type (which should respect alluxio.proxy.s3.writetype), causing all objects copied are in the MUST_CACHE write type. ### Why are the changes needed? To fix the above 2 problems. ### Does this PR introduce any user facing changes? No. pr-link: Alluxio/alluxio#17164 change-id: cid-968c2381d8cbcb6152fe8d4af2272cb201776b98 --- .../java/alluxio/proxy/s3/S3ObjectTask.java | 36 +++++-------- .../proxy/s3/S3RestServiceHandler.java | 3 +- .../client/rest/S3ClientRestApiTest.java | 53 ++++++++++++++++++- 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java index 165f6fe75fb3..27d77684b02e 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java @@ -448,7 +448,6 @@ public Response continueTask() { if (objectPath.endsWith(AlluxioURI.SEPARATOR)) { createDirectory(objectPath, userFs, auditContext); } - AlluxioURI objectUri = new AlluxioURI(objectPath); // Populate the xattr Map with the metadata tags if provided Map xattrMap = new HashMap<>(); @@ -459,19 +458,6 @@ public Response continueTask() { final String contentTypeHeader = mHandler.getHeader(S3Constants.S3_CONTENT_TYPE_HEADER); S3RestUtils.populateContentTypeInXAttr(xattrMap, contentTypeHeader); - CreateFilePOptions filePOptions = - CreateFilePOptions.newBuilder() - .setRecursive(true) - .setMode(PMode.newBuilder() - .setOwnerBits(Bits.ALL) - .setGroupBits(Bits.ALL) - .setOtherBits(Bits.NONE).build()) - .setWriteType(S3RestUtils.getS3WriteType()) - .putAllXattr(xattrMap) - .setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) - .setOverwrite(true) - .build(); - try { copySource = URLDecoder.decode(copySource, "UTF-8"); } catch (UnsupportedEncodingException ex) { @@ -483,15 +469,19 @@ public Response continueTask() { .setMode(PMode.newBuilder() .setOwnerBits(Bits.ALL) .setGroupBits(Bits.ALL) - .setOtherBits(Bits.NONE).build()); + .setOtherBits(Bits.NONE) + .build()) + .setWriteType(S3RestUtils.getS3WriteType()) + .setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) + .setOverwrite(true); // Handle metadata directive final String metadataDirective = mHandler.getHeader( S3Constants.S3_METADATA_DIRECTIVE_HEADER); if (StringUtils.equals(metadataDirective, S3Constants.Directive.REPLACE.name()) - && filePOptions.getXattrMap().containsKey(S3Constants.CONTENT_TYPE_XATTR_KEY)) { + && xattrMap.containsKey(S3Constants.CONTENT_TYPE_XATTR_KEY)) { copyFilePOptionsBuilder.putXattr(S3Constants.CONTENT_TYPE_XATTR_KEY, - filePOptions.getXattrMap().get(S3Constants.CONTENT_TYPE_XATTR_KEY)); + xattrMap.get(S3Constants.CONTENT_TYPE_XATTR_KEY)); } else { // defaults to COPY try { status = userFs.getStatus(new AlluxioURI(copySource)); @@ -510,9 +500,9 @@ public Response continueTask() { final String taggingDirective = mHandler.getHeader( S3Constants.S3_TAGGING_DIRECTIVE_HEADER); if (StringUtils.equals(taggingDirective, S3Constants.Directive.REPLACE.name()) - && filePOptions.getXattrMap().containsKey(S3Constants.TAGGING_XATTR_KEY)) { + && xattrMap.containsKey(S3Constants.TAGGING_XATTR_KEY)) { copyFilePOptionsBuilder.putXattr(S3Constants.TAGGING_XATTR_KEY, - filePOptions.getXattrMap().get(S3Constants.TAGGING_XATTR_KEY)); + xattrMap.get(S3Constants.TAGGING_XATTR_KEY)); } else { // defaults to COPY try { if (status == null) { @@ -712,7 +702,6 @@ public Response continueTask() { if (objectPath.endsWith(AlluxioURI.SEPARATOR)) { return createDirectory(objectPath, userFs, auditContext); } - AlluxioURI objectUri = new AlluxioURI(objectPath); // Populate the xattr Map with the metadata tags if provided Map xattrMap = new HashMap<>(); @@ -802,6 +791,7 @@ public Response continueTask() { .setOwnerBits(Bits.ALL) .setGroupBits(Bits.ALL) .setOtherBits(Bits.NONE).build()) + .setWriteType(S3RestUtils.getS3WriteType()) .setOverwrite(true); String entityTag = copyObject(userFs, auditContext, objectPath, copySource, copyFilePOptionsBuilder.build()); @@ -900,7 +890,7 @@ public Response continueTask() { ByteString.copyFrom(mHandler.getObject(), S3Constants.XATTR_STR_CHARSET)); xattrMap.put(S3Constants.UPLOADS_FILE_ID_XATTR_KEY, ByteString.copyFrom( Longs.toByteArray(userFs.getStatus(multipartTemporaryDir).getFileId()))); - mHandler.getMetaFS().createFile( + try (FileOutStream fos = mHandler.getMetaFS().createFile( new AlluxioURI(S3RestUtils.getMultipartMetaFilepathForUploadId(uploadId)), CreateFilePOptions.newBuilder() .setRecursive(true) @@ -912,7 +902,9 @@ public Response continueTask() { .putAllXattr(xattrMap) .setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) .build() - ); + )) { + // Empty file creation, nothing to do. + } SetAttributePOptions attrPOptions = SetAttributePOptions.newBuilder() .setOwner(user) .build(); diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java index fb86ee921833..967528c4c01c 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java @@ -914,6 +914,7 @@ public Response createObjectOrUploadPart(@HeaderParam("Content-MD5") final Strin .setOwnerBits(Bits.ALL) .setGroupBits(Bits.ALL) .setOtherBits(Bits.NONE).build()) + .setWriteType(S3RestUtils.getS3WriteType()) .setCheckS3BucketPath(true) .setOverwrite(true); // Handle metadata directive @@ -1089,7 +1090,7 @@ public Response initiateMultipartUpload( .putAllXattr(xattrMap) .setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) .build() - ); + ).close(); SetAttributePOptions attrPOptions = SetAttributePOptions.newBuilder() .setOwner(user) .build(); diff --git a/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java b/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java index 1ce63d6c5d4f..091ed6918a34 100644 --- a/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java +++ b/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java @@ -1315,6 +1315,17 @@ public void initiateMultipartUpload() throws Exception { String expectedResult = XML_MAPPER.writeValueAsString(expected); Assert.assertEquals(expectedResult, result); + + URIStatus mpMetaFileStatus = mFileSystem.getStatus( + new AlluxioURI(S3RestUtils.getMultipartMetaFilepathForUploadId(uploadId))); + Assert.assertTrue(mpMetaFileStatus.isCompleted()); + + AlluxioURI mpTempDirURI = new AlluxioURI(S3RestUtils.getMultipartTemporaryDirForObject( + S3RestUtils.parsePath(AlluxioURI.SEPARATOR + bucketName), + objectName, uploadId)); + Assert.assertTrue(mFileSystem.exists(mpTempDirURI)); + URIStatus mpTempDirStatus = mFileSystem.getStatus(mpTempDirURI); + Assert.assertTrue(mpTempDirStatus.getFileInfo().isFolder()); } @Test @@ -1374,7 +1385,47 @@ public void uploadPartWithoutInitiation() throws Exception { Assert.fail("Upload part of an object without multipart upload initialization should fail"); } - // TODO(czhu): Add test for UploadPartCopy + @Test + public void testUploadPartCopy() throws Exception { + final String bucketName = "bucket"; + createBucketRestCall(bucketName); + + final String objectName = "src-object"; + String srcObjectKey = bucketName + AlluxioURI.SEPARATOR + objectName; + final byte[] srcObjectContent = CommonUtils.randomAlphaNumString(DATA_SIZE).getBytes(); + putObjectTest(bucketName, objectName, srcObjectContent, null, null); + + // UploadPartCopy object + String targetObjectName = "target-MP-object"; + String targetMPObjectKey = bucketName + AlluxioURI.SEPARATOR + targetObjectName; + String result = initiateMultipartUploadRestCall(targetMPObjectKey); + final String uploadId = XML_MAPPER.readValue(result, InitiateMultipartUploadResult.class) + .getUploadId(); + Map params = new HashMap<>(); + params.put("uploadId", uploadId); + params.put("partNumber", "1"); + + new TestCase(mHostname, mPort, mBaseUri, + targetMPObjectKey, + params, HttpMethod.PUT, + getDefaultOptionsWithAuth() + .addHeader(S3Constants.S3_COPY_SOURCE_HEADER, srcObjectKey)).runAndGetResponse(); + + List partList = new ArrayList<>(); + partList.add(new CompleteMultipartUploadRequest.Part("", 1)); + result = completeMultipartUploadRestCall(targetMPObjectKey, uploadId, + new CompleteMultipartUploadRequest(partList)); + + // Verify the object's content. + byte[] downloadTargetMpObj = new byte[DATA_SIZE]; + MessageDigest md5 = MessageDigest.getInstance("MD5"); + try (FileInStream is = mFileSystem + .openFile(new AlluxioURI("/" + targetMPObjectKey))) { + is.read(downloadTargetMpObj, 0, DATA_SIZE); + Assert.assertTrue(is.available() <= 0); + } + Assert.assertArrayEquals(srcObjectContent, downloadTargetMpObj); + } @Test public void listParts() throws Exception { From 79529fda87f2ce1b163dc990d38503fac460bdfa Mon Sep 17 00:00:00 2001 From: secfree Date: Wed, 12 Apr 2023 10:47:06 +0800 Subject: [PATCH 07/27] Skip ssh for localhost ### What changes are proposed in this pull request? Skip ssh connection while running locally. ### Why are the changes needed? I was trying to setup a one-node cluster with the latest version to do some simple tests. However, I encountered the following exception while executing `/bin/alluxio format` ``` $ ./bin/alluxio format Executing the following command on all worker nodes and logging to /home/test/deploy/alluxio/logs/task.log: /home/test/deploy/alluxio/bin/alluxio formatWorker Waiting for tasks to finish... test@localhost's password: test@localhost's password: test@localhost's password: Task on 'localhost' fails, exit code: 255 There are task failures, look at /home/test/deploy/alluxio/logs/task.log for details. ``` In the log, it has ``` [2023-03-24 17:12:20][localhost] Failed to add the host to the list of known hosts (/home/test/.ssh/known_hosts). [2023-03-24 17:13:28][localhost] Permission denied, please try again. [2023-03-24 17:13:29][localhost] Permission denied, please try again. [2023-03-24 17:13:30][localhost] Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password). ``` However, in our cluster, we do not know the password of our accounts in a specific server. So I feel it's better to skip the "ssh" part for a local one-node cluster. ### Does this PR introduce any user facing changes? NO. pr-link: Alluxio/alluxio#17167 change-id: cid-0ade98d9517f269668a2f3cb384b8b079fa440be --- bin/alluxio-common.sh | 10 ++++++++++ bin/alluxio-masters.sh | 11 +++-------- bin/alluxio-workers.sh | 9 ++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/bin/alluxio-common.sh b/bin/alluxio-common.sh index 69023a43482f..d0b8f7926e33 100755 --- a/bin/alluxio-common.sh +++ b/bin/alluxio-common.sh @@ -68,3 +68,13 @@ function get_ramdisk_array() { done IFS=$oldifs } + +# Compose the ssh command according to the hostname +function ssh_command() { + local host=$1 + local command="" + if [[ $host != "localhost" && $host != "127.0.0.1" ]]; then + command="ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -tt ${host}" + fi + echo "${command}" +} diff --git a/bin/alluxio-masters.sh b/bin/alluxio-masters.sh index 359c3daea348..e72cc693c820 100755 --- a/bin/alluxio-masters.sh +++ b/bin/alluxio-masters.sh @@ -12,12 +12,7 @@ set -o pipefail -LAUNCHER= -# If debugging is enabled propagate that through to sub-shells -if [[ "$-" == *x* ]]; then - LAUNCHER="bash -x" -fi -BIN=$(cd "$( dirname "$( readlink "$0" || echo "$0" )" )"; pwd) +. $(dirname "$0")/alluxio-common.sh USAGE="Usage: alluxio-masters.sh command..." @@ -46,10 +41,10 @@ fi for master in ${HOSTLIST[@]}; do echo "[${master}] Connecting as ${USER}..." >> ${ALLUXIO_TASK_LOG} if [[ ${HA_ENABLED} == "true" || ${N} -eq 0 ]]; then - nohup ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -tt ${master} ${LAUNCHER} \ + nohup $(ssh_command ${master}) ${LAUNCHER} \ $"${@// /\\ }" 2>&1 | while read line; do echo "[$(date '+%F %T')][${master}] ${line}"; done >> ${ALLUXIO_TASK_LOG} & else - nohup ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -tt ${master} ${LAUNCHER} \ + nohup $(ssh_command ${master}) ${LAUNCHER} \ $"export ALLUXIO_MASTER_SECONDARY=true; ${@// /\\ }" 2>&1 | while read line; do echo "[$(date '+%F %T')][${master}] ${line}"; done >> ${ALLUXIO_TASK_LOG} & fi pids[${#pids[@]}]=$! diff --git a/bin/alluxio-workers.sh b/bin/alluxio-workers.sh index 11dc9c9558ba..79b792706db6 100755 --- a/bin/alluxio-workers.sh +++ b/bin/alluxio-workers.sh @@ -12,12 +12,7 @@ set -o pipefail -LAUNCHER= -# If debugging is enabled propagate that through to sub-shells -if [[ "$-" == *x* ]]; then - LAUNCHER="bash -x" -fi -BIN=$(cd "$( dirname "$( readlink "$0" || echo "$0" )" )"; pwd) +. $(dirname "$0")/alluxio-common.sh USAGE="Usage: alluxio-workers.sh command..." @@ -39,7 +34,7 @@ echo "Executing the following command on all worker nodes and logging to ${ALLUX for worker in ${HOSTLIST[@]}; do echo "[${worker}] Connecting as ${USER}..." >> ${ALLUXIO_TASK_LOG} - nohup ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -tt ${worker} ${LAUNCHER} \ + nohup $(ssh_command ${worker}) ${LAUNCHER} \ $"${@// /\\ }" 2>&1 | while read line; do echo "[$(date '+%F %T')][${worker}] ${line}"; done >> ${ALLUXIO_TASK_LOG} & pids[${#pids[@]}]=$! done From 83c792a09dbfec40a61be98485c7d7d80beba8df Mon Sep 17 00:00:00 2001 From: Rico Chiu Date: Thu, 13 Apr 2023 17:32:58 -0700 Subject: [PATCH 08/27] cherry-pick empty: Prevent unrelated property keys from generating in docGen pr-link: Alluxio/alluxio#17249 change-id: cid-961a1e2cf4862e04be59cfdf6385ae3e2eb9a90d From a4d2e36b248a947eccf2e0bcce0dddca029ae87b Mon Sep 17 00:00:00 2001 From: maobaolong <307499405@qq.com> Date: Fri, 14 Apr 2023 09:58:01 +0800 Subject: [PATCH 09/27] Show the invalid default value and key name ### What changes are proposed in this pull request? If we develop a new Propertykey and give the inappropriate default value, the master will not start successfully, and we cannot find which Propertykey is bad. So I improve the prompt and show the related Propertykey ### Why are the changes needed? Show the related Propertykey of inappropriate default value ### Does this PR introduce any user facing changes? NO pr-link: Alluxio/alluxio#17185 change-id: cid-486e3ab1c1092299d0b8e762fe43715b9dcbd8e7 --- dora/core/common/src/main/java/alluxio/conf/PropertyKey.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java b/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java index 659c5dfc496c..4b988afaa1ba 100755 --- a/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java +++ b/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java @@ -421,7 +421,8 @@ public Builder setDefaultSupplier(Supplier supplier, String description) * @return the updated builder instance */ public Builder setDefaultValue(Object defaultValue) { - checkArgument(validateValue(defaultValue, mType, mEnumType, mValueValidationFunction)); + checkArgument(validateValue(defaultValue, mType, mEnumType, mValueValidationFunction), + String.format("default value %s of %s validate failed", defaultValue, mName)); mDefaultValue = formatValue(defaultValue, mType, mEnumType, mDelimiter); return this; } From 8e7434397f1a83837c15ddd4b02de3b09fdbf201 Mon Sep 17 00:00:00 2001 From: secfree Date: Mon, 17 Apr 2023 10:43:57 +0800 Subject: [PATCH 10/27] Improve the performance of MountTable.getMountPoint ### What changes are proposed in this pull request? Improve the performance of MountTable.getMountPoint ### Why are the changes needed? Currently the implementation of `MountTable.getMountPoint` needs to iterate through all mount points. In one of our Alluxio clusters, there are more than 300 mount points. The leader alluxio master had very high load and we found `MountTable.getMountPoint` cost most of the cpu time. This PR can improve the performance a lot especially for clusters with lots of mount points. Below is the time cost of calling `MountTable.getMountPoint` 1000 times when there are 300 mount points | version | time cost (ms) | | --- | --- | | master | 142 | | PR | 6 | ### Does this PR introduce any user facing changes? NO pr-link: Alluxio/alluxio#17244 change-id: cid-aa35fe2bb9c2f439fc818461cb49c15c9391909e --- .../main/java/alluxio/util/io/PathUtils.java | 24 +++++++++++++++ .../java/alluxio/util/io/PathUtilsTest.java | 30 +++++++++++++++++++ .../alluxio/master/file/meta/MountTable.java | 12 ++++---- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/dora/core/common/src/main/java/alluxio/util/io/PathUtils.java b/dora/core/common/src/main/java/alluxio/util/io/PathUtils.java index cb731a9d7270..0d3069b69182 100644 --- a/dora/core/common/src/main/java/alluxio/util/io/PathUtils.java +++ b/dora/core/common/src/main/java/alluxio/util/io/PathUtils.java @@ -22,8 +22,10 @@ import com.google.common.base.Preconditions; import org.apache.commons.io.FilenameUtils; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.List; import java.util.UUID; import java.util.regex.Pattern; import javax.annotation.concurrent.ThreadSafe; @@ -444,4 +446,26 @@ public static String normalizePath(String path, String separator) { } private PathUtils() {} // prevent instantiation + + /** + * Returns the list of possible mount points of the given path. + * + * "/a/b/c" => {"/a", "/a/b", "/a/b/c"} + * + * @param path the path to get the mount points of + * @return a list of paths + */ + public static List getPossibleMountPoints(String path) throws InvalidPathException { + String basePath = cleanPath(path); + List paths = new ArrayList<>(); + if ((basePath != null) && !basePath.equals(AlluxioURI.SEPARATOR)) { + paths.add(basePath); + String parent = getParent(path); + while (!parent.equals(AlluxioURI.SEPARATOR)) { + paths.add(0, parent); + parent = getParent(parent); + } + } + return paths; + } } diff --git a/dora/core/common/src/test/java/alluxio/util/io/PathUtilsTest.java b/dora/core/common/src/test/java/alluxio/util/io/PathUtilsTest.java index cd34c8a078b9..81d7e1ac01da 100644 --- a/dora/core/common/src/test/java/alluxio/util/io/PathUtilsTest.java +++ b/dora/core/common/src/test/java/alluxio/util/io/PathUtilsTest.java @@ -507,4 +507,34 @@ public void normalizePath() throws Exception { assertEquals("/foo/bar//", PathUtils.normalizePath("/foo/bar//", "/")); assertEquals("/foo/bar%", PathUtils.normalizePath("/foo/bar", "%")); } + + /** + * Tests the {@link PathUtils#getPossibleMountPoints(String)} method to + * throw an exception in case the path is invalid. + */ + @Test + public void getPossibleMountPointsException() throws InvalidPathException { + mException.expect(InvalidPathException.class); + PathUtils.getPossibleMountPoints(""); + } + + /** + * Tests the {@link PathUtils#getPossibleMountPoints(String)} method. + */ + @Test + public void getPossibleMountPointsNoException() throws InvalidPathException { + ArrayList paths = new ArrayList<>(); + assertEquals(paths, PathUtils.getPossibleMountPoints("/")); + assertEquals(paths, PathUtils.getPossibleMountPoints("//")); + + paths.add("/a"); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a")); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a/")); + paths.add("/a/b"); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a/b")); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a/b/")); + paths.add("/a/b/c"); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a/b/c")); + assertEquals(paths, PathUtils.getPossibleMountPoints("/a/b/c/")); + } } diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/meta/MountTable.java b/dora/core/server/master/src/main/java/alluxio/master/file/meta/MountTable.java index 563a282b9f37..da0ae9c4d871 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/meta/MountTable.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/meta/MountTable.java @@ -323,13 +323,13 @@ public void update(Supplier journalContext, AlluxioURI alluxioUr public String getMountPoint(AlluxioURI uri) throws InvalidPathException { String path = uri.getPath(); String lastMount = ROOT; + List possibleMounts = PathUtils.getPossibleMountPoints(path); try (LockResource r = new LockResource(mReadLock)) { - for (Map.Entry entry : mState.getMountTable().entrySet()) { - String mount = entry.getKey(); - // we choose a new candidate path if the previous candidate path is a prefix - // of the current alluxioPath and the alluxioPath is a prefix of the path - if (!mount.equals(ROOT) && PathUtils.hasPrefix(path, mount) - && lastMount.length() < mount.length()) { + Map mountTable = mState.getMountTable(); + for (String mount: possibleMounts) { + if (mountTable.containsKey(mount)) { + // results in `possibleMounts` are from shortest to longest, so it will get the + // longest matching below lastMount = mount; } } From d9f7bb44612498c8021b8b55d9756fcdee983a45 Mon Sep 17 00:00:00 2001 From: Xinran Dong <81548653+007DXR@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:46:21 +0800 Subject: [PATCH 11/27] Enable bucket cache in v2 s3 proxy ### What changes are proposed in this pull request? Create 'BUCKET_CACHE' in v2 s3 proxy to reduce time cost of checking bucket path in the same way as PR #16806. ### Why are the changes needed? To keep the consistence between v2 s3 proxy and v1 s3 proxy. To speed up the Alluxio proxy efficency to deal with requests . ### Does this PR introduce any user facing changes? If enabling the cache, alluxio will cache bucket path statistics for specified time period(configured in alluxio-site.properties file). Be careful to use this cache because Alluxio S3 API will behave differently from AWS S3 API when coming an illegal request. This bucket path cache is swithed off by default. pr-link: Alluxio/alluxio#17022 change-id: cid-a6ae6484aab704bc39ce3a02d22d70bd633cc6a6 --- .../main/java/alluxio/conf/PropertyKey.java | 6 ++- .../java/alluxio/proxy/s3/S3BucketTask.java | 26 +++++++++---- .../main/java/alluxio/proxy/s3/S3Handler.java | 14 +++++++ .../java/alluxio/proxy/s3/S3ObjectTask.java | 37 +++++++++++++------ .../proxy/s3/S3RestServiceHandler.java | 12 ++++-- .../java/alluxio/proxy/s3/S3RestUtils.java | 7 ++-- 6 files changed, 74 insertions(+), 28 deletions(-) diff --git a/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java b/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java index 4b988afaa1ba..fa421f9c2973 100755 --- a/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java +++ b/dora/core/common/src/main/java/alluxio/conf/PropertyKey.java @@ -5565,9 +5565,11 @@ public String toString() { public static final PropertyKey PROXY_S3_BUCKETPATHCACHE_TIMEOUT_MS = durationBuilder(Name.PROXY_S3_BUCKETPATHCACHE_TIMEOUT_MS) .setAlias("alluxio.proxy.s3.bucketpathcache.timeout.ms") - .setDefaultValue("1min") + .setDefaultValue("0min") .setDescription("Expire bucket path statistics in cache for this time period. " - + "Set 0min to disable the cache.") + + "Set 0min to disable the cache. If enabling the cache, " + + "be careful that Alluxio S3 API will behave differently from AWS S3 API" + + " if bucket path cache entries become stale.") .setConsistencyCheckLevel(ConsistencyCheckLevel.IGNORE) .setScope(Scope.NONE) .build(); diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3BucketTask.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3BucketTask.java index 9e13e22bcffc..17d3b5c7eade 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3BucketTask.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3BucketTask.java @@ -144,6 +144,8 @@ public Response continueTask() { // debatable (?) potentially breaks backcompat(?) .filter(URIStatus::isFolder) .collect(Collectors.toList()); + buckets.forEach( + (uri) -> mHandler.BUCKET_PATH_CACHE.put(uri.getPath(), true)); return new ListAllMyBucketsResult(buckets); } }); @@ -165,7 +167,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext, + mHandler.BUCKET_PATH_CACHE); AlluxioURI uri = new AlluxioURI(path); try { TaggingData tagData = S3RestUtils.deserializeTags(userFs.getStatus(uri).getXAttr()); @@ -196,7 +199,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext, + mHandler.BUCKET_PATH_CACHE); try { List children = mHandler.getMetaFS().listStatus(new AlluxioURI( S3RestUtils.MULTIPART_UPLOADS_METADATA_DIR)); @@ -256,7 +260,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, path, auditContext, + mHandler.BUCKET_PATH_CACHE); String markerParam = mHandler.getQueryParameter("marker"); String maxKeysParam = mHandler.getQueryParameter("max-keys"); String prefixParam = mHandler.getQueryParameter("prefix"); @@ -330,7 +335,8 @@ public Response continueTask() { String bucketPath = S3RestUtils.parsePath(AlluxioURI.SEPARATOR + mHandler.getBucket()); try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), mHandler.getUser(), mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(mHandler.getMetaFS(), bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(mHandler.getMetaFS(), bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); try { TaggingData tagData = new XmlMapper().readerFor(TaggingData.class) .readValue(mHandler.getInputStream()); @@ -395,6 +401,7 @@ public Response continueTask() { // Silently swallow CreateBucket calls on existing buckets for this user // - S3 clients may prepend PutObject requests with CreateBucket calls instead of // calling HeadBucket to ensure that the bucket exists + mHandler.BUCKET_PATH_CACHE.put(bucketPath, true); return Response.Status.OK; } // Otherwise, this bucket is owned by a different user @@ -428,6 +435,7 @@ public Response continueTask() { } catch (Exception e) { throw S3RestUtils.toBucketS3Exception(e, bucketPath, auditContext); } + mHandler.BUCKET_PATH_CACHE.put(bucketPath, true); return Response.Status.OK; } }); @@ -509,7 +517,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); } return Response.ok().build(); }); @@ -530,7 +539,8 @@ public Response continueTask() { String bucketPath = S3RestUtils.parsePath(AlluxioURI.SEPARATOR + mHandler.getBucket()); try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); LOG.debug("DeleteBucketTagging bucket={}", bucketPath); Map xattrMap = new HashMap<>(); @@ -565,7 +575,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), null)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); // Delete the bucket. DeletePOptions options = DeletePOptions.newBuilder().setAlluxioOnly(Configuration .get(PropertyKey.PROXY_S3_DELETE_TYPE) @@ -573,6 +584,7 @@ public Response continueTask() { .build(); try { userFs.delete(new AlluxioURI(bucketPath), options); + mHandler.BUCKET_PATH_CACHE.put(bucketPath, false); } catch (Exception e) { throw S3RestUtils.toBucketS3Exception(e, bucketPath, auditContext); } diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3Handler.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3Handler.java index 46eb1e82226f..179447ede09e 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3Handler.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3Handler.java @@ -25,6 +25,8 @@ import alluxio.web.ProxyWebServer; import com.google.common.base.Stopwatch; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; import org.eclipse.jetty.server.Request; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +42,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.Nullable; @@ -72,6 +75,17 @@ public class S3Handler { Pattern.compile("^" + S3RequestServlet.S3_V2_SERVICE_PATH_PREFIX + "/[^/]*$"); public static final Pattern OBJECT_PATH_PATTERN = Pattern.compile("^" + S3RequestServlet.S3_V2_SERVICE_PATH_PREFIX + "/[^/]*/.*$"); + public static final int BUCKET_PATH_CACHE_SIZE = 65536; + /* BUCKET_PATH_CACHE caches bucket path during specific period. + BUCKET_PATH_CACHE.put(bucketPath,true) means bucket path exists. + BUCKET_PATH_CACHE.put(bucketPath,false) plays the same effect + as BUCKET_PATH_CACHE.remove(bucketPath). */ + public static final Cache BUCKET_PATH_CACHE = CacheBuilder.newBuilder() + .maximumSize(BUCKET_PATH_CACHE_SIZE) + .expireAfterWrite( + Configuration.global().getMs(PropertyKey.PROXY_S3_BUCKETPATHCACHE_TIMEOUT_MS), + TimeUnit.MILLISECONDS) + .build(); private static final Logger LOG = LoggerFactory.getLogger(S3Handler.class); private static final ThreadLocal TLS_BYTES = ThreadLocal.withInitial(() -> new byte[8 * 1024]); diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java index 27d77684b02e..78450c296eb6 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java @@ -171,7 +171,8 @@ public Response continueTask() { String bucketPath = S3RestUtils.parsePath(AlluxioURI.SEPARATOR + mHandler.getBucket()); try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); AlluxioURI tmpDir = new AlluxioURI(S3RestUtils.getMultipartTemporaryDirForObject( bucketPath, mHandler.getObject(), uploadId)); @@ -222,7 +223,8 @@ public Response continueTask() { AlluxioURI uri = new AlluxioURI(objectPath); try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); try { TaggingData tagData = S3RestUtils.deserializeTags(userFs.getStatus(uri).getXAttr()); LOG.debug("GetObjectTagging tagData={}", tagData); @@ -249,7 +251,8 @@ public Response continueTask() { String bucketPath = S3RestUtils.parsePath(AlluxioURI.SEPARATOR + mHandler.getBucket()); try (S3AuditContext auditContext = mHandler.createAuditContext( mOPType.name(), user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); String objectPath = bucketPath + AlluxioURI.SEPARATOR + mHandler.getObject(); AlluxioURI objectUri = new AlluxioURI(objectPath); TaggingData tagData = null; @@ -448,6 +451,7 @@ public Response continueTask() { if (objectPath.endsWith(AlluxioURI.SEPARATOR)) { createDirectory(objectPath, userFs, auditContext); } + AlluxioURI objectUri = new AlluxioURI(objectPath); // Populate the xattr Map with the metadata tags if provided Map xattrMap = new HashMap<>(); @@ -473,7 +477,8 @@ public Response continueTask() { .build()) .setWriteType(S3RestUtils.getS3WriteType()) .setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) - .setOverwrite(true); + .setOverwrite(true) + .setCheckS3BucketPath(true); // Handle metadata directive final String metadataDirective = mHandler.getHeader( @@ -624,6 +629,7 @@ public Response createDirectory(String objectPath, FileSystem userFs, .setGroupBits(Bits.ALL) .setOtherBits(Bits.NONE).build()) .setAllowExists(true) + .setCheckS3BucketPath(true) .build(); userFs.createDirectory(new AlluxioURI(objectPath), dirOptions); } catch (FileAlreadyExistsException e) { @@ -696,7 +702,8 @@ public Response continueTask() { try (S3AuditContext auditContext = mHandler.createAuditContext(mOPType.name(), user, bucket, object)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); String objectPath = bucketPath + AlluxioURI.SEPARATOR + object; if (objectPath.endsWith(AlluxioURI.SEPARATOR)) { @@ -722,6 +729,7 @@ public Response continueTask() { .setWriteType(S3RestUtils.getS3WriteType()) .putAllXattr(xattrMap).setXattrPropStrat(XAttrPropagationStrategy.LEAF_NODE) .setOverwrite(true) + .setCheckS3BucketPath(true) .build(); return createObject(objectPath, userFs, filePOptions, auditContext); } @@ -839,7 +847,8 @@ public Response continueTask() { final String contentTypeHeader = mHandler.getHeader(S3Constants.S3_CONTENT_TYPE_HEADER); try (S3AuditContext auditContext = mHandler.createAuditContext( "initiateMultipartUpload", user, bucket, object)) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); if (taggingHeader != null) { // Parse the tagging header if it exists try { tagData = S3RestUtils.deserializeTaggingHeader( @@ -877,7 +886,9 @@ public Response continueTask() { .setOwnerBits(Bits.ALL) .setGroupBits(Bits.ALL) .setOtherBits(Bits.NONE).build()) - .setWriteType(S3RestUtils.getS3WriteType()).build()); + .setWriteType(S3RestUtils.getS3WriteType()) + .setCheckS3BucketPath(true) + .build()); // Create the Alluxio multipart upload metadata file if (contentTypeHeader != null) { @@ -1035,7 +1046,8 @@ public Response continueTask() { mUserFs = S3RestUtils.createFileSystemForUser(user, mHandler.getMetaFS()); try { String bucketPath = S3RestUtils.parsePath(AlluxioURI.SEPARATOR + bucket); - S3RestUtils.checkPathIsAlluxioDirectory(mUserFs, bucketPath, null); + S3RestUtils.checkPathIsAlluxioDirectory(mUserFs, bucketPath, null, + mHandler.BUCKET_PATH_CACHE); objectPath = bucketPath + AlluxioURI.SEPARATOR + object; // Check for existing multipart info files and dirs AlluxioURI multipartTemporaryDir = new AlluxioURI( @@ -1304,7 +1316,8 @@ public Response continueTask() { .getMultipartTemporaryDirForObject(bucketPath, mHandler.getObject(), uploadId)); try (S3AuditContext auditContext = mHandler.createAuditContext( "abortMultipartUpload", user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); try { S3RestUtils.checkStatusesForUploadId(mHandler.getMetaFS(), userFs, multipartTemporaryDir, uploadId); @@ -1360,7 +1373,8 @@ public Response continueTask() { .build(); try (S3AuditContext auditContext = mHandler.createAuditContext( "deleteObjectTags", user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); try { userFs.setAttribute(new AlluxioURI(objectPath), attrPOptions); } catch (Exception e) { @@ -1396,7 +1410,8 @@ public Response continueTask() { .build(); try (S3AuditContext auditContext = mHandler.createAuditContext( "deleteObject", user, mHandler.getBucket(), mHandler.getObject())) { - S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext); + S3RestUtils.checkPathIsAlluxioDirectory(userFs, bucketPath, auditContext, + mHandler.BUCKET_PATH_CACHE); try { userFs.delete(new AlluxioURI(objectPath), options); } catch (FileDoesNotExistException | DirectoryNotEmptyException e) { diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java index 967528c4c01c..3a4a28d65355 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestServiceHandler.java @@ -111,7 +111,11 @@ public final class S3RestServiceHandler { /* Object is after bucket in the URL path */ public static final String OBJECT_PARAM = "{bucket}/{object:.+}"; public static final int BUCKET_PATH_CACHE_SIZE = 65536; - private static final Cache BUCKET_PATH_CACHE = CacheBuilder.newBuilder() + /* BUCKET_PATH_CACHE caches bucket path during specific period. + BUCKET_PATH_CACHE.put(bucketPath,true) means bucket path exists. + BUCKET_PATH_CACHE.put(bucketPath,false) plays the same effect + as BUCKET_PATH_CACHE.remove(bucketPath). */ + private static final Cache BUCKET_PATH_CACHE = CacheBuilder.newBuilder() .maximumSize(BUCKET_PATH_CACHE_SIZE) .expireAfterWrite( Configuration.global().getMs(PropertyKey.PROXY_S3_BUCKETPATHCACHE_TIMEOUT_MS), @@ -225,7 +229,7 @@ public Response listAllMyBuckets() { // debatable (?) potentially breaks backcompat(?) .filter(URIStatus::isFolder) .collect(Collectors.toList()); - buckets.forEach((uri) -> BUCKET_PATH_CACHE.put(new AlluxioURI(uri.getPath()), true)); + buckets.forEach((uri) -> BUCKET_PATH_CACHE.put(uri.getPath(), true)); return new ListAllMyBucketsResult(buckets); } }); @@ -588,7 +592,7 @@ public Response createBucket(@PathParam("bucket") final String bucket, } catch (Exception e) { throw S3RestUtils.toBucketS3Exception(e, bucketPath, auditContext); } - BUCKET_PATH_CACHE.put(new AlluxioURI(bucketPath), true); + BUCKET_PATH_CACHE.put(bucketPath, true); return Response.Status.OK; } }); @@ -649,7 +653,7 @@ public Response deleteBucket(@PathParam("bucket") final String bucket, } catch (Exception e) { throw S3RestUtils.toBucketS3Exception(e, bucketPath, auditContext); } - BUCKET_PATH_CACHE.put(new AlluxioURI(bucketPath), false); + BUCKET_PATH_CACHE.put(bucketPath, false); return Response.Status.NO_CONTENT; } }); diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java index d10f9beb1aee..5013db2a0170 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3RestUtils.java @@ -317,14 +317,13 @@ public static void checkPathIsAlluxioDirectory(FileSystem fs, String bucketPath, */ public static void checkPathIsAlluxioDirectory(FileSystem fs, String bucketPath, @Nullable S3AuditContext auditContext, - Cache bucketPathCache) + Cache bucketPathCache) throws S3Exception { - AlluxioURI uri = new AlluxioURI(bucketPath); - if (Boolean.TRUE.equals(bucketPathCache.getIfPresent(uri))) { + if (Boolean.TRUE.equals(bucketPathCache.getIfPresent(bucketPath))) { return; } checkPathIsAlluxioDirectory(fs, bucketPath, auditContext); - bucketPathCache.put(uri, true); + bucketPathCache.put(bucketPath, true); } /** From 00ee8bbac1bf4bdd9f5fb2f6a20f444906917541 Mon Sep 17 00:00:00 2001 From: Rico Chiu Date: Tue, 18 Apr 2023 22:11:28 -0700 Subject: [PATCH 12/27] [DOCFIX] Update generated tables with docGen pr-link: Alluxio/alluxio#17283 change-id: cid-1814ff8004cffff4a3a09a12a996d423eead3927 --- docs/_data/table/common-configuration.csv | 3 +- docs/_data/table/en/common-configuration.yml | 4 +-- docs/_data/table/en/master-configuration.yml | 10 +++--- docs/_data/table/en/master-metrics.yml | 32 +++++++++++++++++++- docs/_data/table/master-configuration.csv | 7 +++-- docs/_data/table/master-metrics.csv | 15 +++++++++ 6 files changed, 58 insertions(+), 13 deletions(-) diff --git a/docs/_data/table/common-configuration.csv b/docs/_data/table/common-configuration.csv index c867283e17fa..4cefe874f052 100644 --- a/docs/_data/table/common-configuration.csv +++ b/docs/_data/table/common-configuration.csv @@ -83,7 +83,7 @@ alluxio.network.host.resolution.timeout,"5sec" alluxio.network.ip.address.used,"false" alluxio.proxy.audit.logging.enabled,"false" alluxio.proxy.s3.bucket.naming.restrictions.enabled,"false" -alluxio.proxy.s3.bucketpathcache.timeout,"1min" +alluxio.proxy.s3.bucketpathcache.timeout,"0min" alluxio.proxy.s3.complete.multipart.upload.keepalive.enabled,"false" alluxio.proxy.s3.complete.multipart.upload.keepalive.time.interval,"30sec" alluxio.proxy.s3.complete.multipart.upload.min.part.size,"5MB" @@ -118,7 +118,6 @@ alluxio.site.conf.dir,"${alluxio.conf.dir}/,${user.home}/.alluxio/,/etc/alluxio/ alluxio.site.conf.rocks.block.file,"" alluxio.site.conf.rocks.inode.file,"" alluxio.standalone.fuse.jvm.monitor.enabled,"false" -alluxio.standby.master.grpc.enabled,"false" alluxio.standby.master.metrics.sink.enabled,"false" alluxio.standby.master.web.enabled,"false" alluxio.table.catalog.path,"/catalog" diff --git a/docs/_data/table/en/common-configuration.yml b/docs/_data/table/en/common-configuration.yml index adc535d25647..201a3df495a5 100644 --- a/docs/_data/table/en/common-configuration.yml +++ b/docs/_data/table/en/common-configuration.yml @@ -167,7 +167,7 @@ alluxio.proxy.audit.logging.enabled: alluxio.proxy.s3.bucket.naming.restrictions.enabled: 'Toggles whether or not the Alluxio S3 API will enforce AWS S3 bucket naming restrictions. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html.' alluxio.proxy.s3.bucketpathcache.timeout: - 'Expire bucket path statistics in cache for this time period. Set 0min to disable the cache.' + 'Expire bucket path statistics in cache for this time period. Set 0min to disable the cache. If enabling the cache, be careful that Alluxio S3 API will behave differently from AWS S3 API if bucket path cache entries become stale.' alluxio.proxy.s3.complete.multipart.upload.keepalive.enabled: 'Whether or not to enabled sending whitespace characters as a keepalive message during CompleteMultipartUpload. Enabling this will cause any errors to be silently ignored. However, the errors will appear in the Proxy logs.' alluxio.proxy.s3.complete.multipart.upload.keepalive.time.interval: @@ -236,8 +236,6 @@ alluxio.site.conf.rocks.inode.file: 'Path of file containing RocksDB inode store configuration. A template configuration cab be found at ${alluxio.conf.dir}/rocks-inode.ini.template. See https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini for more information on RocksDB configuration files. If unset then a default configuration will be used.' alluxio.standalone.fuse.jvm.monitor.enabled: 'Whether to enable start JVM monitor thread on the standalone fuse process. This will start a thread to detect JVM-wide pauses induced by GC or other reasons.' -alluxio.standby.master.grpc.enabled: - 'Whether a standby master runs a grpc server' alluxio.standby.master.metrics.sink.enabled: 'Whether a standby master runs the metric sink' alluxio.standby.master.web.enabled: diff --git a/docs/_data/table/en/master-configuration.yml b/docs/_data/table/en/master-configuration.yml index 0a70499defc9..ea9fffca137f 100644 --- a/docs/_data/table/en/master-configuration.yml +++ b/docs/_data/table/en/master-configuration.yml @@ -72,6 +72,10 @@ alluxio.master.embedded.journal.retry.cache.expiry.time: 'The time for embedded journal server retry cache to expire. Setting a bigger value allows embedded journal server to cache the responses for a longer time in case of journal writer retries, but will take up more memory in master.' alluxio.master.embedded.journal.snapshot.replication.chunk.size: 'The stream chunk size used by masters to replicate snapshots.' +alluxio.master.embedded.journal.snapshot.replication.compression.level: + 'The zip compression level of sending a snapshot from one master to another. Only applicable when alluxio.master.embedded.journal.snapshot.replication.compression.type is not NO_COMPRESSION. The zip format defines ten levels of compression, ranging from 0 (no compression, but very fast) to 9 (best compression, but slow). Or -1 for the system default compression level.' +alluxio.master.embedded.journal.snapshot.replication.compression.type: + 'The type of compression to use when transferring a snapshot from one master to another. Options are NO_COMPRESSION, GZIP, TAR_GZIP' alluxio.master.embedded.journal.transport.max.inbound.message.size: 'The maximum size of a message that can be sent to the embedded journal server node.' alluxio.master.embedded.journal.transport.request.timeout.ms: @@ -126,8 +130,6 @@ alluxio.master.journal.init.from.backup: 'A uri for a backup to initialize the journal from. When the master becomes primary, if it sees that its journal is freshly formatted, it will restore its state from the backup. When running multiple masters, this property must be configured on all masters since it isn''t known during startup which master will become the first primary.' alluxio.master.journal.local.log.compaction: 'Whether to employ a quorum level log compaction policy or a local (individual) log compaction policy.' -alluxio.master.journal.log.concurrency.max: - 'Max concurrency for notifyTermIndexUpdated method, be sure it''s enough' alluxio.master.journal.log.size.bytes.max: 'If a log file is bigger than this value, it will rotate to next file.' alluxio.master.journal.request.data.timeout: @@ -244,8 +246,8 @@ alluxio.master.metastore.rocks.block.meta.cache.size: 'The capacity in bytes of the RocksDB block metadata table LRU cache. If unset, the RocksDB default will be used. See https://github.com/facebook/rocksdb/wiki/Block-Cache' alluxio.master.metastore.rocks.block.meta.index: 'The index type to be used in the RocksDB block metadata table. If unset, the RocksDB default will be used. See https://github.com/facebook/rocksdb/wiki/Index-Block-Format' -alluxio.master.metastore.rocks.checkpoint.compression.level: - 'The zip compression level of checkpointing rocksdb, the zip format defines ten levels of compression, ranging from 0 (no compression, but very fast) to 9 (best compression, but slow). Or -1 for the system default compression level.' +alluxio.master.metastore.rocks.checkpoint.compression.type: + 'The compression algorithm that RocksDB uses internally. One of {NO_COMPRESSION SNAPPY_COMPRESSION ZLIB_COMPRESSION BZLIB2_COMPRESSION LZ4_COMPRESSION LZ4HC_COMPRESSION XPRESS_COMPRESSION ZSTD_COMPRESSION DISABLE_COMPRESSION_OPTION}' alluxio.master.metastore.rocks.edge.block.index: 'The block index type to be used in the RocksDB inode edge table. If unset, the RocksDB default will be used. See https://rocksdb.org/blog/2018/08/23/data-block-hash-index.html' alluxio.master.metastore.rocks.edge.bloom.filter: diff --git a/docs/_data/table/en/master-metrics.yml b/docs/_data/table/en/master-metrics.yml index fed9d6810da1..f468d9db954c 100644 --- a/docs/_data/table/en/master-metrics.yml +++ b/docs/_data/table/en/master-metrics.yml @@ -48,8 +48,32 @@ Master.EdgeCacheSize: 'Total number of edges (inode metadata) cached. The edge cache is responsible for managing the mapping from (parentId, childName) to childId.' Master.EdgeLockPoolSize: 'The size of master edge lock pool' +Master.EmbeddedJournalLastSnapshotDownloadDiskSize: + 'Describes the size on disk of the snapshot downloaded from other masters in the cluster the previous time the download occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotDownloadDurationMs: + 'Describes the amount of time taken to download journal snapshots from other masters in the cluster the previous time the download occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotDownloadSize: + 'Describes the size of the snapshot downloaded from other masters in the cluster the previous time the download occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotDurationMs: + 'Describes the amount of time taken to generate the last local journal snapshots on this master. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotEntriesCount: + 'Describes the number of entries in the last local journal snapshots on this master. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotReplayDurationMs: + 'Represents the time the last restore from checkpoint operation took in milliseconds.' +Master.EmbeddedJournalLastSnapshotReplayEntriesCount: + 'Represents the time the last restore from checkpoint operation took in milliseconds.' +Master.EmbeddedJournalLastSnapshotUploadDiskSize: + 'Describes the size on disk of the snapshot uploaded to other masters in the cluster the previous time the download occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotUploadDurationMs: + 'Describes the amount of time taken to upload journal snapshots to another master in the cluster the previous time the upload occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalLastSnapshotUploadSize: + 'Describes the size of the snapshot uploaded to other masters in the cluster the previous time the download occurred. Only valid when using the embedded journal.' +Master.EmbeddedJournalSnapshotDownloadDiskHistogram: + 'Describes the size on disk of the snapshot downloaded from another master in the cluster. Only valid when using the embedded journal. Long running average.' Master.EmbeddedJournalSnapshotDownloadGenerate: - 'Describes the amount of time taken to download journal snapshots from other masters in the cluster. Only valid when using the embedded journal. Use this metric to determine if there are potential communication bottlenecks between Alluxio masters.' + 'Describes the amount of time taken to download journal snapshots from other masters in the cluster. Only valid when using the embedded journal. Long running average.' +Master.EmbeddedJournalSnapshotDownloadHistogram: + 'Describes the size of the snapshot downloaded from another master in the cluster. Only valid when using the embedded journal. Long running average.' Master.EmbeddedJournalSnapshotGenerateTimer: 'Describes the amount of time taken to generate local journal snapshots on this master. Only valid when using the embedded journal. Use this metric to measure the performance of Alluxio''s snapshot generation.' Master.EmbeddedJournalSnapshotInstallTimer: @@ -58,6 +82,12 @@ Master.EmbeddedJournalSnapshotLastIndex: 'Represents the latest journal index that was recorded by this master in the most recent local snapshot or from a snapshot downloaded from another master in the cluster. Only valid when using the embedded journal.' Master.EmbeddedJournalSnapshotReplayTimer: 'Describes the amount of time taken to replay a journal snapshot onto the master''s state machine. Only valid only when using the embedded journal. Use this metric to determine the performance of Alluxio when replaying journal snapshot file. Higher numbers may indicate a slow disk or CPU contention' +Master.EmbeddedJournalSnapshotUploadDiskHistogram: + 'Describes the size on disk of the snapshot uploaded to another master in the cluster. Only valid when using the embedded journal. Long running average.' +Master.EmbeddedJournalSnapshotUploadHistogram: + 'Describes the size of the snapshot uploaded to another master in the cluster. Only valid when using the embedded journal. Long running average.' +Master.EmbeddedJournalSnapshotUploadTimer: + 'Describes the amount of time taken to upload journal snapshots to another master in the cluster. Only valid when using the embedded journal. long running average' Master.FileBlockInfosGot: 'Total number of succeed GetFileBlockInfo operations' Master.FileInfosGot: diff --git a/docs/_data/table/master-configuration.csv b/docs/_data/table/master-configuration.csv index c0f0c81f8648..68bfd7c6697e 100644 --- a/docs/_data/table/master-configuration.csv +++ b/docs/_data/table/master-configuration.csv @@ -36,6 +36,8 @@ alluxio.master.embedded.journal.raft.client.request.timeout,"60sec" alluxio.master.embedded.journal.ratis.config,"" alluxio.master.embedded.journal.retry.cache.expiry.time,"60s" alluxio.master.embedded.journal.snapshot.replication.chunk.size,"4MB" +alluxio.master.embedded.journal.snapshot.replication.compression.level,"1" +alluxio.master.embedded.journal.snapshot.replication.compression.type,"NO_COMPRESSION" alluxio.master.embedded.journal.transport.max.inbound.message.size,"100MB" alluxio.master.embedded.journal.transport.request.timeout.ms,"5sec" alluxio.master.embedded.journal.unsafe.flush.enabled,"false" @@ -63,10 +65,9 @@ alluxio.master.journal.gc.period,"2min" alluxio.master.journal.gc.threshold,"5min" alluxio.master.journal.init.from.backup,"" alluxio.master.journal.local.log.compaction,"true" -alluxio.master.journal.log.concurrency.max,"256" alluxio.master.journal.log.size.bytes.max,"10MB" alluxio.master.journal.request.data.timeout,"20000" -alluxio.master.journal.request.info.timeout,"20000" +alluxio.master.journal.request.info.timeout,"10000" alluxio.master.journal.retry.interval,"1sec" alluxio.master.journal.space.monitor.interval,"10min" alluxio.master.journal.space.monitor.percent.free.threshold,"10" @@ -122,7 +123,7 @@ alluxio.master.metastore.rocks.block.meta.block.index,"" alluxio.master.metastore.rocks.block.meta.bloom.filter,"false" alluxio.master.metastore.rocks.block.meta.cache.size,"" alluxio.master.metastore.rocks.block.meta.index,"" -alluxio.master.metastore.rocks.checkpoint.compression.level,"1" +alluxio.master.metastore.rocks.checkpoint.compression.type,"LZ4_COMPRESSION" alluxio.master.metastore.rocks.edge.block.index,"" alluxio.master.metastore.rocks.edge.bloom.filter,"false" alluxio.master.metastore.rocks.edge.cache.size,"" diff --git a/docs/_data/table/master-metrics.csv b/docs/_data/table/master-metrics.csv index 4374cce3f9ca..593fc709f3ff 100644 --- a/docs/_data/table/master-metrics.csv +++ b/docs/_data/table/master-metrics.csv @@ -24,11 +24,26 @@ Master.EdgeCacheLoadTimes,GAUGE Master.EdgeCacheMisses,GAUGE Master.EdgeCacheSize,GAUGE Master.EdgeLockPoolSize,GAUGE +Master.EmbeddedJournalLastSnapshotDownloadDiskSize,GAUGE +Master.EmbeddedJournalLastSnapshotDownloadDurationMs,GAUGE +Master.EmbeddedJournalLastSnapshotDownloadSize,GAUGE +Master.EmbeddedJournalLastSnapshotDurationMs,GAUGE +Master.EmbeddedJournalLastSnapshotEntriesCount,GAUGE +Master.EmbeddedJournalLastSnapshotReplayDurationMs,GAUGE +Master.EmbeddedJournalLastSnapshotReplayEntriesCount,GAUGE +Master.EmbeddedJournalLastSnapshotUploadDiskSize,GAUGE +Master.EmbeddedJournalLastSnapshotUploadDurationMs,GAUGE +Master.EmbeddedJournalLastSnapshotUploadSize,GAUGE +Master.EmbeddedJournalSnapshotDownloadDiskHistogram,HISTOGRAM Master.EmbeddedJournalSnapshotDownloadGenerate,TIMER +Master.EmbeddedJournalSnapshotDownloadHistogram,HISTOGRAM Master.EmbeddedJournalSnapshotGenerateTimer,TIMER Master.EmbeddedJournalSnapshotInstallTimer,TIMER Master.EmbeddedJournalSnapshotLastIndex,GAUGE Master.EmbeddedJournalSnapshotReplayTimer,TIMER +Master.EmbeddedJournalSnapshotUploadDiskHistogram,HISTOGRAM +Master.EmbeddedJournalSnapshotUploadHistogram,HISTOGRAM +Master.EmbeddedJournalSnapshotUploadTimer,TIMER Master.FileBlockInfosGot,COUNTER Master.FileInfosGot,COUNTER Master.FileSize,GAUGE From 6750b547708699f92571c78c31397c9e2378c23b Mon Sep 17 00:00:00 2001 From: bingzheng Date: Wed, 19 Apr 2023 14:26:30 +0800 Subject: [PATCH 13/27] Fix the aggregate result problem in UfsIOBench ### What changes are proposed in this pull request? Fix the aggregate result problem in UfsIOBench. ### Why are the changes needed? The UfsIOBench result summary calculates the total duration by summing each points' result, and compute the average speed based on total duration, actually the average can't represent the system performance. For examples: An alluxio cluster with 1 job master and 3 job worker, Run the UfsIOBench with 16 threads: ``` ./bin/alluxio runUfsIOTest --path hdfs://hdfsCluster/tmp/ufsIoBanch --io-size 2G--threads 16 --cluster ``` WeChatWorkScreenshot_14f71ae8-abe3-410f-bc45-5d31e45e1691 the mTotalDurationSeconds is 16 * 3 *(points duration), but actually the 16 points read and write concurrently and finished almostly at same time. ### Does this PR introduce any user facing changes? Please list the user-facing changes introduced by your change, including No pr-link: Alluxio/alluxio#17063 change-id: cid-ff987441a28e7171eef9fdc765ca93aba3096887 --- .../alluxio/stress/worker/IOTaskSummary.java | 26 +++++++++++++------ .../stress/worker/IOTaskSummaryTest.java | 9 +++++-- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/dora/stress/common/src/main/java/alluxio/stress/worker/IOTaskSummary.java b/dora/stress/common/src/main/java/alluxio/stress/worker/IOTaskSummary.java index 1d02f7d9a4d2..27e6cd22a676 100644 --- a/dora/stress/common/src/main/java/alluxio/stress/worker/IOTaskSummary.java +++ b/dora/stress/common/src/main/java/alluxio/stress/worker/IOTaskSummary.java @@ -26,6 +26,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @@ -149,9 +150,15 @@ public void setWriteSpeedStat(SpeedStat stat) { public static class SpeedStat implements JsonSerializable { public double mTotalDurationSeconds; public long mTotalSizeBytes; + // Max speed among all nodes public double mMaxSpeedMbps; + // Min speed among all nodes public double mMinSpeedMbps; + // Average speed of all nodes public double mAvgSpeedMbps; + // Cluster-wide throughput + public double mClusterAvgSpeedMbps; + // Standard deviation of speed reported by each node public double mStdDev; /** @@ -162,9 +169,10 @@ public SpeedStat() {} @Override public String toString() { return String.format("{totalDuration=%ss, totalSize=%s, maxSpeed=%sMB/s, " - + "minSpeed=%sMB/s, " + "avgSpeed=%sMB/s, stdDev=%s}", + + "minSpeed=%sMB/s, " + "avgSpeed=%sMB/s, clusterAvgSpeed=%sMB/s, " + + "stdDev=%s}", mTotalDurationSeconds, FormatUtils.getSizeFromBytes(mTotalSizeBytes), - mMaxSpeedMbps, mMinSpeedMbps, mAvgSpeedMbps, mStdDev); + mMaxSpeedMbps, mMinSpeedMbps, mAvgSpeedMbps, mClusterAvgSpeedMbps, mStdDev); } } @@ -177,31 +185,33 @@ private static SpeedStat calculateStat(List points) { return result; } - double totalDuration = 0.0; long totalSize = 0L; double[] speeds = new double[points.size()]; double maxSpeed = 0.0; double minSpeed = Double.MAX_VALUE; int i = 0; for (IOTaskResult.Point p : points) { - totalDuration += p.mDurationSeconds; + result.mTotalDurationSeconds = Math.max(p.mDurationSeconds, result.mTotalDurationSeconds); totalSize += p.mDataSizeBytes; double speed = p.mDataSizeBytes / (p.mDurationSeconds * 1024 * 1024); // convert B/s to MB/s maxSpeed = Math.max(maxSpeed, speed); minSpeed = Math.min(minSpeed, speed); speeds[i++] = speed; } - double avgSpeed = totalSize / (totalDuration * 1024 * 1024); // convert B/s to MB/s + // calculate the average speed for each point + double avgPointSpeed = Arrays.stream(speeds).sum() / points.size(); + double avgClusterSpeed = totalSize + / (result.mTotalDurationSeconds * 1024 * 1024); // convert B/s to MB/s double var = 0; for (double s : speeds) { - var += (s - avgSpeed) * (s - avgSpeed); + var += (s - avgPointSpeed) * (s - avgPointSpeed); } - result.mTotalDurationSeconds = totalDuration; result.mTotalSizeBytes = totalSize; result.mMaxSpeedMbps = maxSpeed; result.mMinSpeedMbps = Double.compare(minSpeed, Double.MAX_VALUE) == 0 ? 0.0 : minSpeed; - result.mAvgSpeedMbps = avgSpeed; + result.mAvgSpeedMbps = avgPointSpeed; + result.mClusterAvgSpeedMbps = avgClusterSpeed; result.mStdDev = Math.sqrt(var); return result; diff --git a/dora/stress/common/src/test/java/alluxio/stress/worker/IOTaskSummaryTest.java b/dora/stress/common/src/test/java/alluxio/stress/worker/IOTaskSummaryTest.java index e8cd17549ada..dc936bd63430 100644 --- a/dora/stress/common/src/test/java/alluxio/stress/worker/IOTaskSummaryTest.java +++ b/dora/stress/common/src/test/java/alluxio/stress/worker/IOTaskSummaryTest.java @@ -73,16 +73,19 @@ public void statCalculation() { IOTaskResult result = new IOTaskResult(); double[] durations = new double[]{1.0, 1.5, 2.0, 1.11}; long[] sizes = new long[]{200_000_000, 300_000_000, 500_000_000, 800_000_000}; + double[] speeds = new double[sizes.length]; for (int i = 0; i < sizes.length; i++) { result.addPoint(new IOTaskResult.Point(IOTaskResult.IOMode.READ, durations[i], sizes[i])); result.addPoint(new IOTaskResult.Point(IOTaskResult.IOMode.WRITE, durations[i], sizes[i])); + speeds[i] = sizes[i] / (durations[i] * 1024 * 1024); } IOTaskSummary summary = new IOTaskSummary(result); IOTaskSummary.SpeedStat readStat = summary.getReadSpeedStat(); - double totalDuration = Arrays.stream(durations).sum(); + double totalDuration = Arrays.stream(durations).max().orElse(0L); long totalSize = Arrays.stream(sizes).sum(); - double avgSpeed = totalSize / (totalDuration * 1024 * 1024); + double avgSpeed = Arrays.stream(speeds).sum() / speeds.length; + double clusterAvgSpeed = totalSize / (2.0 * 1024 * 1024); double maxSpeed = 800_000_000 / (1.11 * 1024 * 1024); double minSpeed = 200_000_000 / (1.0 * 1024 * 1024); assertEquals(totalDuration, readStat.mTotalDurationSeconds, 1e-5); @@ -90,6 +93,7 @@ public void statCalculation() { assertEquals(avgSpeed, readStat.mAvgSpeedMbps, 1e-5); assertEquals(maxSpeed, readStat.mMaxSpeedMbps, 1e-5); assertEquals(minSpeed, readStat.mMinSpeedMbps, 1e-5); + assertEquals(clusterAvgSpeed, readStat.mClusterAvgSpeedMbps, 1e-5); IOTaskSummary.SpeedStat writeStat = summary.getWriteSpeedStat(); assertEquals(totalDuration, writeStat.mTotalDurationSeconds, 1e-5); @@ -97,6 +101,7 @@ public void statCalculation() { assertEquals(avgSpeed, writeStat.mAvgSpeedMbps, 1e-5); assertEquals(maxSpeed, writeStat.mMaxSpeedMbps, 1e-5); assertEquals(minSpeed, writeStat.mMinSpeedMbps, 1e-5); + assertEquals(clusterAvgSpeed, writeStat.mClusterAvgSpeedMbps, 1e-5); } private void checkEquality(IOTaskSummary.SpeedStat a, IOTaskSummary.SpeedStat b) { From 28c5d83e4821351b07cbf82fb325c8cf4c78401d Mon Sep 17 00:00:00 2001 From: David Zhu Date: Wed, 19 Apr 2023 10:29:14 -0700 Subject: [PATCH 14/27] Use SlidingTimeWindow Moving Average for meter computation ### What changes are proposed in this pull request? The exponential moving average used for meter can produce inaccurate rate when computing transfer rate. The reason is that we call mark(number of bytes) when a transfer is finished to indicate how many bytes have been transfered, but that is inaccurate in calculating an average rate which is really what we want in a Bytes/Sec calculation. ### Why are the changes needed? SlidingTimewindow produces a moving average that is more correct especially when a short window is used https://www.javadoc.io/static/io.dropwizard.metrics/metrics-core/4.1.1/com/codahale/metrics/SlidingTimeWindowMovingAverages.html ### Does this PR introduce any user facing changes? no pr-link: Alluxio/alluxio#17218 change-id: cid-d21fdf406c78f7ddcad011c117b77ef6b1bf46c7 --- .../src/main/java/alluxio/metrics/MetricsSystem.java | 11 +++++++---- .../src/main/java/alluxio/underfs/UfsIOManager.java | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dora/core/common/src/main/java/alluxio/metrics/MetricsSystem.java b/dora/core/common/src/main/java/alluxio/metrics/MetricsSystem.java index 605ec1d2f246..ccfbfadabc00 100644 --- a/dora/core/common/src/main/java/alluxio/metrics/MetricsSystem.java +++ b/dora/core/common/src/main/java/alluxio/metrics/MetricsSystem.java @@ -26,6 +26,7 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.Meter; import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.SlidingTimeWindowMovingAverages; import com.codahale.metrics.Timer; import com.codahale.metrics.UniformReservoir; import com.codahale.metrics.jvm.CachedThreadStatesGaugeSet; @@ -92,6 +93,7 @@ public final class MetricsSystem { CommonUtils.memoize(() -> constructSourceName()); private static final Map EXECUTOR_SERVICES = new ConcurrentHashMap<>(); + private static final int SECONDS_IN_A_MINUTE = 60; /** * An enum of supported instance type. @@ -604,7 +606,8 @@ public static Counter counterWithTags(String name, boolean shouldReport, String. * @return a meter object with the qualified metric name */ public static Meter meter(String name) { - return METRIC_REGISTRY.meter(getMetricName(name)); + return METRIC_REGISTRY.meter(getMetricName(name), + () -> new Meter(new SlidingTimeWindowMovingAverages())); } /** @@ -799,7 +802,7 @@ private static synchronized List reportMetrics(InstanceType // that a value marked. For clients, especially short-life clients, // the minute rates will be zero for their whole life. // That's why all throughput meters are not aggregated at cluster level. - rpcMetrics.add(Metric.from(entry.getKey(), meter.getOneMinuteRate(), + rpcMetrics.add(Metric.from(entry.getKey(), meter.getOneMinuteRate() / SECONDS_IN_A_MINUTE, MetricType.METER).toProto()); } else if (metric instanceof Timer) { Timer timer = (Timer) metric; @@ -884,7 +887,7 @@ private static Metric getAlluxioMetricFromCodahaleMetric(String name, return Metric.from(name, counter.getCount(), MetricType.COUNTER); } else if (metric instanceof Meter) { Meter meter = (Meter) metric; - return Metric.from(name, meter.getOneMinuteRate(), MetricType.METER); + return Metric.from(name, meter.getOneMinuteRate() / SECONDS_IN_A_MINUTE, MetricType.METER); } else if (metric instanceof Timer) { Timer timer = (Timer) metric; return Metric.from(name, timer.getCount(), MetricType.TIMER); @@ -916,7 +919,7 @@ public static Map allMetrics() { .setDoubleValue(((Counter) metric).getCount()); } else if (metric instanceof Meter) { valueBuilder.setMetricType(MetricType.METER) - .setDoubleValue(((Meter) metric).getOneMinuteRate()); + .setDoubleValue(((Meter) metric).getOneMinuteRate() / SECONDS_IN_A_MINUTE); } else if (metric instanceof Timer) { valueBuilder.setMetricType(MetricType.TIMER) .setDoubleValue(((Timer) metric).getCount()); diff --git a/dora/core/server/worker/src/main/java/alluxio/underfs/UfsIOManager.java b/dora/core/server/worker/src/main/java/alluxio/underfs/UfsIOManager.java index dd637287864f..38578122ee06 100644 --- a/dora/core/server/worker/src/main/java/alluxio/underfs/UfsIOManager.java +++ b/dora/core/server/worker/src/main/java/alluxio/underfs/UfsIOManager.java @@ -123,7 +123,7 @@ private void schedule() { */ @VisibleForTesting public double getUsedThroughput(Meter meter) { - return meter.getOneMinuteRate(); + return meter.getOneMinuteRate() / 60; } /** From 6829312f2501be888f6701a6997b5f78380464e9 Mon Sep 17 00:00:00 2001 From: Rico Chiu Date: Wed, 19 Apr 2023 11:42:51 -0700 Subject: [PATCH 15/27] cherry-pick empty: Skip unrelated property keys from generating in docGen pt2 pr-link: Alluxio/alluxio#17284 change-id: cid-b6800a0be10f9ef7b24b28c099a7431e52b98c6d From 8b8fe464872883e32aa21abde72081f5a1c83862 Mon Sep 17 00:00:00 2001 From: Yaolong Liu Date: Thu, 20 Apr 2023 03:16:10 +0800 Subject: [PATCH 16/27] Integrate ratis-shell to the alluxio tarball Fix https://github.com/Alluxio/alluxio/issues/16943 pr-link: Alluxio/alluxio#16980 change-id: cid-ee63db7b1a00a46034bd9e44a35241a45ed98689 --- .../cmd/generate-tarball.go | 2 ++ integration/tools/ratis-shell/README.md | 5 ++++ .../tools/ratis-shell/install-ratis-shell.sh | 29 +++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 integration/tools/ratis-shell/README.md create mode 100755 integration/tools/ratis-shell/install-ratis-shell.sh diff --git a/dev/scripts/src/alluxio.org/build-distribution/cmd/generate-tarball.go b/dev/scripts/src/alluxio.org/build-distribution/cmd/generate-tarball.go index 0e10eb313b0c..68f83a657090 100644 --- a/dev/scripts/src/alluxio.org/build-distribution/cmd/generate-tarball.go +++ b/dev/scripts/src/alluxio.org/build-distribution/cmd/generate-tarball.go @@ -220,6 +220,8 @@ func addAdditionalFiles(srcPath, dstPath string, hadoopVersion version, version "integration/metrics/otel-agent-config-worker.yaml", "integration/metrics/otel-collector-config.yaml", "integration/metrics/prometheus.yaml", + "integration/tools/ratis-shell/install-ratis-shell.sh", + "integration/tools/ratis-shell/README.md", ) } diff --git a/integration/tools/ratis-shell/README.md b/integration/tools/ratis-shell/README.md new file mode 100644 index 000000000000..47df3ac7e96d --- /dev/null +++ b/integration/tools/ratis-shell/README.md @@ -0,0 +1,5 @@ +### Apache Ratis Shell +`ratis-shell` can manage the ha of alluxio master, you can quickly install it by +running `install-ratis-shell.sh`. + +For more doc, please refer to [RATIS SHELL DOC](https://github.com/apache/ratis/blob/master/ratis-docs/src/site/markdown/cli.md) \ No newline at end of file diff --git a/integration/tools/ratis-shell/install-ratis-shell.sh b/integration/tools/ratis-shell/install-ratis-shell.sh new file mode 100755 index 000000000000..0eb0470b1047 --- /dev/null +++ b/integration/tools/ratis-shell/install-ratis-shell.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# +# The Alluxio Open Foundation licenses this work under the Apache License, version 2.0 +# (the "License"). You may not use this work except in compliance with the License, which is +# available at www.apache.org/licenses/LICENSE-2.0 +# +# This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# either express or implied, as more fully set forth in the License. +# +# See the NOTICE file distributed with this work for information regarding copyright ownership. +# + +# +# This script is run from inside the Docker container +# +set -ex + +RATIS_SHELL_DIR=$(cd "$( dirname "$( readlink "$0" || echo "$0" )" )"; pwd) +# ratis-shell version +VERSION=$1 +if [ -z "$VERSION" ]; then + VERSION=2.4.1 +fi + +wget -P "$RATIS_SHELL_DIR" "https://dlcdn.apache.org/ratis/$VERSION/apache-ratis-$VERSION-bin.tar.gz" +mkdir ratis-cli +tar -zxvf apache-ratis-$VERSION-bin.tar.gz -C $RATIS_SHELL_DIR/ratis-cli --strip-component 1 +chmod 755 ratis-cli/bin/ratis +rm apache-ratis-$VERSION-bin.tar.gz From fd6d1f64633bd47ef23f1c69e26d23f8b486fde5 Mon Sep 17 00:00:00 2001 From: Shawn Sun <32376495+ssz1997@users.noreply.github.com> Date: Wed, 19 Apr 2023 13:39:21 -0700 Subject: [PATCH 17/27] [DOCFIX] Add newline at EOF One file is missing the newline at end of file. This PR fixes it. pr-link: Alluxio/alluxio#17294 change-id: cid-db2278515ec362341914ad7e39ec30c3d4f72ef2 --- integration/tools/ratis-shell/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/tools/ratis-shell/README.md b/integration/tools/ratis-shell/README.md index 47df3ac7e96d..4e4111181208 100644 --- a/integration/tools/ratis-shell/README.md +++ b/integration/tools/ratis-shell/README.md @@ -2,4 +2,4 @@ `ratis-shell` can manage the ha of alluxio master, you can quickly install it by running `install-ratis-shell.sh`. -For more doc, please refer to [RATIS SHELL DOC](https://github.com/apache/ratis/blob/master/ratis-docs/src/site/markdown/cli.md) \ No newline at end of file +For more doc, please refer to [RATIS SHELL DOC](https://github.com/apache/ratis/blob/master/ratis-docs/src/site/markdown/cli.md) From 66863391afd1cf75570ce0dffa6237bce3be355e Mon Sep 17 00:00:00 2001 From: elega <445092967@qq.com> Date: Thu, 20 Apr 2023 13:18:33 +0800 Subject: [PATCH 18/27] Fix FileSystemMergeJournalContext related ### What changes are proposed in this pull request? 1. Only force flushing journals when a locked inode path is closed for FileSystemMergeJournalContext 2. Fix the journal merger broken logic where updating the fingerprint for a directry will be ignored. ### Why are the changes needed? We used to fix the merge journal context in https://github.com/Alluxio/alluxio/pull/17071, where a regular non-merging journal context is used when listStatus() is called. However, if a listStatus triggers a metadata sync and MASTER_FILE_SYSTEM_MERGE_INODE_JOURNALS is set to true, journals will be flushed on every lockedInodePath close during the metadata sync. This behavior leads to journals being flushed too many times and impairs the metadata sync performance. Also we found a minor issue that when inode directory journals are merged, the fingerprint will be ignored. This is becuase inode directory journal does not have a fingerprint field. ### Does this PR introduce any user facing changes? N/A pr-link: Alluxio/alluxio#17251 change-id: cid-23ea3229e6781483ab582c3f4d4c9c4f61cdf634 --- .../file/FileSystemJournalEntryMerger.java | 8 +++++ .../master/file/meta/LockedInodePath.java | 11 ++++-- .../FileSystemJournalEntryMergerTest.java | 35 +++++++++++++++++++ .../master/file/meta/LockedInodePathTest.java | 3 +- 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/FileSystemJournalEntryMerger.java b/dora/core/server/master/src/main/java/alluxio/master/file/FileSystemJournalEntryMerger.java index 5c8df09dace5..c0910746730a 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/FileSystemJournalEntryMerger.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/FileSystemJournalEntryMerger.java @@ -70,6 +70,14 @@ else if ( MutableInodeDirectory.fromJournalEntry(existingEntry.getInodeDirectory()); if (entry.hasUpdateInode()) { inodeDirectory.updateFromEntry(entry.getUpdateInode()); + // Update Inode directory does not contain directory fingerprint, + // so we still need to add the new inode journal entry to the list to keep the + // fingerprint update, + // while we still merge it with the existing inode directory on as best efforts. + if (entry.getUpdateInode().hasUfsFingerprint() + && !entry.getUpdateInode().getUfsFingerprint().equals("")) { + mJournalEntries.add(entry); + } } else if (entry.hasUpdateInodeDirectory()) { inodeDirectory.updateFromEntry(entry.getUpdateInodeDirectory()); } diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/meta/LockedInodePath.java b/dora/core/server/master/src/main/java/alluxio/master/file/meta/LockedInodePath.java index c9aee1e81168..7c66b86a5e1f 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/meta/LockedInodePath.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/meta/LockedInodePath.java @@ -20,6 +20,7 @@ import alluxio.exception.InvalidPathException; import alluxio.exception.status.UnavailableException; import alluxio.master.file.meta.InodeTree.LockPattern; +import alluxio.master.journal.FileSystemMergeJournalContext; import alluxio.master.journal.JournalContext; import alluxio.master.metastore.ReadOnlyInodeStore; import alluxio.resource.AlluxioResourceLeakDetectorFactory; @@ -85,9 +86,7 @@ public class LockedInodePath implements Closeable { @Nullable private final ResourceLeakTracker mTracker; /** To determine if we should flush the journals when lock is released or scope reduced. */ - private final boolean mMergeInodeJournals = Configuration.getBoolean( - PropertyKey.MASTER_FILE_SYSTEM_MERGE_INODE_JOURNALS - ); + private final boolean mMergeInodeJournals; /** * Keeps a reference of JournalContext and flushes it before the lock is released. @@ -159,6 +158,9 @@ public LockedInodePath(AlluxioURI uri, ReadOnlyInodeStore inodeStore, mLockList = new SimpleInodeLockList(inodeLockManager, mUseTryLock); mTracker = DETECTOR.track(this); mJournalContext = journalContext; + mMergeInodeJournals = Configuration.getBoolean( + PropertyKey.MASTER_FILE_SYSTEM_MERGE_INODE_JOURNALS + ) && mJournalContext instanceof FileSystemMergeJournalContext; } /** @@ -184,6 +186,9 @@ private LockedInodePath(AlluxioURI uri, LockedInodePath path, String[] pathCompo // So the new created LockInodePath instance must be on the same thread with // the original one and hence they will use the same JournalContext. mJournalContext = path.mJournalContext; + mMergeInodeJournals = Configuration.getBoolean( + PropertyKey.MASTER_FILE_SYSTEM_MERGE_INODE_JOURNALS + ) && mJournalContext instanceof FileSystemMergeJournalContext; } /** diff --git a/dora/core/server/master/src/test/java/alluxio/master/file/FileSystemJournalEntryMergerTest.java b/dora/core/server/master/src/test/java/alluxio/master/file/FileSystemJournalEntryMergerTest.java index d2c1e77bdb80..746404a26b6e 100644 --- a/dora/core/server/master/src/test/java/alluxio/master/file/FileSystemJournalEntryMergerTest.java +++ b/dora/core/server/master/src/test/java/alluxio/master/file/FileSystemJournalEntryMergerTest.java @@ -13,6 +13,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import alluxio.AlluxioURI; import alluxio.master.block.BlockId; @@ -112,4 +113,38 @@ public void testFileSystemJournalEntryMerger() { merger.clear(); assertEquals(0, merger.getMergedJournalEntries().size()); } + + @Test + public void testMergeDirectoryFingerprint() { + AlluxioURI uri = new AlluxioURI("/dir/test1"); + + FileSystemJournalEntryMerger merger = new FileSystemJournalEntryMerger(); + + merger.add(Journal.JournalEntry.newBuilder().setInodeDirectory( + File.InodeDirectoryEntry.newBuilder().setId(1).setParentId(0) + .setPersistenceState(PersistenceState.PERSISTED.name()) + .setName("test_dir").setPath("test_dir").build()).build()); + + merger.add(Journal.JournalEntry.newBuilder().setUpdateInodeDirectory( + File.UpdateInodeDirectoryEntry.newBuilder().setId(1) + .setDirectChildrenLoaded(true).build()).build()); + + merger.add(Journal.JournalEntry.newBuilder().setUpdateInode( + File.UpdateInodeEntry.newBuilder().setId(1) + .setName("test_dir_updated") + .setUfsFingerprint("fingerprint") + .build()).build()); + + List entries = merger.getMergedJournalEntries(); + Journal.JournalEntry entry = entries.get(0); + assertNotNull(entry.getInodeDirectory()); + assertEquals(1, entry.getInodeDirectory().getId()); + assertEquals("test_dir_updated", entry.getInodeDirectory().getName()); + assertEquals("test_dir", entry.getInodeDirectory().getPath()); + assertTrue(entry.getInodeDirectory().getDirectChildrenLoaded()); + + Journal.JournalEntry entry2 = entries.get(1); + assertNotNull(entry2.getUpdateInode()); + assertEquals("fingerprint", entry2.getUpdateInode().getUfsFingerprint()); + } } diff --git a/dora/core/server/master/src/test/java/alluxio/master/file/meta/LockedInodePathTest.java b/dora/core/server/master/src/test/java/alluxio/master/file/meta/LockedInodePathTest.java index b071694b5824..d56512ee55d5 100644 --- a/dora/core/server/master/src/test/java/alluxio/master/file/meta/LockedInodePathTest.java +++ b/dora/core/server/master/src/test/java/alluxio/master/file/meta/LockedInodePathTest.java @@ -24,6 +24,7 @@ import alluxio.exception.InvalidPathException; import alluxio.exception.status.UnavailableException; import alluxio.master.file.meta.InodeTree.LockPattern; +import alluxio.master.journal.FileSystemMergeJournalContext; import alluxio.master.journal.JournalContext; import alluxio.master.journal.NoopJournalContext; @@ -598,7 +599,7 @@ public void lockFinalEdgeWriteAlreadyLocked() throws Exception { @Test public void testFlushJournal() throws InvalidPathException, UnavailableException { AtomicInteger journalFlushCount = new AtomicInteger(); - JournalContext journalContext = mock(JournalContext.class); + JournalContext journalContext = mock(FileSystemMergeJournalContext.class); Mockito.doAnswer( (mock) -> { journalFlushCount.getAndIncrement(); From 2a438a8b2f406b46ff601e2a1d346b7f9a791b60 Mon Sep 17 00:00:00 2001 From: maobaolong <307499405@qq.com> Date: Fri, 21 Apr 2023 16:36:19 +0800 Subject: [PATCH 19/27] Print journal system information to help diagnose stop reason ### What changes are proposed in this pull request? When journal stop, print the reason, so user can find out why it stop. Something like journal is not formatted. ### Why are the changes needed? Show the journal stop reason pr-link: Alluxio/alluxio#17299 change-id: cid-a5938fcd3b664eb873ad5e380f4e047b038621ee --- .../main/java/alluxio/master/journal/AbstractJournalSystem.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/core/server/common/src/main/java/alluxio/master/journal/AbstractJournalSystem.java b/dora/core/server/common/src/main/java/alluxio/master/journal/AbstractJournalSystem.java index 31ee1d00a96d..871e3d501642 100644 --- a/dora/core/server/common/src/main/java/alluxio/master/journal/AbstractJournalSystem.java +++ b/dora/core/server/common/src/main/java/alluxio/master/journal/AbstractJournalSystem.java @@ -49,7 +49,7 @@ public synchronized void start() { @Override public synchronized void stop() { - Preconditions.checkState(mRunning, "Journal is not running"); + Preconditions.checkState(mRunning, "Journal is not running : " + this); mAllJournalSinks.forEach(JournalSink::beforeShutdown); mRunning = false; stopInternal(); From 6a87b24c281a2445deccc44aed78439f5eca257e Mon Sep 17 00:00:00 2001 From: maobaolong <307499405@qq.com> Date: Sat, 22 Apr 2023 12:18:45 +0800 Subject: [PATCH 20/27] Do not pass sync interval by default ### What changes are proposed in this pull request? Before this PR, if we did not config `alluxio.user.file.metadata.sync.interval` in client side, and config it in master side while we disable the `alluxio.user.conf.cluster.default.enabled` meanwhile, the metadata sync interval will always set to `-1`. After this PR, when `alluxio.user.conf.cluster.default.enabled=false`, master side `alluxio.user.file.metadata.sync.interval` will be accepted if user didn't set it. image ### Does this PR introduce any user facing changes? No pr-link: Alluxio/alluxio#17182 change-id: cid-b54bf51844aba97544237370748d3a2e1c74ec91 --- .../java/alluxio/util/FileSystemOptionsUtils.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dora/core/client/fs/src/main/java/alluxio/util/FileSystemOptionsUtils.java b/dora/core/client/fs/src/main/java/alluxio/util/FileSystemOptionsUtils.java index 9480d88ca004..56520f8bfb77 100644 --- a/dora/core/client/fs/src/main/java/alluxio/util/FileSystemOptionsUtils.java +++ b/dora/core/client/fs/src/main/java/alluxio/util/FileSystemOptionsUtils.java @@ -185,12 +185,14 @@ public static FileSystemMasterCommonPOptions commonDefaults(AlluxioConfiguration public static FileSystemMasterCommonPOptions commonDefaults(AlluxioConfiguration conf, boolean withOpId) { FileSystemMasterCommonPOptions.Builder builder = FileSystemMasterCommonPOptions.newBuilder() - .setSyncIntervalMs(conf.getMs(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)) .setTtl(conf.getMs(PropertyKey.USER_FILE_CREATE_TTL)) .setTtlAction(conf.getEnum(PropertyKey.USER_FILE_CREATE_TTL_ACTION, TtlAction.class)); if (withOpId && conf.getBoolean(PropertyKey.USER_FILE_INCLUDE_OPERATION_ID)) { builder.setOperationId(new OperationId(UUID.randomUUID()).toFsProto()); } + if (conf.isSetByUser(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)) { + builder.setSyncIntervalMs(conf.getMs(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)); + } return builder.build(); } @@ -342,10 +344,12 @@ public static SetAttributePOptions setAttributeClientDefaults(AlluxioConfigurati // Specifically set and override *only* the metadata sync interval // Setting other attributes by default will make the server think the user is intentionally // setting the values. Most fields withinSetAttributePOptions are set by inclusion + FileSystemMasterCommonPOptions.Builder builder = FileSystemMasterCommonPOptions.newBuilder(); + if (conf.isSetByUser(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)) { + builder.setSyncIntervalMs(conf.getMs(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)); + } return SetAttributePOptions.newBuilder() - .setCommonOptions(FileSystemMasterCommonPOptions.newBuilder() - .setSyncIntervalMs(conf.getMs(PropertyKey.USER_FILE_METADATA_SYNC_INTERVAL)) - .build()) + .setCommonOptions(builder.build()) .build(); } From 3b4fb9c72cc9ca03ff988c311c7f68f6fed26c5a Mon Sep 17 00:00:00 2001 From: Bowen Ding <6999708+dbw9580@users.noreply.github.com> Date: Tue, 25 Apr 2023 10:55:27 +0800 Subject: [PATCH 21/27] cherry-pick empty: Fix infinite recursion with RefCountedNioByteBuf.setBytes Fix an infinite recursion with `RefCountedNioByteBuf.setBytes` which causes stack overflow error. The current implementation of `RefCountedNioByteBuf.setBytes(int index, ByteBuf src, int srcIndex, int length)` delegates the copying of bytes to the `getBytes(int index, ByteBuf dst, int dstIndex, int length)` method of the source buffer. In some implementations of a direct `ByteBuf`, e.g. [`io.netty.buffer.UnpooledUnsafeDirectByteBuf`](https://github.com/netty/netty/blob/d773f37e3422b8bc38429bbde94583173c3b7e4a/buffer/src/main/java/io/netty/buffer/UnpooledUnsafeDirectByteBuf.java), the `getBytes(int index, ByteBuf src, int srcIndex, int length)` method in turn delegates the call back to the `setBytes` method of the destination buffer ([here](https://github.com/netty/netty/blob/d773f37e3422b8bc38429bbde94583173c3b7e4a/buffer/src/main/java/io/netty/buffer/UnpooledUnsafeDirectByteBuf.java#L159) and [here](https://github.com/netty/netty/blob/d773f37e3422b8bc38429bbde94583173c3b7e4a/buffer/src/main/java/io/netty/buffer/UnsafeByteBufUtil.java#L464)). This causes an infinite recursion. Error stack when stack overflow: ``` Exception in thread "main" java.lang.StackOverflowError at io.netty.buffer.AbstractByteBuf.ensureAccessible(AbstractByteBuf.java:1488) at alluxio.network.protocol.databuffer.RefCountedNioByteBuf.ensureIndexInBounds(RefCountedNioByteBuf.java:418) at alluxio.network.protocol.databuffer.RefCountedNioByteBuf.setBytes(RefCountedNioByteBuf.java:283) at alluxio.network.protocol.databuffer.PooledDirectNioByteBuf.setBytes(PooledDirectNioByteBuf.java:22) at io.netty.buffer.UnsafeByteBufUtil.getBytes(UnsafeByteBufUtil.java:476) at io.netty.buffer.PooledUnsafeDirectByteBuf.getBytes(PooledUnsafeDirectByteBuf.java:124) at alluxio.network.protocol.databuffer.RefCountedNioByteBuf.setBytes(RefCountedNioByteBuf.java:284) at alluxio.network.protocol.databuffer.PooledDirectNioByteBuf.setBytes(PooledDirectNioByteBuf.java:22) at io.netty.buffer.UnsafeByteBufUtil.getBytes(UnsafeByteBufUtil.java:476) at io.netty.buffer.PooledUnsafeDirectByteBuf.getBytes(PooledUnsafeDirectByteBuf.java:124) at alluxio.network.protocol.databuffer.RefCountedNioByteBuf.setBytes(RefCountedNioByteBuf.java:284) at alluxio.network.protocol.databuffer.PooledDirectNioByteBuf.setBytes(PooledDirectNioByteBuf.java:22) ... ``` An unit test is added to cover this case. No. pr-link: Alluxio/alluxio#17311 change-id: cid-3f9afa7df0b8cddb8c127c40bbfeb240bdc93553 From 915ee8323f027eb9804dd29ea4ea5d91dfbc2fcc Mon Sep 17 00:00:00 2001 From: Haoning Sun Date: Thu, 27 Apr 2023 12:55:23 +0800 Subject: [PATCH 22/27] Avoid null when using BlockMasterClientPool ### What changes are proposed in this pull request? Avoid using mBlockMasterClientPool directly. ### Why are the changes needed? Using mBlockMasterClientPool during reinit may throw an exception. pr-link: Alluxio/alluxio#17326 change-id: cid-1b43697d52fe43065b71208cd5594c248a8c4fe9 --- .../main/java/alluxio/client/file/FileSystemContext.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContext.java b/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContext.java index a126428b50cc..605173f0ba0b 100644 --- a/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContext.java +++ b/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContext.java @@ -885,11 +885,9 @@ private void initializeLocalWorker() throws IOException { */ private List getWorkerAddresses() throws IOException { List infos; - BlockMasterClient blockMasterClient = mBlockMasterClientPool.acquire(); - try { - infos = blockMasterClient.getWorkerInfoList(); - } finally { - mBlockMasterClientPool.release(blockMasterClient); + try (CloseableResource masterClientResource = + acquireBlockMasterClientResource()) { + infos = masterClientResource.get().getWorkerInfoList(); } if (infos.isEmpty()) { throw new UnavailableException(ExceptionMessage.NO_WORKER_AVAILABLE.getMessage()); From 482735a15eee57511b2ea6dcdbf8d5f6bed2ceca Mon Sep 17 00:00:00 2001 From: Xinran Dong <81548653+007DXR@users.noreply.github.com> Date: Fri, 28 Apr 2023 16:49:50 +0800 Subject: [PATCH 23/27] Reduce redundant calls in getObject of S3 API the same modification as #16655 pr-link: Alluxio/alluxio#17356 change-id: cid-a410d4d5645b6fd24a89c145b22c33eb22b7b1a8 --- .../proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java index 78450c296eb6..671a11c49e86 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java @@ -28,6 +28,7 @@ import alluxio.grpc.CreateDirectoryPOptions; import alluxio.grpc.CreateFilePOptions; import alluxio.grpc.DeletePOptions; +import alluxio.grpc.OpenFilePOptions; import alluxio.grpc.PMode; import alluxio.grpc.RenamePOptions; import alluxio.grpc.S3SyntaxOptions; @@ -311,7 +312,7 @@ public Response continueTask() { mOPType.name(), user, mHandler.getBucket(), mHandler.getObject())) { try { URIStatus status = userFs.getStatus(objectUri); - FileInStream is = userFs.openFile(objectUri); + FileInStream is = userFs.openFile(status, OpenFilePOptions.getDefaultInstance()); S3RangeSpec s3Range = S3RangeSpec.Factory.create(range); RangeFileInStream ris = RangeFileInStream.Factory.create( is, status.getLength(), s3Range); From 8a0616f40ff615dee6cbaab6ae130070cb58abee Mon Sep 17 00:00:00 2001 From: yuyang wang <39869597+Jackson-Wang-7@users.noreply.github.com> Date: Sat, 29 Apr 2023 10:11:15 +0800 Subject: [PATCH 24/27] Fix the Eof error when parse the Complete MPU ### What changes are proposed in this pull request? Convert the Eof error to a standard error code when parse the Complete MPU pr-link: Alluxio/alluxio#17351 change-id: cid-565ccb6d766df536cc8a6d4fc22343b59169a41b --- .../s3/CompleteMultipartUploadHandler.java | 7 +++++++ .../java/alluxio/proxy/s3/S3ObjectTask.java | 7 +++++++ .../client/rest/S3ClientRestApiTest.java | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/CompleteMultipartUploadHandler.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/CompleteMultipartUploadHandler.java index 05f3bcf7edfb..acbc75a6fdeb 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/CompleteMultipartUploadHandler.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/CompleteMultipartUploadHandler.java @@ -20,6 +20,7 @@ import alluxio.conf.Configuration; import alluxio.conf.PropertyKey; import alluxio.exception.AlluxioException; +import alluxio.exception.status.InvalidArgumentException; import alluxio.grpc.Bits; import alluxio.grpc.CreateFilePOptions; import alluxio.grpc.DeletePOptions; @@ -33,6 +34,7 @@ import alluxio.web.ProxyWebServer; import com.codahale.metrics.Timer; +import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.dataformat.xml.XmlMapper; import com.google.common.base.Stopwatch; @@ -376,6 +378,11 @@ public CompleteMultipartUploadRequest parseCompleteMultipartUploadRequest(String if (cause instanceof S3Exception) { throw S3RestUtils.toObjectS3Exception((S3Exception) cause, objectPath); } + if (e instanceof JsonParseException) { + throw new S3Exception( + new InvalidArgumentException("Failed parsing CompleteMultipartUploadRequest."), + objectPath, S3ErrorCode.INVALID_ARGUMENT); + } throw S3RestUtils.toObjectS3Exception(e, objectPath); } return request; diff --git a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java index 671a11c49e86..a1006d622d8d 100644 --- a/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java +++ b/dora/core/server/proxy/src/main/java/alluxio/proxy/s3/S3ObjectTask.java @@ -24,6 +24,7 @@ import alluxio.exception.DirectoryNotEmptyException; import alluxio.exception.FileAlreadyExistsException; import alluxio.exception.FileDoesNotExistException; +import alluxio.exception.status.InvalidArgumentException; import alluxio.grpc.Bits; import alluxio.grpc.CreateDirectoryPOptions; import alluxio.grpc.CreateFilePOptions; @@ -41,6 +42,7 @@ import alluxio.web.ProxyWebServer; import com.codahale.metrics.Timer; +import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.dataformat.xml.XmlMapper; import com.google.common.base.Preconditions; import com.google.common.io.BaseEncoding; @@ -1188,6 +1190,11 @@ public CompleteMultipartUploadRequest parseCompleteMultipartUploadRequest(String if (cause instanceof S3Exception) { throw S3RestUtils.toObjectS3Exception((S3Exception) cause, objectPath); } + if (e instanceof JsonParseException) { + throw new S3Exception( + new InvalidArgumentException("Failed parsing CompleteMultipartUploadRequest."), + objectPath, S3ErrorCode.INVALID_ARGUMENT); + } throw S3RestUtils.toObjectS3Exception(e, objectPath); } return request; diff --git a/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java b/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java index 091ed6918a34..9e74d7e88ef4 100644 --- a/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java +++ b/dora/tests/src/test/java/alluxio/client/rest/S3ClientRestApiTest.java @@ -1727,6 +1727,24 @@ public void duplicateMultipartUpload() throws Exception { Assert.assertEquals(S3ErrorCode.Name.NO_SUCH_UPLOAD, response.getCode()); } + @Test + public void completeMultipartUploadWithInvalidArgument() throws Exception { + final String bucketName = "bucket"; + createBucketRestCall(bucketName); + + final String objectName = "object"; + String objectKey = bucketName + AlluxioURI.SEPARATOR + objectName; + + // Initiate the multipart upload. + String result = initiateMultipartUploadRestCall(objectKey); + InitiateMultipartUploadResult multipartUploadResult = + XML_MAPPER.readValue(result, InitiateMultipartUploadResult.class); + final String uploadId = multipartUploadResult.getUploadId(); + TestCase testCase = getCompleteMultipartUploadReadCallTestCase(objectKey, uploadId, null); + HttpURLConnection connection = testCase.execute(); + Assert.assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), connection.getResponseCode()); + } + @Test @Ignore public void completeMultipartUploadSpecifyParts() throws Exception { From 8310070fbf35828ee765fd43385f481154988915 Mon Sep 17 00:00:00 2001 From: elega <445092967@qq.com> Date: Mon, 10 Apr 2023 11:19:39 +0800 Subject: [PATCH 25/27] Show master versions in fsadmin report command ./alluxio fsadmin report summary ![image](https://user-images.githubusercontent.com/14806853/230587552-b44a3aea-fb69-49d8-bd61-6b21950a137e.png) edit: updated screenshot pr-link: Alluxio/alluxio#17177 change-id: cid-d8075ccbbcdaed6c4dd488428d20c59718e125c3 --- .../src/main/proto/grpc/meta_master.proto | 8 +++++ common/transport/src/main/proto/proto.lock | 30 +++++++++++++++++ .../meta/MetaMasterClientServiceHandler.java | 32 ++++++++++++++++++ .../cli/fsadmin/report/SummaryCommand.java | 14 +++++++- .../fsadmin/report/SummaryCommandTest.java | 33 +++++++++++++++++++ 5 files changed, 116 insertions(+), 1 deletion(-) diff --git a/common/transport/src/main/proto/grpc/meta_master.proto b/common/transport/src/main/proto/grpc/meta_master.proto index 5ec40ce402bd..6790b05c3144 100644 --- a/common/transport/src/main/proto/grpc/meta_master.proto +++ b/common/transport/src/main/proto/grpc/meta_master.proto @@ -77,6 +77,13 @@ message MasterInfo { optional string clusterId = 11; optional bool raftJournal = 12; repeated string raftAddress = 13; + repeated MasterVersion masterVersions = 14; +} + +message MasterVersion { + optional grpc.NetAddress addresses = 1; + optional string version = 2; + optional string state = 3; } enum MasterInfoField { @@ -93,6 +100,7 @@ enum MasterInfoField { CLUSTER_ID = 10; RAFT_JOURNAL = 11; RAFT_ADDRESSES = 12; + MASTER_VERSION = 13; } message GetMasterInfoPOptions { diff --git a/common/transport/src/main/proto/proto.lock b/common/transport/src/main/proto/proto.lock index e4b1e3c70b35..fc202bdeb19e 100644 --- a/common/transport/src/main/proto/proto.lock +++ b/common/transport/src/main/proto/proto.lock @@ -5969,6 +5969,10 @@ { "name": "RAFT_ADDRESSES", "integer": 12 + }, + { + "name": "MASTER_VERSION", + "integer": 13 } ] }, @@ -6235,6 +6239,32 @@ "name": "raftAddress", "type": "string", "is_repeated": true + }, + { + "id": 14, + "name": "masterVersions", + "type": "MasterVersion", + "is_repeated": true + } + ] + }, + { + "name": "MasterVersion", + "fields": [ + { + "id": 1, + "name": "addresses", + "type": "grpc.NetAddress" + }, + { + "id": 2, + "name": "version", + "type": "string" + }, + { + "id": 3, + "name": "state", + "type": "string" } ] }, diff --git a/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterClientServiceHandler.java b/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterClientServiceHandler.java index cb33f8391d04..2d76019d89d4 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterClientServiceHandler.java +++ b/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterClientServiceHandler.java @@ -26,7 +26,9 @@ import alluxio.grpc.GetMasterInfoPResponse; import alluxio.grpc.MasterInfo; import alluxio.grpc.MasterInfoField; +import alluxio.grpc.MasterVersion; import alluxio.grpc.MetaMasterClientServiceGrpc; +import alluxio.grpc.NetAddress; import alluxio.master.StateLockOptions; import alluxio.master.journal.raft.RaftJournalSystem; import alluxio.wire.Address; @@ -139,6 +141,36 @@ public void getMasterInfo(GetMasterInfoPOptions options, masterInfo.setRaftJournal(mMetaMaster.getMasterContext().getJournalSystem() instanceof RaftJournalSystem); break; + case MASTER_VERSION: + masterInfo.addMasterVersions( + MasterVersion.newBuilder() + .setAddresses(NetAddress.newBuilder().setHost( + mMetaMaster.getRpcAddress().getHostName()) + .setRpcPort(mMetaMaster.getRpcAddress().getPort()).build()) + .setVersion(RuntimeConstants.VERSION) + .setState("PRIMARY") + .build() + ); + List standbyMasterVersions = + Arrays.stream(mMetaMaster.getStandbyMasterInfos()) + .map(it -> MasterVersion.newBuilder() + .setVersion(it.getVersion()) + .setAddresses(it.getAddress().toProto()) + .setState("STANDBY") + .build()) + .collect(Collectors.toList()); + + masterInfo.addAllMasterVersions(standbyMasterVersions); + List lostMasterVersions = + Arrays.stream(mMetaMaster.getLostMasterInfos()) + .map(it -> MasterVersion.newBuilder() + .setVersion(it.getVersion()) + .setAddresses(it.getAddress().toProto()) + .setState("LOST") + .build()) + .collect(Collectors.toList()); + masterInfo.addAllMasterVersions(lostMasterVersions); + break; default: LOG.warn("Unrecognized meta master info field: " + field); } diff --git a/dora/shell/src/main/java/alluxio/cli/fsadmin/report/SummaryCommand.java b/dora/shell/src/main/java/alluxio/cli/fsadmin/report/SummaryCommand.java index 386803bde1eb..7cf44993e8a7 100644 --- a/dora/shell/src/main/java/alluxio/cli/fsadmin/report/SummaryCommand.java +++ b/dora/shell/src/main/java/alluxio/cli/fsadmin/report/SummaryCommand.java @@ -16,6 +16,8 @@ import alluxio.client.meta.MetaMasterClient; import alluxio.grpc.MasterInfo; import alluxio.grpc.MasterInfoField; +import alluxio.grpc.MasterVersion; +import alluxio.grpc.NetAddress; import alluxio.util.CommonUtils; import alluxio.util.FormatUtils; import alluxio.wire.BlockMasterInfo; @@ -82,7 +84,8 @@ private void printMetaMasterInfo() throws IOException { MasterInfoField.RPC_PORT, MasterInfoField.START_TIME_MS, MasterInfoField.UP_TIME_MS, MasterInfoField.VERSION, MasterInfoField.SAFE_MODE, MasterInfoField.ZOOKEEPER_ADDRESSES, - MasterInfoField.RAFT_JOURNAL, MasterInfoField.RAFT_ADDRESSES)); + MasterInfoField.RAFT_JOURNAL, MasterInfoField.RAFT_ADDRESSES, + MasterInfoField.MASTER_VERSION)); MasterInfo masterInfo = mMetaMasterClient.getMasterInfo(masterInfoFilter); print("Master Address: " + masterInfo.getLeaderMasterAddress()); @@ -118,6 +121,15 @@ private void printMetaMasterInfo() throws IOException { } else { print("Raft-based Journal: false"); } + String formatString = "%-32s %-8s %-32s"; + print(String.format(formatString, "Master Address", "State", "Version")); + for (MasterVersion masterVersion: masterInfo.getMasterVersionsList()) { + NetAddress address = masterVersion.getAddresses(); + print(String.format(formatString, + address.getHost() + ":" + address.getRpcPort(), + masterVersion.getState(), + masterVersion.getVersion())); + } } /** diff --git a/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java b/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java index 0fe10e4bdf5a..e500e259bdd9 100644 --- a/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java +++ b/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java @@ -16,12 +16,15 @@ import static org.mockito.Mockito.when; import alluxio.Constants; +import alluxio.RuntimeConstants; import alluxio.client.block.BlockMasterClient; import alluxio.client.meta.MetaMasterClient; import alluxio.conf.AlluxioConfiguration; import alluxio.conf.Configuration; import alluxio.conf.PropertyKey; import alluxio.grpc.MasterInfo; +import alluxio.grpc.MasterVersion; +import alluxio.grpc.NetAddress; import alluxio.util.CommonUtils; import alluxio.wire.BlockMasterInfo; @@ -93,19 +96,45 @@ public void prepareBaseDependencies() throws IOException { } void prepareZKHADependencies() throws IOException { + MasterVersion primaryVersion = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Primary").setAddresses( + NetAddress.newBuilder().setHost("hostname1").setRpcPort(10000).build() + ).build(); + MasterVersion standby1Version = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Standby").setAddresses( + NetAddress.newBuilder().setHost("hostname2").setRpcPort(10001).build() + ).build(); + MasterVersion standby2Version = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Standby").setAddresses( + NetAddress.newBuilder().setHost("hostname3").setRpcPort(10002).build() + ).build(); mMasterInfo = MasterInfo.newBuilder(mMasterInfo) .addAllZookeeperAddresses(Arrays.asList("[zookeeper_hostname1]:2181", "[zookeeper_hostname2]:2181", "[zookeeper_hostname3]:2181")) + .addAllMasterVersions(Arrays.asList(primaryVersion, standby1Version, standby2Version)) .setRaftJournal(false) .build(); when(mMetaMasterClient.getMasterInfo(any())).thenReturn(mMasterInfo); } void prepareRaftHaDependencies() throws IOException { + MasterVersion primaryVersion = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Primary").setAddresses( + NetAddress.newBuilder().setHost("hostname1").setRpcPort(10000).build() + ).build(); + MasterVersion standby1Version = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Standby").setAddresses( + NetAddress.newBuilder().setHost("hostname2").setRpcPort(10001).build() + ).build(); + MasterVersion standby2Version = MasterVersion.newBuilder() + .setVersion(RuntimeConstants.VERSION).setState("Standby").setAddresses( + NetAddress.newBuilder().setHost("hostname3").setRpcPort(10002).build() + ).build(); mMasterInfo = MasterInfo.newBuilder(mMasterInfo) .setRaftJournal(true) .addAllRaftAddress(Arrays.asList("[raftJournal_hostname1]:19200", "[raftJournal_hostname2]:19200", "[raftJournal_hostname3]:19200")) + .addAllMasterVersions(Arrays.asList(primaryVersion, standby1Version, standby2Version)) .build(); when(mMetaMasterClient.getMasterInfo(any())).thenReturn(mMasterInfo); } @@ -165,6 +194,10 @@ private void checkIfOutputValid(String dateFormatPattern, List " Safe Mode: false")); expectedOutput.addAll(HAPattern); expectedOutput.addAll(new ArrayList<>(Arrays.asList( + " Master Address State Version ", + " hostname1:10000 Primary 2.10.0-SNAPSHOT ", + " hostname2:10001 Standby 2.10.0-SNAPSHOT ", + " hostname3:10002 Standby 2.10.0-SNAPSHOT ", " Live Workers: 12", " Lost Workers: 4", " Total Capacity: 1309.92KB", From 2d59e6bce905467b4c6045b7ebb48b819e216af0 Mon Sep 17 00:00:00 2001 From: maobaolong <307499405@qq.com> Date: Fri, 14 Apr 2023 09:57:23 +0800 Subject: [PATCH 26/27] cherry pick and resolve conflicts: Support cron timer to arrange the period heartbeat executor invoke time Support cron timer to arrange the period heartbeat executor invoke time. Example ``` cron timer config: * 0-10,20-30,40-56 12-13 * * ? * The heartbeat executor invoke at every minute from 0 through 10, from 20 through 30, from 40 through 56, at past every hour from 12 through 13. ``` pr-link: Alluxio/alluxio#16900 change-id: cid-9277f30e2159e64863067d14cbcbee526707c5b6 --- .../alluxio/client/file/ConfigHashSync.java | 2 +- .../file/FileSystemContextReinitializer.java | 2 +- .../CronExpressionIntervalSupplier.java | 59 +++++++++ .../heartbeat/FixedIntervalSupplier.java | 63 +++++++++ .../alluxio/heartbeat/HeartbeatExecutor.java | 6 +- .../alluxio/heartbeat/HeartbeatThread.java | 74 ++++++----- .../alluxio/heartbeat/HeartbeatTimer.java | 26 ++-- .../alluxio/heartbeat/ScheduledTimer.java | 11 +- .../heartbeat/SleepIntervalSupplier.java | 34 +++++ .../java/alluxio/heartbeat/SleepingTimer.java | 66 +++++----- .../heartbeat/HeartbeatContextTest.java | 2 +- .../heartbeat/HeartbeatThreadTest.java | 5 +- ...ForCronExpressionIntervalSupplierTest.java | 121 ++++++++++++++++++ .../alluxio/heartbeat/SleepingTimerTest.java | 9 +- .../master/block/DefaultBlockMaster.java | 11 +- .../master/block/meta/MasterWorkerInfo.java | 2 +- .../master/file/BlockIntegrityChecker.java | 2 +- .../master/file/DefaultFileSystemMaster.java | 31 +++-- .../alluxio/master/file/InodeTtlChecker.java | 2 +- .../alluxio/master/file/LostFileDetector.java | 2 +- .../java/alluxio/master/file/UfsCleaner.java | 2 +- .../file/activesync/ActiveSyncManager.java | 4 +- .../master/file/activesync/ActiveSyncer.java | 2 +- .../file/replication/ReplicationChecker.java | 2 +- .../master/meta/DefaultMetaMaster.java | 21 +-- .../master/meta/JournalSpaceMonitor.java | 2 +- .../alluxio/master/meta/MetaMasterSync.java | 2 +- .../alluxio/master/meta/UpdateChecker.java | 2 +- .../master/metrics/DefaultMetricsMaster.java | 6 +- .../throttle/DefaultThrottleMaster.java | 6 +- .../replication/ReplicationCheckerTest.java | 36 +++--- .../master/meta/JournalSpaceMonitorTest.java | 4 +- .../alluxio/worker/block/BlockMasterSync.java | 2 +- .../worker/block/BlockSyncMasterGroup.java | 4 +- .../worker/block/DefaultBlockWorker.java | 12 +- .../alluxio/worker/block/PinListSync.java | 2 +- .../worker/block/SpecificMasterBlockSync.java | 2 +- .../alluxio/worker/dora/PagedDoraWorker.java | 6 +- .../underfs/SpecificMasterBlockSyncTest.java | 12 +- .../alluxio/worker/block/PinListSyncTest.java | 4 +- .../main/java/alluxio/fuse/AlluxioFuse.java | 3 +- .../java/alluxio/fuse/meta/UpdateChecker.java | 2 +- .../java/alluxio/master/job/JobMaster.java | 6 +- .../main/java/alluxio/worker/JobWorker.java | 4 +- .../job/command/CommandHandlingExecutor.java | 2 +- .../command/CommandHandlingExecutorTest.java | 2 +- .../table/transform/TransformManager.java | 6 +- ...MasterDeleteLostWorkerIntegrationTest.java | 4 +- ...ileSystemContextReinitIntegrationTest.java | 4 +- ...ckMasterRegisterStreamIntegrationTest.java | 2 +- ...ckWorkerRegisterStreamIntegrationTest.java | 2 +- 51 files changed, 510 insertions(+), 190 deletions(-) create mode 100644 dora/core/common/src/main/java/alluxio/heartbeat/CronExpressionIntervalSupplier.java create mode 100644 dora/core/common/src/main/java/alluxio/heartbeat/FixedIntervalSupplier.java create mode 100644 dora/core/common/src/main/java/alluxio/heartbeat/SleepIntervalSupplier.java create mode 100644 dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerForCronExpressionIntervalSupplierTest.java diff --git a/dora/core/client/fs/src/main/java/alluxio/client/file/ConfigHashSync.java b/dora/core/client/fs/src/main/java/alluxio/client/file/ConfigHashSync.java index 144be4e7f6f1..b94ffd6d0651 100644 --- a/dora/core/client/fs/src/main/java/alluxio/client/file/ConfigHashSync.java +++ b/dora/core/client/fs/src/main/java/alluxio/client/file/ConfigHashSync.java @@ -71,7 +71,7 @@ public Optional getException() { } @Override - public synchronized void heartbeat() { + public synchronized void heartbeat(long timeLimitMs) { if (!mContext.getClientContext().getClusterConf().clusterDefaultsLoaded()) { // Wait until the initial cluster defaults are loaded. return; diff --git a/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContextReinitializer.java b/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContextReinitializer.java index 78ae526be8e6..ae7e9049e95c 100644 --- a/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContextReinitializer.java +++ b/dora/core/client/fs/src/main/java/alluxio/client/file/FileSystemContextReinitializer.java @@ -66,7 +66,7 @@ public FileSystemContextReinitializer(FileSystemContext context) { mExecutor = new ConfigHashSync(context); mFuture = REINIT_EXECUTOR.scheduleAtFixedRate(() -> { try { - mExecutor.heartbeat(); + mExecutor.heartbeat(Long.MAX_VALUE); } catch (Exception e) { LOG.error("Uncaught exception in config heartbeat executor, shutting down", e); } diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/CronExpressionIntervalSupplier.java b/dora/core/common/src/main/java/alluxio/heartbeat/CronExpressionIntervalSupplier.java new file mode 100644 index 000000000000..e632e472dac6 --- /dev/null +++ b/dora/core/common/src/main/java/alluxio/heartbeat/CronExpressionIntervalSupplier.java @@ -0,0 +1,59 @@ +/* + * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0 + * (the "License"). You may not use this work except in compliance with the License, which is + * available at www.apache.org/licenses/LICENSE-2.0 + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied, as more fully set forth in the License. + * + * See the NOTICE file distributed with this work for information regarding copyright ownership. + */ + +package alluxio.heartbeat; + +import org.apache.logging.log4j.core.util.CronExpression; + +import java.time.Duration; +import java.time.Instant; +import java.util.Date; + +/** +* Calculate the next interval by given cron expression. +*/ +public class CronExpressionIntervalSupplier implements SleepIntervalSupplier { + private final long mInterval; + private final CronExpression mCron; + + /** + * Constructs a new {@link CronExpressionIntervalSupplier}. + * + * @param cronExpression the cron expression + * @param fixedInterval the fixed interval + */ + public CronExpressionIntervalSupplier(CronExpression cronExpression, long fixedInterval) { + mInterval = fixedInterval; + mCron = cronExpression; + } + + @Override + public long getNextInterval(long mPreviousTickedMs, long nowTimeStampMillis) { + long nextInterval = 0; + long executionTimeMs = nowTimeStampMillis - mPreviousTickedMs; + if (executionTimeMs < mInterval) { + nextInterval = mInterval - executionTimeMs; + } + Date now = Date.from(Instant.ofEpochMilli(nowTimeStampMillis + nextInterval)); + if (mCron.isSatisfiedBy(now)) { + return nextInterval; + } + return nextInterval + Duration.between( + now.toInstant(), mCron.getNextValidTimeAfter(now).toInstant()).toMillis(); + } + + @Override + public long getRunLimit(long mPreviousTickedMs) { + Date now = Date.from(Instant.ofEpochMilli(mPreviousTickedMs)); + return Duration.between(now.toInstant(), + mCron.getNextInvalidTimeAfter(now).toInstant()).toMillis(); + } +} diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/FixedIntervalSupplier.java b/dora/core/common/src/main/java/alluxio/heartbeat/FixedIntervalSupplier.java new file mode 100644 index 000000000000..1269f5996112 --- /dev/null +++ b/dora/core/common/src/main/java/alluxio/heartbeat/FixedIntervalSupplier.java @@ -0,0 +1,63 @@ +/* + * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0 + * (the "License"). You may not use this work except in compliance with the License, which is + * available at www.apache.org/licenses/LICENSE-2.0 + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied, as more fully set forth in the License. + * + * See the NOTICE file distributed with this work for information regarding copyright ownership. + */ + +package alluxio.heartbeat; + +import org.slf4j.Logger; +import org.slf4j.helpers.NOPLogger; + +/** + * Fixed interval supplier. + */ +public class FixedIntervalSupplier implements SleepIntervalSupplier { + + private final long mInterval; + protected final Logger mLogger; + + /** + * Constructs a new {@link FixedIntervalSupplier}. + * + * @param fixedInterval the fixed interval + * @param logger the logger + */ + public FixedIntervalSupplier(long fixedInterval, Logger logger) { + mInterval = fixedInterval; + mLogger = logger; + } + + /** + * Constructs a new {@link FixedIntervalSupplier}. + * + * @param fixedInterval the fixed interval + */ + public FixedIntervalSupplier(long fixedInterval) { + this(fixedInterval, NOPLogger.NOP_LOGGER); + } + + @Override + public long getNextInterval(long mPreviousTickedMs, long nowTimeStampMillis) { + if (mPreviousTickedMs == -1) { + return -1; + } + long executionTimeMs = nowTimeStampMillis - mPreviousTickedMs; + if (executionTimeMs > mInterval) { + mLogger.warn("{} last execution took {} ms. Longer than the interval {}", + Thread.currentThread().getName(), executionTimeMs, mInterval); + return 0; + } + return mInterval - executionTimeMs; + } + + @Override + public long getRunLimit(long mPreviousTickedMs) { + return mInterval; + } +} diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatExecutor.java b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatExecutor.java index a10c4662c5c5..2b8e96ec7532 100644 --- a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatExecutor.java +++ b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatExecutor.java @@ -15,15 +15,17 @@ /** * An interface for a heartbeat execution. The {@link HeartbeatThread} calls the - * {@link #heartbeat()} method. + * {@link #heartbeat(long)} method. */ public interface HeartbeatExecutor extends Closeable { + /** * Implements the heartbeat logic. * + * @param timeLimitMs time limit in milliseconds this heartbeat should not exceed when running * @throws InterruptedException if the thread is interrupted */ - void heartbeat() throws InterruptedException; + void heartbeat(long timeLimitMs) throws InterruptedException; /** * Cleans up any resources used by the heartbeat executor. diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatThread.java b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatThread.java index 2bb891d67c19..cc9b200bfe5f 100644 --- a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatThread.java +++ b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatThread.java @@ -12,7 +12,6 @@ package alluxio.heartbeat; import alluxio.conf.AlluxioConfiguration; -import alluxio.conf.Reconfigurable; import alluxio.conf.ReconfigurableRegistry; import alluxio.security.authentication.AuthenticatedClientUser; import alluxio.security.user.UserState; @@ -21,11 +20,12 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.google.common.base.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.time.Clock; +import java.util.function.Supplier; import javax.annotation.concurrent.NotThreadSafe; /** @@ -33,13 +33,12 @@ * the JVM from exiting. */ @NotThreadSafe -public final class HeartbeatThread implements Runnable, Reconfigurable { +public final class HeartbeatThread implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(HeartbeatThread.class); private final String mThreadName; private final HeartbeatExecutor mExecutor; private final UserState mUserState; - private final Supplier mIntervalSupplier; private HeartbeatTimer mTimer; private AlluxioConfiguration mConfiguration; private Status mStatus; @@ -73,26 +72,28 @@ public static String generateThreadName(String executorName, String threadId) { * @param intervalSupplier Sleep time between different heartbeat supplier * @param conf Alluxio configuration * @param userState the user state for this heartbeat thread + * @param clock the clock used to compute the current time */ public HeartbeatThread(String executorName, String threadId, HeartbeatExecutor executor, - Supplier intervalSupplier, AlluxioConfiguration conf, UserState userState) { + Supplier intervalSupplier, + AlluxioConfiguration conf, UserState userState, Clock clock) { mThreadName = generateThreadName(executorName, threadId); mExecutor = Preconditions.checkNotNull(executor, "executor"); Class timerClass = HeartbeatContext.getTimerClass(executorName); - mTimer = CommonUtils.createNewClassInstance(timerClass, new Class[] {String.class, long.class}, - new Object[] {mThreadName, intervalSupplier.get()}); + mTimer = CommonUtils.createNewClassInstance(timerClass, + new Class[] {String.class, Clock.class, Supplier.class}, + new Object[] {mThreadName, clock, intervalSupplier}); mConfiguration = conf; mUserState = userState; - mIntervalSupplier = intervalSupplier; mStatus = Status.INIT; - ReconfigurableRegistry.register(this); + ReconfigurableRegistry.register(mTimer); } /** * Convenience method for * {@link * #HeartbeatThread(String, String, HeartbeatExecutor, Supplier, AlluxioConfiguration, - * UserState)} where threadId is null. + * UserState, Clock)} where threadId is null. * * @param executorName the executor name that is one of those defined in {@link HeartbeatContext} * @param executor the heartbeat executor @@ -101,12 +102,34 @@ public HeartbeatThread(String executorName, String threadId, HeartbeatExecutor e * @param userState the user state for this heartbeat thread */ public HeartbeatThread(String executorName, HeartbeatExecutor executor, - Supplier intervalSupplier, AlluxioConfiguration conf, UserState userState) { - this(executorName, null, executor, intervalSupplier, conf, userState); + Supplier intervalSupplier, AlluxioConfiguration conf, + UserState userState) { + this(executorName, null, executor, intervalSupplier, conf, userState, Clock.systemUTC()); + } + + /** + * Convenience method for + * {@link + * #HeartbeatThread(String, String, HeartbeatExecutor, Supplier, AlluxioConfiguration, + * UserState, Clock)} where threadId is null. + * + * @param executorName the executor name that is one of those defined in {@link HeartbeatContext} + * @param executor the heartbeat executor + * @param intervalSupplier the interval between heartbeats supplier + * @param conf the Alluxio configuration + * @param userState the user state for this heartbeat thread + * @param clock the clock used to compute the current time + */ + public HeartbeatThread(String executorName, HeartbeatExecutor executor, + Supplier intervalSupplier, + AlluxioConfiguration conf, UserState userState, Clock clock) { + this(executorName, null, executor, intervalSupplier, + conf, userState, clock); } @Override public void run() { + long counter = 0L; try { if (SecurityUtils.isSecurityEnabled(mConfiguration) && AuthenticatedClientUser.get(mConfiguration) == null) { @@ -123,9 +146,10 @@ public void run() { while (!Thread.interrupted()) { // TODO(peis): Fix this. The current implementation consumes one thread even when ticking. mStatus = Status.WAITING; - mTimer.tick(); + long limitTime = mTimer.tick(); mStatus = Status.RUNNING; - mExecutor.heartbeat(); + LOG.debug("{} #{} will run limited in {}s", mThreadName, counter++, limitTime / 1000); + mExecutor.heartbeat(limitTime); } } catch (InterruptedException e) { // Allow thread to exit. @@ -133,19 +157,11 @@ public void run() { LOG.error("Uncaught exception in heartbeat executor, Heartbeat Thread shutting down", e); } finally { mStatus = Status.STOPPED; + ReconfigurableRegistry.unregister(mTimer); mExecutor.close(); } } - /** - * Updates the heartbeat interval. - * - * @param intervalMs the heartbeat interval in ms - */ - public void updateIntervalMs(long intervalMs) { - mTimer.setIntervalMs(intervalMs); - } - /** * @return the status of current heartbeat thread */ @@ -153,18 +169,6 @@ public Status getStatus() { return mStatus; } - @Override - public void update() { - if (mStatus == Status.STOPPED) { - ReconfigurableRegistry.unregister(this); - return; - } - long interval = mIntervalSupplier.get(); - if (interval != mTimer.getIntervalMs()) { - updateIntervalMs(interval); - } - } - /** * Enum representing the status of HeartbeatThread. */ diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatTimer.java b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatTimer.java index 96e9618af3ea..736037234edd 100644 --- a/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatTimer.java +++ b/dora/core/common/src/main/java/alluxio/heartbeat/HeartbeatTimer.java @@ -11,33 +11,27 @@ package alluxio.heartbeat; +import alluxio.conf.Reconfigurable; + /** * An interface for heartbeat timers. The {@link HeartbeatThread} calls the {@link #tick()} method. */ -public interface HeartbeatTimer { +public interface HeartbeatTimer extends Reconfigurable { /** - * Sets the heartbeat interval. - * - * @param intervalMs the heartbeat interval in ms - */ - default void setIntervalMs(long intervalMs) { - throw new UnsupportedOperationException("Setting interval is not supported"); - } - - /** - * Get the interval of HeartbeatTimer. - * - * @return the interval of this HeartbeatTimer + * When this object needs to be reconfigured + * due to external configuration change etc., + * this function will be invoked. */ - default long getIntervalMs() { - throw new UnsupportedOperationException("Getting interval is not supported"); + default void update() { } /** * Waits until next heartbeat should be executed. * + * @return time limit in milliseconds for this heartbeat action to run for before + * the next heartbeat is due. * @throws InterruptedException if the thread is interrupted while waiting */ - void tick() throws InterruptedException; + long tick() throws InterruptedException; } diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/ScheduledTimer.java b/dora/core/common/src/main/java/alluxio/heartbeat/ScheduledTimer.java index 62b6d5667d83..cff75372105c 100644 --- a/dora/core/common/src/main/java/alluxio/heartbeat/ScheduledTimer.java +++ b/dora/core/common/src/main/java/alluxio/heartbeat/ScheduledTimer.java @@ -15,9 +15,11 @@ import com.google.common.base.Preconditions; +import java.time.Clock; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; /** @@ -46,9 +48,11 @@ public final class ScheduledTimer implements HeartbeatTimer { * Creates a new instance of {@link ScheduledTimer}. * * @param threadName the thread name - * @param intervalMs the heartbeat interval (unused) + * @param clock for telling the current time (unused) + * @param intervalSupplierSupplier Sleep time between different heartbeat supplier */ - public ScheduledTimer(String threadName, long intervalMs) { + public ScheduledTimer(String threadName, Clock clock, + Supplier intervalSupplierSupplier) { mThreadName = threadName; mLock = new ReentrantLock(); mTickCondition = mLock.newCondition(); @@ -77,7 +81,7 @@ protected void schedule() { } @Override - public void tick() throws InterruptedException { + public long tick() throws InterruptedException { try (LockResource r = new LockResource(mLock)) { HeartbeatScheduler.addTimer(this); // Wait in a loop to handle spurious wakeups @@ -87,5 +91,6 @@ public void tick() throws InterruptedException { mScheduled = false; } + return Long.MAX_VALUE; } } diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/SleepIntervalSupplier.java b/dora/core/common/src/main/java/alluxio/heartbeat/SleepIntervalSupplier.java new file mode 100644 index 000000000000..cde2ddd5ff3f --- /dev/null +++ b/dora/core/common/src/main/java/alluxio/heartbeat/SleepIntervalSupplier.java @@ -0,0 +1,34 @@ +/* + * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0 + * (the "License"). You may not use this work except in compliance with the License, which is + * available at www.apache.org/licenses/LICENSE-2.0 + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied, as more fully set forth in the License. + * + * See the NOTICE file distributed with this work for information regarding copyright ownership. + */ + +package alluxio.heartbeat; + +/** + * A policy to calculate the next interval to sleep. + */ +public interface SleepIntervalSupplier { + /** + * Gets the next interval for sleeping. + * + * @param mPreviousTickedMs previous ticked time stamp in millisecond + * @param nowTimeStampMillis current time stamp in millisecond + * @return the interval to sleep starting from now before next time the timer triggers + */ + long getNextInterval(long mPreviousTickedMs, long nowTimeStampMillis); + + /** + * Gets the run limit from previous ticked. + * + * @param mPreviousTickedMs previous ticked time stamp in millisecond + * @return the run limit + */ + long getRunLimit(long mPreviousTickedMs); +} diff --git a/dora/core/common/src/main/java/alluxio/heartbeat/SleepingTimer.java b/dora/core/common/src/main/java/alluxio/heartbeat/SleepingTimer.java index d6d4ad2589ab..2e444de5b892 100644 --- a/dora/core/common/src/main/java/alluxio/heartbeat/SleepingTimer.java +++ b/dora/core/common/src/main/java/alluxio/heartbeat/SleepingTimer.java @@ -11,7 +11,6 @@ package alluxio.heartbeat; -import alluxio.clock.SystemClock; import alluxio.time.Sleeper; import alluxio.time.ThreadSleeper; @@ -20,57 +19,52 @@ import java.time.Clock; import java.time.Duration; +import java.util.function.Supplier; import javax.annotation.concurrent.NotThreadSafe; /** * This class can be used for executing heartbeats periodically. */ @NotThreadSafe -public final class SleepingTimer implements HeartbeatTimer { - private long mIntervalMs; - private long mPreviousTickMs; +public class SleepingTimer implements HeartbeatTimer { + protected long mPreviousTickedMs = -1; private final String mThreadName; - private final Logger mLogger; - private final Clock mClock; - private final Sleeper mSleeper; + protected final Logger mLogger; + protected final Clock mClock; + protected final Sleeper mSleeper; + protected final Supplier mIntervalSupplierSupplier; + protected SleepIntervalSupplier mIntervalSupplier; /** * Creates a new instance of {@link SleepingTimer}. * * @param threadName the thread name - * @param intervalMs the heartbeat interval + * @param clock for telling the current time + * @param intervalSupplierSupplier Sleep time between different heartbeat supplier */ - public SleepingTimer(String threadName, long intervalMs) { - this(threadName, intervalMs, LoggerFactory.getLogger(SleepingTimer.class), - new SystemClock(), ThreadSleeper.INSTANCE); + public SleepingTimer(String threadName, Clock clock, + Supplier intervalSupplierSupplier) { + this(threadName, LoggerFactory.getLogger(SleepingTimer.class), + clock, ThreadSleeper.INSTANCE, intervalSupplierSupplier); } /** * Creates a new instance of {@link SleepingTimer}. * * @param threadName the thread name - * @param intervalMs the heartbeat interval * @param logger the logger to log to * @param clock for telling the current time * @param sleeper the utility to use for sleeping + * @param intervalSupplierSupplier Sleep time between different heartbeat supplier */ - public SleepingTimer(String threadName, long intervalMs, Logger logger, Clock clock, - Sleeper sleeper) { - mIntervalMs = intervalMs; + public SleepingTimer(String threadName, Logger logger, Clock clock, Sleeper sleeper, + Supplier intervalSupplierSupplier) { mThreadName = threadName; mLogger = logger; mClock = clock; mSleeper = sleeper; - } - - @Override - public void setIntervalMs(long intervalMs) { - mIntervalMs = intervalMs; - } - - @Override - public long getIntervalMs() { - return mIntervalMs; + mIntervalSupplierSupplier = intervalSupplierSupplier; + mIntervalSupplier = intervalSupplierSupplier.get(); } /** @@ -79,16 +73,18 @@ public long getIntervalMs() { * @throws InterruptedException if the thread is interrupted while waiting */ @Override - public void tick() throws InterruptedException { - if (mPreviousTickMs != 0) { - long executionTimeMs = mClock.millis() - mPreviousTickMs; - if (executionTimeMs > mIntervalMs) { - mLogger.warn("{} last execution took {} ms. Longer than the interval {}", mThreadName, - executionTimeMs, mIntervalMs); - } else { - mSleeper.sleep(Duration.ofMillis(mIntervalMs - executionTimeMs)); - } + public long tick() throws InterruptedException { + long nextInterval = mIntervalSupplier.getNextInterval(mPreviousTickedMs, mClock.millis()); + if (nextInterval > 0) { + mSleeper.sleep(Duration.ofMillis(nextInterval)); } - mPreviousTickMs = mClock.millis(); + mPreviousTickedMs = mClock.millis(); + return mIntervalSupplier.getRunLimit(mPreviousTickedMs); + } + + @Override + public void update() { + mIntervalSupplier = mIntervalSupplierSupplier.get(); + mLogger.info("update {} interval supplier.", mThreadName); } } diff --git a/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatContextTest.java b/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatContextTest.java index f5c222739dc0..0c972baf44db 100644 --- a/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatContextTest.java +++ b/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatContextTest.java @@ -21,7 +21,7 @@ */ public final class HeartbeatContextTest { @Test - public void allThreadsUseSleepingTimer() { + public void allThreadsUseProductionTimer() { for (String threadName : HeartbeatContext.getTimerClasses().keySet()) { Class timerClass = HeartbeatContext.getTimerClass(threadName); assertTrue(timerClass.isAssignableFrom(SleepingTimer.class)); diff --git a/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatThreadTest.java b/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatThreadTest.java index 5d09135dc7ea..921e250984da 100644 --- a/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatThreadTest.java +++ b/dora/core/common/src/test/java/alluxio/heartbeat/HeartbeatThreadTest.java @@ -139,7 +139,8 @@ public Void call() throws Exception { try (ManuallyScheduleHeartbeat.Resource r = new ManuallyScheduleHeartbeat.Resource(Arrays.asList(mThreadName))) { DummyHeartbeatExecutor executor = new DummyHeartbeatExecutor(); - HeartbeatThread ht = new HeartbeatThread(mThreadName, executor, () -> 1L, + HeartbeatThread ht = new HeartbeatThread(mThreadName, executor, + () -> new FixedIntervalSupplier(1L), Configuration.global(), UserState.Factory.create(Configuration.global())); // Run the HeartbeatThread. @@ -166,7 +167,7 @@ private class DummyHeartbeatExecutor implements HeartbeatExecutor { private int mCounter = 0; @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { mCounter++; } diff --git a/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerForCronExpressionIntervalSupplierTest.java b/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerForCronExpressionIntervalSupplierTest.java new file mode 100644 index 000000000000..81d9d5e4bc06 --- /dev/null +++ b/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerForCronExpressionIntervalSupplierTest.java @@ -0,0 +1,121 @@ +/* + * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0 + * (the "License"). You may not use this work except in compliance with the License, which is + * available at www.apache.org/licenses/LICENSE-2.0 + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied, as more fully set forth in the License. + * + * See the NOTICE file distributed with this work for information regarding copyright ownership. + */ + +package alluxio.heartbeat; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; + +import alluxio.Constants; +import alluxio.clock.ManualClock; +import alluxio.time.Sleeper; + +import org.apache.logging.log4j.core.util.CronExpression; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; + +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.Duration; +import java.util.Date; + +/** + * Unit tests for {@link SleepingTimer}. + */ +public final class SleepingTimerForCronExpressionIntervalSupplierTest { + private static final String THREAD_NAME = "cron-test-thread-name"; + private static final long INTERVAL_MS = 10 * Constants.MINUTE_MS; + private Logger mMockLogger; + private ManualClock mFakeClock; + private Sleeper mMockSleeper; + private long mAllSleepTimeMs; + + @Before + public void before() throws InterruptedException { + mMockLogger = mock(Logger.class); + mFakeClock = new ManualClock(); + mMockSleeper = mock(Sleeper.class); + doAnswer((invocation) -> { + Duration duration = invocation.getArgument(0); + mFakeClock.addTime(duration); + mAllSleepTimeMs += duration.toMillis(); + return null; + }).when(mMockSleeper).sleep(any(Duration.class)); + } + + /** + * Tests that the cron timer will attempt to run at the same interval, independently of how + * long the execution between ticks takes. For example, if the interval is 100ms and execution + * takes 80ms, the timer should sleep for only 20ms to maintain the regular interval of 100ms. + */ + @Test + public void maintainInterval() throws Exception { + SleepingTimer timer = + new SleepingTimer(THREAD_NAME, mMockLogger, mFakeClock, mMockSleeper, + () -> { + try { + return new CronExpressionIntervalSupplier( + new CronExpression("* 30-59 0-1,4-9,13-23 * * ? *"), INTERVAL_MS); + } catch (ParseException e) { + throw new RuntimeException(e); + } + }); + DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + Date startDate = formatter.parse("2022-01-01 00:00:00"); + Assert.assertEquals(-1, timer.mPreviousTickedMs); + mFakeClock.setTimeMs(startDate.getTime()); + long limitMs = timer.tick(); + long lastAllSleepTimeMs = mAllSleepTimeMs; + Assert.assertEquals(30 * Constants.MINUTE_MS, mAllSleepTimeMs); + Assert.assertEquals(30 * Constants.MINUTE_MS, limitMs); + Assert.assertEquals(formatter.parse("2022-01-01 00:30:00"), new Date(timer.mPreviousTickedMs)); + Assert.assertEquals(formatter.parse("2022-01-01 00:30:00"), new Date(mFakeClock.millis())); + // Mock heartbeat 1 minute + mFakeClock.addTime(Duration.ofMinutes(1)); + + limitMs = timer.tick(); + Assert.assertEquals(9 * Constants.MINUTE_MS, mAllSleepTimeMs - lastAllSleepTimeMs); + lastAllSleepTimeMs = mAllSleepTimeMs; + Assert.assertEquals(20 * Constants.MINUTE_MS, limitMs); + Assert.assertEquals(formatter.parse("2022-01-01 00:40:00"), new Date(timer.mPreviousTickedMs)); + Assert.assertEquals(formatter.parse("2022-01-01 00:40:00"), new Date(mFakeClock.millis())); + // Mock heartbeat 5 minute + mFakeClock.addTime(Duration.ofMinutes(5)); + + limitMs = timer.tick(); + Assert.assertEquals(5 * Constants.MINUTE_MS, mAllSleepTimeMs - lastAllSleepTimeMs); + lastAllSleepTimeMs = mAllSleepTimeMs; + Assert.assertEquals(10 * Constants.MINUTE_MS, limitMs); + Assert.assertEquals(formatter.parse("2022-01-01 00:50:00"), new Date(timer.mPreviousTickedMs)); + Assert.assertEquals(formatter.parse("2022-01-01 00:50:00"), new Date(mFakeClock.millis())); + // Mock heartbeat 5 minute + mFakeClock.addTime(Duration.ofMinutes(5)); + + limitMs = timer.tick(); + Assert.assertEquals(35 * Constants.MINUTE_MS, mAllSleepTimeMs - lastAllSleepTimeMs); + lastAllSleepTimeMs = mAllSleepTimeMs; + Assert.assertEquals(30 * Constants.MINUTE_MS, limitMs); + Assert.assertEquals(formatter.parse("2022-01-01 01:30:00"), new Date(timer.mPreviousTickedMs)); + Assert.assertEquals(formatter.parse("2022-01-01 01:30:00"), new Date(mFakeClock.millis())); + // Mock heartbeat 30 minute + mFakeClock.addTime(Duration.ofMinutes(30)); + + limitMs = timer.tick(); + Assert.assertEquals(150 * Constants.MINUTE_MS, mAllSleepTimeMs - lastAllSleepTimeMs); + Assert.assertEquals(30 * Constants.MINUTE_MS, limitMs); + Assert.assertEquals(formatter.parse("2022-01-01 04:30:00"), new Date(timer.mPreviousTickedMs)); + Assert.assertEquals(formatter.parse("2022-01-01 04:30:00"), new Date(mFakeClock.millis())); + } +} diff --git a/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerTest.java b/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerTest.java index ae8ef03d8aea..6a4f79447574 100644 --- a/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerTest.java +++ b/dora/core/common/src/test/java/alluxio/heartbeat/SleepingTimerTest.java @@ -47,7 +47,8 @@ public void before() { @Test public void warnWhenExecutionTakesLongerThanInterval() throws Exception { SleepingTimer timer = - new SleepingTimer(THREAD_NAME, INTERVAL_MS, mMockLogger, mFakeClock, mMockSleeper); + new SleepingTimer(THREAD_NAME, mMockLogger, mFakeClock, mMockSleeper, + () -> new FixedIntervalSupplier(INTERVAL_MS, mMockLogger)); timer.tick(); mFakeClock.addTimeMs(5 * INTERVAL_MS); @@ -60,7 +61,8 @@ public void warnWhenExecutionTakesLongerThanInterval() throws Exception { @Test public void sleepForSpecifiedInterval() throws Exception { final SleepingTimer timer = - new SleepingTimer(THREAD_NAME, INTERVAL_MS, mMockLogger, mFakeClock, mMockSleeper); + new SleepingTimer(THREAD_NAME, mMockLogger, mFakeClock, mMockSleeper, + () -> new FixedIntervalSupplier(INTERVAL_MS)); timer.tick(); // first tick won't sleep verify(mMockSleeper, times(0)).sleep(any(Duration.class)); timer.tick(); @@ -75,7 +77,8 @@ public void sleepForSpecifiedInterval() throws Exception { @Test public void maintainInterval() throws Exception { SleepingTimer stimer = - new SleepingTimer(THREAD_NAME, INTERVAL_MS, mMockLogger, mFakeClock, mMockSleeper); + new SleepingTimer(THREAD_NAME, mMockLogger, mFakeClock, mMockSleeper, + () -> new FixedIntervalSupplier(INTERVAL_MS)); stimer.tick(); mFakeClock.addTimeMs(INTERVAL_MS / 3); diff --git a/dora/core/server/master/src/main/java/alluxio/master/block/DefaultBlockMaster.java b/dora/core/server/master/src/main/java/alluxio/master/block/DefaultBlockMaster.java index 0af8b818eb4b..9df2b8a17c94 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/block/DefaultBlockMaster.java +++ b/dora/core/server/master/src/main/java/alluxio/master/block/DefaultBlockMaster.java @@ -41,6 +41,7 @@ import alluxio.grpc.ServiceType; import alluxio.grpc.StorageList; import alluxio.grpc.WorkerLostStorageInfo; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -477,7 +478,7 @@ public class WorkerRegisterStreamGCExecutor implements HeartbeatExecutor { .getMs(PropertyKey.MASTER_WORKER_REGISTER_STREAM_RESPONSE_TIMEOUT); @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { AtomicInteger removedSessions = new AtomicInteger(0); mActiveRegisterContexts.entrySet().removeIf((entry) -> { WorkerRegisterContext context = entry.getValue(); @@ -522,7 +523,8 @@ public void start(Boolean isLeader) throws IOException { if (isLeader || mWorkerRegisterToAllMasters) { getExecutorService().submit(new HeartbeatThread( HeartbeatContext.MASTER_LOST_WORKER_DETECTION, new LostWorkerDetectionHeartbeatExecutor(), - () -> Configuration.getMs(PropertyKey.MASTER_LOST_WORKER_DETECTION_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_LOST_WORKER_DETECTION_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } @@ -530,7 +532,8 @@ HeartbeatContext.MASTER_LOST_WORKER_DETECTION, new LostWorkerDetectionHeartbeatE getExecutorService().submit(new HeartbeatThread( HeartbeatContext.MASTER_WORKER_REGISTER_SESSION_CLEANER, new WorkerRegisterStreamGCExecutor(), - () -> Configuration.getMs(PropertyKey.MASTER_WORKER_REGISTER_STREAM_RESPONSE_TIMEOUT), + () -> new FixedIntervalSupplier(Configuration.getMs( + PropertyKey.MASTER_WORKER_REGISTER_STREAM_RESPONSE_TIMEOUT)), Configuration.global(), mMasterContext.getUserState())); } @@ -1730,7 +1733,7 @@ public final class LostWorkerDetectionHeartbeatExecutor implements HeartbeatExec public LostWorkerDetectionHeartbeatExecutor() {} @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { long masterWorkerTimeoutMs = Configuration.getMs(PropertyKey.MASTER_WORKER_TIMEOUT_MS); long masterWorkerDeleteTimeoutMs = Configuration.getMs(PropertyKey.MASTER_LOST_WORKER_DELETION_TIMEOUT_MS); diff --git a/dora/core/server/master/src/main/java/alluxio/master/block/meta/MasterWorkerInfo.java b/dora/core/server/master/src/main/java/alluxio/master/block/meta/MasterWorkerInfo.java index 8974ce548176..b2d08a66fcb3 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/block/meta/MasterWorkerInfo.java +++ b/dora/core/server/master/src/main/java/alluxio/master/block/meta/MasterWorkerInfo.java @@ -111,7 +111,7 @@ * and block removal/commit. * 2. In {@link alluxio.master.block.WorkerRegisterContext}, * to write locks are held throughout the lifecycle. - * 3. In {@link DefaultBlockMaster.LostWorkerDetectionHeartbeatExecutor#heartbeat()} + * 3. In {@link DefaultBlockMaster.LostWorkerDetectionHeartbeatExecutor#heartbeat(long)} */ @NotThreadSafe public final class MasterWorkerInfo { diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/BlockIntegrityChecker.java b/dora/core/server/master/src/main/java/alluxio/master/file/BlockIntegrityChecker.java index 24334a592eb0..46370d2f91f2 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/BlockIntegrityChecker.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/BlockIntegrityChecker.java @@ -39,7 +39,7 @@ public BlockIntegrityChecker(FileSystemMaster fsm) { } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { try { mFileSystemMaster.validateInodeBlocks(mRepair); } catch (Exception e) { diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/DefaultFileSystemMaster.java b/dora/core/server/master/src/main/java/alluxio/master/file/DefaultFileSystemMaster.java index 6a829a33dc6c..d124eae95dc5 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/DefaultFileSystemMaster.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/DefaultFileSystemMaster.java @@ -60,6 +60,7 @@ import alluxio.grpc.SetAclAction; import alluxio.grpc.SetAttributePOptions; import alluxio.grpc.TtlAction; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatThread; import alluxio.job.plan.persist.PersistConfig; @@ -719,30 +720,35 @@ public void start(Boolean isPrimary) throws IOException { getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_BLOCK_INTEGRITY_CHECK, new BlockIntegrityChecker(this), () -> - Configuration.getMs(PropertyKey.MASTER_PERIODIC_BLOCK_INTEGRITY_CHECK_INTERVAL), + new FixedIntervalSupplier(Configuration.getMs( + PropertyKey.MASTER_PERIODIC_BLOCK_INTEGRITY_CHECK_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_TTL_CHECK, new InodeTtlChecker(this, mInodeTree), - () -> Configuration.getMs(PropertyKey.MASTER_TTL_CHECKER_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_TTL_CHECKER_INTERVAL_MS)), Configuration.global(), mMasterContext.getUserState())); getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_LOST_FILES_DETECTION, new LostFileDetector(this, mBlockMaster, mInodeTree), - () -> Configuration.getMs(PropertyKey.MASTER_LOST_WORKER_FILE_DETECTION_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_LOST_WORKER_FILE_DETECTION_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); mReplicationCheckHeartbeatThread = new HeartbeatThread( HeartbeatContext.MASTER_REPLICATION_CHECK, new alluxio.master.file.replication.ReplicationChecker(mInodeTree, mBlockMaster, mSafeModeManager, mJobMasterClientPool), - () -> Configuration.getMs(PropertyKey.MASTER_REPLICATION_CHECK_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_REPLICATION_CHECK_INTERVAL_MS)), Configuration.global(), mMasterContext.getUserState()); getExecutorService().submit(mReplicationCheckHeartbeatThread); getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_PERSISTENCE_SCHEDULER, new PersistenceScheduler(), - () -> Configuration.getMs(PropertyKey.MASTER_PERSISTENCE_SCHEDULER_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_PERSISTENCE_SCHEDULER_INTERVAL_MS)), Configuration.global(), mMasterContext.getUserState())); mPersistCheckerPool = new java.util.concurrent.ThreadPoolExecutor(PERSIST_CHECKER_POOL_THREADS, @@ -753,12 +759,14 @@ public void start(Boolean isPrimary) throws IOException { getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_PERSISTENCE_CHECKER, new PersistenceChecker(), - () -> Configuration.getMs(PropertyKey.MASTER_PERSISTENCE_CHECKER_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_PERSISTENCE_CHECKER_INTERVAL_MS)), Configuration.global(), mMasterContext.getUserState())); getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_METRICS_TIME_SERIES, new TimeSeriesRecorder(), - () -> Configuration.getMs(PropertyKey.MASTER_METRICS_TIME_SERIES_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_METRICS_TIME_SERIES_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); if (Configuration.getBoolean(PropertyKey.MASTER_AUDIT_LOGGING_ENABLED)) { mAsyncAuditLogWriter = new AsyncUserAccessAuditLogWriter("AUDIT_LOG"); @@ -771,7 +779,8 @@ public void start(Boolean isPrimary) throws IOException { if (Configuration.getBoolean(PropertyKey.UNDERFS_CLEANUP_ENABLED)) { getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_UFS_CLEANUP, new UfsCleaner(this), - () -> Configuration.getMs(PropertyKey.UNDERFS_CLEANUP_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.UNDERFS_CLEANUP_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } if (mAccessTimeUpdater != null) { @@ -4573,7 +4582,7 @@ private void handleReady(long fileId, JournalContext journalContext, AtomicInteg * @throws InterruptedException if the thread is interrupted */ @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { LOG.debug("Async Persist heartbeat start"); java.util.concurrent.TimeUnit.SECONDS.sleep(mQuietPeriodSeconds); AtomicInteger journalCounter = new AtomicInteger(0); @@ -4876,7 +4885,7 @@ private void createParentPath(List inodes, String ufsPath, } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { boolean queueEmpty = mPersistCheckerPool.getQueue().isEmpty(); // Check the progress of persist jobs. for (long fileId : mPersistJobs.keySet()) { @@ -4964,7 +4973,7 @@ public void heartbeat() throws InterruptedException { @NotThreadSafe private final class TimeSeriesRecorder implements alluxio.heartbeat.HeartbeatExecutor { @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { // TODO(calvin): Provide a better way to keep track of metrics collected as time series MetricRegistry registry = MetricsSystem.METRIC_REGISTRY; SortedMap gauges = registry.getGauges(); diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/InodeTtlChecker.java b/dora/core/server/master/src/main/java/alluxio/master/file/InodeTtlChecker.java index 0c9cf4a76ab4..595322679c31 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/InodeTtlChecker.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/InodeTtlChecker.java @@ -61,7 +61,7 @@ public InodeTtlChecker(FileSystemMaster fileSystemMaster, InodeTree inodeTree) { } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { Set expiredBuckets = mTtlBuckets.pollExpiredBuckets(System.currentTimeMillis()); Map failedInodesToRetryNum = new HashMap<>(); for (TtlBucket bucket : expiredBuckets) { diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/LostFileDetector.java b/dora/core/server/master/src/main/java/alluxio/master/file/LostFileDetector.java index 535bec900ec9..9f25b8d8a857 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/LostFileDetector.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/LostFileDetector.java @@ -59,7 +59,7 @@ public LostFileDetector(FileSystemMaster fileSystemMaster, BlockMaster blockMast } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { Iterator iter = mBlockMaster.getLostBlocksIterator(); Set toMarkFiles = new HashSet<>(); while (iter.hasNext()) { diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/UfsCleaner.java b/dora/core/server/master/src/main/java/alluxio/master/file/UfsCleaner.java index bc9ab0ab6ef4..5d1261bff807 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/UfsCleaner.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/UfsCleaner.java @@ -30,7 +30,7 @@ public UfsCleaner(FileSystemMaster fileSystemMaster) { } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { mFileSystemMaster.cleanupUfs(); } diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncManager.java b/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncManager.java index 214c1ec72e67..6993b31027dd 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncManager.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncManager.java @@ -17,6 +17,7 @@ import alluxio.conf.Configuration; import alluxio.conf.PropertyKey; import alluxio.exception.InvalidPathException; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatThread; import alluxio.master.file.FileSystemMaster; @@ -262,7 +263,8 @@ public void launchPollingThread(long mountId, long txId) { ActiveSyncer syncer = new ActiveSyncer(mFileSystemMaster, this, mMountTable, mountId); Future future = getExecutor().submit( new HeartbeatThread(HeartbeatContext.MASTER_ACTIVE_UFS_SYNC, - syncer, () -> Configuration.getMs(PropertyKey.MASTER_UFS_ACTIVE_SYNC_INTERVAL), + syncer, () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_UFS_ACTIVE_SYNC_INTERVAL)), Configuration.global(), ServerUserState.global())); mPollerMap.put(mountId, future); } diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncer.java b/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncer.java index 666da9434682..e9ba8aebec3c 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncer.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/activesync/ActiveSyncer.java @@ -74,7 +74,7 @@ public ActiveSyncer(FileSystemMaster fileSystemMaster, ActiveSyncManager syncMan } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { LOG.debug("start sync heartbeat for {} with mount id {}", mMountUri, mMountId); // Remove any previously completed sync tasks mSyncTasks.removeIf(Future::isDone); diff --git a/dora/core/server/master/src/main/java/alluxio/master/file/replication/ReplicationChecker.java b/dora/core/server/master/src/main/java/alluxio/master/file/replication/ReplicationChecker.java index 44e801dc29d9..d669f182bbdb 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/file/replication/ReplicationChecker.java +++ b/dora/core/server/master/src/main/java/alluxio/master/file/replication/ReplicationChecker.java @@ -148,7 +148,7 @@ private boolean shouldRun() { * (2) Is there any blocks over replicated, schedule evict jobs to reduce the replication level. */ @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { if (!shouldRun()) { return; } diff --git a/dora/core/server/master/src/main/java/alluxio/master/meta/DefaultMetaMaster.java b/dora/core/server/master/src/main/java/alluxio/master/meta/DefaultMetaMaster.java index 42f966c60bde..a41f3437f7bf 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/meta/DefaultMetaMaster.java +++ b/dora/core/server/master/src/main/java/alluxio/master/meta/DefaultMetaMaster.java @@ -35,6 +35,7 @@ import alluxio.grpc.RegisterMasterPOptions; import alluxio.grpc.Scope; import alluxio.grpc.ServiceType; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -301,13 +302,14 @@ public void start(Boolean isPrimary) throws IOException { getExecutorService().submit(new HeartbeatThread( HeartbeatContext.MASTER_LOST_MASTER_DETECTION, new LostMasterDetectionHeartbeatExecutor(), - () -> Configuration.getMs(PropertyKey.MASTER_STANDBY_HEARTBEAT_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_STANDBY_HEARTBEAT_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_LOG_CONFIG_REPORT_SCHEDULING, new LogConfigReportHeartbeatExecutor(), - () -> Configuration - .getMs(PropertyKey.MASTER_LOG_CONFIG_REPORT_HEARTBEAT_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_LOG_CONFIG_REPORT_HEARTBEAT_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); if (Configuration.getBoolean(PropertyKey.MASTER_DAILY_BACKUP_ENABLED)) { @@ -318,7 +320,8 @@ public void start(Boolean isPrimary) throws IOException { if (mJournalSpaceMonitor != null) { getExecutorService().submit(new HeartbeatThread( HeartbeatContext.MASTER_JOURNAL_SPACE_MONITOR, mJournalSpaceMonitor, - () -> Configuration.getMs(PropertyKey.MASTER_JOURNAL_SPACE_MONITOR_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_JOURNAL_SPACE_MONITOR_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } if (mState.getClusterID().equals(INVALID_CLUSTER_ID)) { @@ -331,7 +334,8 @@ public void start(Boolean isPrimary) throws IOException { && !Configuration.getBoolean(PropertyKey.TEST_MODE)) { getExecutorService().submit(new HeartbeatThread(HeartbeatContext.MASTER_UPDATE_CHECK, new UpdateChecker(this), - () -> Configuration.getMs(PropertyKey.MASTER_UPDATE_CHECK_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_UPDATE_CHECK_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } } else { @@ -346,7 +350,8 @@ public void start(Boolean isPrimary) throws IOException { .newBuilder(ClientContext.create(Configuration.global())).build()); getExecutorService().submit(new HeartbeatThread(HeartbeatContext.META_MASTER_SYNC, new MetaMasterSync(mMasterAddress, metaMasterClient), - () -> Configuration.getMs(PropertyKey.MASTER_STANDBY_HEARTBEAT_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_STANDBY_HEARTBEAT_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); LOG.info("Standby master with address {} starts sending heartbeat to leader master.", mMasterAddress); @@ -708,7 +713,7 @@ public LostMasterDetectionHeartbeatExecutor() { } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { long masterTimeoutMs = Configuration.getMs(PropertyKey.MASTER_HEARTBEAT_TIMEOUT); for (MasterInfo master : mMasters) { synchronized (master) { @@ -737,7 +742,7 @@ private final class LogConfigReportHeartbeatExecutor implements HeartbeatExecuto private volatile boolean mFirst = true; @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { // Skip the first heartbeat since it happens before servers have time to register their // configurations. if (mFirst) { diff --git a/dora/core/server/master/src/main/java/alluxio/master/meta/JournalSpaceMonitor.java b/dora/core/server/master/src/main/java/alluxio/master/meta/JournalSpaceMonitor.java index 8b74f695e6a9..d917be9e348f 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/meta/JournalSpaceMonitor.java +++ b/dora/core/server/master/src/main/java/alluxio/master/meta/JournalSpaceMonitor.java @@ -169,7 +169,7 @@ public List getJournalDiskWarnings() { } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { getJournalDiskWarnings().forEach(LOG::warn); } diff --git a/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterSync.java b/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterSync.java index f793f2d7fa34..3b246cefae15 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterSync.java +++ b/dora/core/server/master/src/main/java/alluxio/master/meta/MetaMasterSync.java @@ -62,7 +62,7 @@ public MetaMasterSync(Address masterAddress, RetryHandlingMetaMasterMasterClient * Heartbeats to the leader master node. */ @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { MetaCommand command = null; try { if (mMasterId.get() == UNINITIALIZED_MASTER_ID) { diff --git a/dora/core/server/master/src/main/java/alluxio/master/meta/UpdateChecker.java b/dora/core/server/master/src/main/java/alluxio/master/meta/UpdateChecker.java index d7d75f837014..7bfdfb6e77c2 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/meta/UpdateChecker.java +++ b/dora/core/server/master/src/main/java/alluxio/master/meta/UpdateChecker.java @@ -45,7 +45,7 @@ public UpdateChecker(DefaultMetaMaster metaMaster) { * Heartbeat for the periodic update check. */ @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { try { List additionalInfo = new ArrayList<>(); int clusterSize = mMetaMaster.getWorkerAddresses().size(); diff --git a/dora/core/server/master/src/main/java/alluxio/master/metrics/DefaultMetricsMaster.java b/dora/core/server/master/src/main/java/alluxio/master/metrics/DefaultMetricsMaster.java index bf65ad6d2449..3ccbb8c7aba1 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/metrics/DefaultMetricsMaster.java +++ b/dora/core/server/master/src/main/java/alluxio/master/metrics/DefaultMetricsMaster.java @@ -18,6 +18,7 @@ import alluxio.grpc.GrpcService; import alluxio.grpc.MetricValue; import alluxio.grpc.ServiceType; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -180,7 +181,8 @@ public void start(Boolean isLeader) throws IOException { if (isLeader) { getExecutorService().submit(new HeartbeatThread( HeartbeatContext.MASTER_CLUSTER_METRICS_UPDATER, new ClusterMetricsUpdater(), - () -> Configuration.getMs(PropertyKey.MASTER_CLUSTER_METRICS_UPDATE_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_CLUSTER_METRICS_UPDATE_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); } } @@ -215,7 +217,7 @@ public Map getMetrics() { */ private class ClusterMetricsUpdater implements HeartbeatExecutor { @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { updateMultiValueMasterMetrics(); } diff --git a/dora/core/server/master/src/main/java/alluxio/master/throttle/DefaultThrottleMaster.java b/dora/core/server/master/src/main/java/alluxio/master/throttle/DefaultThrottleMaster.java index 70ee98d0b85c..ef5eee6f489c 100644 --- a/dora/core/server/master/src/main/java/alluxio/master/throttle/DefaultThrottleMaster.java +++ b/dora/core/server/master/src/main/java/alluxio/master/throttle/DefaultThrottleMaster.java @@ -19,6 +19,7 @@ import alluxio.conf.PropertyKey; import alluxio.grpc.GrpcService; import alluxio.grpc.ServiceType; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -109,7 +110,8 @@ public void start(Boolean isLeader) throws IOException { LOG.info("Starting {}", getName()); mThrottleService = getExecutorService().submit( new HeartbeatThread(HeartbeatContext.MASTER_THROTTLE, mThrottleExecutor, - () -> Configuration.getMs(PropertyKey.MASTER_THROTTLE_HEARTBEAT_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.MASTER_THROTTLE_HEARTBEAT_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); LOG.info("{} is started", getName()); @@ -141,7 +143,7 @@ public ThrottleExecutor(MasterProcess masterProcess) { } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { mSystemMonitor.run(); } diff --git a/dora/core/server/master/src/test/java/alluxio/master/file/replication/ReplicationCheckerTest.java b/dora/core/server/master/src/test/java/alluxio/master/file/replication/ReplicationCheckerTest.java index 45c4db8333d1..0cb49ba8de5d 100644 --- a/dora/core/server/master/src/test/java/alluxio/master/file/replication/ReplicationCheckerTest.java +++ b/dora/core/server/master/src/test/java/alluxio/master/file/replication/ReplicationCheckerTest.java @@ -279,7 +279,7 @@ private void heartbeatToAddLocationHelper(long blockId, long workerId) throws Ex @Test public void heartbeatWhenTreeIsEmpty() throws Exception { - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); } @@ -290,17 +290,17 @@ public void heartbeatFileWithinRange() throws Exception { createBlockHelper(TEST_FILE_1, mFileContext, ""); // One replica, meeting replication min addBlockLocationHelper(blockId, 1); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); // Two replicas, good heartbeatToAddLocationHelper(blockId, createWorkerHelper(1)); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); // Three replicas, meeting replication max, still good heartbeatToAddLocationHelper(blockId, createWorkerHelper(2)); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); } @@ -309,7 +309,7 @@ public void heartbeatFileUnderReplicatedBy1() throws Exception { mFileContext.getOptions().setReplicationMin(1); long blockId = createBlockHelper(TEST_FILE_1, mFileContext, ""); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId, 1); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -320,7 +320,7 @@ public void heartbeatFileNeedsMove() throws Exception { long blockId = createBlockHelper(TEST_FILE_1, mFileContext, Constants.MEDIUM_SSD); addBlockLocationHelper(blockId, 1); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map> expected = ImmutableMap.of(blockId, new Pair<>("host0", Constants.MEDIUM_SSD)); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); @@ -333,7 +333,7 @@ public void heartbeatFileDoesnotNeedMove() throws Exception { long blockId = createBlockHelper(TEST_FILE_1, mFileContext, Constants.MEDIUM_MEM); addBlockLocationHelper(blockId, 1); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); Assert.assertEquals(EMPTY, mMockReplicationHandler.getMigrateRequests()); } @@ -343,7 +343,7 @@ public void heartbeatFileUnderReplicatedBy10() throws Exception { mFileContext.getOptions().setReplicationMin(10); long blockId = createBlockHelper(TEST_FILE_1, mFileContext, ""); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId, 10); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -355,7 +355,7 @@ public void heartbeatMultipleFilesUnderReplicated() throws Exception { mFileContext.getOptions().setReplicationMin(2); long blockId2 = createBlockHelper(TEST_FILE_2, mFileContext, ""); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId1, 1, blockId2, 2); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -380,7 +380,7 @@ public void heartbeatFileUnderReplicatedAndLost() throws Exception { ImmutableMap.of(Constants.MEDIUM_MEM, 0L), ImmutableList.of(blockId), NO_BLOCKS_ON_LOCATION, NO_LOST_STORAGE, NO_METRICS); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(EMPTY, mMockReplicationHandler.getSetReplicaRequests()); } @@ -390,7 +390,7 @@ public void heartbeatFileOverReplicatedBy1() throws Exception { long blockId = createBlockHelper(TEST_FILE_1, mFileContext, ""); addBlockLocationHelper(blockId, 2); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId, 1); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -401,7 +401,7 @@ public void heartbeatFileOverReplicatedBy10() throws Exception { long blockId = createBlockHelper(TEST_FILE_1, mFileContext, ""); addBlockLocationHelper(blockId, 11); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId, 1); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -415,7 +415,7 @@ public void heartbeatMultipleFilesOverReplicated() throws Exception { addBlockLocationHelper(blockId1, 2); addBlockLocationHelper(blockId2, 4); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected = ImmutableMap.of(blockId1, 1, blockId2, 2); Assert.assertEquals(expected, mMockReplicationHandler.getSetReplicaRequests()); } @@ -429,7 +429,7 @@ public void heartbeatFilesUnderAndOverReplicated() throws Exception { addBlockLocationHelper(blockId1, 1); addBlockLocationHelper(blockId2, 5); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Map expected1 = ImmutableMap.of(blockId1, 2, blockId2, 3); Assert.assertEquals(expected1, mMockReplicationHandler.getSetReplicaRequests()); } @@ -447,7 +447,7 @@ public void heartbeatPartial() throws Exception { addBlockLocationHelper(blockId2, 1); addBlockLocationHelper(blockId3, 1); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); final Map replicateRequests = mMockReplicationHandler.getSetReplicaRequests(); System.out.println(replicateRequests); Assert.assertEquals(2, replicateRequests.size()); @@ -457,11 +457,11 @@ public void heartbeatPartial() throws Exception { mMockReplicationHandler.setJobStatus(1, Status.RUNNING); mMockReplicationHandler.setJobStatus(2, Status.RUNNING); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(0, replicateRequests.size()); mMockReplicationHandler.setJobStatus(1, Status.FAILED); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(1, replicateRequests.size()); Assert.assertEquals(3, replicateRequests.values().toArray()[0]); @@ -471,7 +471,7 @@ public void heartbeatPartial() throws Exception { mMockReplicationHandler.setJobStatus(2, Status.COMPLETED); mMockReplicationHandler.setJobStatus(3, Status.COMPLETED); - mReplicationChecker.heartbeat(); + mReplicationChecker.heartbeat(Long.MAX_VALUE); Assert.assertEquals(1, replicateRequests.size()); Assert.assertTrue(replicateRequests.containsKey(blockId3)); Assert.assertEquals(3, replicateRequests.values().toArray()[0]); diff --git a/dora/core/server/master/src/test/java/alluxio/master/meta/JournalSpaceMonitorTest.java b/dora/core/server/master/src/test/java/alluxio/master/meta/JournalSpaceMonitorTest.java index eb638ae88800..8054599ee0a6 100644 --- a/dora/core/server/master/src/test/java/alluxio/master/meta/JournalSpaceMonitorTest.java +++ b/dora/core/server/master/src/test/java/alluxio/master/meta/JournalSpaceMonitorTest.java @@ -82,7 +82,7 @@ public void testLoggingPositive() throws IOException, InterruptedException { JournalSpaceMonitor monitor = Mockito.spy( new JournalSpaceMonitor(Paths.get(".").toAbsolutePath().toString(), 90)); doReturn(new CommandReturn(0, CMD_RETURN_MOCK)).when(monitor).getRawDiskInfo(); - monitor.heartbeat(); + monitor.heartbeat(Long.MAX_VALUE); assertTrue(mLogger.wasLoggedWithLevel("The journal disk /dev/nvme0n1p2 backing the journal " + "has only .* space left", Level.WARN)); } @@ -92,7 +92,7 @@ public void testLoggingNegative() throws IOException, InterruptedException { JournalSpaceMonitor monitor = Mockito.spy( new JournalSpaceMonitor(Paths.get(".").toAbsolutePath().toString(), 10)); doReturn(new CommandReturn(0, CMD_RETURN_MOCK)).when(monitor).getRawDiskInfo(); - monitor.heartbeat(); + monitor.heartbeat(Long.MAX_VALUE); assertFalse(mLogger.wasLoggedWithLevel("The journal disk /dev/nvme0n1p2 backing the journal " + "has only .* space left", Level.WARN)); } diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockMasterSync.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockMasterSync.java index b372c7f84ac8..3ac632238cc2 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockMasterSync.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockMasterSync.java @@ -117,7 +117,7 @@ private void registerWithMaster() throws IOException { * Heartbeats to the master node about the change in the worker's managed space. */ @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { boolean success = mBlockMasterSyncHelper.heartbeat( mWorkerId.get(), mBlockWorker.getReport(), mBlockWorker.getStoreMeta(), this::handleMasterCommand); diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockSyncMasterGroup.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockSyncMasterGroup.java index 6abc313fc1d1..ba9758da143a 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockSyncMasterGroup.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/BlockSyncMasterGroup.java @@ -15,6 +15,7 @@ import alluxio.ProcessUtils; import alluxio.conf.Configuration; import alluxio.conf.PropertyKey; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatThread; import alluxio.master.MasterClientContext; @@ -91,7 +92,8 @@ public synchronized void start(ExecutorService executorService) { } mMasterSyncOperators.values().forEach(blockMasterSync -> executorService .submit(new HeartbeatThread(HeartbeatContext.WORKER_BLOCK_SYNC, blockMasterSync, - () -> Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS)), Configuration.global(), ServerUserState.global()))); } diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/DefaultBlockWorker.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/DefaultBlockWorker.java index 214e46c2181d..d73ff41dc75d 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/DefaultBlockWorker.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/DefaultBlockWorker.java @@ -42,6 +42,7 @@ import alluxio.grpc.GrpcService; import alluxio.grpc.ServiceType; import alluxio.grpc.UfsReadOptions; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -370,7 +371,8 @@ public void start(WorkerNetAddress address) throws IOException { new PinListSync(this, mFileSystemMasterClient)); getExecutorService() .submit(new HeartbeatThread(HeartbeatContext.WORKER_PIN_LIST_SYNC, pinListSync, - () -> Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS)), Configuration.global(), ServerUserState.global())); // Setup session cleaner @@ -383,7 +385,8 @@ public void start(WorkerNetAddress address) throws IOException { StorageChecker storageChecker = mResourceCloser.register(new StorageChecker()); getExecutorService() .submit(new HeartbeatThread(HeartbeatContext.WORKER_STORAGE_HEALTH, storageChecker, - () -> Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS)), Configuration.global(), ServerUserState.global())); } // Mounts the embedded Fuse application @@ -397,7 +400,8 @@ protected void setupBlockMasterSync() throws IOException { .register(new BlockMasterSync(this, mWorkerId, mAddress, mBlockMasterClientPool)); getExecutorService() .submit(new HeartbeatThread(HeartbeatContext.WORKER_BLOCK_SYNC, blockMasterSync, - () -> Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS)), Configuration.global(), ServerUserState.global())); } @@ -716,7 +720,7 @@ private Metrics() { public final class StorageChecker implements HeartbeatExecutor { @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { try { mBlockStore.removeInaccessibleStorage(); } catch (Exception e) { diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/PinListSync.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/PinListSync.java index a85a50092a3c..67ac89a7357d 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/PinListSync.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/PinListSync.java @@ -47,7 +47,7 @@ public PinListSync(BlockWorker blockWorker, FileSystemMasterClient masterClient) } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { // Send the sync try { Set pinList = mMasterClient.getPinList(); diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/block/SpecificMasterBlockSync.java b/dora/core/server/worker/src/main/java/alluxio/worker/block/SpecificMasterBlockSync.java index 3c9aeea0b491..660e0735c785 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/block/SpecificMasterBlockSync.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/block/SpecificMasterBlockSync.java @@ -182,7 +182,7 @@ private RetryPolicy createEndlessRetry() { } @Override - public synchronized void heartbeat() throws InterruptedException { + public synchronized void heartbeat(long runLimit) throws InterruptedException { if (mWorkerState == WorkerMasterRegistrationState.NOT_REGISTERED) { // Not registered because: // 1. The worker just started, we kick off the 1st registration here. diff --git a/dora/core/server/worker/src/main/java/alluxio/worker/dora/PagedDoraWorker.java b/dora/core/server/worker/src/main/java/alluxio/worker/dora/PagedDoraWorker.java index ec2ba9368ce9..d51132402554 100644 --- a/dora/core/server/worker/src/main/java/alluxio/worker/dora/PagedDoraWorker.java +++ b/dora/core/server/worker/src/main/java/alluxio/worker/dora/PagedDoraWorker.java @@ -48,6 +48,7 @@ import alluxio.grpc.ServiceType; import alluxio.grpc.UfsReadOptions; import alluxio.grpc.WriteOptions; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -223,7 +224,8 @@ public void start(WorkerNetAddress address) throws IOException { getExecutorService() .submit(new HeartbeatThread(HeartbeatContext.WORKER_BLOCK_SYNC, mResourceCloser.register(new BlockMasterSync()), - () -> Configuration.getMs(PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS), + () -> new FixedIntervalSupplier(Configuration.getMs( + PropertyKey.WORKER_BLOCK_HEARTBEAT_INTERVAL_MS)), mConf, ServerUserState.global())); } @@ -626,7 +628,7 @@ public void cleanupSession(long sessionId) { private class BlockMasterSync implements HeartbeatExecutor { @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { final Command cmdFromMaster; try (PooledResource bmc = mBlockMasterClientPool.acquireCloseable()) { cmdFromMaster = bmc.get().heartbeat(mWorkerId.get(), diff --git a/dora/core/server/worker/src/test/java/alluxio/underfs/SpecificMasterBlockSyncTest.java b/dora/core/server/worker/src/test/java/alluxio/underfs/SpecificMasterBlockSyncTest.java index cf02f215f52a..e88385f2ae56 100644 --- a/dora/core/server/worker/src/test/java/alluxio/underfs/SpecificMasterBlockSyncTest.java +++ b/dora/core/server/worker/src/test/java/alluxio/underfs/SpecificMasterBlockSyncTest.java @@ -63,24 +63,24 @@ public void heartbeatThread() throws Exception { assertFalse(sync.isRegistered()); // heartbeat registers the worker if it has not been registered. - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); assertTrue(sync.isRegistered()); // heartbeat returning register command resets the worker state. Configuration.set(PropertyKey.WORKER_REGISTER_STREAM_ENABLED, true); TestBlockMasterClient.INSTANCE.setReturnRegisterCommand(true); - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); TestBlockMasterClient.INSTANCE.setReturnRegisterCommand(false); assertFalse(sync.isRegistered()); Configuration.set(PropertyKey.WORKER_REGISTER_STREAM_ENABLED, false); TestBlockMasterClient.INSTANCE.setReturnRegisterCommand(true); - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); TestBlockMasterClient.INSTANCE.setReturnRegisterCommand(false); assertFalse(sync.isRegistered()); // heartbeat registers the worker if it has not been registered. - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); assertTrue(sync.isRegistered()); // TestBlockHeartbeatReporter generates the report with one more removed block id each time. @@ -88,7 +88,7 @@ public void heartbeatThread() throws Exception { // heartbeatReportCapacityThreshold is 3. TestBlockMasterClient.INSTANCE.mHeartbeatCallCount = 0; TestBlockMasterClient.INSTANCE.setHeartbeatError(true); - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); assertFalse(sync.isRegistered()); assertEquals( heartbeatReportCapacityThreshold, TestBlockMasterClient.INSTANCE.mHeartbeatCallCount); @@ -96,7 +96,7 @@ public void heartbeatThread() throws Exception { // registration should happen on the next heartbeat and the reporter should be cleared, // except the newly generated ones. TestBlockMasterClient.INSTANCE.setHeartbeatError(false); - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); assertTrue(sync.isRegistered()); assertEquals(1, blockHeartbeatReporter.generateReportAndClear().getBlockChangeCount()); diff --git a/dora/core/server/worker/src/test/java/alluxio/worker/block/PinListSyncTest.java b/dora/core/server/worker/src/test/java/alluxio/worker/block/PinListSyncTest.java index 2e8b44920ef6..dae0717ffef1 100644 --- a/dora/core/server/worker/src/test/java/alluxio/worker/block/PinListSyncTest.java +++ b/dora/core/server/worker/src/test/java/alluxio/worker/block/PinListSyncTest.java @@ -44,7 +44,7 @@ public Set getPinList() { }; PinListSync sync = new PinListSync(mBlockWorker, client); - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); // should receive the latest pin list assertEquals(testPinLists, mBlockWorker.getPinList()); @@ -62,7 +62,7 @@ public Set getPinList() throws IOException { PinListSync sync = new PinListSync(mBlockWorker, client); // should fail - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); // should not get any pin list update assertEquals(ImmutableSet.of(), mBlockWorker.getPinList()); diff --git a/dora/integration/fuse/src/main/java/alluxio/fuse/AlluxioFuse.java b/dora/integration/fuse/src/main/java/alluxio/fuse/AlluxioFuse.java index 731e2b04fe04..810da273969f 100644 --- a/dora/integration/fuse/src/main/java/alluxio/fuse/AlluxioFuse.java +++ b/dora/integration/fuse/src/main/java/alluxio/fuse/AlluxioFuse.java @@ -27,6 +27,7 @@ import alluxio.exception.runtime.InvalidArgumentRuntimeException; import alluxio.fuse.meta.UpdateChecker; import alluxio.fuse.options.FuseOptions; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatThread; import alluxio.jnifuse.LibFuse; @@ -168,7 +169,7 @@ public static void main(String[] args) throws ParseException { if (fuseOptions.updateCheckEnabled()) { executor = Executors.newSingleThreadExecutor(); executor.submit(new HeartbeatThread(HeartbeatContext.FUSE_UPDATE_CHECK, - UpdateChecker.create(fuseOptions), () -> Long.valueOf(Constants.DAY_MS), + UpdateChecker.create(fuseOptions), () -> new FixedIntervalSupplier(Constants.DAY_MS), Configuration.global(), UserState.Factory.create(conf))); } try (FileSystem fs = FileSystem.Factory.create(fsContext, fuseOptions.getFileSystemOptions())) { diff --git a/dora/integration/fuse/src/main/java/alluxio/fuse/meta/UpdateChecker.java b/dora/integration/fuse/src/main/java/alluxio/fuse/meta/UpdateChecker.java index 802ebd1ef434..bfcc6ca93f13 100644 --- a/dora/integration/fuse/src/main/java/alluxio/fuse/meta/UpdateChecker.java +++ b/dora/integration/fuse/src/main/java/alluxio/fuse/meta/UpdateChecker.java @@ -79,7 +79,7 @@ private UpdateChecker(List unchangeableFuseInfo, Map fuseO * Heartbeat for the periodic update check. */ @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { try { String latestVersion = UpdateCheck.getLatestVersion(mInstanceId, getFuseCheckInfo(), diff --git a/dora/job/server/src/main/java/alluxio/master/job/JobMaster.java b/dora/job/server/src/main/java/alluxio/master/job/JobMaster.java index ae99321ca928..bc2782e01bfb 100644 --- a/dora/job/server/src/main/java/alluxio/master/job/JobMaster.java +++ b/dora/job/server/src/main/java/alluxio/master/job/JobMaster.java @@ -28,6 +28,7 @@ import alluxio.grpc.ListAllPOptions; import alluxio.grpc.RegisterCommand; import alluxio.grpc.ServiceType; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -199,7 +200,8 @@ public void start(Boolean isLeader) throws IOException { getExecutorService() .submit(new HeartbeatThread(HeartbeatContext.JOB_MASTER_LOST_WORKER_DETECTION, new LostWorkerDetectionHeartbeatExecutor(), - () -> Configuration.getMs(PropertyKey.JOB_MASTER_LOST_WORKER_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.JOB_MASTER_LOST_WORKER_INTERVAL)), Configuration.global(), mMasterContext.getUserState())); if (Configuration.getBoolean(PropertyKey.MASTER_AUDIT_LOGGING_ENABLED)) { mAsyncAuditLogWriter = new AsyncUserAccessAuditLogWriter("JOB_MASTER_AUDIT_LOG"); @@ -694,7 +696,7 @@ private final class LostWorkerDetectionHeartbeatExecutor implements HeartbeatExe public LostWorkerDetectionHeartbeatExecutor() {} @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { int masterWorkerTimeoutMs = (int) Configuration .getMs(PropertyKey.JOB_MASTER_WORKER_TIMEOUT); List lostWorkers = new ArrayList<>(); diff --git a/dora/job/server/src/main/java/alluxio/worker/JobWorker.java b/dora/job/server/src/main/java/alluxio/worker/JobWorker.java index aec996509b95..29a6cc054772 100644 --- a/dora/job/server/src/main/java/alluxio/worker/JobWorker.java +++ b/dora/job/server/src/main/java/alluxio/worker/JobWorker.java @@ -21,6 +21,7 @@ import alluxio.exception.ConnectionFailedException; import alluxio.grpc.GrpcService; import alluxio.grpc.ServiceType; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatThread; import alluxio.job.JobServerContext; @@ -107,7 +108,8 @@ public void start(WorkerNetAddress address) throws IOException { new HeartbeatThread(HeartbeatContext.JOB_WORKER_COMMAND_HANDLING, new CommandHandlingExecutor(mJobServerContext, taskExecutorManager, mJobMasterClient, address), - () -> Configuration.getMs(PropertyKey.JOB_MASTER_WORKER_HEARTBEAT_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.JOB_MASTER_WORKER_HEARTBEAT_INTERVAL)), Configuration.global(), ServerUserState.global())); } diff --git a/dora/job/server/src/main/java/alluxio/worker/job/command/CommandHandlingExecutor.java b/dora/job/server/src/main/java/alluxio/worker/job/command/CommandHandlingExecutor.java index c52db6c0ff58..4d14e2418532 100644 --- a/dora/job/server/src/main/java/alluxio/worker/job/command/CommandHandlingExecutor.java +++ b/dora/job/server/src/main/java/alluxio/worker/job/command/CommandHandlingExecutor.java @@ -83,7 +83,7 @@ public CommandHandlingExecutor(JobServerContext jobServerContext, } @Override - public void heartbeat() { + public void heartbeat(long timeLimitMs) { JobWorkerHealthReporter.JobWorkerHealthReport jobWorkerHealthReport = mHealthReporter.getJobWorkerHealthReport(); diff --git a/dora/job/server/src/test/java/alluxio/job/command/CommandHandlingExecutorTest.java b/dora/job/server/src/test/java/alluxio/job/command/CommandHandlingExecutorTest.java index 95310ff7c92b..15c2d804e916 100644 --- a/dora/job/server/src/test/java/alluxio/job/command/CommandHandlingExecutorTest.java +++ b/dora/job/server/src/test/java/alluxio/job/command/CommandHandlingExecutorTest.java @@ -86,7 +86,7 @@ public void heartbeat() throws Exception { Mockito.when(mJobMasterClient.heartbeat(any(JobWorkerHealth.class), eq(Lists.newArrayList()))) .thenReturn(Lists.newArrayList(command.build())); - mCommandHandlingExecutor.heartbeat(); + mCommandHandlingExecutor.heartbeat(Long.MAX_VALUE); ExecutorService executorService = AlluxioMockUtil.getInternalState( mCommandHandlingExecutor, "mCommandHandlingService"); executorService.shutdown(); diff --git a/dora/table/server/master/src/main/java/alluxio/master/table/transform/TransformManager.java b/dora/table/server/master/src/main/java/alluxio/master/table/transform/TransformManager.java index e5a24c5715be..ba7b9bab3a65 100644 --- a/dora/table/server/master/src/main/java/alluxio/master/table/transform/TransformManager.java +++ b/dora/table/server/master/src/main/java/alluxio/master/table/transform/TransformManager.java @@ -18,6 +18,7 @@ import alluxio.exception.ExceptionMessage; import alluxio.exception.status.NotFoundException; import alluxio.exception.status.UnavailableException; +import alluxio.heartbeat.FixedIntervalSupplier; import alluxio.heartbeat.HeartbeatContext; import alluxio.heartbeat.HeartbeatExecutor; import alluxio.heartbeat.HeartbeatThread; @@ -135,7 +136,8 @@ public TransformManager( public void start(ExecutorService executorService, UserState userState) { executorService.submit( new HeartbeatThread(HeartbeatContext.MASTER_TABLE_TRANSFORMATION_MONITOR, new JobMonitor(), - () -> Configuration.getMs(PropertyKey.TABLE_TRANSFORM_MANAGER_JOB_MONITOR_INTERVAL), + () -> new FixedIntervalSupplier( + Configuration.getMs(PropertyKey.TABLE_TRANSFORM_MANAGER_JOB_MONITOR_INTERVAL)), Configuration.global(), userState)); } @@ -300,7 +302,7 @@ private void handleJobSuccess(TransformJobInfo job) { } @Override - public void heartbeat() throws InterruptedException { + public void heartbeat(long timeLimitMs) throws InterruptedException { for (TransformJobInfo job : mState.getRunningJobs()) { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException("TransformManager's heartbeat was interrupted"); diff --git a/dora/tests/src/test/java/alluxio/client/fs/BlockMasterDeleteLostWorkerIntegrationTest.java b/dora/tests/src/test/java/alluxio/client/fs/BlockMasterDeleteLostWorkerIntegrationTest.java index 1b620e2537ad..6a2779b4a04d 100644 --- a/dora/tests/src/test/java/alluxio/client/fs/BlockMasterDeleteLostWorkerIntegrationTest.java +++ b/dora/tests/src/test/java/alluxio/client/fs/BlockMasterDeleteLostWorkerIntegrationTest.java @@ -88,14 +88,14 @@ public void lostWorkerDeletedAfterTimeout() throws Exception { // The worker will not be deleted, if the lost time is less than MASTER_WORKER_TIMEOUT_MS long newTimeMs = worker.getLastUpdatedTimeMs() + MASTER_WORKER_TIMEOUT_MS + 1; mClock.setTimeMs(newTimeMs); - lostWorkerDetector.heartbeat(); + lostWorkerDetector.heartbeat(Long.MAX_VALUE); assertEquals(0, mBlockMaster.getWorkerCount()); assertEquals(1, mBlockMaster.getLostWorkerCount()); // The worker will be deleted, if the lost time is greater than MASTER_WORKER_TIMEOUT_MS newTimeMs = newTimeMs + MASTER_WORKER_DELETE_TIMEOUT_MS + 1; mClock.setTimeMs(newTimeMs); - lostWorkerDetector.heartbeat(); + lostWorkerDetector.heartbeat(Long.MAX_VALUE); assertEquals(0, mBlockMaster.getWorkerCount()); assertEquals(0, mBlockMaster.getLostWorkerCount()); } diff --git a/dora/tests/src/test/java/alluxio/client/fs/FileSystemContextReinitIntegrationTest.java b/dora/tests/src/test/java/alluxio/client/fs/FileSystemContextReinitIntegrationTest.java index 22472643ab13..43c92688ec24 100644 --- a/dora/tests/src/test/java/alluxio/client/fs/FileSystemContextReinitIntegrationTest.java +++ b/dora/tests/src/test/java/alluxio/client/fs/FileSystemContextReinitIntegrationTest.java @@ -139,7 +139,7 @@ public void configHashSyncWithOpenStream() throws Exception { ExecutorService service = Executors.newSingleThreadExecutor(); Future future = service.submit(() -> { - mExecutor.heartbeat(); + mExecutor.heartbeat(Long.MAX_VALUE); }); TimeUnit.SECONDS.sleep(1); // Stream is open, so reinitialization should block until the stream is closed. @@ -159,7 +159,7 @@ public void configHashSyncWithOpenStream() throws Exception { * Triggers ConfigHashSync heartbeat and waits for it to finish. */ private void triggerAndWaitSync() throws Exception { - mExecutor.heartbeat(); + mExecutor.heartbeat(Long.MAX_VALUE); } private void restartMasters() throws Exception { diff --git a/dora/tests/src/test/java/alluxio/server/block/BlockMasterRegisterStreamIntegrationTest.java b/dora/tests/src/test/java/alluxio/server/block/BlockMasterRegisterStreamIntegrationTest.java index dab5f3e302ab..6bd74f113f03 100644 --- a/dora/tests/src/test/java/alluxio/server/block/BlockMasterRegisterStreamIntegrationTest.java +++ b/dora/tests/src/test/java/alluxio/server/block/BlockMasterRegisterStreamIntegrationTest.java @@ -211,7 +211,7 @@ public void registerLostWorker() throws Exception { mClock.setTimeMs(newTimeMs); DefaultBlockMaster.LostWorkerDetectionHeartbeatExecutor lostWorkerDetector = ((DefaultBlockMaster) mBlockMaster).new LostWorkerDetectionHeartbeatExecutor(); - lostWorkerDetector.heartbeat(); + lostWorkerDetector.heartbeat(Long.MAX_VALUE); // Verify the worker has been forgotten assertEquals(0, mBlockMaster.getWorkerCount()); diff --git a/dora/tests/src/test/java/alluxio/server/block/BlockWorkerRegisterStreamIntegrationTest.java b/dora/tests/src/test/java/alluxio/server/block/BlockWorkerRegisterStreamIntegrationTest.java index 725b97c7f1f3..6ca95d201475 100644 --- a/dora/tests/src/test/java/alluxio/server/block/BlockWorkerRegisterStreamIntegrationTest.java +++ b/dora/tests/src/test/java/alluxio/server/block/BlockWorkerRegisterStreamIntegrationTest.java @@ -473,7 +473,7 @@ public void deleteDuringRegisterStream() throws Exception { f.get(); assertNull(error.get()); // Validation will happen on the heartbeat - sync.heartbeat(); + sync.heartbeat(Long.MAX_VALUE); } // TODO(jiacheng): an internal block movement happens during register stream From 8cbc4908da1985b5b0ebec3bb807ce9bd3764f4e Mon Sep 17 00:00:00 2001 From: jiacheliu3 Date: Fri, 19 May 2023 21:09:58 +0800 Subject: [PATCH 27/27] fix SummaryCommandTest to accept any version string --- .../alluxio/cli/fsadmin/report/SummaryCommandTest.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java b/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java index e500e259bdd9..303fd4add38c 100644 --- a/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java +++ b/dora/shell/src/test/java/alluxio/cli/fsadmin/report/SummaryCommandTest.java @@ -193,11 +193,12 @@ private void checkIfOutputValid(String dateFormatPattern, List " Version: testVersion", " Safe Mode: false")); expectedOutput.addAll(HAPattern); + String versionStr = String.format("%-32s", RuntimeConstants.VERSION); expectedOutput.addAll(new ArrayList<>(Arrays.asList( " Master Address State Version ", - " hostname1:10000 Primary 2.10.0-SNAPSHOT ", - " hostname2:10001 Standby 2.10.0-SNAPSHOT ", - " hostname3:10002 Standby 2.10.0-SNAPSHOT ", + " hostname1:10000 Primary " + versionStr, + " hostname2:10001 Standby " + versionStr, + " hostname3:10002 Standby " + versionStr, " Live Workers: 12", " Lost Workers: 4", " Total Capacity: 1309.92KB",