Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TSPS-306 getResult returns signed urls for all outputs #120

Merged
merged 12 commits into from
Sep 10, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,50 @@ private FileUtils() {
private static final String USER_PROVIDED_FILE_INPUT_DIRECTORY = "user-input-files";

/**
* Extract the blob name from the full file path, using the workspaceId as a delimiter.
* Extract the blob name from the full Azure file path, using the workspaceId as a delimiter.
*
* <p>For example, `https://lz123.blob.core.windows.net/sc-{workspaceId}/path/to/file` becomes
* `path/to/file`
* <p>For example, with workspaceId as the workspaceSubstringStart,
* `https://lz123.blob.core.windows.net/sc-{workspaceDelimiter}/path/to/file` becomes
* `path/to/file`.
*
* @param blobHttpUrl
* @param blobUrl
* @param workspaceId
*/
public static String getBlobNameFromTerraWorkspaceStorageUrlAzure(
String blobUrl, String workspaceId) {
return getBlobNameFromTerraWorkspaceStorageUrl(blobUrl, workspaceId);
}

/**
* Extract the blob name from the full GCP file path, using the workspaceStorageContainerName as a
* delimiter.
*
* <p>For example, with workspaceStorageContainerName as the workspaceSubstringStart,
* `gs://{workspaceDelimiter}/path/to/file` becomes `path/to/file`.
*
* @param blobUrl
* @param workspaceStorageContainerName
*/
public static String getBlobNameFromTerraWorkspaceStorageUrlGcp(
String blobUrl, String workspaceStorageContainerName) {
return getBlobNameFromTerraWorkspaceStorageUrl(blobUrl, workspaceStorageContainerName);
}

/**
* Extract the blob name from the full file path, using a defined workspaceSubstringStart.
*
* @param blobUrl
* @param workspaceSubstringStart
* @return blobName
*/
public static String getBlobNameFromTerraWorkspaceStorageHttpUrl(
String blobHttpUrl, UUID workspaceId) {
if (!blobHttpUrl.contains(workspaceId.toString())) {
private static String getBlobNameFromTerraWorkspaceStorageUrl(
String blobUrl, String workspaceSubstringStart) {
if (!blobUrl.contains(workspaceSubstringStart)) {
throw new InternalServerErrorException(
"File path and workspaceId do not match. Cannot extract blob name.");
"File path and workspaceSubstringStart do not match. Cannot extract blob name.");
}
return blobHttpUrl.substring(
blobHttpUrl.indexOf(workspaceId.toString()) + workspaceId.toString().length() + 1);
return blobUrl.substring(
blobUrl.indexOf(workspaceSubstringStart) + workspaceSubstringStart.length() + 1);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
import com.google.cloud.storage.StorageException;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.retry.support.RetryTemplate;
Expand Down Expand Up @@ -70,11 +72,82 @@ public URL generatePutObjectSignedUrl(String projectId, String bucketName, Strin
Storage.SignUrlOption.withExtHeaders(extensionHeaders),
Storage.SignUrlOption.withV4Signature()));

logger.info("Generated PUT signed URL: {}", url);
String cleanSignedUrlString = cleanSignedUrl(url);
logger.info("Generated PUT signed URL: {}", cleanSignedUrlString);

return url;
}

/**
* Generates and returns a GET (read-only) signed url for a specific object in a bucket. See
* documentation on signed urls <a
* href="https://cloud.google.com/storage/docs/access-control/signed-urls">here</a>.
*
* <p>The output URL can be used with a curl command to download an object: `curl '{url}' >
* {local_file_name}`
*
* @param projectId Google project id
* @param bucketName without a prefix
* @param objectName should include the full path of the object (subdirectories + file name)
* @return url that can be used to download an object to GCS
*/
public URL generateGetObjectSignedUrl(String projectId, String bucketName, String objectName)
throws StorageException {
// define target blob object resource
BlobInfo blobInfo = BlobInfo.newBuilder(BlobId.of(bucketName, objectName)).build();

// generate signed URL
URL url =
executionWithRetryTemplate(
listenerResetRetryTemplate,
() ->
gcsClient
.getStorageService(projectId)
.signUrl(
blobInfo,
gcsConfiguration.signedUrlGetDurationHours(),
TimeUnit.HOURS,
Storage.SignUrlOption.httpMethod(HttpMethod.GET),
Storage.SignUrlOption.withV4Signature()));

String cleanSignedUrlString = cleanSignedUrl(url);
logger.info("Generated GET signed URL: {}", cleanSignedUrlString);

return url;
}

/**
* Redact the X-Google-Signature element's value from the signed url and return the cleaned result
* as a string.
*
* @param signedUrl
* @return
*/
public static String cleanSignedUrl(URL signedUrl) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice this looks great

String signedUrlString = signedUrl.toString();
String[] signedUrlParts = signedUrlString.split("\\?");
String elementDelimiter = "&";
List<String> signedUrlElements = List.of(signedUrlParts[1].split(elementDelimiter));

String signatureKey = "X-Goog-Signature";

String cleanUrl = signedUrlString;
if (signedUrlString.contains(signatureKey)) {
String urlElementsWithoutSignature =
signedUrlElements.stream()
.filter(signedUrlElement -> !signedUrlElement.contains(signatureKey))
.collect(Collectors.joining(elementDelimiter));
cleanUrl =
signedUrlParts[0]
+ "?"
+ urlElementsWithoutSignature
+ elementDelimiter
+ signatureKey
+ "=REDACTED";
}
return cleanUrl;
}

interface GcsAction<T> {
T execute();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ public Entity upsertDataTableEntity(
.createEntity(entity, workspaceNamespace, workspaceName));
}

public Entity getDataTableEntity(
String accessToken,
String workspaceNamespace,
String workspaceName,
String entityType,
String entityName) {
return executionWithRetryTemplate(
listenerResetRetryTemplate,
() ->
rawlsClient
.getEntitiesApi(accessToken)
.getEntity(workspaceNamespace, workspaceName, entityType, entityName, null, null));
}

// returns true if submission is in a running state
public static boolean submissionIsRunning(Submission submission) {
return !FINAL_RUN_STATES.contains(submission.getStatus());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package bio.terra.pipelines.service;

import static bio.terra.pipelines.common.utils.FileUtils.constructDestinationBlobNameForUserInputFile;
import static bio.terra.pipelines.common.utils.FileUtils.getBlobNameFromTerraWorkspaceStorageUrlGcp;
import static java.util.Collections.emptyList;
import static org.springframework.data.domain.PageRequest.ofSize;

Expand Down Expand Up @@ -397,8 +398,8 @@ public PipelineRun markPipelineRunSuccessAndWriteOutputs(
// methods to interact with and format pipeline run outputs

/**
* Extract the pipeline outputs from a pipelineRun object and return an ApiPipelineRunOutputs
* object with the outputs.
* Extract the pipeline outputs from a pipelineRun object, create signed GET (read-only) urls for
* each file, and return an ApiPipelineRunOutputs object with the outputs.
*
* @param pipelineRun object from the pipelineRunsRepository
* @return ApiPipelineRunOutputs
Expand All @@ -408,6 +409,17 @@ public ApiPipelineRunOutputs formatPipelineRunOutputs(PipelineRun pipelineRun) {
pipelineRunOutputsAsMap(
pipelineOutputsRepository.findPipelineOutputsByJobId(pipelineRun.getId()).getOutputs());

// currently all outputs are paths that will need a signed url
String workspaceStorageContainerName = pipelineRun.getWorkspaceStorageContainerName();
outputsMap.replaceAll(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is neat

(k, v) ->
gcsService
.generateGetObjectSignedUrl(
pipelineRun.getWorkspaceGoogleProject(),
workspaceStorageContainerName,
getBlobNameFromTerraWorkspaceStorageUrlGcp(v, workspaceStorageContainerName))
.toString());

ApiPipelineRunOutputs apiPipelineRunOutputs = new ApiPipelineRunOutputs();
apiPipelineRunOutputs.putAll(outputsMap);
return apiPipelineRunOutputs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
import bio.terra.pipelines.common.utils.FlightUtils;
import bio.terra.pipelines.common.utils.PipelinesEnum;
import bio.terra.pipelines.dependencies.stairway.JobMapKeys;
import bio.terra.pipelines.stairway.imputation.steps.CompletePipelineRunStep;
import bio.terra.pipelines.stairway.imputation.steps.PrepareImputationInputsStep;
import bio.terra.pipelines.stairway.imputation.steps.gcp.AddDataTableRowStep;
import bio.terra.pipelines.stairway.imputation.steps.gcp.FetchOutputsFromDataTableStep;
import bio.terra.pipelines.stairway.imputation.steps.gcp.PollCromwellSubmissionStatusStep;
import bio.terra.pipelines.stairway.imputation.steps.gcp.SubmitCromwellSubmissionStep;
import bio.terra.stairway.Flight;
import bio.terra.stairway.FlightMap;
import bio.terra.stairway.RetryRule;
import bio.terra.stairway.RetryRuleExponentialBackoff;
import bio.terra.stairway.RetryRuleFixedInterval;
import bio.terra.stairway.Step;

Expand All @@ -21,6 +24,16 @@ public class RunImputationGcpJobFlight extends Flight {
private final RetryRule dbRetryRule =
new RetryRuleFixedInterval(/*intervalSeconds= */ 1, /* maxCount= */ 5);

/**
* Use for a short exponential backoff retry, for operations that should be completable within a
* few seconds.
*/
private final RetryRule externalServiceRetryRule =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you!

// maxOperationTimeSeconds must be larger than socket timeout (20s), otherwise a socket
// timeout
// won't be retried.
new RetryRuleExponentialBackoff(1, 8, /* maxOperationTimeSeconds */ 30);

// addStep is protected in Flight, so make an override that is public
@Override
public void addStep(Step step, RetryRule retryRule) {
Expand Down Expand Up @@ -56,20 +69,28 @@ public RunImputationGcpJobFlight(FlightMap inputParameters, Object beanBag) {
dbRetryRule);

addStep(
new AddDataTableRowStep(flightBeanBag.getRawlsService(), flightBeanBag.getSamService()));
new AddDataTableRowStep(flightBeanBag.getRawlsService(), flightBeanBag.getSamService()),
externalServiceRetryRule);

addStep(
new SubmitCromwellSubmissionStep(
flightBeanBag.getRawlsService(),
flightBeanBag.getSamService(),
flightBeanBag.getImputationConfiguration()),
dbRetryRule);
externalServiceRetryRule);

addStep(
new PollCromwellSubmissionStatusStep(
flightBeanBag.getSamService(),
flightBeanBag.getRawlsService(),
flightBeanBag.getImputationConfiguration()),
dbRetryRule);
externalServiceRetryRule);

addStep(
new FetchOutputsFromDataTableStep(
flightBeanBag.getRawlsService(), flightBeanBag.getSamService()),
externalServiceRetryRule);

addStep(new CompletePipelineRunStep(flightBeanBag.getPipelineRunsService()), dbRetryRule);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package bio.terra.pipelines.stairway.imputation.steps.gcp;

import bio.terra.pipelines.common.utils.FlightUtils;
import bio.terra.pipelines.common.utils.PipelinesEnum;
import bio.terra.pipelines.db.entities.PipelineOutputDefinition;
import bio.terra.pipelines.dependencies.rawls.RawlsService;
import bio.terra.pipelines.dependencies.rawls.RawlsServiceApiException;
import bio.terra.pipelines.dependencies.sam.SamService;
import bio.terra.pipelines.dependencies.stairway.JobMapKeys;
import bio.terra.pipelines.stairway.imputation.RunImputationJobFlightMapKeys;
import bio.terra.rawls.model.Entity;
import bio.terra.stairway.FlightContext;
import bio.terra.stairway.FlightMap;
import bio.terra.stairway.Step;
import bio.terra.stairway.StepResult;
import bio.terra.stairway.StepStatus;
import com.fasterxml.jackson.core.type.TypeReference;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* This step calls Rawls to fetch outputs from a data table row for a given job and stores them in
* the flight's working map. These outputs are considered raw in that they are cloud paths and not
* signed urls.
*/
public class FetchOutputsFromDataTableStep implements Step {

private final RawlsService rawlsService;
private final SamService samService;

public FetchOutputsFromDataTableStep(RawlsService rawlsService, SamService samService) {
this.rawlsService = rawlsService;
this.samService = samService;
}

@Override
@SuppressWarnings(
"java:S2259") // suppress warning for possible NPE when calling pipelineName.getValue(),
// since we do validate that pipelineName is not null in `validateRequiredEntries`
public StepResult doStep(FlightContext flightContext) {
String jobId = flightContext.getFlightId();

// validate and extract parameters from input map
var inputParameters = flightContext.getInputParameters();
FlightUtils.validateRequiredEntries(
inputParameters,
JobMapKeys.PIPELINE_NAME.getKeyName(),
RunImputationJobFlightMapKeys.CONTROL_WORKSPACE_BILLING_PROJECT,
RunImputationJobFlightMapKeys.CONTROL_WORKSPACE_NAME,
RunImputationJobFlightMapKeys.PIPELINE_OUTPUT_DEFINITIONS);

String controlWorkspaceBillingProject =
inputParameters.get(
RunImputationJobFlightMapKeys.CONTROL_WORKSPACE_BILLING_PROJECT, String.class);
String controlWorkspaceName =
inputParameters.get(RunImputationJobFlightMapKeys.CONTROL_WORKSPACE_NAME, String.class);
PipelinesEnum pipelineName =
inputParameters.get(JobMapKeys.PIPELINE_NAME.getKeyName(), PipelinesEnum.class);
List<PipelineOutputDefinition> outputDefinitions =
inputParameters.get(
RunImputationJobFlightMapKeys.PIPELINE_OUTPUT_DEFINITIONS, new TypeReference<>() {});

Entity entity;
try {
entity =
rawlsService.getDataTableEntity(
samService.getTeaspoonsServiceAccountToken(),
controlWorkspaceBillingProject,
controlWorkspaceName,
pipelineName.getValue(),
jobId);
} catch (RawlsServiceApiException e) {
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, e);
}

Map<String, String> outputs = new HashMap<>();
for (PipelineOutputDefinition outputDefinition : outputDefinitions) {
String keyName = outputDefinition.getName();
String wdlVariableName = outputDefinition.getWdlVariableName();
outputs.put(keyName, entity.getAttributes().get(wdlVariableName).toString());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as we chatted on slack, it may be worth validating the outputs exist and are in fact not empty

}

FlightMap workingMap = flightContext.getWorkingMap();
workingMap.put(RunImputationJobFlightMapKeys.PIPELINE_RUN_OUTPUTS, outputs);

return StepResult.getStepResultSuccess();
}

@Override
public StepResult undoStep(FlightContext flightContext) {
// nothing to undo
return StepResult.getStepResultSuccess();
}
}
Loading
Loading