Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(deadline): add WorkerInstanceConfiguration construct #209

Merged
merged 6 commits into from
Nov 6, 2020

Conversation

ddneilson
Copy link
Contributor

@ddneilson ddneilson commented Nov 4, 2020

Fixes: #208

This is a construct that provides helper methods for configuring a Deadline Worker in the RFDK context.

Testing

This was tested by deploying both a WorkerInstanceFleet and a single instance using the WorkerConfiguration construct. Deployments were in the context of the simple app that's in the RFDK docs here -- https://docs.aws.amazon.com/rfdk/latest/guide/what-is-rfdk.html#why-use-the-rfdk . After deployment, I ensured that the setup was correct via the generated CloudWatch logs.

Python test code for the test was:

import os
import jsii
import aws_cdk.core as core
import aws_cdk.aws_ec2 as ec2
import aws_cdk.aws_iam as iam
import aws_cdk.aws_efs as efs
import aws_rfdk as rfdk_core
import aws_rfdk.deadline as rfdk_deadline

@jsii.implements(rfdk_deadline.IHost)
class WorkerInstance(core.Construct):
  def __init__(self, scope: core.Construct, id: str, *, vpc: ec2.Vpc, render_queue: rfdk_deadline.RenderQueue) -> None:
    super().__init__(scope, id)

    deployment_region = core.Stack.of(self).region

    machine_image=ec2.MachineImage.generic_linux({
      deployment_region: "ami-05d4887175201bde8" # Deadline 10.1.10.6 AWSPortal Worker for Linux, us-west-2
    })
    self.os_type = machine_image.get_image(self).os_type
    self.grant_principal = iam.Role(self, 'WorkerRole', assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'))
    self.user_data = ec2.UserData.for_linux()
    security_group = ec2.SecurityGroup(self, 'WorkerSecGrp',
      vpc=vpc,
      description='Security group for Fleet [fleet identifier]'
    )
    self.connections = ec2.Connections(security_groups=[security_group])

    # Note: Must configure userData before we render it
    config = rfdk_deadline.WorkerInstanceConfiguration(self, 'Config', 
      worker=self,
      render_queue=render_queue,
      cloudwatch_log_settings=rfdk_core.LogGroupFactoryProps(
        log_group_prefix='/test-worker/'
      ),
      worker_settings=rfdk_deadline.WorkerSettings(
        groups= ['g1'],
      )
    )

    worker_subnet = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE).subnets[0]
    worker_instance = ec2.CfnInstance(self, 'Worker', 
      image_id=machine_image.get_image(self).image_id,
      instance_type='t3.small',
      security_group_ids=[security_group.security_group_id],
      iam_instance_profile=iam.CfnInstanceProfile(self, 'WorkerProfile', 
        roles=[self.grant_principal.role_name],
      ).ref,
      user_data=core.Fn.base64(self.user_data.render()),
      subnet_id=worker_subnet.subnet_id,
      availability_zone=worker_subnet.availability_zone,
    )

  # =====================================
  # Properties for the IHost interface = (IConnection, IGrantable, IScriptHost)

  # --------
  #  ec2.IConnection
  @property
  def connections(self):
    return self._connections

  @connections.setter
  def connections(self, value):
    self._connections = value

  # -------
  #  iam.IGrantable
  @property
  def grant_principal(self):
    return self._grant_principal

  @grant_principal.setter
  def grant_principal(self, value):
    self._grant_principal = value

  # -----
  #  rfdk.IScriptHost
  @property
  def os_type(self):
    return self._os_type

  @os_type.setter
  def os_type(self, value):
    self._os_type = value

  @property
  def user_data(self):
    return self._user_data

  @user_data.setter
  def user_data(self, value):
    self._user_data = value

class InfrastructureStack(core.Stack):

  def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
    super().__init__(scope, id, **kwargs)

    self.vpc = ec2.Vpc(self, "Vpc", max_azs=2)

    script_dir = os.path.dirname(os.path.abspath(__file__))
    stage_dir = os.path.abspath(os.path.join(script_dir, '..', 'stage'))
    local_recipe_stage = rfdk_deadline.Stage.from_directory(stage_dir)
    server_recipes = rfdk_deadline.ThinkboxDockerRecipes(self, 'ServerImages',
      stage=local_recipe_stage
    )

    self.repository = rfdk_deadline.Repository(self, 'Repository',
      vpc=self.vpc,
      version=server_recipes.version,
      # Allow resources to be deleted when we delete the sample
      removal_policy=rfdk_deadline.RepositoryRemovalPolicies(
        database=core.RemovalPolicy.DESTROY,
        filesystem=core.RemovalPolicy.DESTROY
      )
    )
    
    self.render_queue = rfdk_deadline.RenderQueue(self, 'RenderQueue',
      vpc=self.vpc, 
      version=server_recipes.version,
      images=server_recipes.render_queue_images,
      repository=self.repository,
      # Allow the load-balancer to be deleted when we delete the sample
      deletion_protection=False,
    )



class WorkersStack(core.Stack):

  def __init__(self, scope: core.Construct, id: str, *, render_queue, vpc, **kwargs) -> None:
    super().__init__(scope, id, **kwargs)

    worker = WorkerInstance(self, 'Worker', vpc=vpc, render_queue=render_queue)

    health_monitor = rfdk_core.HealthMonitor(self, 'HealthMonitor',
      vpc=vpc,
      deletion_protection=False,
    )

    fleet = rfdk_deadline.WorkerInstanceFleet(self, 'LinWorkers',
      vpc=vpc, 
      render_queue=render_queue,
      worker_machine_image=ec2.MachineImage.generic_linux({
        # Fill in your AMI id here
        core.Stack.of(self).region: "ami-05d4887175201bde8" # Deadline 10.1.10.6 AWSPortal Worker for Linux, us-west-2
      }),
      min_capacity=5,
      instance_type=ec2.InstanceType("c5.large"),
      spot_price=0.15,
      health_monitor=health_monitor
    )

    fleet2 = rfdk_deadline.WorkerInstanceFleet(self, 'WinWorkers',
      vpc=vpc, 
      render_queue=render_queue,
      worker_machine_image=ec2.MachineImage.generic_windows({
        # Fill in your AMI id here
        core.Stack.of(self).region: "ami-09c712180218564f2" # Deadline 10.1.10.6 AWSPortal Worker for Windows, us-west-2
      }),
      min_capacity=5,
      instance_type=ec2.InstanceType("c5.large"),
      spot_price=0.25,
      health_monitor=health_monitor
    )

By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license

@jusiskin jusiskin added the contribution/core This is a PR that came from AWS. label Nov 4, 2020
@ddneilson ddneilson requested review from yashda and jusiskin November 5, 2020 00:44
Copy link
Contributor

@jusiskin jusiskin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fantastic job factoring out some re-usable logic here without changing any public APIs. I just have some suggestions to refine the new APIs we are introducing here - let me know what you think.

* Interface for Deadline clients that can be configured via the ClientConfiguration
* helper class.
*/
export interface IConfigurableWorker extends IScriptHost, IGrantable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: unless I'm missing something, I believe this is redundant since IScriptHost already extends IGrantable.

Comment on lines 149 to 150
const groups = settings?.groups?.map(val => val.toLowerCase()).join(',') ?? ''; // props.groups ? props.groups.map(val => val.toLowerCase()).join(',') : '';
const pools = settings?.pools?.map(val => val.toLowerCase()).join(',') ?? ''; // props.pools ? props.pools.map(val => val.toLowerCase()).join(',') : '';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: these appear to be leftover comments from debugging

Comment on lines 76 to 113
export class WorkerConfiguration extends Construct {
constructor(scope: Construct, id: string) {
super(scope, id);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This construct has no logic in its constructor nor any member variables. All of the functionality is in instance methods that generate construct nodes under this construct sub-tree and the user has to call these in the correct order and provide disambiguating IDs.

I notice that in WorkerInstanceFleet, we have this sequence of calls:

    // Updating the user data with installation logs stream.
    workerConfig.configureCloudWatchLogStream(this.fleet, id, {
      logGroupPrefix: WorkerInstanceFleet.DEFAULT_LOG_GROUP_PREFIX,
      ...props.logGroupProps,
    });

    props.renderQueue.configureClientInstance({
      host: this.fleet,
    });

    // Updating the user data with deadline repository installation commands.
    workerConfig.configureWorkerSettings(this.fleet, id, props);

That configuration order should be preserved so errors configuring the Worker to connect to the render queue are streamed and the render queue connection is configured before trying to configure the worker, but I wonder if it makes makes sense to configure the Worker to use the render queue as part of this new WorkerConfiguration construct? It feels like an important missing piece of Worker configuration and we want to abstract knowledge of the correct calling order.

I suspect the decision to use methods was chosen because the IRenderQueue interface has two configuration methods that rely on knowing whether the Workers are instances or containers and the intention was to keep WorkerConfiguration agnostic to that. The current implementation is instance-specific anyways since Deadline configuration is done through user data which is not available in containers. You've already provided an example in the PR description of how to implement the IHost interface, so....

Allow me propose a slight change to bring this all together:

import { IHost } from './host-ref';
import { IRenderQueue } from './render-queue';

interface WorkerInstanceConfigurationProps extends WorkerSettings {
  readonly logGroup?: LogGroupFactoryProps;
  readonly renderQueue?: IRenderQueue;
  readonly workerHost: IHost;
}

export class WorkerInstanceConfiguration extends Construct {
  constructor(scope: Construct, id: string, props: WorkerInstanceConfigurationProps) {
    super(scope, id);

    if (props.logGroup) {
      this.configureCloudWatchLogStream(props.workerHost, props.logGroup);
    }
    if (props.renderQueue) {
      props.renderQueue.configureClientInstance({ host: props.workerHost });
    }
    this.configureWorkerSettings(props.workerHost, props);
  }
}

this makes the usage of this construct 1:1 with a Worker, but eliminates the need to specify child IDs, know the best-practice for the order of the method calls, makes the calling code cleaner. It goes from:

const worker: IConfigurableWorker = ...;
const renderQueue: IRenderQueue = ...;
const workerConfiguration = new WorkerConfiguration(this, 'WorkerConfiguration');
workerConfiguration.configureCloudWatchLogStream(worker, 'WorkerCloudWatch', { ... });
renderQueue.configureClientInstance({ host: props.worker });
workerConfiguration.configureWorkerSettings(worker, 'WorkerSettings', { ... });

to:

const workerHost: IHost = ...;
const renderQueue: IRenderQueue = ...;
const workerConfiguration = new WorkerInstanceConfiguration(this, 'WorkerConfiguration', {
  workerHost,
  renderQueue,
  logGroup: ...,
  groups: [...],
  pools: [...],
  region: 'myregion',
});

What do you think?

Comment on lines 391 to 396
{
'Fn::GetAtt': [
'ConfigWorkerLogGroupWrapperDC3AF2E7',
'LogGroupName',
],
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a technique to avoid baking these dynamically generated logical IDs into our tests. It involves the use of the stack.resolve(...). In this case, you could use:

const logGroupName = stack.resolve((config.node.findChild('WorkerLogGroup') as ILogGroup).logGroupName);

same feedback applies below.

Comment on lines 55 to 62
# Restart service, if it exists, else restart application
if service --status-all | grep -q 'Deadline 10 Launcher'; then
service deadline10launcher restart
else
DEADLINE_LAUNCHER="$DEADLINE_PATH/deadlinelauncher"
"$DEADLINE_LAUNCHER" -shutdownall
"$DEADLINE_LAUNCHER"
fi
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the WorkerInstanceFleet case, we will now be restarting the launcher and Worker twice at startup. I understand it is necessary to ensure the changes take effect when running this script in isolation, but to avoid this extra restart, we could use OO-inheritance of WorkerConfiguration, like this:

class WorkerConfiguration {
  private readonly applyConfigScript: ScriptAsset;
  private readonly setConfigScript: ScriptAsset;

  constructor(...) {
    super(...);

    // Script to restart the launcher
    this.applyConfigScript = ScriptAsset.fromPathConvention(...);

    this.setConfiguration();
    this.applyConfiguration();
  }

  protected setConfiguration() {
    setConfigScript.executeOn(...);
  }

  protected applyConfiguration() {
    // Calls a separate ScriptAsset to restart the launcher
    applyConfigScript.executeOn(...);
  }
}

and for the worker instance fleet we override the setConfiguration method:

class WorkerInstanceFleetConfiguration extends WorkerConfiguration {
  private readonly configureHealthMonitor: ScriptAsset;

  constructor(...) {
    super(...);
    this.configureHealthMonitor = ScriptAsset.fromPathConvention(...);
  }

  protected setConfiguration() {
    super.setConfiguration();
    this.configureHealthMonitor.executeOn(...);
  }
}

This assumes my other proposed change of applying the configuration in the constructor. It's a bit of rework, but do you think it's worth it?

jusiskin
jusiskin previously approved these changes Nov 5, 2020
Copy link
Contributor

@jusiskin jusiskin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Approved! Really great job here

Daniel Neilson added 5 commits November 5, 2020 23:05
@ddneilson ddneilson changed the title feat(deadline): Adds WorkerConfiguration construct feat(deadline): Adds WorkerInstanceConfiguration construct Nov 5, 2020
yashda
yashda previously approved these changes Nov 5, 2020
Copy link
Contributor

@yashda yashda left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good! Thanks Daniel for the changes.

jusiskin
jusiskin previously approved these changes Nov 6, 2020
@jusiskin jusiskin added the feature-request A feature should be added or improved. label Nov 6, 2020
@jusiskin jusiskin changed the title feat(deadline): Adds WorkerInstanceConfiguration construct feat(deadline): add WorkerInstanceConfiguration construct Nov 6, 2020
@jusiskin jusiskin merged commit bbb82b0 into aws:mainline Nov 6, 2020
@ddneilson ddneilson deleted the worker_configure branch March 17, 2021 15:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
contribution/core This is a PR that came from AWS. feature-request A feature should be added or improved.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Construct for configuring Deadline Workers
3 participants