Skip to content

Commit

Permalink
fix(eks): cannot update cluster configuration
Browse files Browse the repository at this point in the history
Our custom resource naively tried to call the UpdateCluster API for updates, but this is in fact not inline with how AWS::EKS::Cluster is implemented. This change modifies the custom resource handler to handle updates based on the same specification as the official CloudFormation resource:

- Changes the cluster name, VPC or role will cause a replacement (creation of a cluster with a new name and removal of the old cluster).
- Changes to the version will use the UpdateClusterVersion API to update the version in-place.

This fixes #4311.

This commit also fixes #4310 which caused cluster deletions when updates failed. The root cause was that when errors were reported to CFN we always used the log stream name as the physical resource ID, and CFN thought we wanted to replace the resource. Oouch.

This change was manually tested since we still don't have a good unit test harness for this resource so we manually tested all types of updates and observed that the appropriate behaviour was taken (replacements, in-place).
  • Loading branch information
Elad Ben-Israel committed Oct 26, 2019
1 parent 9b7d2d0 commit da853ce
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 8 deletions.
2 changes: 1 addition & 1 deletion packages/@aws-cdk/aws-eks/lib/cluster-resource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ export class ClusterResource extends Construct {

// since we don't know the cluster name at this point, we must give this role star resource permissions
handler.addToRolePolicy(new PolicyStatement({
actions: [ 'eks:CreateCluster', 'eks:DescribeCluster', 'eks:DeleteCluster' ],
actions: [ 'eks:CreateCluster', 'eks:DescribeCluster', 'eks:DeleteCluster', 'eks:UpdateClusterVersion' ],
resources: [ '*' ]
}));

Expand Down
70 changes: 63 additions & 7 deletions packages/@aws-cdk/aws-eks/lib/cluster-resource/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ def cfn_error(message=None):
old_props = event.get('OldResourceProperties', {})
physical_id = event.get('PhysicalResourceId', None)
config = props['Config']
old_config = old_props.get('Config', {})

def new_cluster_name():
return "cluster-%s" % request_id

logger.info(json.dumps(config))

Expand All @@ -49,11 +53,36 @@ def cfn_error(message=None):
cluster_name=config.get('name', None)
if cluster_name is None:
if physical_id: cluster_name = physical_id
elif request_type == 'Create': cluster_name = "cluster-%s" % request_id
elif request_type == 'Create': cluster_name = new_cluster_name()
else: raise Exception("unexpected error. cannot determine cluster name")
config['name'] = cluster_name
logger.info("request: %s" % config)

# extract additional options
resourcesVpcConfig = config.get('resourcesVpcConfig', None)
roleArn = config.get('roleArn', None)
version = config.get('version', None)

def should_replace_cluster():
logger.info("old config: %s" % json.dumps(old_config))

old_name = physical_id
if old_name != cluster_name:
logger.info("'name' change requires replacement (old=%s, new=%s)" % (old_name, cluster_name))
return True

old_resourcesVpcConfig = old_config.get('resourcesVpcConfig', None)
if old_resourcesVpcConfig != resourcesVpcConfig:
logger.info("'resourcesVpcConfig' change requires replacement (old=%s, new=%s)" % (old_resourcesVpcConfig, resourcesVpcConfig))
return True

old_roleArn = old_config.get('roleArn', None)
if old_roleArn != roleArn:
logger.info("'roleArn' change requires replacement (old=%s, new=%s)" % (old_roleArn, roleArn))
return True

return False

# delete is a special case
if request_type == 'Delete':
logger.info('deleting cluster')
Expand All @@ -70,8 +99,32 @@ def cfn_error(message=None):
logger.info("create response: %s" % resp)
elif request_type == 'Update':
logger.info("updating cluster %s" % cluster_name)
resp = eks.update_cluster_config(**config)
logger.info("update response: %s" % resp)

current_state = eks.describe_cluster(name=cluster_name)['cluster']

# changes to "name", "resourcesVpcConfig" and "roleArn" all require replacement
# according to the cloudformation spec, so if one of these change, we basically need to create
# a new cluster with the new configuration (in this case, if "version" has been changed, the
# new version will be used by the new cluster).
if should_replace_cluster():
cluster_name = new_cluster_name()
config['name'] = cluster_name
logger.info("replacing cluster %s with a new cluster %s" % (physical_id, cluster_name))
resp = eks.create_cluster(**config)
logger.info("create (replacement) response: %s" % resp)
else:
# version change - we can do that without replacement
old_version = old_config.get('version', None)
if (old_version is None) and (version is None):
logger.info("no version change")
else:
old_version_actual = current_state['version']
if version != old_version_actual:
if version is None:
raise Exception("Version cannot be changed from a specific value (%s) to undefined" % old_version)

resp = eks.update_cluster_version(name=cluster_name,version=version)
logger.info("update response: %s" % resp)
else:
raise Exception("Invalid request type %s" % request_type)

Expand All @@ -94,9 +147,8 @@ def cfn_error(message=None):
logger.info("attributes: %s" % attrs)
cfn_send(event, context, CFN_SUCCESS, responseData=attrs, physicalResourceId=cluster_name)

except KeyError as e:
cfn_error("invalid request. Missing '%s'" % str(e))
except Exception as e:
except:
e = sys.exc_info()[1]
logger.exception(e)
cfn_error(str(e))

Expand All @@ -111,10 +163,14 @@ def cfn_send(event, context, responseStatus, responseData={}, physicalResourceId
responseUrl = event['ResponseURL']
logger.info(responseUrl)

# use previous PhysicalResourceId if physical resource ID is not specified, otherwise update failures
# will result in resource replacement
physicalResourceId = physicalResourceId or event.get('PhysicalResourceId', context.log_stream_name)

responseBody = {}
responseBody['Status'] = responseStatus
responseBody['Reason'] = reason or ('See the details in CloudWatch Log Stream: ' + context.log_stream_name)
responseBody['PhysicalResourceId'] = physicalResourceId or context.log_stream_name
responseBody['PhysicalResourceId'] = physicalResourceId
responseBody['StackId'] = event['StackId']
responseBody['RequestId'] = event['RequestId']
responseBody['LogicalResourceId'] = event['LogicalResourceId']
Expand Down

0 comments on commit da853ce

Please sign in to comment.