Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error trying to create a private only EKS cluster #272

Open
1 task done
miaeyg opened this issue Feb 9, 2024 · 10 comments
Open
1 task done

Error trying to create a private only EKS cluster #272

miaeyg opened this issue Feb 9, 2024 · 10 comments
Assignees
Labels
bug Something isn't working

Comments

@miaeyg
Copy link

miaeyg commented Feb 9, 2024

Terraform Version Details

Using the latest version of this project and trying to setup a private-only EKS cluster.
Using BYON Scenario 2 to setup a private only EKS cluster.

Manually created the VPC and 2 control-plance subnets (in two separate AZs) + 1 private subnet + 1 public subnet.
The public subnet has it's own route table with an additional route "0.0.0.0/0" to the attached IGW to the VPC. The public subnet is used only for setting up the public facing "SAS deployment machine" on it. I can SSH to this "SAS deployment machine" from the Internet and run the viya4-iac-aws project on it.

The private subnet and the 2 control-plane were assigned the following tags and do not have route to the IGW:

image

Terraform Variable File Details


# !NOTE! - These are only a subset of the variables in CONFIG-VARS.md provided
# as examples. Customize this file to add any variables from CONFIG-VARS.md whose
# default values you want to change.

# ****************  REQUIRED VARIABLES  ****************
# These required variables' values MUST be provided by the User
prefix   = "mia"
location = "il-central-1" # e.g., "us-east-1"
# ****************  REQUIRED VARIABLES  ****************

# !NOTE! - Without specifying your CIDR block access rules, ingress traffic
#          to your cluster will be blocked by default.

# **************  RECOMMENDED  VARIABLES  ***************
default_private_access_cidrs = ["192.168.129.53/32"]           # public bastion (in my case)
ssh_public_key              = "~/.ssh/id_rsa.pub"
# **************  RECOMMENDED  VARIABLES  ***************

# Tags for all tagable items in your cluster.
tags = {} # e.g., { "key1" = "value1", "key2" = "value2" }

# Postgres config - By having this entry a database server is created. If you do not
#                   need an external database server remove the 'postgres_servers'
#                   block below.
# postgres_servers = {
#   default = {},
# }

#-------------------CHANGED HERE-----------------

cluster_api_mode = "private"
create_jump_public_ip = false
create_nfs_public_ip = false

vpc_id = "vpc-0a1e0b7ec66f8b337"
subnet_ids = {
  "private" : ["subnet-0332da0d666ebc6ea"],
  "control_plane" : ["subnet-0595cabfc6806d7bd", "subnet-08ecb61618a28d758"]
}
#-------------------CHANGED HERE-----------------

## Cluster config
kubernetes_version           = "1.27"
default_nodepool_node_count  = 1
default_nodepool_vm_type     = "m5.large"
default_nodepool_custom_data = ""

## General
efs_performance_mode = "maxIO"
storage_type         = "standard"

## Cluster Node Pools config - minimal
cluster_node_pool_mode = "minimal"
node_pools = {
  cas = {
    "vm_type"      = "r5.xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes"    = 1
    "max_nodes"    = 1
    "node_taints"  = ["workload.sas.com/class=cas:NoSchedule"]
    "node_labels" = {
      "workload.sas.com/class" = "cas"
    }
    "custom_data"                          = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
  },
  generic = {
    "vm_type"      = "m5.2xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes"    = 1
    "max_nodes"    = 1
    "node_taints"  = []
    "node_labels" = {
      "workload.sas.com/class"        = "compute"
      "launcher.sas.com/prepullImage" = "sas-programming-environment"
    }
    "custom_data"                          = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
  }
}

# Jump Server
create_jump_vm = true
jump_vm_admin  = "jumpuser"
jump_vm_type   = "t3.medium"

# NFS Server
# required ONLY when storage_type is "standard" to create NFS Server VM
nfs_vm_admin         = "nfsuser"
nfs_vm_type          = "m5.xlarge"


Steps to Reproduce

Use the supplied "terraform.tfvars" and adjust the following input variables to your env:

location
default_private_access_cidrs
vpc_id
subnet_ids

Expected Behavior

EKS cluster setup as private only without any problem

Actual Behavior

Failure happens after 25 minutes: NodeCreationFailure: Instances failed to join the kubernetes cluster
See complete error below

I see that the EKS cluster is Active and I also see the EC2 instances running:

image
module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0]: Still creating... [24m20s elapsed]
module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0]: Still creating... [24m30s elapsed]
module.eks.module.eks_managed_node_group["generic"].aws_eks_node_group.this[0]: Still creating... [24m30s elapsed]
╷
│ Error: waiting for EKS Node Group (mia-eks:default-20240209131523066800000029) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0738840a2017eca97: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":
│  308: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:cas-20240209131523059200000027) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0a8ae5eaa2bf41f8b: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["cas"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":
│  308: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:generic-2024020913152307710000002b) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-076af0c24e1a64460: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["generic"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":
│  308: resource "aws_eks_node_group" "this" {
│

Additional Context

No response

References

No response

Code of Conduct

  • I agree to follow this project's Code of Conduct
@miaeyg miaeyg added bug Something isn't working new Added to an issue when it's new ;) labels Feb 9, 2024
@dhoucgitter
Copy link
Member

Hi @miaeyg, thanks for filling out the new issue form, the additional information requested in it is often quite helpful. I looked over your description this morning but nothing stood out. Josh reached out to me regarding the issue you are having. I suspect that it may be a private endpoint security group rule related issue given the "instances failed to join the k8s cluster" behavior that you mentioned. I can work with Josh next week to get this issue resolved for you.

@dhoucgitter dhoucgitter self-assigned this Feb 9, 2024
@miaeyg
Copy link
Author

miaeyg commented Feb 16, 2024

Hi @dhoucgitter. Any idea what is wrong?

@dhoucgitter
Copy link
Member

Hi @miaeyg, touched based with Josh and heard you are meeting soon to discuss.

@miaeyg
Copy link
Author

miaeyg commented Feb 22, 2024

Hi @dhoucgitter not sure how the meeting is related to the reported problem. I am interested to understand how does this IaC project support creating a private-only EKS cluster. The way I did it (since I do not have AWS Direct Connect nor AWS Site to Site VPN) is by creating a small public subnet with an EC2 instance which has both public access and since it is in the VPC is also has private access to the VPC. I ran Terraform from this EC2 public machine with the terraform.tfvars files I included at the beginning. Should this have worked or is my public subnet perhaps the root cause of the failure? How did you test the creation of the private-only EKS setup?

@miaeyg
Copy link
Author

miaeyg commented Feb 27, 2024

Hi @dhoucgitter so after the meeting it was understood that we need to setup a private only VPC but we can use a public K8S API Server. So, I tested again this time I used my on-prem machine as the deployment machine and the VPC does not contain any public subnets + no IGW + no NAT Gateway. It contains only 1 private subnet + 2 private control-plane subnets. I tagged the private subnet as per the doc and also manually created a security group which allows inbound "0.0.0.0/0" and assigned it to the three input security groups variables so I am now on BYON scenario #3.

Nevertheless, the Terraform script still fails to join the node-groups with the EKS cluster with the same error. I really need help with this...

Here is the terraform.tfvars file:

prefix   = "csc"
location = "il-central-1" 

default_public_access_cidrs  = ["<my-on-prem-deployment-machines-public-ip-address>/32"]
default_private_access_cidrs = ["192.168.0.0/16"]   # this is the CIDR range of the VPC
ssh_public_key               = "~/.ssh/id_rsa.pub"
tags = {} 

cluster_api_mode = "public"
create_jump_public_ip = false
create_nfs_public_ip = false

security_group_id = "sg-0500dad8fdd42e76d" # security group I created with inbound all traffic from 0.0.0.0/0
cluster_security_group_id = "sg-0500dad8fdd42e76d" # security group I created with inbound all traffic from 0.0.0.0/0
workers_security_group_id = "sg-0500dad8fdd42e76d" # security group I created with inbound all traffic from 0.0.0.0/0

vpc_id = "vpc-004264c1f5a45e0bf"
subnet_ids = {
  "private" : ["subnet-05d24ae24f3cb793e"],
  "control_plane" : ["subnet-0d4737473f00ada0f", "subnet-045008f1d53806643"]  # placed in two AZs where one the AZs is the same AZ as the AZ of the private subnet
}

kubernetes_version           = "1.27"
default_nodepool_node_count  = 1
default_nodepool_vm_type     = "m5.large"
default_nodepool_custom_data = ""

efs_performance_mode = "maxIO"
storage_type         = "standard"

cluster_node_pool_mode = "minimal"
node_pools = {
  cas = {
    "vm_type"      = "r5.xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes"    = 1
    "max_nodes"    = 1
    "node_taints"  = ["workload.sas.com/class=cas:NoSchedule"]
    "node_labels" = {
      "workload.sas.com/class" = "cas"
    }
    "custom_data"                          = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
  },
  generic = {
    "vm_type"      = "m5.2xlarge"
    "cpu_type"     = "AL2_x86_64"
    "os_disk_type" = "gp2"
    "os_disk_size" = 200
    "os_disk_iops" = 0
    "min_nodes"    = 3
    "max_nodes"    = 5
    "node_taints"  = []
    "node_labels" = {
      "workload.sas.com/class"        = "compute"
      "launcher.sas.com/prepullImage" = "sas-programming-environment"
    }
    "custom_data"                          = ""
    "metadata_http_endpoint"               = "enabled"
    "metadata_http_tokens"                 = "required"
    "metadata_http_put_response_hop_limit" = 1
  }
}

create_jump_vm = true
jump_vm_admin  = "jumpuser"
jump_vm_type   = "t3.medium"

nfs_vm_admin         = "nfsuser"
nfs_vm_type          = "m5.xlarge"

Here is the error I get:

module.eks.module.eks_managed_node_group["cas"].aws_eks_node_group.this[0]: Still creating... [24m20s elapsed]
module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0]: Still creating... [24m30s elapsed]
module.eks.module.eks_managed_node_group["cas"].aws_eks_node_group.this[0]: Still creating... [24m30s elapsed]
module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0]: Still creating... [24m40s elapsed]

│ Error: waiting for EKS Node Group (csc-eks:default-20240227122039285000000027) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0081040db31553947: NodeCreationFailure: Instances failed to join the kubernetes cluster

│ with module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0],
│ on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":
│ 308: resource "aws_eks_node_group" "this" {



│ Error: waiting for EKS Node Group (csc-eks:cas-2024022712203930370000002b) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-034bd245791fb8d56: NodeCreationFailure: Instances failed to join the kubernetes cluster

│ with module.eks.module.eks_managed_node_group["cas"].aws_eks_node_group.this[0],
│ on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":
│ 308: resource "aws_eks_node_group" "this" {



│ Error: waiting for EKS Node Group (csc-eks:generic-20240227122039290600000029) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0bb67f1ff82ebe1c6, i-0d1df406bdc070a31, i-0d78c49904c7961a1: NodeCreationFailure: Instances failed to join the kubernetes cluster

│ with module.eks.module.eks_managed_node_group["generic"].aws_eks_node_group.this[0],
│ on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 308, in resource "aws_eks_node_group" "this":

@joshcoburn
Copy link

joshcoburn commented Mar 5, 2024

Root Cause:
The root cause was identified as recent changes which switched the S3 VPC private endpoint to a Gateway type without association to the private route table. Nodes will fail to join the cluster if no route to S3 is available. The current issue only affects air gapped EKS deployments.

Fix:
After discussion with @dhoucgitter, a decision was made to switch the S3 VPC endpoint back to interface type with the goal of pushing out the fix in the next release. In the meantime, the issue can be overcome by associating the VPC private routetable with the S3 gateway and re-running terraform apply (note: you may need to manually delete any stale EC2 instances which failed to join the cluster) OR you can setup a BYON with the required VPC endpoints already properly in place and specify vpc_private_endpoints_enabled = false in your tfvars.

Additional Details:
Gateway type endpoints requires a route table association and no route table was being associated to the S3 Gateway. EKS nodes require communication to S3 during provisioning as noted in eksctl documentation. The nodes will fail to join the cluster when they cannot reach S3.

After further investigation/testing, we ultimately discovered that functionally a S3 VPC endpoint with either Gateway type OR interface type will work in an EKS air gap scenario. The difference is in implementation and cost. A gateway type requires a routable association whereas interfaces do not and Gateway types are free whereas interfaces have a cost associated.

@dhoucgitter
Copy link
Member

Hi @miaeyg, I have been testing the change Josh describes above re: changing the S3 VPC endpoint type from "gateway" to "interface" with success in a a private cluster configuration. Anticipating that it will be part of the content in the upcoming April IAC AWS release along with help for dark site deployments.

@dhoucgitter
Copy link
Member

Didn't intend to close this before, I will add a link to the upcoming dark site PR that is expected to close this issue once I've created it.

@miaeyg
Copy link
Author

miaeyg commented Mar 22, 2024

Hi @dhoucgitter , good to hear that you will be making this fix public!

Here is what I did in the meanwhile to workaround after reading Josh's explanation of the root cause:
For creating a private-only EKS cluster I have to use BYON scenario #2 at a minimum so I manually created also the S3 Gateway endpoint and associated it with the private subnet so to make it create a route to it - all this as part of the BYON scenario #2 manual setup. Then I modified the modules/aws_vpc/variables.tf file and removed line 129 which creates the S3 Gateway endpoint. This worked fine for me.

The changes are located here: main...miaeyg:viya4-iac-aws:fix-vpc-private

@dhoucgitter dhoucgitter removed the new Added to an issue when it's new ;) label Apr 16, 2024
@miaeyg
Copy link
Author

miaeyg commented Dec 16, 2024

Hi @dhoucgitter

I tested the latest version 8.6.0 with Terraform 1.9.6 at my test env. simulating a dark site.
I removed the S3 Gateway that I manually added (and worked!) and tried using the new 8.6.0 version which creates an S3 endpoint instead of an S3 Gateway and Terraform failed after 25 minutes trying to start the EC2 node pools with this error:

╷
│ Error: creating EKS Access Entry (mia-eks:arn:aws:sts::855334947981:assumed-role/AWSReservedSSO_AdministratorAccess_88b75af5706b77a1/eyal): operation error EKS: CreateAccessEntry, https response error StatusCode: 400, RequestID: 35fff4f4-3c56-423c-9a06-ccd0e38f6ad3, InvalidParameterException: The principalArn parameter format is not valid
│
│   with module.eks.aws_eks_access_entry.this["cluster_creator"],
│   on .terraform/modules/eks/main.tf line 267, in resource "aws_eks_access_entry" "this":
│  267: resource "aws_eks_access_entry" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:default-2024121608233555950000002d) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0bf392d36c42446fa: NodeCreationFailure: Unhealthy nodes in the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["default"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 392, in resource "aws_eks_node_group" "this":
│  392: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:compute-20241216082335568500000031) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0f21485f05f9ce0ae: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["compute"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 392, in resource "aws_eks_node_group" "this":
│  392: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:stateful-20241216082335575500000035) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-0f5d0ec583ec36b48: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["stateful"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 392, in resource "aws_eks_node_group" "this":
│  392: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:cas-2024121608233556780000002f) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-08975f51b2b4ba3ad: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["cas"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 392, in resource "aws_eks_node_group" "this":
│  392: resource "aws_eks_node_group" "this" {
│
╵
╷
│ Error: waiting for EKS Node Group (mia-eks:stateless-20241216082335574300000033) create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: i-083ff4551575e6479: NodeCreationFailure: Instances failed to join the kubernetes cluster
│
│   with module.eks.module.eks_managed_node_group["stateless"].aws_eks_node_group.this[0],
│   on .terraform/modules/eks/modules/eks-managed-node-group/main.tf line 392, in resource "aws_eks_node_group" "this":
│  392: resource "aws_eks_node_group" "this" {
│

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

3 participants