poseidon · dghubble · Apr 18, 2018 · Apr 15, 2018
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,6 +4,12 @@ Notable changes between versions.
 
 ## Latest
 
+#### Google Cloud
+
+* Add support for multi-controller clusters (i.e. multi-master) ([#54](https://github.com/poseidon/typhoon/issues/54), [#190](https://github.com/poseidon/typhoon/pull/190))
+  * Switch from Google Cloud network load balancer to a TCP proxy load balancer. Avoid a [bug](https://issuetracker.google.com/issues/67366622) in Google network load balancers that limited clusters to only bootstrapping one controller node. 
+  * Add TCP health check for apiserver pods on controllers. Replace kubelet check approximation.
+
 #### Addons
 
 * Update kube-state-metrics from v1.3.0 to v1.3.1

diff --git a/docs/google-cloud.md b/docs/google-cloud.md
@@ -252,7 +252,7 @@ resource "google_dns_managed_zone" "zone-for-clusters" {
 
 | Name | Description | Default | Example |
 |:-----|:------------|:--------|:--------|
-| controller_count | Number of controllers (i.e. masters) | 1 | 1 |
+| controller_count | Number of controllers (i.e. masters) | 1 | 3 |
 | worker_count | Number of workers | 1 | 3 |
 | controller_type | Machine type for controllers | "n1-standard-1" | See below |
 | worker_type | Machine type for workers | "n1-standard-1" | See below |
@@ -268,9 +268,6 @@ resource "google_dns_managed_zone" "zone-for-clusters" {
 
 Check the list of valid [machine types](https://cloud.google.com/compute/docs/machine-types).
 
-!!! warning
-    Set controller_count to 1. A bug in Google Cloud network load balancer health checking prevents multiple controllers from bootstrapping. There are workarounds, but they all involve tradeoffs we're uncomfortable recommending. See [#54](https://github.com/poseidon/typhoon/issues/54).
-
 #### Preemption
 
 Add `worker_preemeptible = "true"` to allow worker nodes to be [preempted](https://cloud.google.com/compute/docs/instances/preemptible) at random, but pay [significantly](https://cloud.google.com/compute/pricing) less. Clusters tolerate stopping instances fairly well (reschedules pods, but cannot drain) and preemption provides a nice reward for running fault-tolerant cluster systems.`

diff --git a/docs/topics/performance.md b/docs/topics/performance.md
@@ -9,7 +9,7 @@ Provisioning times vary based on the platform. Sampling the time to create (appl
 | AWS           | 6 min | 5 min   |
 | Bare-Metal    | 10-14 min | NA  |
 | Digital Ocean | 3 min 30 sec | 20 sec |
-| Google Cloud  | 4 min | 4 min 30 sec |
+| Google Cloud  | 7 min | 4 min 30 sec |
 
 Notes:
 

diff --git a/google-cloud/container-linux/kubernetes/apiserver.tf b/google-cloud/container-linux/kubernetes/apiserver.tf
@@ -1,10 +1,5 @@
-# Static IPv4 address for the Network Load Balancer
-resource "google_compute_address" "controllers-ip" {
-  name = "${var.cluster_name}-controllers-ip"
-}
-
-# DNS record for the Network Load Balancer
-resource "google_dns_record_set" "controllers" {
+# TCP Proxy load balancer DNS record
+resource "google_dns_record_set" "apiserver" {
   # DNS Zone name where record should be created
   managed_zone = "${var.dns_zone_name}"
 
@@ -13,44 +8,88 @@ resource "google_dns_record_set" "controllers" {
   type = "A"
   ttl  = 300
 
-  # IPv4 address of controllers' network load balancer
-  rrdatas = ["${google_compute_address.controllers-ip.address}"]
+  # IPv4 address of apiserver TCP Proxy load balancer
+  rrdatas = ["${google_compute_global_address.apiserver-ipv4.address}"]
 }
 
-# Network Load Balancer for controllers
-resource "google_compute_forwarding_rule" "controller-https-rule" {
-  name       = "${var.cluster_name}-controller-https-rule"
-  ip_address = "${google_compute_address.controllers-ip.address}"
-  port_range = "443"
-  target     = "${google_compute_target_pool.controllers.self_link}"
+# Static IPv4 address for the TCP Proxy Load Balancer
+resource "google_compute_global_address" "apiserver-ipv4" {
+  name = "${var.cluster_name}-apiserver-ip"
+  ip_version = "IPV4"
 }
 
-# Target pool of instances for the controller(s) Network Load Balancer
-resource "google_compute_target_pool" "controllers" {
-  name = "${var.cluster_name}-controller-pool"
+# Forward IPv4 TCP traffic to the TCP proxy load balancer
+resource "google_compute_global_forwarding_rule" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  ip_address = "${google_compute_global_address.apiserver-ipv4.address}"
+  ip_protocol = "TCP"
+  port_range = "443"
+  target = "${google_compute_target_tcp_proxy.apiserver.self_link}"
+}
 
-  instances = [
-    "${google_compute_instance.controllers.*.self_link}",
-  ]
+# Global TCP Proxy Load Balancer for apiservers
+resource "google_compute_target_tcp_proxy" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  description = "Distribute TCP load across ${var.cluster_name} controllers"
+  backend_service = "${google_compute_backend_service.apiserver.self_link}"
+}
 
-  health_checks = [
-    "${google_compute_http_health_check.kubelet.name}",
-  ]
+# Global backend service backed by unmanaged instance groups
+resource "google_compute_backend_service" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  description = "${var.cluster_name} apiserver service"
 
+  protocol = "TCP"
+  port_name = "apiserver"
   session_affinity = "NONE"
+  timeout_sec = "60"
+
+  # controller(s) spread across zonal instance groups
+  backend {
+    group = "${google_compute_instance_group.controllers.0.self_link}"
+  }
+  backend {
+    group = "${google_compute_instance_group.controllers.1.self_link}"
+  }
+  backend {
+    group = "${google_compute_instance_group.controllers.2.self_link}"
+  }
+
+  health_checks = ["${google_compute_health_check.apiserver.self_link}"]
+}
+
+# Instance group of heterogeneous (unmanged) controller instances
+resource "google_compute_instance_group" "controllers" {
+  count = "${length(local.zones)}"
+
+  name = "${format("%s-controllers-%s", var.cluster_name, element(local.zones, count.index))}"
+  zone = "${element(local.zones, count.index)}"
+
+  named_port {
+    name = "apiserver"
+    port = "443"
+  }
+
+  # add instances in the zone into the instance group
+  instances = [
+    "${matchkeys(google_compute_instance.controllers.*.self_link,
+      google_compute_instance.controllers.*.zone,
+      list(element(local.zones, count.index)))}"
+  ]
 }
 
-# Kubelet HTTP Health Check
-resource "google_compute_http_health_check" "kubelet" {
-  name        = "${var.cluster_name}-kubelet-health"
-  description = "Health check Kubelet health host port"
+# TCP health check for apiserver
+resource "google_compute_health_check" "apiserver" {
+  name = "${var.cluster_name}-apiserver-tcp-health"
+  description = "TCP health check for kube-apiserver"
 
-  timeout_sec        = 5
+  timeout_sec = 5
   check_interval_sec = 5
 
-  healthy_threshold   = 2
-  unhealthy_threshold = 4
+  healthy_threshold = 1
+  unhealthy_threshold = 3
 
-  port         = 10255
-  request_path = "/healthz"
+  tcp_health_check {
+    port  = "443"
+  }
 }
diff --git a/google-cloud/container-linux/kubernetes/controllers.tf b/google-cloud/container-linux/kubernetes/controllers.tf
@@ -19,12 +19,19 @@ data "google_compute_zones" "all" {
   region = "${var.region}"
 }
 
+locals {
+  # TCP proxy load balancers require a fixed number of zonal backends. Spread
+  # controllers over up to 3 zones, since all GCP regions have at least 3.
+  zones = "${slice(data.google_compute_zones.all.names, 0, 3)}"
+  controllers_ipv4_public = ["${google_compute_instance.controllers.*.network_interface.0.access_config.0.assigned_nat_ip}"]
+}
+
 # Controller instances
 resource "google_compute_instance" "controllers" {
   count = "${var.controller_count}"
 
   name         = "${var.cluster_name}-controller-${count.index}"
-  zone         = "${element(data.google_compute_zones.all.names, count.index)}"
+  zone         = "${element(local.zones, count.index)}"
   machine_type = "${var.controller_type}"
 
   metadata {
@@ -51,10 +58,6 @@ resource "google_compute_instance" "controllers" {
   tags           = ["${var.cluster_name}-controller"]
 }
 
-locals {
-  controllers_ipv4_public = ["${google_compute_instance.controllers.*.network_interface.0.access_config.0.assigned_nat_ip}"]
-}
-
 # Controller Container Linux Config
 data "template_file" "controller_config" {
   count = "${var.controller_count}"

diff --git a/google-cloud/container-linux/kubernetes/ssh.tf b/google-cloud/container-linux/kubernetes/ssh.tf
@@ -66,7 +66,7 @@ resource "null_resource" "bootkube-start" {
   depends_on = [
     "module.bootkube",
     "module.workers",
-    "google_dns_record_set.controllers",
+    "google_dns_record_set.apiserver",
     "null_resource.copy-controller-secrets",
   ]