openshift · sdodson · Oct 10, 2016 · Oct 7, 2016
diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml
@@ -168,10 +168,21 @@
 - include: set_loopback_context.yml
   when: openshift.common.version_gte_3_2_or_1_2
 
+# TODO: Master startup can fail when ec2 transparently reallocates the block
+# storage, causing etcd writes to temporarily fail. Retry failures blindly just
+# once to allow time for this transient condition to to resolve and for systemd
+# to restart the master (which will eventually succeed).
+#
+# https://github.com/coreos/etcd/issues/3864
+# https://github.com/openshift/origin/issues/6065
+# https://github.com/openshift/origin/issues/6447
 - name: Start and enable master
   service: name={{ openshift.common.service_type }}-master enabled=yes state=started
   when: not openshift_master_ha | bool
   register: start_result
+  until: not start_result | failed
+  retries: 1
+  delay: 60
   notify: Verify API Server
 
 - name: Check for non-HA master service presence