From ba776edd44604bd83af2f54a223c68b0f686ce96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Chamley?= <theo.ch@mley.fr>
Date: Sat, 28 Jan 2017 14:03:57 +0100
Subject: [PATCH 1/4] Documentation to go from single to multi-master

This adds a lengthy documentation on how to go
from a single-master setup to a multi-master
setup.
---
 docs/single-to-multi-master.md | 213 +++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 docs/single-to-multi-master.md
diff --git a/docs/single-to-multi-master.md b/docs/single-to-multi-master.md
new file mode 100644
index 0000000000..2ad60b2dd4
--- /dev/null
+++ b/docs/single-to-multi-master.md
@@ -0,0 +1,213 @@
+# Migrating from single to multi-master
+
+This document describes how to go from a single-master cluster (created by kops)
+to a multi-master cluster.
+
+## Warnings
+
+This is a risky procedure that **can lead to data-loss** in the etcd cluster.
+Please follow all the backup steps before attempting it. Please read the
+[etcd admin guide](https://github.com/coreos/etcd/blob/v2.2.1/Documentation/admin_guide.md)
+before attempting it.
+
+During this procedure, you will experience **downtime** on the API server, but
+not on the end user services.
+
+## 1 - Backups
+
+### a - Backup main etcd cluster
+
+```bash
+$ kubectl --namespace=kube-system get pods | grep etcd
+etcd-server-events-ip-172-20-36-161.ec2.internal        1/1       Running   4          2h
+etcd-server-ip-172-20-36-161.ec2.internal               1/1       Running   4          2h
+$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -it -- sh
+/ # etcdctl backup --data-dir /var/etcd/data --backup-dir /var/etcd/backup
+/ # mv /var/etcd/backup/ /var/etcd/data/
+/ # exit
+$ kubectl --namespace=kube-system get pod etcd-server-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcddata")) | .hostPath.path'
+"/mnt/master-vol-0ea119c15602cbb57/var/etcd/data"
+$ ssh admin@<master-node>
+admin@ip-172-20-36-161:~$ sudo -i
+root@ip-172-20-36-161:~# mv /mnt/master-vol-0ea119c15602cbb57/var/etcd/data/backup /home/admin/
+root@ip-172-20-36-161:~# chown -R admin: /home/admin/backup/
+root@ip-172-20-36-161:~# exit
+admin@ip-172-20-36-161:~$ exit
+$ scp -r admin@<master-node>:backup/ .
+```
+
+### b - Backup event etcd cluster
+
+```bash
+$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -it -- sh
+/ # etcdctl backup --data-dir /var/etcd/data-events --backup-dir /var/etcd/backup
+/ # mv /var/etcd/backup/ /var/etcd/data-events/
+/ # exit
+$ kubectl --namespace=kube-system get pod etcd-server-events-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcddata")) | .hostPath.path'
+"/mnt/master-vol-0bb5ad222911c6777/var/etcd/data-events"
+$ ssh admin@<master-node>
+admin@ip-172-20-36-161:~$ sudo -i
+root@ip-172-20-36-161:~# mv /mnt/master-vol-0bb5ad222911c6777/var/etcd/data-events/backup/ /home/admin/backup-events
+root@ip-172-20-36-161:~# chown -R admin: /home/admin/backup-events/
+root@ip-172-20-36-161:~# exit
+admin@ip-172-20-36-161:~$ exit
+$ scp -r admin@<master-node>:backup-events/ .
+```
+
+## 2 - Add a new master
+
+### a - Create the instance group
+
+Create 1 kops instance group for the first one of your new masters, in
+a different AZ from the existing one.
+
+```bash
+$ kops create instancegroup master-<availability-zone2>
+```
+
+ * ``maxSize`` and ``minSize`` should be 1,
+ * ``role`` should be ``Master``,
+ * only one zone should be listed.
+
+### b - Reference the new masters in your cluster configuration
+
+*kops will refuse to have only 2 members in the etcd clusters, so we have to
+reference a third one, even if we have not created it yet.*
+
+```bash
+$ kops edit cluster myclusterdomain.com
+```
+ * In ``.spec.etcdClusters`` 2 new members in each cluster, one for each new
+ availability zone.
+
+### c - Add a new member to the etcd clusters
+
+**The clusters will stop to work until the new member is started**.
+
+```bash
+$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone2> http://etcd-<availability-zone2>.internal.myclusterdomain.com:2380
+$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone2> http://etcd-events-<availability-zone2>.internal.myclusterdomain.com:2381
+```
+
+### d - Launch the new master
+
+```bash
+$ kops update cluster myclusterdomain.com --yes
+# wait for the new master to boot and initialize
+$ ssh admin@<new-master>
+admin@ip-172-20-116-230:~$ sudo -i
+root@ip-172-20-116-230:~# systemctl stop kubelet
+root@ip-172-20-116-230:~# systemctl stop protokube
+```
+
+Reinitialize the etcd instances:
+* In both ``/etc/kubernetes/manifests/etcd-events.manifest`` and
+``/etc/kubernetes/manifests/etcd.manifest``, edit the
+``ETCD_INITIAL_CLUSTER_STATE`` variable to ``existing``.
+* In the same files, remove the third non-existing member from
+``ETCD_INITIAL_CLUSTER``.
+* Delete the containers and the data directories:
+
+```bash
+root@ip-172-20-116-230:~# docker stop $(docker ps | grep "etcd:2.2.1" | awk '{print $1}')
+root@ip-172-20-116-230:~# rm -r /mnt/master-vol-03b97b1249caf379a/var/etcd/data-events/member/
+root@ip-172-20-116-230:~# rm -r /mnt/master-vol-0dbfd1f3c60b8c509/var/etcd/data/member/
+```
+
+Launch them again:
+
+```bash
+root@ip-172-20-116-230:~# systemctl start kubelet
+```
+
+At this point, both etcd clusters should be healthy with two members:
+
+```bash
+$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member list
+$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl cluster-health
+$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member list
+$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 cluster-health
+```
+
+If not, check ``/var/log/etcd.log`` for problems.
+
+Restart protokube on the new master:
+
+```bash
+root@ip-172-20-116-230:~# systemctl start protokube
+```
+
+## 3 - Add the third master
+
+### a - Create the instance group
+
+Create 1 kops instance group for the third master, in
+a different AZ from the existing ones.
+
+```bash
+$ kops create instancegroup master-<availability-zone3>
+```
+
+ * ``maxSize`` and ``minSize`` should be 1,
+ * ``role`` should be ``Master``,
+ * only one zone should be listed.
+
+### b - Add a new member to the etcd clusters
+
+ ```bash
+ $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone3> http://etcd-<availability-zone3>.internal.myclusterdomain.com:2380
+ $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone3> http://etcd-events-<availability-zone3>.internal.myclusterdomain.com:2381
+ ```
+
+### c - Launch the third master
+
+ ```bash
+ $ kops update cluster myclusterdomain.com --yes
+ # wait for the third master to boot and initialize
+ $ ssh admin@<third-master>
+ admin@ip-172-20-139-130:~$ sudo -i
+ root@ip-172-20-139-130:~# systemctl stop kubelet
+ root@ip-172-20-139-130:~# systemctl stop protokube
+ ```
+
+ Reinitialize the etcd instances:
+ * In both ``/etc/kubernetes/manifests/etcd-events.manifest`` and
+ ``/etc/kubernetes/manifests/etcd.manifest``, edit the
+ ``ETCD_INITIAL_CLUSTER_STATE`` variable to ``existing``.
+ * Delete the containers and the data directories:
+
+ ```bash
+ root@ip-172-20-139-130:~# docker stop $(docker ps | grep "etcd:2.2.1" | awk '{print $1}')
+ root@ip-172-20-139-130:~# rm -r /mnt/master-vol-019796c3511a91b4f//var/etcd/data-events/member/
+ root@ip-172-20-139-130:~# rm -r /mnt/master-vol-0c89fd6f6a256b686/var/etcd/data/member/
+ ```
+
+ Launch them again:
+
+ ```bash
+ root@ip-172-20-139-130:~# systemctl start kubelet
+ ```
+
+ At this point, both etcd clusters should be healthy with three members:
+
+ ```bash
+ $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member list
+ $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl cluster-health
+ $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member list
+ $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 cluster-health
+ ```
+
+ If not, check ``/var/log/etcd.log`` for problems.
+
+ Restart protokube on the third master:
+
+ ```bash
+ root@ip-172-20-139-130:~# systemctl start protokube
+ ```
+
+## 4 - Cleanup
+
+To be sure that everything runs smoothly and is setup correctly, it is advised
+to terminate the masters one after the other (always keeping 2 of them up and
+running). They will be restarted with a clean config and should join the others
+without any problems.

From 959c2eec916d78aeb44049057af89790a71dedee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Chamley?= <theo.ch@mley.fr>
Date: Thu, 16 Mar 2017 10:18:55 +0100
Subject: [PATCH 2/4] Fix jq expression + link in HA doc

---
 docs/high_availability.md      | 5 +++--
 docs/single-to-multi-master.md | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/high_availability.md b/docs/high_availability.md
index 68d89259c6..c12422648e 100644
--- a/docs/high_availability.md
+++ b/docs/high_availability.md
@@ -29,8 +29,9 @@ In short:
 
 ## Using Kops HA
 
-We can create HA clusters using kops, but only it's important to note that you must plan for this at time of cluster creation.  Currently it is not possible to change
-the etcd cluster size (i.e. we cannot change an HA cluster to be non-HA, or a non-HA cluster to be HA.) [Issue #1512](https://github.com/kubernetes/kops/issues/1512)
+We can create HA clusters using kops, but only it's important to note that migrating from a single-master
+cluster to a multi-master cluster is a complicated operation (described [here](./single-to-multi-master.md)).
+If possible, try to plan this at time of cluster creation.
 
 When you first call `kops create cluster`, you specify the `--master-zones` flag listing the zones you want your masters
 to run in, for example:
diff --git a/docs/single-to-multi-master.md b/docs/single-to-multi-master.md
index 2ad60b2dd4..7884b9a333 100644
--- a/docs/single-to-multi-master.md
+++ b/docs/single-to-multi-master.md
@@ -25,7 +25,7 @@ $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal
 / # etcdctl backup --data-dir /var/etcd/data --backup-dir /var/etcd/backup
 / # mv /var/etcd/backup/ /var/etcd/data/
 / # exit
-$ kubectl --namespace=kube-system get pod etcd-server-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcddata")) | .hostPath.path'
+$ kubectl --namespace=kube-system get pod etcd-server-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcdata")) | .hostPath.path'
 "/mnt/master-vol-0ea119c15602cbb57/var/etcd/data"
 $ ssh admin@<master-node>
 admin@ip-172-20-36-161:~$ sudo -i
@@ -43,7 +43,7 @@ $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.i
 / # etcdctl backup --data-dir /var/etcd/data-events --backup-dir /var/etcd/backup
 / # mv /var/etcd/backup/ /var/etcd/data-events/
 / # exit
-$ kubectl --namespace=kube-system get pod etcd-server-events-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcddata")) | .hostPath.path'
+$ kubectl --namespace=kube-system get pod etcd-server-events-ip-172-20-36-161.ec2.internal -o json | jq '.spec.volumes[] | select(.name | contains("varetcdata")) | .hostPath.path'
 "/mnt/master-vol-0bb5ad222911c6777/var/etcd/data-events"
 $ ssh admin@<master-node>
 admin@ip-172-20-36-161:~$ sudo -i

From acb27cbbb2d3bf128a196af8fa665a044194156c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Chamley?= <theo.ch@mley.fr>
Date: Wed, 22 Mar 2017 15:30:12 +0100
Subject: [PATCH 3/4] Change example domain and describing downtime

Changes requested by yissachar during review.
---
 docs/single-to-multi-master.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/single-to-multi-master.md b/docs/single-to-multi-master.md
index 7884b9a333..e454460a97 100644
--- a/docs/single-to-multi-master.md
+++ b/docs/single-to-multi-master.md
@@ -11,7 +11,9 @@ Please follow all the backup steps before attempting it. Please read the
 before attempting it.
 
 During this procedure, you will experience **downtime** on the API server, but
-not on the end user services.
+not on the end user services. During this downtime, existing pods will continue
+to work, but you will not be able to create new pods and any existing pod that
+dies will not be restarted.
 
 ## 1 - Backups
 
@@ -75,7 +77,7 @@ $ kops create instancegroup master-<availability-zone2>
 reference a third one, even if we have not created it yet.*
 
 ```bash
-$ kops edit cluster myclusterdomain.com
+$ kops edit cluster example.com
 ```
  * In ``.spec.etcdClusters`` 2 new members in each cluster, one for each new
  availability zone.
@@ -85,14 +87,14 @@ $ kops edit cluster myclusterdomain.com
 **The clusters will stop to work until the new member is started**.
 
 ```bash
-$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone2> http://etcd-<availability-zone2>.internal.myclusterdomain.com:2380
-$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone2> http://etcd-events-<availability-zone2>.internal.myclusterdomain.com:2381
+$ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone2> http://etcd-<availability-zone2>.internal.example.com:2380
+$ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone2> http://etcd-events-<availability-zone2>.internal.example.com:2381
 ```
 
 ### d - Launch the new master
 
 ```bash
-$ kops update cluster myclusterdomain.com --yes
+$ kops update cluster example.com --yes
 # wait for the new master to boot and initialize
 $ ssh admin@<new-master>
 admin@ip-172-20-116-230:~$ sudo -i
@@ -155,14 +157,14 @@ $ kops create instancegroup master-<availability-zone3>
 ### b - Add a new member to the etcd clusters
 
  ```bash
- $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone3> http://etcd-<availability-zone3>.internal.myclusterdomain.com:2380
- $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone3> http://etcd-events-<availability-zone3>.internal.myclusterdomain.com:2381
+ $ kubectl --namespace=kube-system exec etcd-server-ip-172-20-36-161.ec2.internal -- etcdctl member add etcd-<availability-zone3> http://etcd-<availability-zone3>.internal.example.com:2380
+ $ kubectl --namespace=kube-system exec etcd-server-events-ip-172-20-36-161.ec2.internal -- etcdctl --endpoint http://127.0.0.1:4002 member add etcd-events-<availability-zone3> http://etcd-events-<availability-zone3>.internal.example.com:2381
  ```
 
 ### c - Launch the third master
 
  ```bash
- $ kops update cluster myclusterdomain.com --yes
+ $ kops update cluster example.com --yes
  # wait for the third master to boot and initialize
  $ ssh admin@<third-master>
  admin@ip-172-20-139-130:~$ sudo -i

From fab376eef54073c4fb1d9b2e2cc3fd99481dad75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Chamley?= <theo.ch@mley.fr>
Date: Wed, 22 Mar 2017 17:56:33 +0100
Subject: [PATCH 4/4] Explaining why we need the cleanup step

---
 docs/single-to-multi-master.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/single-to-multi-master.md b/docs/single-to-multi-master.md
index e454460a97..76d8b97fb1 100644
--- a/docs/single-to-multi-master.md
+++ b/docs/single-to-multi-master.md
@@ -213,3 +213,8 @@ To be sure that everything runs smoothly and is setup correctly, it is advised
 to terminate the masters one after the other (always keeping 2 of them up and
 running). They will be restarted with a clean config and should join the others
 without any problems.
+
+While optional, this last step allows you to be sure that your masters are
+fully configured by Kops and that there is no residual manual configuration.
+If there is any configuration problem, they will be detected during this step
+and not during a future upgrade or, worse, during a master failure.