Don't use port open check to determine if reboot completed. Fixes #856.

The old approach, waiting for the machine to not having an open port, and then waiting for it to be open again, was insufficient, because of the race condition that the machine rebooted so quickly that the port was immediately open again without nixops noticing that it went down. I experienced this on a Hetzner cloud server. The new approach checks the `last reboot` on the remote side to change, which is not racy.
NixOS · May 26, 2018 · 2fff961 · 2fff961
1 parent d4b2fc5
commit 2fff961
Showing 5 changed files with 50 additions and 17 deletions.
diff --git a/nixops/backends/__init__.py b/nixops/backends/__init__.py
@@ -3,6 +3,7 @@
 import os
 import re
 import subprocess
+import time
 
 import nixops.util
 import nixops.resources
@@ -170,7 +171,7 @@ def backup(self, defn, backup_id):
         """Make backup of persistent disks, if possible."""
         self.warn("don't know how to make backup of disks for machine ‘{0}’".format(self.name))
 
-    def reboot(self, hard=False):
+    def reboot(self, hard=False, reset=True):
         """Reboot this machine."""
         self.log("rebooting...")
         if self.state == self.RESCUE:
@@ -182,16 +183,46 @@ def reboot(self, hard=False):
             reboot_command = "systemctl reboot"
         self.run_command(reboot_command, check=False)
         self.state = self.STARTING
-        self.ssh.reset()
+        if reset:
+            self.ssh.reset()
 
     def reboot_sync(self, hard=False):
         """Reboot this machine and wait until it's up again."""
-        self.reboot(hard=hard)
-        self.log_start("waiting for the machine to finish rebooting...")
-        nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
-        self.log_continue("[down]")
-        nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
-        self.log_end("[up]")
+
+        # To check when the machine has finished rebooting in a race-free
+        # manner, we compare the output of `last reboot` before and after
+        # the reboot. Once the output has changed, the reboot is done.
+        def get_last_reboot_output():
+            # Note `last reboot` does not exist on older OSs like
+            # the Hetzner rescue system, but that doesn't matter much
+            # because all we care about is when the output of the
+            # command invocation changes.
+            # We use timeout=10 so that the user gets some sense
+            # of progress, as reboots can take a long time.
+            return self.run_command('last reboot --time-format iso | head -n1', capture_stdout=True, timeout=10).rstrip()
+
+        pre_reboot_last_reboot_output = get_last_reboot_output()
+
+        # We set reset=False so that we can continue running `last reboot`
+        # remotely; when the reboot happens, our SSH connection will be reset
+        # by the remote side instead.
+        self.reboot(hard=hard, reset=False)
+
+        self.log_start("waiting for reboot to complete...")
+        while True:
+            last_reboot_output = None
+            try:
+                last_reboot_output = get_last_reboot_output()
+            except (nixops.ssh_util.SSHConnectionFailed, nixops.ssh_util.SSHCommandFailed):
+                # We accept this because the machine might be down,
+                # and show an 'x' as progress indicator in that case.
+                self.log_continue("x")
+            if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
+                break
+            self.log_continue(".")
+            time.sleep(1)
+        self.log_end("done.")
+
         self.state = self.UP
         self.ssh_pinged = True
         self._ssh_pinged_this_time = True

diff --git a/nixops/backends/azure_vm.py b/nixops/backends/azure_vm.py
@@ -948,14 +948,15 @@ def after_activation(self, defn):
                 self._delete_encryption_key(d_id)
 
 
-    def reboot(self, hard=False):
+    def reboot(self, hard=False, reset=True):
         if hard:
             self.log("sending hard reset to Azure machine...")
             self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
             self.state = self.STARTING
-            self.ssh.reset()
+            if reset:
+                self.ssh.reset()
         else:
-            MachineState.reboot(self, hard=hard)
+            MachineState.reboot(self, hard=hard, reset=reset)
         self.ssh_pinged = False
 
     def start(self):

diff --git a/nixops/backends/ec2.py b/nixops/backends/ec2.py
@@ -1407,7 +1407,7 @@ def _check(self, res):
                     res.messages.append("  * {0} - {1}".format(e.not_before, e.not_after))
 
 
-    def reboot(self, hard=False):
+    def reboot(self, hard=False, reset=True):
         self.log("rebooting EC2 machine...")
         instance = self._get_instance()
         instance.reboot()

diff --git a/nixops/backends/gce.py b/nixops/backends/gce.py
@@ -530,13 +530,13 @@ def create_node(self, defn):
             self.on_host_maintenance = defn.on_host_maintenance
 
 
-    def reboot(self, hard=False):
+    def reboot(self, hard=False, reset=True):
         if hard:
             self.log("sending hard reset to GCE machine...")
             self.node().reboot()
             self.state = self.STARTING
         else:
-            MachineState.reboot(self, hard=hard)
+            MachineState.reboot(self, hard=hard, reset=reset)
 
 
     def start(self):

diff --git a/nixops/backends/hetzner.py b/nixops/backends/hetzner.py
@@ -311,16 +311,17 @@ def _bootstrap_rescue(self, install, partitions):
             self.run_command(cmd)
         self.log_end("done.")
 
-    def reboot(self, hard=False):
+    def reboot(self, hard=False, reset=True):
         if hard:
             self.log_start("sending hard reset to robot... ")
             server = self._get_server_by_ip(self.main_ipv4)
             server.reboot('hard')
             self.log_end("done.")
             self.state = self.STARTING
-            self.ssh.reset()
+            if reset:
+                self.ssh.reset()
         else:
-            MachineState.reboot(self, hard=hard)
+            MachineState.reboot(self, hard=hard, reset=reset)
 
     def reboot_rescue(self, install=False, partitions=None, bootstrap=True,
                       hard=False):