From b79250fe9b97afe9092ab17f232d6808e223a392 Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Mon, 13 Apr 2020 18:24:40 -0700 Subject: [PATCH 1/6] Use poweroff + poll instead of qvm-kill for forced shutdowns Towards #498 --- launcher/sdw_updater_gui/Updater.py | 94 +++++++++++++++++++++++++---- launcher/tests/test_updater.py | 69 +++------------------ 2 files changed, 92 insertions(+), 71 deletions(-) diff --git a/launcher/sdw_updater_gui/Updater.py b/launcher/sdw_updater_gui/Updater.py index 03fa2bf3..63806181 100644 --- a/launcher/sdw_updater_gui/Updater.py +++ b/launcher/sdw_updater_gui/Updater.py @@ -11,9 +11,19 @@ import logging import os import subprocess +import time from datetime import datetime, timedelta from enum import Enum +# Used for VM checks when run in Qubes dom0 +try: + import qubesadmin + + qubes = qubesadmin.Qubes() +except ImportError: + qubes = None + + DATE_FORMAT = "%Y-%m-%d %H:%M:%S" DEFAULT_HOME = ".securedrop_launcher" FLAG_FILE_STATUS_SD_APP = "/home/user/.securedrop_client/sdw-update-status" @@ -403,8 +413,8 @@ def shutdown_and_start_vms(): on, for example. All system AppVMs (sys-net, sys-firewall and sys-usb) need to be restarted. - We use qvm-kill for sys-firewall and sys-net, because a shutdown may fail - if they are currently in use as NetVMs by any of the user's other VMs. + We use a forced shutdown for sys-net and sys-firewall because a regular + shutdown command will return an error if they are in use as NetVMs. """ sdw_vms_in_order = [ @@ -433,17 +443,10 @@ def shutdown_and_start_vms(): sdlog.info("Safely shutting down system VM: {}".format(vm)) _safely_shutdown_vm(vm) - # TODO: Use of qvm-kill should be considered unsafe and may have unexpected - # side effects. We should aim for a more graceful shutdown strategy. unsafe_sys_vms_in_order = ["sys-firewall", "sys-net"] for vm in unsafe_sys_vms_in_order: - sdlog.info("Killing system VM: {}".format(vm)) - try: - subprocess.check_output(["qvm-kill", vm], stderr=subprocess.PIPE) - except subprocess.CalledProcessError as e: - sdlog.error("Error while killing system VM: {}".format(vm)) - sdlog.error(str(e)) - sdlog.error(str(e.stderr)) + sdlog.info("Forcing shutdown of system VM: {}".format(vm)) + _force_shutdown_vm(vm) all_sys_vms_in_order = safe_sys_vms_in_order + unsafe_sys_vms_in_order sdlog.info("Starting fedora-based system VMs after updates") @@ -465,6 +468,75 @@ def _safely_shutdown_vm(vm): return UpdateStatus.UPDATES_FAILED +def _force_shutdown_vm(vm): + """ + Qubes does not yet provide an option to force shutdown for VMs that act as + NetVMs. Issuing a poweroff command and polling for the VM to stop running + is the recommended workaround until then. + + Return value: + - True - if VM was successfully shut down or is not running + - False - if there was a problem shutting down the VM (logged as error) + """ + if vm not in qubes.domains: + sdlog.error( + "Error shutting down VM '{}'. No VM with this name found.".format(vm) + ) + return False + + if qubes.domains[vm].is_running() is False: + sdlog.info("VM '{}' is not running. No shutdown necessary.".format(vm)) + return True + + if qubes.domains[vm].is_paused(): + sdlog.info("VM '{}' is paused. Unpausing before shutdown.".format(vm)) + qubes.domains[vm].unpause() + + try: + qubes.domains[vm].run("poweroff", user="root") + except subprocess.CalledProcessError as e: + # Exit codes 1 and 143 may occur with successful shutdown; log others + if e.returncode != 1 and e.returncode != 143: + sdlog.error( + "Error shutting down VM '{}'. " + "poweroff command returned unexpected exit code {}.".format( + vm, e.returncode + ) + ) + return False + + return _wait_for_is_running(vm, False) + + +def _wait_for_is_running(vm, expected, timeout=60, interval=0.2): + """ + Poll for a VM to enter the given is_running state, and give up after a + timeout is reached. + + Return value: + - True if the VM reached the expeted state + - False if it did not + """ + start_time = time.time() + stop_time = start_time + timeout + while time.time() < stop_time: + state = qubes.domains[vm].is_running() + elapsed = time.time() - start_time + if state == expected: + sdlog.info( + "VM '{}' entered expected state (is_running() is {}) " + "after {:.2f} seconds of polling.".format(vm, expected, elapsed) + ) + return True + time.sleep(interval) + + sdlog.error( + "VM '{}' did not enter expected state (is_running() is {}) " + "in the provided timeout of {} seconds.".format(vm, expected, timeout) + ) + return False + + def _safely_start_vm(vm): try: running_vms = subprocess.check_output( diff --git a/launcher/tests/test_updater.py b/launcher/tests/test_updater.py index f886a546..afd75208 100644 --- a/launcher/tests/test_updater.py +++ b/launcher/tests/test_updater.py @@ -668,15 +668,18 @@ def test_safely_shutdown_fails(mocked_info, mocked_error, mocked_call, vm): @mock.patch("subprocess.check_output") @mock.patch("Updater._safely_start_vm") @mock.patch("Updater._safely_shutdown_vm") +@mock.patch("Updater._force_shutdown_vm") @mock.patch("Updater.sdlog.error") @mock.patch("Updater.sdlog.info") def test_shutdown_and_start_vms( - mocked_info, mocked_error, mocked_shutdown, mocked_start, mocked_output + mocked_info, + mocked_error, + mocked_force_shutdown, + mocked_shutdown, + mocked_start, + mocked_output, ): - sys_vm_kill_calls = [ - call(["qvm-kill", "sys-firewall"], stderr=-1), - call(["qvm-kill", "sys-net"], stderr=-1), - ] + force_shutdown_calls = [call("sys-firewall"), call("sys-net")] sys_vm_shutdown_calls = [ call("sys-usb"), call("sys-whonix"), @@ -705,7 +708,7 @@ def test_shutdown_and_start_vms( call("sd-log"), ] updater.shutdown_and_start_vms() - mocked_output.assert_has_calls(sys_vm_kill_calls) + mocked_force_shutdown.assert_has_calls(force_shutdown_calls) mocked_shutdown.assert_has_calls( template_vm_calls + app_vm_calls + sys_vm_shutdown_calls ) @@ -714,60 +717,6 @@ def test_shutdown_and_start_vms( assert not mocked_error.called -@mock.patch( - "subprocess.check_output", - side_effect=subprocess.CalledProcessError(1, "check_output"), -) -@mock.patch("Updater._safely_start_vm") -@mock.patch("Updater._safely_shutdown_vm") -@mock.patch("Updater.sdlog.error") -@mock.patch("Updater.sdlog.info") -def test_shutdown_and_start_vms_sysvm_fail( - mocked_info, mocked_error, mocked_shutdown, mocked_start, mocked_output -): - sys_vm_kill_calls = [ - call(["qvm-kill", "sys-firewall"], stderr=-1), - call(["qvm-kill", "sys-net"], stderr=-1), - ] - sys_vm_start_calls = [ - call("sys-net"), - call("sys-firewall"), - call("sys-whonix"), - call("sys-usb"), - ] - app_vm_calls = [ - call("sd-app"), - call("sd-proxy"), - call("sd-whonix"), - call("sd-gpg"), - call("sd-log"), - ] - template_vm_calls = [ - call("fedora-30"), - call("sd-viewer-buster-template"), - call("sd-app-buster-template"), - call("sd-log-buster-template"), - call("sd-devices-buster-template"), - call("sd-proxy-buster-template"), - call("whonix-gw-15"), - call("securedrop-workstation-buster"), - ] - error_calls = [ - call("Error while killing system VM: sys-firewall"), - call("Command 'check_output' returned non-zero exit status 1."), - call("None"), - call("Error while killing system VM: sys-net"), - call("Command 'check_output' returned non-zero exit status 1."), - call("None"), - ] - updater.shutdown_and_start_vms() - mocked_output.assert_has_calls(sys_vm_kill_calls) - mocked_shutdown.assert_has_calls(template_vm_calls + app_vm_calls) - app_vm_calls_reversed = list(reversed(app_vm_calls)) - mocked_start.assert_has_calls(sys_vm_start_calls + app_vm_calls_reversed) - mocked_error.assert_has_calls(error_calls) - - @pytest.mark.parametrize("status", UpdateStatus) @mock.patch("subprocess.check_call") @mock.patch("os.path.expanduser", return_value=temp_dir) From 47c6a3c77bdac5e7a0c5b1f6406bbec85da11183 Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Tue, 14 Apr 2020 12:21:49 -0700 Subject: [PATCH 2/6] Fix typo in comment --- launcher/sdw_updater_gui/Updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher/sdw_updater_gui/Updater.py b/launcher/sdw_updater_gui/Updater.py index 63806181..4dcb51fe 100644 --- a/launcher/sdw_updater_gui/Updater.py +++ b/launcher/sdw_updater_gui/Updater.py @@ -514,7 +514,7 @@ def _wait_for_is_running(vm, expected, timeout=60, interval=0.2): timeout is reached. Return value: - - True if the VM reached the expeted state + - True if the VM reached the expected state - False if it did not """ start_time = time.time() From 3de355f9087b9d7f8c35b5a4f1447e07dbc8e997 Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Wed, 15 Apr 2020 00:23:23 -0700 Subject: [PATCH 3/6] Add first test for VM polling --- launcher/tests/test_updater.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/launcher/tests/test_updater.py b/launcher/tests/test_updater.py index afd75208..8e805ce3 100644 --- a/launcher/tests/test_updater.py +++ b/launcher/tests/test_updater.py @@ -1,6 +1,7 @@ import json import os import pytest +import time import subprocess from importlib.machinery import SourceFileLoader from datetime import datetime, timedelta @@ -504,6 +505,24 @@ def test_apply_updates_dom0_failure(mocked_info, mocked_error, mocked_call): mocked_error.assert_has_calls(error_log) +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_vm_polling(mocked_info, mocked_error): + def mock_api(results): + for r in results: + yield r + time.sleep(0.1) + + with mock.patch("Updater.qubes") as mocked_qubes: + mocked_qubes.domains = {"sys-net": mock.MagicMock()} + mocked_qubes.domains["sys-net"].is_running = mock.MagicMock( + side_effect=mock_api((True, True, False)) + ) + assert updater._wait_for_is_running("sys-net", False, timeout=1) is True + assert mocked_info.called + assert not mocked_error.called + + @pytest.mark.parametrize("vm", current_templates.keys()) @mock.patch("subprocess.check_call", side_effect="0") @mock.patch("Updater.sdlog.error") From 111ccb039fe66b2b6cc56fde7f821bdf9c441908 Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Wed, 15 Apr 2020 12:24:51 -0700 Subject: [PATCH 4/6] Make polling code more generic, add tests --- launcher/sdw_updater_gui/Updater.py | 24 +++++++------- launcher/tests/test_updater.py | 49 ++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/launcher/sdw_updater_gui/Updater.py b/launcher/sdw_updater_gui/Updater.py index 4dcb51fe..c53c5320 100644 --- a/launcher/sdw_updater_gui/Updater.py +++ b/launcher/sdw_updater_gui/Updater.py @@ -19,7 +19,7 @@ try: import qubesadmin - qubes = qubesadmin.Qubes() + qubes = qubesadmin.Qubes() # pragma: no cover except ImportError: qubes = None @@ -505,13 +505,13 @@ def _force_shutdown_vm(vm): ) return False - return _wait_for_is_running(vm, False) + return _wait_for(vm, (lambda vm: qubes.domains[vm].is_running() is False)) -def _wait_for_is_running(vm, expected, timeout=60, interval=0.2): +def _wait_for(vm, condition, timeout=60, interval=0.2): """ - Poll for a VM to enter the given is_running state, and give up after a - timeout is reached. + Poll for a VM to enter the state using the function condition() + that must return True when the state is reached Return value: - True if the VM reached the expected state @@ -520,19 +520,21 @@ def _wait_for_is_running(vm, expected, timeout=60, interval=0.2): start_time = time.time() stop_time = start_time + timeout while time.time() < stop_time: - state = qubes.domains[vm].is_running() + # Evaluate condition before time measurement to include its runtime + condition_reached = condition(vm) elapsed = time.time() - start_time - if state == expected: + if condition_reached: sdlog.info( - "VM '{}' entered expected state (is_running() is {}) " - "after {:.2f} seconds of polling.".format(vm, expected, elapsed) + "VM '{}' entered expected state after {:.2f} seconds of " + "polling.".format(vm, elapsed) ) + return True time.sleep(interval) sdlog.error( - "VM '{}' did not enter expected state (is_running() is {}) " - "in the provided timeout of {} seconds.".format(vm, expected, timeout) + "VM '{}' did not enter expected state in the provided timeout of " + "{} seconds.".format(vm, timeout) ) return False diff --git a/launcher/tests/test_updater.py b/launcher/tests/test_updater.py index 8e805ce3..f7d23407 100644 --- a/launcher/tests/test_updater.py +++ b/launcher/tests/test_updater.py @@ -1,8 +1,8 @@ import json import os import pytest -import time import subprocess +import re from importlib.machinery import SourceFileLoader from datetime import datetime, timedelta from tempfile import TemporaryDirectory @@ -57,6 +57,13 @@ "sd-viewer": UpdateStatus.UPDATES_REQUIRED, } +VM_POLLING_REGEX_SUCCESS = ( + r"VM '.*' entered expected state after (.*) seconds of polling." +) +VM_POLLING_REGEX_FAILURE = ( + r"VM '.*' did not enter expected state in the provided timeout of (.*) seconds." +) + def test_updater_vms_present(): assert len(updater.current_templates) == 9 @@ -507,20 +514,34 @@ def test_apply_updates_dom0_failure(mocked_info, mocked_error, mocked_call): @mock.patch("Updater.sdlog.error") @mock.patch("Updater.sdlog.info") -def test_vm_polling(mocked_info, mocked_error): - def mock_api(results): - for r in results: - yield r - time.sleep(0.1) +def test_vm_polling_success(mocked_info, mocked_error): + poll_results = mock.MagicMock(side_effect=(False, False, True)) + assert updater._wait_for("sys-net", poll_results, interval=0.1, timeout=1) is True + assert mocked_info.called + info_string = mocked_info.call_args[0][0] + match = re.search(VM_POLLING_REGEX_SUCCESS, info_string) + assert match is not None + elapsed = float(match.group(1)) + # With a sleep interval of 0.1, at least 0.2 seconds should pass before we + # get the expected result. + assert elapsed >= 0.20 + assert not mocked_error.called - with mock.patch("Updater.qubes") as mocked_qubes: - mocked_qubes.domains = {"sys-net": mock.MagicMock()} - mocked_qubes.domains["sys-net"].is_running = mock.MagicMock( - side_effect=mock_api((True, True, False)) - ) - assert updater._wait_for_is_running("sys-net", False, timeout=1) is True - assert mocked_info.called - assert not mocked_error.called + +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_vm_polling_failure(mocked_info, mocked_error): + poll_results = mock.MagicMock(side_effect=(False, False, False)) + assert ( + updater._wait_for("sys-net", poll_results, interval=0.1, timeout=0.3) is False + ) + assert not mocked_info.called + assert mocked_error.called + error_string = mocked_error.call_args[0][0] + match = re.search(VM_POLLING_REGEX_FAILURE, error_string) + assert match is not None + timeout = float(match.group(1)) + assert timeout == 0.30 @pytest.mark.parametrize("vm", current_templates.keys()) From dc372ce6ae11eca57868ba6be8aca7bf76394a3a Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Wed, 15 Apr 2020 17:07:24 -0700 Subject: [PATCH 5/6] Complete test coverage --- launcher/tests/test_updater.py | 107 +++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/launcher/tests/test_updater.py b/launcher/tests/test_updater.py index f7d23407..b8e50606 100644 --- a/launcher/tests/test_updater.py +++ b/launcher/tests/test_updater.py @@ -57,10 +57,14 @@ "sd-viewer": UpdateStatus.UPDATES_REQUIRED, } -VM_POLLING_REGEX_SUCCESS = ( +VM_SHUTDOWN_FAILURE_REGEX = ( + r"Error shutting down VM '.*'. poweroff command returned unexpected exit code (.*)." +) + +VM_POLLING_SUCCESS_REGEX = ( r"VM '.*' entered expected state after (.*) seconds of polling." ) -VM_POLLING_REGEX_FAILURE = ( +VM_POLLING_FAILURE_REGEX = ( r"VM '.*' did not enter expected state in the provided timeout of (.*) seconds." ) @@ -512,6 +516,101 @@ def test_apply_updates_dom0_failure(mocked_info, mocked_error, mocked_call): mocked_error.assert_has_calls(error_log) +@mock.patch("Updater._wait_for") +@mock.patch("Updater.qubes") +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_vm_not_found( + mocked_info, mocked_error, mocked_qubes, mocked_wait +): + mocked_qubes.domains = {} + assert updater._force_shutdown_vm("sys-net") is False + assert mocked_error.called + assert not mocked_wait.called + + +@mock.patch("Updater._wait_for") +@mock.patch("Updater.qubes") +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_vm_not_running( + mocked_info, mocked_error, mocked_qubes, mocked_wait +): + mocked_qubes.domains = {"sys-net": mock.MagicMock()} + mocked_qubes.domains["sys-net"].is_running = mock.MagicMock(return_value=False) + assert updater._force_shutdown_vm("sys-net") is True + assert not mocked_error.called + assert mocked_info.called + assert not mocked_wait.called + + +@mock.patch("Updater._wait_for", return_value=True) +@mock.patch("Updater.qubes") +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_unpauses_and_shuts_down( + mocked_info, mocked_error, mocked_qubes, mocked_wait +): + mocked_qubes.domains = {"sys-net": mock.MagicMock()} + mocked_qubes.domains["sys-net"].is_running = mock.MagicMock(return_value=True) + mocked_qubes.domains["sys-net"].is_paused = mock.MagicMock(return_value=True) + + assert updater._force_shutdown_vm("sys-net") is True + assert mocked_qubes.domains["sys-net"].unpause.called + assert mocked_qubes.domains["sys-net"].run.called + + assert not mocked_error.called + assert mocked_info.called + assert mocked_wait.called + + +@pytest.mark.parametrize("good_returncode", (1, 143)) +@mock.patch("Updater._wait_for", return_value=True) +@mock.patch("Updater.qubes") +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_expected_return_code_is_handled( + mocked_info, mocked_error, mocked_qubes, mocked_wait, good_returncode +): + mocked_qubes.domains = {"sys-net": mock.MagicMock()} + mocked_qubes.domains["sys-net"].is_running = mock.MagicMock(return_value=True) + mocked_qubes.domains["sys-net"].run = mock.MagicMock( + side_effect=subprocess.CalledProcessError(returncode=good_returncode, cmd="") + ) + assert updater._force_shutdown_vm("sys-net") is True + assert mocked_qubes.domains["sys-net"].run.called + assert mocked_wait.called + + +@mock.patch("Updater._wait_for") +@mock.patch("Updater.qubes") +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_unexpected_return_code_is_reported( + mocked_info, mocked_error, mocked_qubes, mocked_wait +): + mocked_qubes.domains = {"sys-net": mock.MagicMock()} + mocked_qubes.domains["sys-net"].is_running = mock.MagicMock(return_value=True) + mocked_qubes.domains["sys-net"].run = mock.MagicMock( + side_effect=subprocess.CalledProcessError(returncode=666, cmd="") + ) + assert updater._force_shutdown_vm("sys-net") is False + assert mocked_qubes.domains["sys-net"].run.called + assert mocked_error.called + error_string = mocked_error.call_args[0][0] + match = re.search(VM_SHUTDOWN_FAILURE_REGEX, error_string) + assert match is not None + found_code = int(match.group(1)) + assert found_code == 666 + assert not mocked_wait.called + + +@mock.patch("Updater.sdlog.error") +@mock.patch("Updater.sdlog.info") +def test_force_shutdown_failure(mocked_info, mocked_error): + assert True + + @mock.patch("Updater.sdlog.error") @mock.patch("Updater.sdlog.info") def test_vm_polling_success(mocked_info, mocked_error): @@ -519,7 +618,7 @@ def test_vm_polling_success(mocked_info, mocked_error): assert updater._wait_for("sys-net", poll_results, interval=0.1, timeout=1) is True assert mocked_info.called info_string = mocked_info.call_args[0][0] - match = re.search(VM_POLLING_REGEX_SUCCESS, info_string) + match = re.search(VM_POLLING_SUCCESS_REGEX, info_string) assert match is not None elapsed = float(match.group(1)) # With a sleep interval of 0.1, at least 0.2 seconds should pass before we @@ -538,7 +637,7 @@ def test_vm_polling_failure(mocked_info, mocked_error): assert not mocked_info.called assert mocked_error.called error_string = mocked_error.call_args[0][0] - match = re.search(VM_POLLING_REGEX_FAILURE, error_string) + match = re.search(VM_POLLING_FAILURE_REGEX, error_string) assert match is not None timeout = float(match.group(1)) assert timeout == 0.30 From b0154ba78cbc21eb4b78b6136ece4f5ee53dc583 Mon Sep 17 00:00:00 2001 From: Erik Moeller Date: Wed, 15 Apr 2020 18:02:34 -0700 Subject: [PATCH 6/6] Explain why we accept return codes 1 and 143 --- launcher/sdw_updater_gui/Updater.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/launcher/sdw_updater_gui/Updater.py b/launcher/sdw_updater_gui/Updater.py index c53c5320..01c84c14 100644 --- a/launcher/sdw_updater_gui/Updater.py +++ b/launcher/sdw_updater_gui/Updater.py @@ -495,7 +495,10 @@ def _force_shutdown_vm(vm): try: qubes.domains[vm].run("poweroff", user="root") except subprocess.CalledProcessError as e: - # Exit codes 1 and 143 may occur with successful shutdown; log others + # Exit codes 1 and 143 occur with successful shutdown (signifying + # the termination of the qrexec connection or a system process). + # poweroff does not provide a return code until the system is fully + # shut down; we never see exit code 0 from dom0. if e.returncode != 1 and e.returncode != 143: sdlog.error( "Error shutting down VM '{}'. "