From 02dd132e0883b1c2c778e0f548c2c6934829b2ad Mon Sep 17 00:00:00 2001 From: DavHau Date: Fri, 9 Feb 2024 19:46:32 +0700 Subject: [PATCH] vms: init graceful shutdown for GUI - add python modules for qemu protocols: QMP (hardware interactions) and QGA (guest service interaction) - refactor state directory: remove name from path (already contains url) - add impure vm test for basic qmp interaction - simplify existing vm persistance test (factor out shared code) - integrate graceful shutdown into GUI the GUI integration still needs to be improved later: - add fallback in case system doesn't react to powerdown button - shutdown GUI switch fails if VM hasn't been started yet, and then remains in a wrong position --- nixosModules/clanCore/vm.nix | 7 + pkgs/clan-cli/clan_cli/dirs.py | 15 +- pkgs/clan-cli/clan_cli/flakes/inspect.py | 7 +- pkgs/clan-cli/clan_cli/machines/machines.py | 35 ++++ pkgs/clan-cli/clan_cli/vms/run.py | 5 +- pkgs/clan-cli/qemu/__init__.py | 0 pkgs/clan-cli/{clan_cli => }/qemu/qga.py | 2 +- pkgs/clan-cli/{clan_cli => }/qemu/qmp.py | 0 pkgs/clan-cli/tests/fixtures_flakes.py | 5 +- pkgs/clan-cli/tests/test_dirs.py | 10 +- pkgs/clan-cli/tests/test_vms_cli.py | 171 +++++++++++------- .../clan_vm_manager/models/use_vms.py | 22 ++- 12 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 pkgs/clan-cli/qemu/__init__.py rename pkgs/clan-cli/{clan_cli => }/qemu/qga.py (97%) rename pkgs/clan-cli/{clan_cli => }/qemu/qmp.py (100%) diff --git a/nixosModules/clanCore/vm.nix b/nixosModules/clanCore/vm.nix index 2dcd3ddd..11e82708 100644 --- a/nixosModules/clanCore/vm.nix +++ b/nixosModules/clanCore/vm.nix @@ -30,6 +30,13 @@ let # required for issuing shell commands via qga services.qemuGuest.enable = true; + # required to react to system_powerdown qmp command + # Some desktop managers like xfce override the poweroff signal and therefore + # make it impossible to handle it via 'logind' diretly. + services.acpid.enable = true; + services.acpid.handlers.power.event = "button/power.*"; + services.acpid.handlers.power.action = "poweroff"; + boot.initrd.systemd.enable = true; # currently needed for system.etc.overlay.enable diff --git a/pkgs/clan-cli/clan_cli/dirs.py b/pkgs/clan-cli/clan_cli/dirs.py index a1614807..abcf8325 100644 --- a/pkgs/clan-cli/clan_cli/dirs.py +++ b/pkgs/clan-cli/clan_cli/dirs.py @@ -15,9 +15,12 @@ def find_git_repo_root() -> Path | None: return find_toplevel([".git"]) -def clan_key_safe(clan_name: str, flake_url: str) -> str: +def clan_key_safe(flake_url: str) -> str: + """ + only embed the url in the path, not the clan name, as it would involve eval. + """ quoted_url = urllib.parse.quote_plus(flake_url) - return f"{clan_name}-{quoted_url}" + return f"{quoted_url}" def find_toplevel(top_level_files: list[str]) -> Path | None: @@ -69,10 +72,10 @@ def user_gcroot_dir() -> Path: return p -def machine_gcroot(*, clan_name: str, flake_url: str) -> Path: +def machine_gcroot(flake_url: str) -> Path: # Always build icon so that we can symlink it to the gcroot gcroot_dir = user_gcroot_dir() - clan_gcroot = gcroot_dir / clan_key_safe(clan_name, flake_url) + clan_gcroot = gcroot_dir / clan_key_safe(flake_url) clan_gcroot.mkdir(parents=True, exist_ok=True) return clan_gcroot @@ -81,8 +84,8 @@ def user_history_file() -> Path: return user_config_dir() / "clan" / "history" -def vm_state_dir(clan_name: str, flake_url: str, vm_name: str) -> Path: - clan_key = clan_key_safe(clan_name, flake_url) +def vm_state_dir(flake_url: str, vm_name: str) -> Path: + clan_key = clan_key_safe(flake_url) return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name diff --git a/pkgs/clan-cli/clan_cli/flakes/inspect.py b/pkgs/clan-cli/clan_cli/flakes/inspect.py index 3e2c45ea..9897ed82 100644 --- a/pkgs/clan-cli/clan_cli/flakes/inspect.py +++ b/pkgs/clan-cli/clan_cli/flakes/inspect.py @@ -50,10 +50,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig: # Make symlink to gcroots from vm.machine_icon if vm.machine_icon: - gcroot_icon: Path = ( - machine_gcroot(clan_name=vm.clan_name, flake_url=str(flake_url)) - / vm.machine_name - ) + gcroot_icon: Path = machine_gcroot(flake_url=str(flake_url)) / vm.machine_name nix_add_to_gcroots(vm.machine_icon, gcroot_icon) # Get the cLAN name @@ -83,7 +80,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig: [ f'{flake_url}#clanInternals.machines."{system}"."{machine_name}".config.clanCore.clanIcon' ], - machine_gcroot(clan_name=clan_name, flake_url=str(flake_url)) / "clanIcon", + machine_gcroot(flake_url=str(flake_url)) / "clanIcon", ) run_cmd(cmd) diff --git a/pkgs/clan-cli/clan_cli/machines/machines.py b/pkgs/clan-cli/clan_cli/machines/machines.py index 07874dc1..a09b1dca 100644 --- a/pkgs/clan-cli/clan_cli/machines/machines.py +++ b/pkgs/clan-cli/clan_cli/machines/machines.py @@ -1,6 +1,11 @@ import json import logging +from os import path from pathlib import Path +from time import sleep + +from clan_cli.dirs import vm_state_dir +from qemu.qmp import QEMUMonitorProtocol from ..cmd import run from ..errors import ClanError @@ -30,6 +35,14 @@ class Machine: self.build_cache: dict[str, Path] = {} self._deployment_info: None | dict[str, str] = deployment_info + state_dir = vm_state_dir(flake_url=str(self.flake), vm_name=self.name) + + self.qmp_socket: Path = state_dir / "qmp.sock" + self.qga_socket: Path = state_dir / "qga.sock" + + print(f"qmp_socket: {self.qmp_socket}") + self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket)) + self._qmp_connected = False def __str__(self) -> str: return f"Machine(name={self.name}, flake={self.flake})" @@ -46,6 +59,28 @@ class Machine: ) return self._deployment_info + def qmp_connect(self) -> None: + if not self._qmp_connected: + tries = 100 + for num in range(tries): + try: + # the socket file link might be outdated, therefore re-init the qmp object + self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket)) + self._qmp.connect() + self._qmp_connected = True + log.debug("QMP Connected") + return + except FileNotFoundError: + if num < 99: + sleep(0.1) + continue + else: + raise + + def qmp_command(self, command: str) -> dict: + self.qmp_connect() + return self._qmp.command(command) + @property def target_host_address(self) -> str: # deploymentAddress is deprecated. diff --git a/pkgs/clan-cli/clan_cli/vms/run.py b/pkgs/clan-cli/clan_cli/vms/run.py index 4caf5305..067a2c55 100644 --- a/pkgs/clan-cli/clan_cli/vms/run.py +++ b/pkgs/clan-cli/clan_cli/vms/run.py @@ -159,8 +159,7 @@ def get_vm_create_info( f'{clan_dir}#clanInternals.machines."{system}"."{machine.name}".config.system.clan.vm.create', *nix_options, ], - machine_gcroot(clan_name=vm.clan_name, flake_url=str(vm.flake_url)) - / f"vm-{machine.name}", + machine_gcroot(flake_url=str(vm.flake_url)) / f"vm-{machine.name}", ) proc = run( cmd, log=Log.BOTH, error_msg=f"Could not build vm config for {machine.name}" @@ -298,7 +297,7 @@ def run_vm( secrets_dir = get_secrets(machine, tmpdir) - state_dir = vm_state_dir(vm.clan_name, str(vm.flake_url), machine.name) + state_dir = vm_state_dir(str(vm.flake_url), machine.name) state_dir.mkdir(parents=True, exist_ok=True) # specify socket files for qmp and qga diff --git a/pkgs/clan-cli/qemu/__init__.py b/pkgs/clan-cli/qemu/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pkgs/clan-cli/clan_cli/qemu/qga.py b/pkgs/clan-cli/qemu/qga.py similarity index 97% rename from pkgs/clan-cli/clan_cli/qemu/qga.py rename to pkgs/clan-cli/qemu/qga.py index ff409466..316d9767 100644 --- a/pkgs/clan-cli/clan_cli/qemu/qga.py +++ b/pkgs/clan-cli/qemu/qga.py @@ -12,7 +12,7 @@ from time import sleep class QgaSession: def __init__(self, socket_file: Path | str) -> None: self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - # try to reconnect a couple of times if connetion refused + # try to reconnect a couple of times if connection refused for _ in range(100): try: self.sock.connect(str(socket_file)) diff --git a/pkgs/clan-cli/clan_cli/qemu/qmp.py b/pkgs/clan-cli/qemu/qmp.py similarity index 100% rename from pkgs/clan-cli/clan_cli/qemu/qmp.py rename to pkgs/clan-cli/qemu/qmp.py diff --git a/pkgs/clan-cli/tests/fixtures_flakes.py b/pkgs/clan-cli/tests/fixtures_flakes.py index b29868ad..241fc30e 100644 --- a/pkgs/clan-cli/tests/fixtures_flakes.py +++ b/pkgs/clan-cli/tests/fixtures_flakes.py @@ -41,7 +41,10 @@ class FlakeForTest(NamedTuple): def generate_flake( temporary_home: Path, flake_template: Path, - substitutions: dict[str, str] = {}, + substitutions: dict[str, str] = { + "__CHANGE_ME__": "_test_vm_persistence", + "git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE), + }, # define the machines directly including their config machine_configs: dict[str, dict] = {}, ) -> FlakeForTest: diff --git a/pkgs/clan-cli/tests/test_dirs.py b/pkgs/clan-cli/tests/test_dirs.py index 75b01d6a..ac262683 100644 --- a/pkgs/clan-cli/tests/test_dirs.py +++ b/pkgs/clan-cli/tests/test_dirs.py @@ -20,16 +20,16 @@ from clan_cli.dirs import clan_key_safe, vm_state_dir def test_clan_key_safe() -> None: - assert clan_key_safe("clan1", "/foo/bar") == "clan1-%2Ffoo%2Fbar" + assert clan_key_safe("/foo/bar") == "%2Ffoo%2Fbar" def test_vm_state_dir_identity() -> None: - dir1 = vm_state_dir("clan1", "https://some.clan", "vm1") - dir2 = vm_state_dir("clan1", "https://some.clan", "vm1") + dir1 = vm_state_dir("https://some.clan", "vm1") + dir2 = vm_state_dir("https://some.clan", "vm1") assert str(dir1) == str(dir2) def test_vm_state_dir_no_collision() -> None: - dir1 = vm_state_dir("clan1", "/foo/bar", "vm1") - dir2 = vm_state_dir("clan1", "https://some.clan", "vm1") + dir1 = vm_state_dir("/foo/bar", "vm1") + dir2 = vm_state_dir("https://some.clan", "vm1") assert str(dir1) != str(dir2) diff --git a/pkgs/clan-cli/tests/test_vms_cli.py b/pkgs/clan-cli/tests/test_vms_cli.py index 74e3c21f..4345ca89 100644 --- a/pkgs/clan-cli/tests/test_vms_cli.py +++ b/pkgs/clan-cli/tests/test_vms_cli.py @@ -12,8 +12,8 @@ from fixtures_flakes import FlakeForTest, generate_flake from root import CLAN_CORE from clan_cli.dirs import vm_state_dir -from clan_cli.qemu.qga import QgaSession -from clan_cli.qemu.qmp import QEMUMonitorProtocol +from qemu.qga import QgaSession +from qemu.qmp import QEMUMonitorProtocol if TYPE_CHECKING: from age_keys import KeyPair @@ -21,6 +21,54 @@ if TYPE_CHECKING: no_kvm = not os.path.exists("/dev/kvm") +def run_vm_in_thread(machine_name: str) -> None: + # runs machine and prints exceptions + def run() -> None: + try: + Cli().run(["vms", "run", machine_name]) + except Exception: + # print exception details + print(traceback.format_exc(), file=sys.stderr) + print(sys.exc_info()[2], file=sys.stderr) + + # run the machine in a separate thread + t = threading.Thread(target=run, name="run") + t.daemon = True + t.start() + + +# wait for qmp socket to exist +def wait_vm_up(state_dir: Path) -> None: + socket_file = state_dir / "qga.sock" + while True: + if socket_file.exists(): + break + sleep(0.1) + + +# wait for vm to be down by checking if qga socket is down +def wait_vm_down(state_dir: Path) -> None: + socket_file = state_dir / "qga.sock" + while socket_file.exists(): + sleep(0.1) + + +# wait for vm to be up then connect and return qmp instance +def qmp_connect(state_dir: Path) -> QEMUMonitorProtocol: + wait_vm_up(state_dir) + qmp = QEMUMonitorProtocol( + address=str(os.path.realpath(state_dir / "qmp.sock")), + ) + qmp.connect() + return qmp + + +# wait for vm to be up then connect and return qga instance +def qga_connect(state_dir: Path) -> QgaSession: + wait_vm_up(state_dir) + return QgaSession(os.path.realpath(state_dir / "qga.sock")) + + @pytest.mark.impure def test_inspect( test_flake_with_core: FlakeForTest, capsys: pytest.CaptureFixture @@ -55,19 +103,56 @@ def test_run( @pytest.mark.skipif(no_kvm, reason="Requires KVM") @pytest.mark.impure -def test_vm_persistence( +def test_vm_qmp( monkeypatch: pytest.MonkeyPatch, temporary_home: Path, - age_keys: list["KeyPair"], ) -> None: - monkeypatch.setenv("SOPS_AGE_KEY", age_keys[0].privkey) + # set up a simple clan flake + flake = generate_flake( + temporary_home, + flake_template=CLAN_CORE / "templates" / "new-clan", + machine_configs=dict( + my_machine=dict( + clan=dict( + virtualisation=dict(graphics=False), + networking=dict(targetHost="client"), + ), + services=dict(getty=dict(autologinUser="root")), + ) + ), + ) + + # 'clan vms run' must be executed from within the flake + monkeypatch.chdir(flake.path) + + # the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets + state_dir = vm_state_dir(str(flake.path), "my_machine") + + # start the VM + run_vm_in_thread("my_machine") + + # connect with qmp + qmp = qmp_connect(state_dir) + + # verify that issuing a command works + # result = qmp.cmd_obj({"execute": "query-status"}) + result = qmp.command("query-status") + assert result["status"] == "running", result + + # shutdown machine (prevent zombie qemu processes) + qmp.command("system_powerdown") + + +@pytest.mark.skipif(no_kvm, reason="Requires KVM") +@pytest.mark.impure +def test_vm_persistence( + monkeypatch: pytest.MonkeyPatch, + temporary_home: Path, +) -> None: + # set up a clan flake with some systemd services to test persistence flake = generate_flake( temporary_home, flake_template=CLAN_CORE / "templates" / "new-clan", - substitutions={ - "__CHANGE_ME__": "_test_vm_persistence", - "git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE), - }, machine_configs=dict( my_machine=dict( services=dict(getty=dict(autologinUser="root")), @@ -83,8 +168,7 @@ def test_vm_persistence( ) ) ), - # create test user - # TODO: test persisting files via that user + # create test user to test if state can be owned by user users=dict( users=dict( test=dict( @@ -94,6 +178,8 @@ def test_vm_persistence( root=dict(password="root"), ) ), + # create a systemd service to create a file in the state folder + # and another to read it after reboot systemd=dict( services=dict( create_state=dict( @@ -163,59 +249,22 @@ def test_vm_persistence( ) monkeypatch.chdir(flake.path) - state_dir = vm_state_dir("_test_vm_persistence", str(flake.path), "my_machine") - socket_file = state_dir / "qga.sock" + # the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets + state_dir = vm_state_dir(str(flake.path), "my_machine") - # wait until socket file exists - def connect() -> QgaSession: - while True: - if (state_dir / "qga.sock").exists(): - break - sleep(0.1) - return QgaSession(os.path.realpath(socket_file)) + run_vm_in_thread("my_machine") - # runs machine and prints exceptions - def run() -> None: - try: - Cli().run(["vms", "run", "my_machine"]) - except Exception: - # print exception details - print(traceback.format_exc()) - print(sys.exc_info()[2]) - - # run the machine in a separate thread - t = threading.Thread(target=run, name="run") - t.daemon = True - t.start() - - # wait for socket to be up - Path("/tmp/log").write_text(f"wait for socket to be up: {socket_file!s}") - while True: - if socket_file.exists(): - break - sleep(0.1) + # wait for the VM to start + wait_vm_up(state_dir) # wait for socket to be down (systemd service 'poweroff' rebooting machine) - Path("/tmp/log").write_text("wait for socket to be down") - while socket_file.exists(): - sleep(0.1) - Path("/tmp/log").write_text("socket is down") + wait_vm_down(state_dir) # start vm again - t = threading.Thread(target=run, name="run") - t.daemon = True - t.start() - - # wait for the socket to be up - Path("/tmp/log").write_text("wait for socket to be up second time") - while True: - if socket_file.exists(): - break - sleep(0.1) + run_vm_in_thread("my_machine") # connect second time - Path("/tmp/log").write_text("connecting") - qga = connect() + qga = qga_connect(state_dir) # ensure that either /var/lib/nixos or /etc gets persisted # (depending on if system.etc.overlay.enable is set or not) @@ -224,6 +273,7 @@ def test_vm_persistence( ) assert exitcode == 0, err + # ensure that the file created by the service is still there and has the expected content exitcode, out, err = qga.run("cat /var/my-state/test") assert exitcode == 0, err assert out == "dream2nix\n", out @@ -236,11 +286,8 @@ def test_vm_persistence( exitcode, out, err = qga.run( "systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )" ) - print(out) assert exitcode == 0, out - qmp = QEMUMonitorProtocol( - address=str(os.path.realpath(state_dir / "qmp.sock")), - ) - qmp.connect() - qmp.cmd_obj({"execute": "system_powerdown"}) + # use qmp to shutdown the machine (prevent zombie qemu processes) + qmp = qmp_connect(state_dir) + qmp.command("system_powerdown") diff --git a/pkgs/clan-vm-manager/clan_vm_manager/models/use_vms.py b/pkgs/clan-vm-manager/clan_vm_manager/models/use_vms.py index b03315af..ec6d2931 100644 --- a/pkgs/clan-vm-manager/clan_vm_manager/models/use_vms.py +++ b/pkgs/clan-vm-manager/clan_vm_manager/models/use_vms.py @@ -120,33 +120,34 @@ class VM(GObject.Object): self._finalizer = weakref.finalize(self, self.stop) self.connect("vm_status_changed", self._start_logs_task) - def __start(self) -> None: - if self.is_running(): - log.warn("VM is already running") - return - uri = ClanURI.from_str( url=self.data.flake.flake_url, flake_attr=self.data.flake.flake_attr ) - match uri.scheme: case ClanScheme.LOCAL.value(path): - machine = Machine( + self.machine = Machine( name=self.data.flake.flake_attr, flake=path, # type: ignore ) case ClanScheme.REMOTE.value(url): - machine = Machine( + self.machine = Machine( name=self.data.flake.flake_attr, flake=url, # type: ignore ) - vm = vms.run.inspect_vm(machine) + + def __start(self) -> None: + if self.is_running(): + log.warn("VM is already running") + return + vm = vms.run.inspect_vm(self.machine) self.process = spawn( on_except=None, log_dir=Path(str(self.log_dir.name)), func=vms.run.run_vm, vm=vm, ) + log.debug("Starting VM") + self.machine.qmp_connect() def start(self) -> None: if self.is_running(): @@ -212,7 +213,8 @@ class VM(GObject.Object): if not self.is_running(): return log.info(f"Stopping VM {self.get_id()}") - self.process.kill_group() + # TODO: add fallback to kill the process if the QMP command fails + self.machine.qmp_command("system_powerdown") def read_whole_log(self) -> str: if not self.process.out_file.exists():