1
0
forked from clan/clan-core

vms: init graceful shutdown for GUI

- add python modules for qemu protocols: QMP (hardware interactions) and QGA (guest service interaction)
- refactor state directory: remove name from path (already contains url)
- add impure vm test for basic qmp interaction
- simplify existing vm persistance test (factor out shared code)
- integrate graceful shutdown into GUI

the GUI integration still needs to be improved later:
- add fallback in case system doesn't react to powerdown button
- shutdown GUI switch fails if VM hasn't been started yet, and then remains in a wrong position
This commit is contained in:
DavHau 2024-02-09 19:46:32 +07:00
parent 6af8423f1e
commit 02dd132e08
12 changed files with 186 additions and 93 deletions

View File

@ -30,6 +30,13 @@ let
# required for issuing shell commands via qga # required for issuing shell commands via qga
services.qemuGuest.enable = true; services.qemuGuest.enable = true;
# required to react to system_powerdown qmp command
# Some desktop managers like xfce override the poweroff signal and therefore
# make it impossible to handle it via 'logind' diretly.
services.acpid.enable = true;
services.acpid.handlers.power.event = "button/power.*";
services.acpid.handlers.power.action = "poweroff";
boot.initrd.systemd.enable = true; boot.initrd.systemd.enable = true;
# currently needed for system.etc.overlay.enable # currently needed for system.etc.overlay.enable

View File

@ -15,9 +15,12 @@ def find_git_repo_root() -> Path | None:
return find_toplevel([".git"]) return find_toplevel([".git"])
def clan_key_safe(clan_name: str, flake_url: str) -> str: def clan_key_safe(flake_url: str) -> str:
"""
only embed the url in the path, not the clan name, as it would involve eval.
"""
quoted_url = urllib.parse.quote_plus(flake_url) quoted_url = urllib.parse.quote_plus(flake_url)
return f"{clan_name}-{quoted_url}" return f"{quoted_url}"
def find_toplevel(top_level_files: list[str]) -> Path | None: def find_toplevel(top_level_files: list[str]) -> Path | None:
@ -69,10 +72,10 @@ def user_gcroot_dir() -> Path:
return p return p
def machine_gcroot(*, clan_name: str, flake_url: str) -> Path: def machine_gcroot(flake_url: str) -> Path:
# Always build icon so that we can symlink it to the gcroot # Always build icon so that we can symlink it to the gcroot
gcroot_dir = user_gcroot_dir() gcroot_dir = user_gcroot_dir()
clan_gcroot = gcroot_dir / clan_key_safe(clan_name, flake_url) clan_gcroot = gcroot_dir / clan_key_safe(flake_url)
clan_gcroot.mkdir(parents=True, exist_ok=True) clan_gcroot.mkdir(parents=True, exist_ok=True)
return clan_gcroot return clan_gcroot
@ -81,8 +84,8 @@ def user_history_file() -> Path:
return user_config_dir() / "clan" / "history" return user_config_dir() / "clan" / "history"
def vm_state_dir(clan_name: str, flake_url: str, vm_name: str) -> Path: def vm_state_dir(flake_url: str, vm_name: str) -> Path:
clan_key = clan_key_safe(clan_name, flake_url) clan_key = clan_key_safe(flake_url)
return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name

View File

@ -50,10 +50,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
# Make symlink to gcroots from vm.machine_icon # Make symlink to gcroots from vm.machine_icon
if vm.machine_icon: if vm.machine_icon:
gcroot_icon: Path = ( gcroot_icon: Path = machine_gcroot(flake_url=str(flake_url)) / vm.machine_name
machine_gcroot(clan_name=vm.clan_name, flake_url=str(flake_url))
/ vm.machine_name
)
nix_add_to_gcroots(vm.machine_icon, gcroot_icon) nix_add_to_gcroots(vm.machine_icon, gcroot_icon)
# Get the cLAN name # Get the cLAN name
@ -83,7 +80,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
[ [
f'{flake_url}#clanInternals.machines."{system}"."{machine_name}".config.clanCore.clanIcon' f'{flake_url}#clanInternals.machines."{system}"."{machine_name}".config.clanCore.clanIcon'
], ],
machine_gcroot(clan_name=clan_name, flake_url=str(flake_url)) / "clanIcon", machine_gcroot(flake_url=str(flake_url)) / "clanIcon",
) )
run_cmd(cmd) run_cmd(cmd)

View File

@ -1,6 +1,11 @@
import json import json
import logging import logging
from os import path
from pathlib import Path from pathlib import Path
from time import sleep
from clan_cli.dirs import vm_state_dir
from qemu.qmp import QEMUMonitorProtocol
from ..cmd import run from ..cmd import run
from ..errors import ClanError from ..errors import ClanError
@ -30,6 +35,14 @@ class Machine:
self.build_cache: dict[str, Path] = {} self.build_cache: dict[str, Path] = {}
self._deployment_info: None | dict[str, str] = deployment_info self._deployment_info: None | dict[str, str] = deployment_info
state_dir = vm_state_dir(flake_url=str(self.flake), vm_name=self.name)
self.qmp_socket: Path = state_dir / "qmp.sock"
self.qga_socket: Path = state_dir / "qga.sock"
print(f"qmp_socket: {self.qmp_socket}")
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
self._qmp_connected = False
def __str__(self) -> str: def __str__(self) -> str:
return f"Machine(name={self.name}, flake={self.flake})" return f"Machine(name={self.name}, flake={self.flake})"
@ -46,6 +59,28 @@ class Machine:
) )
return self._deployment_info return self._deployment_info
def qmp_connect(self) -> None:
if not self._qmp_connected:
tries = 100
for num in range(tries):
try:
# the socket file link might be outdated, therefore re-init the qmp object
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
self._qmp.connect()
self._qmp_connected = True
log.debug("QMP Connected")
return
except FileNotFoundError:
if num < 99:
sleep(0.1)
continue
else:
raise
def qmp_command(self, command: str) -> dict:
self.qmp_connect()
return self._qmp.command(command)
@property @property
def target_host_address(self) -> str: def target_host_address(self) -> str:
# deploymentAddress is deprecated. # deploymentAddress is deprecated.

View File

@ -159,8 +159,7 @@ def get_vm_create_info(
f'{clan_dir}#clanInternals.machines."{system}"."{machine.name}".config.system.clan.vm.create', f'{clan_dir}#clanInternals.machines."{system}"."{machine.name}".config.system.clan.vm.create',
*nix_options, *nix_options,
], ],
machine_gcroot(clan_name=vm.clan_name, flake_url=str(vm.flake_url)) machine_gcroot(flake_url=str(vm.flake_url)) / f"vm-{machine.name}",
/ f"vm-{machine.name}",
) )
proc = run( proc = run(
cmd, log=Log.BOTH, error_msg=f"Could not build vm config for {machine.name}" cmd, log=Log.BOTH, error_msg=f"Could not build vm config for {machine.name}"
@ -298,7 +297,7 @@ def run_vm(
secrets_dir = get_secrets(machine, tmpdir) secrets_dir = get_secrets(machine, tmpdir)
state_dir = vm_state_dir(vm.clan_name, str(vm.flake_url), machine.name) state_dir = vm_state_dir(str(vm.flake_url), machine.name)
state_dir.mkdir(parents=True, exist_ok=True) state_dir.mkdir(parents=True, exist_ok=True)
# specify socket files for qmp and qga # specify socket files for qmp and qga

View File

View File

@ -12,7 +12,7 @@ from time import sleep
class QgaSession: class QgaSession:
def __init__(self, socket_file: Path | str) -> None: def __init__(self, socket_file: Path | str) -> None:
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
# try to reconnect a couple of times if connetion refused # try to reconnect a couple of times if connection refused
for _ in range(100): for _ in range(100):
try: try:
self.sock.connect(str(socket_file)) self.sock.connect(str(socket_file))

View File

@ -41,7 +41,10 @@ class FlakeForTest(NamedTuple):
def generate_flake( def generate_flake(
temporary_home: Path, temporary_home: Path,
flake_template: Path, flake_template: Path,
substitutions: dict[str, str] = {}, substitutions: dict[str, str] = {
"__CHANGE_ME__": "_test_vm_persistence",
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
},
# define the machines directly including their config # define the machines directly including their config
machine_configs: dict[str, dict] = {}, machine_configs: dict[str, dict] = {},
) -> FlakeForTest: ) -> FlakeForTest:

View File

@ -20,16 +20,16 @@ from clan_cli.dirs import clan_key_safe, vm_state_dir
def test_clan_key_safe() -> None: def test_clan_key_safe() -> None:
assert clan_key_safe("clan1", "/foo/bar") == "clan1-%2Ffoo%2Fbar" assert clan_key_safe("/foo/bar") == "%2Ffoo%2Fbar"
def test_vm_state_dir_identity() -> None: def test_vm_state_dir_identity() -> None:
dir1 = vm_state_dir("clan1", "https://some.clan", "vm1") dir1 = vm_state_dir("https://some.clan", "vm1")
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1") dir2 = vm_state_dir("https://some.clan", "vm1")
assert str(dir1) == str(dir2) assert str(dir1) == str(dir2)
def test_vm_state_dir_no_collision() -> None: def test_vm_state_dir_no_collision() -> None:
dir1 = vm_state_dir("clan1", "/foo/bar", "vm1") dir1 = vm_state_dir("/foo/bar", "vm1")
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1") dir2 = vm_state_dir("https://some.clan", "vm1")
assert str(dir1) != str(dir2) assert str(dir1) != str(dir2)

View File

@ -12,8 +12,8 @@ from fixtures_flakes import FlakeForTest, generate_flake
from root import CLAN_CORE from root import CLAN_CORE
from clan_cli.dirs import vm_state_dir from clan_cli.dirs import vm_state_dir
from clan_cli.qemu.qga import QgaSession from qemu.qga import QgaSession
from clan_cli.qemu.qmp import QEMUMonitorProtocol from qemu.qmp import QEMUMonitorProtocol
if TYPE_CHECKING: if TYPE_CHECKING:
from age_keys import KeyPair from age_keys import KeyPair
@ -21,6 +21,54 @@ if TYPE_CHECKING:
no_kvm = not os.path.exists("/dev/kvm") no_kvm = not os.path.exists("/dev/kvm")
def run_vm_in_thread(machine_name: str) -> None:
# runs machine and prints exceptions
def run() -> None:
try:
Cli().run(["vms", "run", machine_name])
except Exception:
# print exception details
print(traceback.format_exc(), file=sys.stderr)
print(sys.exc_info()[2], file=sys.stderr)
# run the machine in a separate thread
t = threading.Thread(target=run, name="run")
t.daemon = True
t.start()
# wait for qmp socket to exist
def wait_vm_up(state_dir: Path) -> None:
socket_file = state_dir / "qga.sock"
while True:
if socket_file.exists():
break
sleep(0.1)
# wait for vm to be down by checking if qga socket is down
def wait_vm_down(state_dir: Path) -> None:
socket_file = state_dir / "qga.sock"
while socket_file.exists():
sleep(0.1)
# wait for vm to be up then connect and return qmp instance
def qmp_connect(state_dir: Path) -> QEMUMonitorProtocol:
wait_vm_up(state_dir)
qmp = QEMUMonitorProtocol(
address=str(os.path.realpath(state_dir / "qmp.sock")),
)
qmp.connect()
return qmp
# wait for vm to be up then connect and return qga instance
def qga_connect(state_dir: Path) -> QgaSession:
wait_vm_up(state_dir)
return QgaSession(os.path.realpath(state_dir / "qga.sock"))
@pytest.mark.impure @pytest.mark.impure
def test_inspect( def test_inspect(
test_flake_with_core: FlakeForTest, capsys: pytest.CaptureFixture test_flake_with_core: FlakeForTest, capsys: pytest.CaptureFixture
@ -55,19 +103,56 @@ def test_run(
@pytest.mark.skipif(no_kvm, reason="Requires KVM") @pytest.mark.skipif(no_kvm, reason="Requires KVM")
@pytest.mark.impure @pytest.mark.impure
def test_vm_persistence( def test_vm_qmp(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
temporary_home: Path, temporary_home: Path,
age_keys: list["KeyPair"],
) -> None: ) -> None:
monkeypatch.setenv("SOPS_AGE_KEY", age_keys[0].privkey) # set up a simple clan flake
flake = generate_flake(
temporary_home,
flake_template=CLAN_CORE / "templates" / "new-clan",
machine_configs=dict(
my_machine=dict(
clan=dict(
virtualisation=dict(graphics=False),
networking=dict(targetHost="client"),
),
services=dict(getty=dict(autologinUser="root")),
)
),
)
# 'clan vms run' must be executed from within the flake
monkeypatch.chdir(flake.path)
# the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
state_dir = vm_state_dir(str(flake.path), "my_machine")
# start the VM
run_vm_in_thread("my_machine")
# connect with qmp
qmp = qmp_connect(state_dir)
# verify that issuing a command works
# result = qmp.cmd_obj({"execute": "query-status"})
result = qmp.command("query-status")
assert result["status"] == "running", result
# shutdown machine (prevent zombie qemu processes)
qmp.command("system_powerdown")
@pytest.mark.skipif(no_kvm, reason="Requires KVM")
@pytest.mark.impure
def test_vm_persistence(
monkeypatch: pytest.MonkeyPatch,
temporary_home: Path,
) -> None:
# set up a clan flake with some systemd services to test persistence
flake = generate_flake( flake = generate_flake(
temporary_home, temporary_home,
flake_template=CLAN_CORE / "templates" / "new-clan", flake_template=CLAN_CORE / "templates" / "new-clan",
substitutions={
"__CHANGE_ME__": "_test_vm_persistence",
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
},
machine_configs=dict( machine_configs=dict(
my_machine=dict( my_machine=dict(
services=dict(getty=dict(autologinUser="root")), services=dict(getty=dict(autologinUser="root")),
@ -83,8 +168,7 @@ def test_vm_persistence(
) )
) )
), ),
# create test user # create test user to test if state can be owned by user
# TODO: test persisting files via that user
users=dict( users=dict(
users=dict( users=dict(
test=dict( test=dict(
@ -94,6 +178,8 @@ def test_vm_persistence(
root=dict(password="root"), root=dict(password="root"),
) )
), ),
# create a systemd service to create a file in the state folder
# and another to read it after reboot
systemd=dict( systemd=dict(
services=dict( services=dict(
create_state=dict( create_state=dict(
@ -163,59 +249,22 @@ def test_vm_persistence(
) )
monkeypatch.chdir(flake.path) monkeypatch.chdir(flake.path)
state_dir = vm_state_dir("_test_vm_persistence", str(flake.path), "my_machine") # the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
socket_file = state_dir / "qga.sock" state_dir = vm_state_dir(str(flake.path), "my_machine")
# wait until socket file exists run_vm_in_thread("my_machine")
def connect() -> QgaSession:
while True:
if (state_dir / "qga.sock").exists():
break
sleep(0.1)
return QgaSession(os.path.realpath(socket_file))
# runs machine and prints exceptions # wait for the VM to start
def run() -> None: wait_vm_up(state_dir)
try:
Cli().run(["vms", "run", "my_machine"])
except Exception:
# print exception details
print(traceback.format_exc())
print(sys.exc_info()[2])
# run the machine in a separate thread
t = threading.Thread(target=run, name="run")
t.daemon = True
t.start()
# wait for socket to be up
Path("/tmp/log").write_text(f"wait for socket to be up: {socket_file!s}")
while True:
if socket_file.exists():
break
sleep(0.1)
# wait for socket to be down (systemd service 'poweroff' rebooting machine) # wait for socket to be down (systemd service 'poweroff' rebooting machine)
Path("/tmp/log").write_text("wait for socket to be down") wait_vm_down(state_dir)
while socket_file.exists():
sleep(0.1)
Path("/tmp/log").write_text("socket is down")
# start vm again # start vm again
t = threading.Thread(target=run, name="run") run_vm_in_thread("my_machine")
t.daemon = True
t.start()
# wait for the socket to be up
Path("/tmp/log").write_text("wait for socket to be up second time")
while True:
if socket_file.exists():
break
sleep(0.1)
# connect second time # connect second time
Path("/tmp/log").write_text("connecting") qga = qga_connect(state_dir)
qga = connect()
# ensure that either /var/lib/nixos or /etc gets persisted # ensure that either /var/lib/nixos or /etc gets persisted
# (depending on if system.etc.overlay.enable is set or not) # (depending on if system.etc.overlay.enable is set or not)
@ -224,6 +273,7 @@ def test_vm_persistence(
) )
assert exitcode == 0, err assert exitcode == 0, err
# ensure that the file created by the service is still there and has the expected content
exitcode, out, err = qga.run("cat /var/my-state/test") exitcode, out, err = qga.run("cat /var/my-state/test")
assert exitcode == 0, err assert exitcode == 0, err
assert out == "dream2nix\n", out assert out == "dream2nix\n", out
@ -236,11 +286,8 @@ def test_vm_persistence(
exitcode, out, err = qga.run( exitcode, out, err = qga.run(
"systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )" "systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )"
) )
print(out)
assert exitcode == 0, out assert exitcode == 0, out
qmp = QEMUMonitorProtocol( # use qmp to shutdown the machine (prevent zombie qemu processes)
address=str(os.path.realpath(state_dir / "qmp.sock")), qmp = qmp_connect(state_dir)
) qmp.command("system_powerdown")
qmp.connect()
qmp.cmd_obj({"execute": "system_powerdown"})

View File

@ -120,33 +120,34 @@ class VM(GObject.Object):
self._finalizer = weakref.finalize(self, self.stop) self._finalizer = weakref.finalize(self, self.stop)
self.connect("vm_status_changed", self._start_logs_task) self.connect("vm_status_changed", self._start_logs_task)
def __start(self) -> None:
if self.is_running():
log.warn("VM is already running")
return
uri = ClanURI.from_str( uri = ClanURI.from_str(
url=self.data.flake.flake_url, flake_attr=self.data.flake.flake_attr url=self.data.flake.flake_url, flake_attr=self.data.flake.flake_attr
) )
match uri.scheme: match uri.scheme:
case ClanScheme.LOCAL.value(path): case ClanScheme.LOCAL.value(path):
machine = Machine( self.machine = Machine(
name=self.data.flake.flake_attr, name=self.data.flake.flake_attr,
flake=path, # type: ignore flake=path, # type: ignore
) )
case ClanScheme.REMOTE.value(url): case ClanScheme.REMOTE.value(url):
machine = Machine( self.machine = Machine(
name=self.data.flake.flake_attr, name=self.data.flake.flake_attr,
flake=url, # type: ignore flake=url, # type: ignore
) )
vm = vms.run.inspect_vm(machine)
def __start(self) -> None:
if self.is_running():
log.warn("VM is already running")
return
vm = vms.run.inspect_vm(self.machine)
self.process = spawn( self.process = spawn(
on_except=None, on_except=None,
log_dir=Path(str(self.log_dir.name)), log_dir=Path(str(self.log_dir.name)),
func=vms.run.run_vm, func=vms.run.run_vm,
vm=vm, vm=vm,
) )
log.debug("Starting VM")
self.machine.qmp_connect()
def start(self) -> None: def start(self) -> None:
if self.is_running(): if self.is_running():
@ -212,7 +213,8 @@ class VM(GObject.Object):
if not self.is_running(): if not self.is_running():
return return
log.info(f"Stopping VM {self.get_id()}") log.info(f"Stopping VM {self.get_id()}")
self.process.kill_group() # TODO: add fallback to kill the process if the QMP command fails
self.machine.qmp_command("system_powerdown")
def read_whole_log(self) -> str: def read_whole_log(self) -> str:
if not self.process.out_file.exists(): if not self.process.out_file.exists():