Merge pull request 'vms: init graceful shutdown for GUI' (#833) from DavHau-dave into main
All checks were successful
checks / test (push) Successful in 31s
checks-impure / test (push) Successful in 1m40s

This commit is contained in:
clan-bot 2024-02-09 12:58:52 +00:00
commit cc8d6b281b
12 changed files with 186 additions and 93 deletions

View File

@ -30,6 +30,13 @@ let
# required for issuing shell commands via qga
services.qemuGuest.enable = true;
# required to react to system_powerdown qmp command
# Some desktop managers like xfce override the poweroff signal and therefore
# make it impossible to handle it via 'logind' diretly.
services.acpid.enable = true;
services.acpid.handlers.power.event = "button/power.*";
services.acpid.handlers.power.action = "poweroff";
boot.initrd.systemd.enable = true;
# currently needed for system.etc.overlay.enable

View File

@ -15,9 +15,12 @@ def find_git_repo_root() -> Path | None:
return find_toplevel([".git"])
def clan_key_safe(clan_name: str, flake_url: str) -> str:
def clan_key_safe(flake_url: str) -> str:
"""
only embed the url in the path, not the clan name, as it would involve eval.
"""
quoted_url = urllib.parse.quote_plus(flake_url)
return f"{clan_name}-{quoted_url}"
return f"{quoted_url}"
def find_toplevel(top_level_files: list[str]) -> Path | None:
@ -69,10 +72,10 @@ def user_gcroot_dir() -> Path:
return p
def machine_gcroot(*, clan_name: str, flake_url: str) -> Path:
def machine_gcroot(flake_url: str) -> Path:
# Always build icon so that we can symlink it to the gcroot
gcroot_dir = user_gcroot_dir()
clan_gcroot = gcroot_dir / clan_key_safe(clan_name, flake_url)
clan_gcroot = gcroot_dir / clan_key_safe(flake_url)
clan_gcroot.mkdir(parents=True, exist_ok=True)
return clan_gcroot
@ -81,8 +84,8 @@ def user_history_file() -> Path:
return user_config_dir() / "clan" / "history"
def vm_state_dir(clan_name: str, flake_url: str, vm_name: str) -> Path:
clan_key = clan_key_safe(clan_name, flake_url)
def vm_state_dir(flake_url: str, vm_name: str) -> Path:
clan_key = clan_key_safe(flake_url)
return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name

View File

@ -50,10 +50,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
# Make symlink to gcroots from vm.machine_icon
if vm.machine_icon:
gcroot_icon: Path = (
machine_gcroot(clan_name=vm.clan_name, flake_url=str(flake_url))
/ vm.machine_name
)
gcroot_icon: Path = machine_gcroot(flake_url=str(flake_url)) / vm.machine_name
nix_add_to_gcroots(vm.machine_icon, gcroot_icon)
# Get the cLAN name
@ -83,7 +80,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
[
f'{flake_url}#clanInternals.machines."{system}"."{machine_name}".config.clanCore.clanIcon'
],
machine_gcroot(clan_name=clan_name, flake_url=str(flake_url)) / "clanIcon",
machine_gcroot(flake_url=str(flake_url)) / "clanIcon",
)
run_cmd(cmd)

View File

@ -1,6 +1,11 @@
import json
import logging
from os import path
from pathlib import Path
from time import sleep
from clan_cli.dirs import vm_state_dir
from qemu.qmp import QEMUMonitorProtocol
from ..cmd import run
from ..errors import ClanError
@ -30,6 +35,14 @@ class Machine:
self.build_cache: dict[str, Path] = {}
self._deployment_info: None | dict[str, str] = deployment_info
state_dir = vm_state_dir(flake_url=str(self.flake), vm_name=self.name)
self.qmp_socket: Path = state_dir / "qmp.sock"
self.qga_socket: Path = state_dir / "qga.sock"
print(f"qmp_socket: {self.qmp_socket}")
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
self._qmp_connected = False
def __str__(self) -> str:
return f"Machine(name={self.name}, flake={self.flake})"
@ -46,6 +59,28 @@ class Machine:
)
return self._deployment_info
def qmp_connect(self) -> None:
if not self._qmp_connected:
tries = 100
for num in range(tries):
try:
# the socket file link might be outdated, therefore re-init the qmp object
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
self._qmp.connect()
self._qmp_connected = True
log.debug("QMP Connected")
return
except FileNotFoundError:
if num < 99:
sleep(0.1)
continue
else:
raise
def qmp_command(self, command: str) -> dict:
self.qmp_connect()
return self._qmp.command(command)
@property
def target_host_address(self) -> str:
# deploymentAddress is deprecated.

View File

@ -159,8 +159,7 @@ def get_vm_create_info(
f'{clan_dir}#clanInternals.machines."{system}"."{machine.name}".config.system.clan.vm.create',
*nix_options,
],
machine_gcroot(clan_name=vm.clan_name, flake_url=str(vm.flake_url))
/ f"vm-{machine.name}",
machine_gcroot(flake_url=str(vm.flake_url)) / f"vm-{machine.name}",
)
proc = run(
cmd, log=Log.BOTH, error_msg=f"Could not build vm config for {machine.name}"
@ -298,7 +297,7 @@ def run_vm(
secrets_dir = get_secrets(machine, tmpdir)
state_dir = vm_state_dir(vm.clan_name, str(vm.flake_url), machine.name)
state_dir = vm_state_dir(str(vm.flake_url), machine.name)
state_dir.mkdir(parents=True, exist_ok=True)
# specify socket files for qmp and qga

View File

View File

@ -12,7 +12,7 @@ from time import sleep
class QgaSession:
def __init__(self, socket_file: Path | str) -> None:
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
# try to reconnect a couple of times if connetion refused
# try to reconnect a couple of times if connection refused
for _ in range(100):
try:
self.sock.connect(str(socket_file))

View File

@ -41,7 +41,10 @@ class FlakeForTest(NamedTuple):
def generate_flake(
temporary_home: Path,
flake_template: Path,
substitutions: dict[str, str] = {},
substitutions: dict[str, str] = {
"__CHANGE_ME__": "_test_vm_persistence",
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
},
# define the machines directly including their config
machine_configs: dict[str, dict] = {},
) -> FlakeForTest:

View File

@ -20,16 +20,16 @@ from clan_cli.dirs import clan_key_safe, vm_state_dir
def test_clan_key_safe() -> None:
assert clan_key_safe("clan1", "/foo/bar") == "clan1-%2Ffoo%2Fbar"
assert clan_key_safe("/foo/bar") == "%2Ffoo%2Fbar"
def test_vm_state_dir_identity() -> None:
dir1 = vm_state_dir("clan1", "https://some.clan", "vm1")
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1")
dir1 = vm_state_dir("https://some.clan", "vm1")
dir2 = vm_state_dir("https://some.clan", "vm1")
assert str(dir1) == str(dir2)
def test_vm_state_dir_no_collision() -> None:
dir1 = vm_state_dir("clan1", "/foo/bar", "vm1")
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1")
dir1 = vm_state_dir("/foo/bar", "vm1")
dir2 = vm_state_dir("https://some.clan", "vm1")
assert str(dir1) != str(dir2)

View File

@ -12,8 +12,8 @@ from fixtures_flakes import FlakeForTest, generate_flake
from root import CLAN_CORE
from clan_cli.dirs import vm_state_dir
from clan_cli.qemu.qga import QgaSession
from clan_cli.qemu.qmp import QEMUMonitorProtocol
from qemu.qga import QgaSession
from qemu.qmp import QEMUMonitorProtocol
if TYPE_CHECKING:
from age_keys import KeyPair
@ -21,6 +21,54 @@ if TYPE_CHECKING:
no_kvm = not os.path.exists("/dev/kvm")
def run_vm_in_thread(machine_name: str) -> None:
# runs machine and prints exceptions
def run() -> None:
try:
Cli().run(["vms", "run", machine_name])
except Exception:
# print exception details
print(traceback.format_exc(), file=sys.stderr)
print(sys.exc_info()[2], file=sys.stderr)
# run the machine in a separate thread
t = threading.Thread(target=run, name="run")
t.daemon = True
t.start()
# wait for qmp socket to exist
def wait_vm_up(state_dir: Path) -> None:
socket_file = state_dir / "qga.sock"
while True:
if socket_file.exists():
break
sleep(0.1)
# wait for vm to be down by checking if qga socket is down
def wait_vm_down(state_dir: Path) -> None:
socket_file = state_dir / "qga.sock"
while socket_file.exists():
sleep(0.1)
# wait for vm to be up then connect and return qmp instance
def qmp_connect(state_dir: Path) -> QEMUMonitorProtocol:
wait_vm_up(state_dir)
qmp = QEMUMonitorProtocol(
address=str(os.path.realpath(state_dir / "qmp.sock")),
)
qmp.connect()
return qmp
# wait for vm to be up then connect and return qga instance
def qga_connect(state_dir: Path) -> QgaSession:
wait_vm_up(state_dir)
return QgaSession(os.path.realpath(state_dir / "qga.sock"))
@pytest.mark.impure
def test_inspect(
test_flake_with_core: FlakeForTest, capsys: pytest.CaptureFixture
@ -55,19 +103,56 @@ def test_run(
@pytest.mark.skipif(no_kvm, reason="Requires KVM")
@pytest.mark.impure
def test_vm_persistence(
def test_vm_qmp(
monkeypatch: pytest.MonkeyPatch,
temporary_home: Path,
age_keys: list["KeyPair"],
) -> None:
monkeypatch.setenv("SOPS_AGE_KEY", age_keys[0].privkey)
# set up a simple clan flake
flake = generate_flake(
temporary_home,
flake_template=CLAN_CORE / "templates" / "new-clan",
machine_configs=dict(
my_machine=dict(
clan=dict(
virtualisation=dict(graphics=False),
networking=dict(targetHost="client"),
),
services=dict(getty=dict(autologinUser="root")),
)
),
)
# 'clan vms run' must be executed from within the flake
monkeypatch.chdir(flake.path)
# the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
state_dir = vm_state_dir(str(flake.path), "my_machine")
# start the VM
run_vm_in_thread("my_machine")
# connect with qmp
qmp = qmp_connect(state_dir)
# verify that issuing a command works
# result = qmp.cmd_obj({"execute": "query-status"})
result = qmp.command("query-status")
assert result["status"] == "running", result
# shutdown machine (prevent zombie qemu processes)
qmp.command("system_powerdown")
@pytest.mark.skipif(no_kvm, reason="Requires KVM")
@pytest.mark.impure
def test_vm_persistence(
monkeypatch: pytest.MonkeyPatch,
temporary_home: Path,
) -> None:
# set up a clan flake with some systemd services to test persistence
flake = generate_flake(
temporary_home,
flake_template=CLAN_CORE / "templates" / "new-clan",
substitutions={
"__CHANGE_ME__": "_test_vm_persistence",
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
},
machine_configs=dict(
my_machine=dict(
services=dict(getty=dict(autologinUser="root")),
@ -83,8 +168,7 @@ def test_vm_persistence(
)
)
),
# create test user
# TODO: test persisting files via that user
# create test user to test if state can be owned by user
users=dict(
users=dict(
test=dict(
@ -94,6 +178,8 @@ def test_vm_persistence(
root=dict(password="root"),
)
),
# create a systemd service to create a file in the state folder
# and another to read it after reboot
systemd=dict(
services=dict(
create_state=dict(
@ -163,59 +249,22 @@ def test_vm_persistence(
)
monkeypatch.chdir(flake.path)
state_dir = vm_state_dir("_test_vm_persistence", str(flake.path), "my_machine")
socket_file = state_dir / "qga.sock"
# the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
state_dir = vm_state_dir(str(flake.path), "my_machine")
# wait until socket file exists
def connect() -> QgaSession:
while True:
if (state_dir / "qga.sock").exists():
break
sleep(0.1)
return QgaSession(os.path.realpath(socket_file))
run_vm_in_thread("my_machine")
# runs machine and prints exceptions
def run() -> None:
try:
Cli().run(["vms", "run", "my_machine"])
except Exception:
# print exception details
print(traceback.format_exc())
print(sys.exc_info()[2])
# run the machine in a separate thread
t = threading.Thread(target=run, name="run")
t.daemon = True
t.start()
# wait for socket to be up
Path("/tmp/log").write_text(f"wait for socket to be up: {socket_file!s}")
while True:
if socket_file.exists():
break
sleep(0.1)
# wait for the VM to start
wait_vm_up(state_dir)
# wait for socket to be down (systemd service 'poweroff' rebooting machine)
Path("/tmp/log").write_text("wait for socket to be down")
while socket_file.exists():
sleep(0.1)
Path("/tmp/log").write_text("socket is down")
wait_vm_down(state_dir)
# start vm again
t = threading.Thread(target=run, name="run")
t.daemon = True
t.start()
# wait for the socket to be up
Path("/tmp/log").write_text("wait for socket to be up second time")
while True:
if socket_file.exists():
break
sleep(0.1)
run_vm_in_thread("my_machine")
# connect second time
Path("/tmp/log").write_text("connecting")
qga = connect()
qga = qga_connect(state_dir)
# ensure that either /var/lib/nixos or /etc gets persisted
# (depending on if system.etc.overlay.enable is set or not)
@ -224,6 +273,7 @@ def test_vm_persistence(
)
assert exitcode == 0, err
# ensure that the file created by the service is still there and has the expected content
exitcode, out, err = qga.run("cat /var/my-state/test")
assert exitcode == 0, err
assert out == "dream2nix\n", out
@ -236,11 +286,8 @@ def test_vm_persistence(
exitcode, out, err = qga.run(
"systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )"
)
print(out)
assert exitcode == 0, out
qmp = QEMUMonitorProtocol(
address=str(os.path.realpath(state_dir / "qmp.sock")),
)
qmp.connect()
qmp.cmd_obj({"execute": "system_powerdown"})
# use qmp to shutdown the machine (prevent zombie qemu processes)
qmp = qmp_connect(state_dir)
qmp.command("system_powerdown")

View File

@ -120,33 +120,34 @@ class VM(GObject.Object):
self._finalizer = weakref.finalize(self, self.stop)
self.connect("vm_status_changed", self._start_logs_task)
def __start(self) -> None:
if self.is_running():
log.warn("VM is already running")
return
uri = ClanURI.from_str(
url=self.data.flake.flake_url, flake_attr=self.data.flake.flake_attr
)
match uri.scheme:
case ClanScheme.LOCAL.value(path):
machine = Machine(
self.machine = Machine(
name=self.data.flake.flake_attr,
flake=path, # type: ignore
)
case ClanScheme.REMOTE.value(url):
machine = Machine(
self.machine = Machine(
name=self.data.flake.flake_attr,
flake=url, # type: ignore
)
vm = vms.run.inspect_vm(machine)
def __start(self) -> None:
if self.is_running():
log.warn("VM is already running")
return
vm = vms.run.inspect_vm(self.machine)
self.process = spawn(
on_except=None,
log_dir=Path(str(self.log_dir.name)),
func=vms.run.run_vm,
vm=vm,
)
log.debug("Starting VM")
self.machine.qmp_connect()
def start(self) -> None:
if self.is_running():
@ -212,7 +213,8 @@ class VM(GObject.Object):
if not self.is_running():
return
log.info(f"Stopping VM {self.get_id()}")
self.process.kill_group()
# TODO: add fallback to kill the process if the QMP command fails
self.machine.qmp_command("system_powerdown")
def read_whole_log(self) -> str:
if not self.process.out_file.exists():