vms: init graceful shutdown for GUI
- add python modules for qemu protocols: QMP (hardware interactions) and QGA (guest service interaction) - refactor state directory: remove name from path (already contains url) - add impure vm test for basic qmp interaction - simplify existing vm persistance test (factor out shared code) - integrate graceful shutdown into GUI the GUI integration still needs to be improved later: - add fallback in case system doesn't react to powerdown button - shutdown GUI switch fails if VM hasn't been started yet, and then remains in a wrong position
This commit is contained in:
parent
6af8423f1e
commit
02dd132e08
|
@ -30,6 +30,13 @@ let
|
|||
# required for issuing shell commands via qga
|
||||
services.qemuGuest.enable = true;
|
||||
|
||||
# required to react to system_powerdown qmp command
|
||||
# Some desktop managers like xfce override the poweroff signal and therefore
|
||||
# make it impossible to handle it via 'logind' diretly.
|
||||
services.acpid.enable = true;
|
||||
services.acpid.handlers.power.event = "button/power.*";
|
||||
services.acpid.handlers.power.action = "poweroff";
|
||||
|
||||
boot.initrd.systemd.enable = true;
|
||||
|
||||
# currently needed for system.etc.overlay.enable
|
||||
|
|
|
@ -15,9 +15,12 @@ def find_git_repo_root() -> Path | None:
|
|||
return find_toplevel([".git"])
|
||||
|
||||
|
||||
def clan_key_safe(clan_name: str, flake_url: str) -> str:
|
||||
def clan_key_safe(flake_url: str) -> str:
|
||||
"""
|
||||
only embed the url in the path, not the clan name, as it would involve eval.
|
||||
"""
|
||||
quoted_url = urllib.parse.quote_plus(flake_url)
|
||||
return f"{clan_name}-{quoted_url}"
|
||||
return f"{quoted_url}"
|
||||
|
||||
|
||||
def find_toplevel(top_level_files: list[str]) -> Path | None:
|
||||
|
@ -69,10 +72,10 @@ def user_gcroot_dir() -> Path:
|
|||
return p
|
||||
|
||||
|
||||
def machine_gcroot(*, clan_name: str, flake_url: str) -> Path:
|
||||
def machine_gcroot(flake_url: str) -> Path:
|
||||
# Always build icon so that we can symlink it to the gcroot
|
||||
gcroot_dir = user_gcroot_dir()
|
||||
clan_gcroot = gcroot_dir / clan_key_safe(clan_name, flake_url)
|
||||
clan_gcroot = gcroot_dir / clan_key_safe(flake_url)
|
||||
clan_gcroot.mkdir(parents=True, exist_ok=True)
|
||||
return clan_gcroot
|
||||
|
||||
|
@ -81,8 +84,8 @@ def user_history_file() -> Path:
|
|||
return user_config_dir() / "clan" / "history"
|
||||
|
||||
|
||||
def vm_state_dir(clan_name: str, flake_url: str, vm_name: str) -> Path:
|
||||
clan_key = clan_key_safe(clan_name, flake_url)
|
||||
def vm_state_dir(flake_url: str, vm_name: str) -> Path:
|
||||
clan_key = clan_key_safe(flake_url)
|
||||
return user_data_dir() / "clan" / "vmstate" / clan_key / vm_name
|
||||
|
||||
|
||||
|
|
|
@ -50,10 +50,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
|
|||
|
||||
# Make symlink to gcroots from vm.machine_icon
|
||||
if vm.machine_icon:
|
||||
gcroot_icon: Path = (
|
||||
machine_gcroot(clan_name=vm.clan_name, flake_url=str(flake_url))
|
||||
/ vm.machine_name
|
||||
)
|
||||
gcroot_icon: Path = machine_gcroot(flake_url=str(flake_url)) / vm.machine_name
|
||||
nix_add_to_gcroots(vm.machine_icon, gcroot_icon)
|
||||
|
||||
# Get the cLAN name
|
||||
|
@ -83,7 +80,7 @@ def inspect_flake(flake_url: str | Path, machine_name: str) -> FlakeConfig:
|
|||
[
|
||||
f'{flake_url}#clanInternals.machines."{system}"."{machine_name}".config.clanCore.clanIcon'
|
||||
],
|
||||
machine_gcroot(clan_name=clan_name, flake_url=str(flake_url)) / "clanIcon",
|
||||
machine_gcroot(flake_url=str(flake_url)) / "clanIcon",
|
||||
)
|
||||
run_cmd(cmd)
|
||||
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
import json
|
||||
import logging
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
from clan_cli.dirs import vm_state_dir
|
||||
from qemu.qmp import QEMUMonitorProtocol
|
||||
|
||||
from ..cmd import run
|
||||
from ..errors import ClanError
|
||||
|
@ -30,6 +35,14 @@ class Machine:
|
|||
self.build_cache: dict[str, Path] = {}
|
||||
|
||||
self._deployment_info: None | dict[str, str] = deployment_info
|
||||
state_dir = vm_state_dir(flake_url=str(self.flake), vm_name=self.name)
|
||||
|
||||
self.qmp_socket: Path = state_dir / "qmp.sock"
|
||||
self.qga_socket: Path = state_dir / "qga.sock"
|
||||
|
||||
print(f"qmp_socket: {self.qmp_socket}")
|
||||
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
|
||||
self._qmp_connected = False
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Machine(name={self.name}, flake={self.flake})"
|
||||
|
@ -46,6 +59,28 @@ class Machine:
|
|||
)
|
||||
return self._deployment_info
|
||||
|
||||
def qmp_connect(self) -> None:
|
||||
if not self._qmp_connected:
|
||||
tries = 100
|
||||
for num in range(tries):
|
||||
try:
|
||||
# the socket file link might be outdated, therefore re-init the qmp object
|
||||
self._qmp = QEMUMonitorProtocol(path.realpath(self.qmp_socket))
|
||||
self._qmp.connect()
|
||||
self._qmp_connected = True
|
||||
log.debug("QMP Connected")
|
||||
return
|
||||
except FileNotFoundError:
|
||||
if num < 99:
|
||||
sleep(0.1)
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
def qmp_command(self, command: str) -> dict:
|
||||
self.qmp_connect()
|
||||
return self._qmp.command(command)
|
||||
|
||||
@property
|
||||
def target_host_address(self) -> str:
|
||||
# deploymentAddress is deprecated.
|
||||
|
|
|
@ -159,8 +159,7 @@ def get_vm_create_info(
|
|||
f'{clan_dir}#clanInternals.machines."{system}"."{machine.name}".config.system.clan.vm.create',
|
||||
*nix_options,
|
||||
],
|
||||
machine_gcroot(clan_name=vm.clan_name, flake_url=str(vm.flake_url))
|
||||
/ f"vm-{machine.name}",
|
||||
machine_gcroot(flake_url=str(vm.flake_url)) / f"vm-{machine.name}",
|
||||
)
|
||||
proc = run(
|
||||
cmd, log=Log.BOTH, error_msg=f"Could not build vm config for {machine.name}"
|
||||
|
@ -298,7 +297,7 @@ def run_vm(
|
|||
|
||||
secrets_dir = get_secrets(machine, tmpdir)
|
||||
|
||||
state_dir = vm_state_dir(vm.clan_name, str(vm.flake_url), machine.name)
|
||||
state_dir = vm_state_dir(str(vm.flake_url), machine.name)
|
||||
state_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# specify socket files for qmp and qga
|
||||
|
|
0
pkgs/clan-cli/qemu/__init__.py
Normal file
0
pkgs/clan-cli/qemu/__init__.py
Normal file
|
@ -12,7 +12,7 @@ from time import sleep
|
|||
class QgaSession:
|
||||
def __init__(self, socket_file: Path | str) -> None:
|
||||
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
# try to reconnect a couple of times if connetion refused
|
||||
# try to reconnect a couple of times if connection refused
|
||||
for _ in range(100):
|
||||
try:
|
||||
self.sock.connect(str(socket_file))
|
|
@ -41,7 +41,10 @@ class FlakeForTest(NamedTuple):
|
|||
def generate_flake(
|
||||
temporary_home: Path,
|
||||
flake_template: Path,
|
||||
substitutions: dict[str, str] = {},
|
||||
substitutions: dict[str, str] = {
|
||||
"__CHANGE_ME__": "_test_vm_persistence",
|
||||
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
|
||||
},
|
||||
# define the machines directly including their config
|
||||
machine_configs: dict[str, dict] = {},
|
||||
) -> FlakeForTest:
|
||||
|
|
|
@ -20,16 +20,16 @@ from clan_cli.dirs import clan_key_safe, vm_state_dir
|
|||
|
||||
|
||||
def test_clan_key_safe() -> None:
|
||||
assert clan_key_safe("clan1", "/foo/bar") == "clan1-%2Ffoo%2Fbar"
|
||||
assert clan_key_safe("/foo/bar") == "%2Ffoo%2Fbar"
|
||||
|
||||
|
||||
def test_vm_state_dir_identity() -> None:
|
||||
dir1 = vm_state_dir("clan1", "https://some.clan", "vm1")
|
||||
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1")
|
||||
dir1 = vm_state_dir("https://some.clan", "vm1")
|
||||
dir2 = vm_state_dir("https://some.clan", "vm1")
|
||||
assert str(dir1) == str(dir2)
|
||||
|
||||
|
||||
def test_vm_state_dir_no_collision() -> None:
|
||||
dir1 = vm_state_dir("clan1", "/foo/bar", "vm1")
|
||||
dir2 = vm_state_dir("clan1", "https://some.clan", "vm1")
|
||||
dir1 = vm_state_dir("/foo/bar", "vm1")
|
||||
dir2 = vm_state_dir("https://some.clan", "vm1")
|
||||
assert str(dir1) != str(dir2)
|
||||
|
|
|
@ -12,8 +12,8 @@ from fixtures_flakes import FlakeForTest, generate_flake
|
|||
from root import CLAN_CORE
|
||||
|
||||
from clan_cli.dirs import vm_state_dir
|
||||
from clan_cli.qemu.qga import QgaSession
|
||||
from clan_cli.qemu.qmp import QEMUMonitorProtocol
|
||||
from qemu.qga import QgaSession
|
||||
from qemu.qmp import QEMUMonitorProtocol
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from age_keys import KeyPair
|
||||
|
@ -21,6 +21,54 @@ if TYPE_CHECKING:
|
|||
no_kvm = not os.path.exists("/dev/kvm")
|
||||
|
||||
|
||||
def run_vm_in_thread(machine_name: str) -> None:
|
||||
# runs machine and prints exceptions
|
||||
def run() -> None:
|
||||
try:
|
||||
Cli().run(["vms", "run", machine_name])
|
||||
except Exception:
|
||||
# print exception details
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
print(sys.exc_info()[2], file=sys.stderr)
|
||||
|
||||
# run the machine in a separate thread
|
||||
t = threading.Thread(target=run, name="run")
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
|
||||
# wait for qmp socket to exist
|
||||
def wait_vm_up(state_dir: Path) -> None:
|
||||
socket_file = state_dir / "qga.sock"
|
||||
while True:
|
||||
if socket_file.exists():
|
||||
break
|
||||
sleep(0.1)
|
||||
|
||||
|
||||
# wait for vm to be down by checking if qga socket is down
|
||||
def wait_vm_down(state_dir: Path) -> None:
|
||||
socket_file = state_dir / "qga.sock"
|
||||
while socket_file.exists():
|
||||
sleep(0.1)
|
||||
|
||||
|
||||
# wait for vm to be up then connect and return qmp instance
|
||||
def qmp_connect(state_dir: Path) -> QEMUMonitorProtocol:
|
||||
wait_vm_up(state_dir)
|
||||
qmp = QEMUMonitorProtocol(
|
||||
address=str(os.path.realpath(state_dir / "qmp.sock")),
|
||||
)
|
||||
qmp.connect()
|
||||
return qmp
|
||||
|
||||
|
||||
# wait for vm to be up then connect and return qga instance
|
||||
def qga_connect(state_dir: Path) -> QgaSession:
|
||||
wait_vm_up(state_dir)
|
||||
return QgaSession(os.path.realpath(state_dir / "qga.sock"))
|
||||
|
||||
|
||||
@pytest.mark.impure
|
||||
def test_inspect(
|
||||
test_flake_with_core: FlakeForTest, capsys: pytest.CaptureFixture
|
||||
|
@ -55,19 +103,56 @@ def test_run(
|
|||
|
||||
@pytest.mark.skipif(no_kvm, reason="Requires KVM")
|
||||
@pytest.mark.impure
|
||||
def test_vm_persistence(
|
||||
def test_vm_qmp(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
temporary_home: Path,
|
||||
age_keys: list["KeyPair"],
|
||||
) -> None:
|
||||
monkeypatch.setenv("SOPS_AGE_KEY", age_keys[0].privkey)
|
||||
# set up a simple clan flake
|
||||
flake = generate_flake(
|
||||
temporary_home,
|
||||
flake_template=CLAN_CORE / "templates" / "new-clan",
|
||||
machine_configs=dict(
|
||||
my_machine=dict(
|
||||
clan=dict(
|
||||
virtualisation=dict(graphics=False),
|
||||
networking=dict(targetHost="client"),
|
||||
),
|
||||
services=dict(getty=dict(autologinUser="root")),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# 'clan vms run' must be executed from within the flake
|
||||
monkeypatch.chdir(flake.path)
|
||||
|
||||
# the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
|
||||
state_dir = vm_state_dir(str(flake.path), "my_machine")
|
||||
|
||||
# start the VM
|
||||
run_vm_in_thread("my_machine")
|
||||
|
||||
# connect with qmp
|
||||
qmp = qmp_connect(state_dir)
|
||||
|
||||
# verify that issuing a command works
|
||||
# result = qmp.cmd_obj({"execute": "query-status"})
|
||||
result = qmp.command("query-status")
|
||||
assert result["status"] == "running", result
|
||||
|
||||
# shutdown machine (prevent zombie qemu processes)
|
||||
qmp.command("system_powerdown")
|
||||
|
||||
|
||||
@pytest.mark.skipif(no_kvm, reason="Requires KVM")
|
||||
@pytest.mark.impure
|
||||
def test_vm_persistence(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
temporary_home: Path,
|
||||
) -> None:
|
||||
# set up a clan flake with some systemd services to test persistence
|
||||
flake = generate_flake(
|
||||
temporary_home,
|
||||
flake_template=CLAN_CORE / "templates" / "new-clan",
|
||||
substitutions={
|
||||
"__CHANGE_ME__": "_test_vm_persistence",
|
||||
"git+https://git.clan.lol/clan/clan-core": "path://" + str(CLAN_CORE),
|
||||
},
|
||||
machine_configs=dict(
|
||||
my_machine=dict(
|
||||
services=dict(getty=dict(autologinUser="root")),
|
||||
|
@ -83,8 +168,7 @@ def test_vm_persistence(
|
|||
)
|
||||
)
|
||||
),
|
||||
# create test user
|
||||
# TODO: test persisting files via that user
|
||||
# create test user to test if state can be owned by user
|
||||
users=dict(
|
||||
users=dict(
|
||||
test=dict(
|
||||
|
@ -94,6 +178,8 @@ def test_vm_persistence(
|
|||
root=dict(password="root"),
|
||||
)
|
||||
),
|
||||
# create a systemd service to create a file in the state folder
|
||||
# and another to read it after reboot
|
||||
systemd=dict(
|
||||
services=dict(
|
||||
create_state=dict(
|
||||
|
@ -163,59 +249,22 @@ def test_vm_persistence(
|
|||
)
|
||||
monkeypatch.chdir(flake.path)
|
||||
|
||||
state_dir = vm_state_dir("_test_vm_persistence", str(flake.path), "my_machine")
|
||||
socket_file = state_dir / "qga.sock"
|
||||
# the state dir is a point of reference for qemu interactions as it links to the qga/qmp sockets
|
||||
state_dir = vm_state_dir(str(flake.path), "my_machine")
|
||||
|
||||
# wait until socket file exists
|
||||
def connect() -> QgaSession:
|
||||
while True:
|
||||
if (state_dir / "qga.sock").exists():
|
||||
break
|
||||
sleep(0.1)
|
||||
return QgaSession(os.path.realpath(socket_file))
|
||||
run_vm_in_thread("my_machine")
|
||||
|
||||
# runs machine and prints exceptions
|
||||
def run() -> None:
|
||||
try:
|
||||
Cli().run(["vms", "run", "my_machine"])
|
||||
except Exception:
|
||||
# print exception details
|
||||
print(traceback.format_exc())
|
||||
print(sys.exc_info()[2])
|
||||
|
||||
# run the machine in a separate thread
|
||||
t = threading.Thread(target=run, name="run")
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
# wait for socket to be up
|
||||
Path("/tmp/log").write_text(f"wait for socket to be up: {socket_file!s}")
|
||||
while True:
|
||||
if socket_file.exists():
|
||||
break
|
||||
sleep(0.1)
|
||||
# wait for the VM to start
|
||||
wait_vm_up(state_dir)
|
||||
|
||||
# wait for socket to be down (systemd service 'poweroff' rebooting machine)
|
||||
Path("/tmp/log").write_text("wait for socket to be down")
|
||||
while socket_file.exists():
|
||||
sleep(0.1)
|
||||
Path("/tmp/log").write_text("socket is down")
|
||||
wait_vm_down(state_dir)
|
||||
|
||||
# start vm again
|
||||
t = threading.Thread(target=run, name="run")
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
# wait for the socket to be up
|
||||
Path("/tmp/log").write_text("wait for socket to be up second time")
|
||||
while True:
|
||||
if socket_file.exists():
|
||||
break
|
||||
sleep(0.1)
|
||||
run_vm_in_thread("my_machine")
|
||||
|
||||
# connect second time
|
||||
Path("/tmp/log").write_text("connecting")
|
||||
qga = connect()
|
||||
qga = qga_connect(state_dir)
|
||||
|
||||
# ensure that either /var/lib/nixos or /etc gets persisted
|
||||
# (depending on if system.etc.overlay.enable is set or not)
|
||||
|
@ -224,6 +273,7 @@ def test_vm_persistence(
|
|||
)
|
||||
assert exitcode == 0, err
|
||||
|
||||
# ensure that the file created by the service is still there and has the expected content
|
||||
exitcode, out, err = qga.run("cat /var/my-state/test")
|
||||
assert exitcode == 0, err
|
||||
assert out == "dream2nix\n", out
|
||||
|
@ -236,11 +286,8 @@ def test_vm_persistence(
|
|||
exitcode, out, err = qga.run(
|
||||
"systemctl --failed | tee /tmp/yolo | grep -q '0 loaded units listed' || ( cat /tmp/yolo && false )"
|
||||
)
|
||||
print(out)
|
||||
assert exitcode == 0, out
|
||||
|
||||
qmp = QEMUMonitorProtocol(
|
||||
address=str(os.path.realpath(state_dir / "qmp.sock")),
|
||||
)
|
||||
qmp.connect()
|
||||
qmp.cmd_obj({"execute": "system_powerdown"})
|
||||
# use qmp to shutdown the machine (prevent zombie qemu processes)
|
||||
qmp = qmp_connect(state_dir)
|
||||
qmp.command("system_powerdown")
|
||||
|
|
|
@ -120,33 +120,34 @@ class VM(GObject.Object):
|
|||
self._finalizer = weakref.finalize(self, self.stop)
|
||||
self.connect("vm_status_changed", self._start_logs_task)
|
||||
|
||||
def __start(self) -> None:
|
||||
if self.is_running():
|
||||
log.warn("VM is already running")
|
||||
return
|
||||
|
||||
uri = ClanURI.from_str(
|
||||
url=self.data.flake.flake_url, flake_attr=self.data.flake.flake_attr
|
||||
)
|
||||
|
||||
match uri.scheme:
|
||||
case ClanScheme.LOCAL.value(path):
|
||||
machine = Machine(
|
||||
self.machine = Machine(
|
||||
name=self.data.flake.flake_attr,
|
||||
flake=path, # type: ignore
|
||||
)
|
||||
case ClanScheme.REMOTE.value(url):
|
||||
machine = Machine(
|
||||
self.machine = Machine(
|
||||
name=self.data.flake.flake_attr,
|
||||
flake=url, # type: ignore
|
||||
)
|
||||
vm = vms.run.inspect_vm(machine)
|
||||
|
||||
def __start(self) -> None:
|
||||
if self.is_running():
|
||||
log.warn("VM is already running")
|
||||
return
|
||||
vm = vms.run.inspect_vm(self.machine)
|
||||
self.process = spawn(
|
||||
on_except=None,
|
||||
log_dir=Path(str(self.log_dir.name)),
|
||||
func=vms.run.run_vm,
|
||||
vm=vm,
|
||||
)
|
||||
log.debug("Starting VM")
|
||||
self.machine.qmp_connect()
|
||||
|
||||
def start(self) -> None:
|
||||
if self.is_running():
|
||||
|
@ -212,7 +213,8 @@ class VM(GObject.Object):
|
|||
if not self.is_running():
|
||||
return
|
||||
log.info(f"Stopping VM {self.get_id()}")
|
||||
self.process.kill_group()
|
||||
# TODO: add fallback to kill the process if the QMP command fails
|
||||
self.machine.qmp_command("system_powerdown")
|
||||
|
||||
def read_whole_log(self) -> str:
|
||||
if not self.process.out_file.exists():
|
||||
|
|
Loading…
Reference in New Issue
Block a user