testvm: Improved CLI usability

Makes the testvm CLI more intuitive, so we can start using the x86vm as a default for running crosvm integration tests. Modernizes the CLI output with colors and spinners for waiting. Adds status and logs commands to aid debugging. Improved `up` command to kill VMs if they do not become responsive. Also merges the wait command into `up --wait`. BUG=b:275717759 TEST=aarch64vm up && aarch64vm up --wait TEST=aarch64vm stop && aarch64vm up --wait TEST=testvm TEST=aarch64vm logs Change-Id: I82cf14cff19a0b01cda64718a24d170d564fdcd7 Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/4390731 Reviewed-by: Zihan Chen <zihanchen@google.com>
2024-11-24 12:34:31 +00:00 · 2023-03-31 16:00:39 -07:00 · 2023-03-31 16:00:39 -07:00 · 1b0c937ce4
commit 1b0c937ce4
parent 2f4504e176
5 changed files with 227 additions and 81 deletions
--- a/tools/impl/common.py
+++ b/tools/impl/common.py
@ -18,7 +18,6 @@ from . import preamble  # type: ignore

 import argparse
 import contextlib
-import csv
 import datetime
 import functools
 import getpass
@ -34,7 +33,6 @@ import urllib
 import urllib.request
 import urllib.error
 from copy import deepcopy
-from io import StringIO
 from math import ceil
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
@ -111,6 +109,9 @@ assert 'name = "crosvm"' in CROSVM_TOML.read_text()
 # List of times recorded by `record_time` which will be printed if --timing-info is provided.
 global_time_records: List[Tuple[str, datetime.timedelta]] = []

+# Regex that matches ANSI escape sequences
+ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+

 def crosvm_target_dir():
    crosvm_target = os.environ.get("CROSVM_TARGET_DIR")
@ -1166,6 +1167,10 @@ def download_file(url: str, filename: Path, attempts: int = 3):
                console.print("Download failed:", e)


+def strip_ansi_escape_sequences(line: str) -> str:
+    return ANSI_ESCAPE.sub("", line)
+
+
 console = rich.console.Console()

 if __name__ == "__main__":
--- a/tools/impl/presubmit.py
+++ b/tools/impl/presubmit.py
@ -4,29 +4,29 @@
 # found in the LICENSE file.

 import os
-import re
 import subprocess
+import sys
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
+from datetime import datetime, timedelta
 from fnmatch import fnmatch
 from pathlib import Path
-import sys
 from time import sleep
-from typing import Callable, List, NamedTuple, Optional, Set, Union
-from datetime import datetime, timedelta
+from typing import Callable, List, NamedTuple, Optional, Union

-from impl.common import Command, all_tracked_files, cmd, console, verbose
-
-import rich
-import rich.console
-import rich.live
-import rich.spinner
-import rich.text
+from impl.common import (
+    Command,
+    all_tracked_files,
+    cmd,
+    console,
+    rich,
+    strip_ansi_escape_sequences,
+    verbose,
+)

 git = cmd("git")

-ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
-

@dataclass
 class CheckContext(object):
@ -204,7 +204,7 @@ class Task(object):
            *(
                # Print last log lines without it's original colors
                rich.text.Text(
-                    "│ " + ansi_escape.sub("", log_line),
+                    "│ " + strip_ansi_escape_sequences(log_line),
                    style="light_slate_grey",
                    overflow="ellipsis",
                    no_wrap=True,
--- a/tools/impl/testvm.py
+++ b/tools/impl/testvm.py
@ -2,6 +2,7 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

+from enum import Enum
 import json
 import os
 import socket
@ -11,14 +12,15 @@ import time
 import typing
 from contextlib import closing
 from pathlib import Path
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Tuple

-from .common import CACHE_DIR, download_file, cmd
+from .common import CACHE_DIR, download_file, cmd, rich, console

 KVM_SUPPORT = os.access("/dev/kvm", os.W_OK)

 Arch = Literal["x86_64", "aarch64"]
-ARCH_OPTIONS = typing.get_args(Arch)
+ARCH_OPTIONS = typing.cast(Tuple[Arch], typing.get_args(Arch))
+

 SCRIPT_DIR = Path(__file__).parent.resolve()
 SRC_DIR = SCRIPT_DIR.joinpath("testvm")
@ -53,6 +55,10 @@ def pid_path(arch: Arch):
    return data_dir(arch).joinpath("pid")


+def log_path(arch: Arch):
+    return data_dir(arch).joinpath("vm_log")
+
+
 def base_img_name(arch: Arch):
    return f"base-{arch}-{BASE_IMG_VERSION}.qcow2"

@ -170,10 +176,36 @@ def run_qemu(
    else:
        serial = "stdio"

-    process = qemu.with_args(f"-hda {hda}", f"-serial {serial}").popen(start_new_session=background)
-    write_pid_file(arch, process.pid)
-    if not background:
-        process.wait()
+    console.print(f"Booting {arch} VM with disk", hda)
+    command = qemu.with_args(
+        f"-hda {hda}",
+        f"-serial {serial}",
+        f"-netdev user,id=net0,hostfwd=tcp::{SSH_PORTS[arch]}-:22",
+    )
+    if background:
+        # Start qemu in a new session so it can outlive this process.
+        process = command.popen(
+            start_new_session=background, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+
+        # Wait for 1s to see if the qemu is staying alive.
+        assert process.stdout
+        for _ in range(10):
+            if process.poll() is not None:
+                sys.stdout.write(process.stdout.read())
+                print(f"'{command}' exited with code {process.returncode}")
+                sys.exit(process.returncode)
+            time.sleep(0.1)
+
+        # Print any warnings qemu might produce.
+        sys.stdout.write(process.stdout.read(0))
+        sys.stdout.flush()
+        process.stdout.close()
+
+        # Save pid so we can manage the process later.
+        write_pid_file(arch, process.pid)
+    else:
+        command.fg()


 def run_vm(arch: Arch, background: bool = False):
@ -200,12 +232,18 @@ def is_running(arch: Arch):
 def kill_vm(arch: Arch):
    pid = read_pid_file(arch)
    if pid:
-        os.kill(pid, 9)
+        try:
+            os.kill(pid, 9)
+            # Ping with signal 0 until we get an OSError indicating the process has shutdown.
+            while True:
+                os.kill(pid, 0)
+        except OSError:
+            return


 def build_if_needed(arch: Arch, reset: bool = False):
    if reset and is_running(arch):
-        print("Killing existing VM...")
+        print(f"Killing existing {arch} VM to perform reset...")
        kill_vm(arch)
        time.sleep(1)

@ -213,14 +251,14 @@ def build_if_needed(arch: Arch, reset: bool = False):

    base_img = base_img_path(arch)
    if not base_img.exists():
-        print(f"Downloading base image ({base_img_url(arch)})...")
+        print(f"Downloading {arch} base image ({base_img_url(arch)})...")
        download_file(base_img_url(arch), base_img_path(arch))

    rootfs_img = rootfs_img_path(arch)
    if not rootfs_img.exists() or reset:
        # The rootfs is backed by the base image generated above. So we can
        # easily reset to a clean VM by rebuilding an empty rootfs image.
-        print("Creating rootfs overlay...")
+        print(f"Creating {arch} rootfs overlay...")
        qemu_img.with_args(
            "create",
            "-f qcow2",
@ -236,33 +274,68 @@ def is_ssh_port_available(arch: Arch):
        return sock.connect_ex(("127.0.0.1", SSH_PORTS[arch])) != 0


-def up(arch: Arch, reset: bool = False):
-    "Start the VM if it's not already running."
+def up(arch: Arch, reset: bool = False, wait: bool = False, timeout: int = 120):
+    "Starts the test vm if it's not already running. Optionally wait for it to be reachable."
+
+    # Try waiting for the running VM, if it does not become reachable, kill it.
    if is_running(arch):
-        return
+        if not wait:
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+            return
+        if not wait_until_reachable(arch, timeout):
+            if is_running(arch):
+                print(f"{arch} VM is not reachable. Restarting it.")
+                kill_vm(arch)
+            else:
+                print(f"{arch} VM stopped. Starting it again.")
+        else:
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+            return

    build_if_needed(arch, reset)
-    print("Booting VM...")
    run_qemu(
        arch,
        rootfs_img_path(arch),
        background=True,
    )

+    if wait:
+        if wait_until_reachable(arch, timeout):
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+        else:
+            raise Exception(f"Waiting for {arch} VM timed out.")

-def wait(arch: Arch, timeout: int = 120):
+
+def wait_until_reachable(arch: Arch, timeout: int = 120):
    "Blocks until the VM is ready to use."
-    up(arch)
+    if not is_running(arch):
+        return False
    if ping_vm(arch):
-        return
+        return True

-    print("Waiting for VM")
-    start_time = time.time()
-    while (time.time() - start_time) < timeout:
-        time.sleep(1)
-        sys.stdout.write(".")
-        sys.stdout.flush()
+    with rich.live.Live(
+        rich.spinner.Spinner("point", f"Waiting for {arch} VM to become reachable...")
+    ):
+        start_time = time.time()
+        while (time.time() - start_time) < timeout:
+            if not is_running(arch):
+                return False
+            if ping_vm(arch):
+                return True
+    return False
+
+
+class VmState(Enum):
+    REACHABLE = "Reachable"
+    RUNNING_NOT_REACHABLE = "Running, but not reachable"
+    STOPPED = "Stopped"
+
+
+def state(arch: Arch):
+    if is_running(arch):
        if ping_vm(arch):
-            print()
-            return
-    raise Exception("Timeout while waiting for VM")
+            return VmState.REACHABLE
+        else:
+            return VmState.RUNNING_NOT_REACHABLE
+    else:
+        return VmState.STOPPED
--- a/tools/run_tests
+++ b/tools/run_tests
@ -287,6 +287,7 @@ def main(
    if dut == "host":
        check_host_prerequisites(run_root_tests)
    if dut == "vm":
+        # Start VM ahead of time but don't wait for it to boot.
        testvm.up(get_vm_arch(triple))

    nextest_args = [
@ -353,7 +354,7 @@ def main(
        if dut == "host":
            target = HostTarget(package_dir)
        elif dut == "vm":
-            testvm.wait(get_vm_arch(triple))
+            testvm.up(get_vm_arch(triple), wait=True)
            remote = Remote("localhost", testvm.ssh_opts(get_vm_arch(triple)))
            target = SshTarget(package_archive, remote)

--- a/tools/testvm
+++ b/tools/testvm
@ -4,14 +4,16 @@
 # found in the LICENSE file.

 import shutil
+from typing import Iterable, Optional

-from impl.common import run_commands, argh
+from impl.common import run_commands, argh, console, strip_ansi_escape_sequences
 from impl import testvm
+from impl.testvm import Arch, VmState

 USAGE = """Manages VMs for testing crosvm.

 Can run an x86_64 and an aarch64 vm via `./tools/x86vm` and `./tools/aarch64vm`.
-The VM image will be downloaded and initialized on first use.
+Both are a wrapper around `./tools/testvm --arch=x86_64/aarch64`.

 The easiest way to use the VM is:

@ -20,21 +22,43 @@ The easiest way to use the VM is:
 Which will initialize and boot the VM, then wait for SSH to be available and
 opens an SSH session. The VM will stay alive between calls.

-Alternatively, you can set up an SSH config to connect to the VM. First ensure
-the VM ready:
+Available commands are:
+    - up: Start the VM if it is not already running.
+    - stop: Gracefully stop the VM
+    - kill: Send SIGKILL to the VM
+    - clean: Stop the VM and delete all images
+    - logs: Print logs of the VM console

-  $ ./tools/aarch64vm wait
+All of these can be called on `./tools/x86vm` or `./tools/aarch64vm`, but also on
+`tools/testvm` to apply to both VMs.
 """


-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def up(arch: testvm.Arch = "x86_64", reset: bool = False):
+def cli_shorthand(arch: Arch):
+    if arch == "x86_64":
+        return "tools/x86vm"
+    elif arch == "aarch64":
+        return "tools/aarch64vm"
+    else:
+        raise Exception(f"Unknown architecture: {arch}")
+
+
+def arch_or_all(arch: Optional[Arch]):
+    return (arch,) if arch else testvm.ARCH_OPTIONS
+
+
+ARCHS = testvm.ARCH_OPTIONS
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def up(arch_list: Iterable[Arch] = [], reset: bool = False, wait: bool = False, timeout: int = 120):
    "Start the VM if it's not already running."
-    testvm.up(arch, reset)
+    for arch in arch_list:
+        testvm.up(arch, reset, wait, timeout)


@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def run(arch: testvm.Arch = "x86_64", reset: bool = False):
+def run(arch: Arch = "x86_64", reset: bool = False):
    "Run the VM in foreground for debugging purposes."
    if testvm.is_running(arch):
        raise Exception("VM is already running")
@ -47,45 +71,88 @@ def run(arch: testvm.Arch = "x86_64", reset: bool = False):


@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def wait(arch: testvm.Arch = "x86_64", timeout: int = 120):
-    "Blocks until the VM is ready to use."
-    testvm.wait(arch, timeout)
-
-
-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def ssh(arch: testvm.Arch = "x86_64", timeout: int = 120):
+def shell(arch: Arch = "x86_64", timeout: int = 120):
    "Starts an interactive shell via SSH, will ensure the VM is running."
-    testvm.up(arch)
-    wait(arch, timeout)
+    testvm.up(arch, wait=True, timeout=timeout)
    testvm.ssh_exec(arch)


-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def stop(arch: testvm.Arch = "x86_64"):
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def stop(arch_list: Iterable[Arch] = []):
    "Gracefully stops the running VM."
-    if not testvm.is_running(arch):
-        print("VM is not running.")
-        return
-    testvm.ssh_exec(arch, "sudo poweroff")
+    for arch in arch_list:
+        if not testvm.is_running(arch):
+            print(f"{arch} VM is not running")
+            break
+        console.print(f"Stopping {arch} VM")
+        testvm.ssh_exec(arch, "sudo poweroff")


-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def kill(arch: testvm.Arch = "x86_64"):
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def kill(arch_list: Iterable[Arch] = []):
    "Kills the running VM with SIGKILL."
-    if not testvm.is_running(arch):
-        print("VM is not running.")
-        return
-    testvm.kill_vm(arch)
+    for arch in arch_list:
+        if not testvm.is_running(arch):
+            console.print(f"{arch} VM is not running")
+            break
+        console.print(f"Killing {arch} VM process")
+        testvm.kill_vm(arch)
+        print()
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def clean(arch_list: Iterable[Arch] = []):
+    "Stops the VM or VMs and deletes all data."
+    for arch in arch_list:
+        if testvm.is_running(arch):
+            kill(arch)
+        if testvm.data_dir(arch).exists():
+            console.print("Cleaning data directory", testvm.data_dir(arch))
+            shutil.rmtree(testvm.data_dir(arch))
+        print()
+
+
+def vm_status(arch: Arch):
+    def cli_tip(*args: str):
+        return f"[green][bold]{cli_shorthand(arch)} {' '.join(args)}[/bold][/green]"
+
+    vm = f"{arch} VM"
+    port = f"[blue]{testvm.SSH_PORTS[arch]}[/blue]"
+
+    state = testvm.state(arch)
+    if state == VmState.REACHABLE:
+        console.print(f"{vm} is [green]reachable[/green] on port {port}")
+        console.print(f"Start a shell with {cli_tip('shell')}")
+    elif state == VmState.STOPPED:
+        console.print(f"{vm} is [red]stopped[/red]")
+        console.print(f"Start the VM with {cli_tip('up')}")
+    else:
+        console.print(f"{vm} is running but [red]not reachable[/red] on port {port}")
+        console.print(f"Recent logs:")
+        logs(arch, 10, style="light_slate_grey")
+        console.print(f"See all logs with {cli_tip('logs')}")
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def status(arch_list: Iterable[Arch] = []):
+    for arch in arch_list:
+        vm_status(arch)
+        print()


@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def clean(arch: testvm.Arch = "x86_64"):
-    "Stops the VM and deletes all data."
-    if testvm.is_running(arch):
-        kill(arch)
-    if testvm.data_dir(arch).exists():
-        shutil.rmtree(testvm.data_dir(arch))
+def logs(arch: Arch = "x86_64", n: int = 0, style: Optional[str] = None):
+    log_lines = testvm.log_path(arch).read_text().splitlines()
+    if n > 0 and len(log_lines) > n:
+        log_lines = log_lines[-n:]
+    for line in log_lines:
+        if style:
+            console.print(
+                strip_ansi_escape_sequences(line), style=style, markup=False, highlight=False
+            )
+        else:
+            print(line)


 if __name__ == "__main__":
-    run_commands(up, run, wait, ssh, stop, kill, clean, usage=USAGE)
+    run_commands(up, run, shell, stop, kill, clean, status, logs, usage=USAGE, default_fn=status)