From 1b0c937ce474b7d34f226c6d47e73292134d0263 Mon Sep 17 00:00:00 2001
From: Dennis Kempin <denniskempin@google.com>
Date: Fri, 31 Mar 2023 16:00:39 -0700
Subject: [PATCH] testvm: Improved CLI usability

Makes the testvm CLI more intuitive, so we can start using the
x86vm as a default for running crosvm integration tests.

Modernizes the CLI output with colors and spinners for waiting.
Adds status and logs commands to aid debugging.
Improved `up` command to kill VMs if they do not become responsive.

Also merges the wait command into `up --wait`.

BUG=b:275717759
TEST=aarch64vm up && aarch64vm up --wait
TEST=aarch64vm stop && aarch64vm up --wait
TEST=testvm
TEST=aarch64vm logs

Change-Id: I82cf14cff19a0b01cda64718a24d170d564fdcd7
Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/4390731
Reviewed-by: Zihan Chen <zihanchen@google.com>
---
 tools/impl/common.py    |   9 ++-
 tools/impl/presubmit.py |  28 ++++----
 tools/impl/testvm.py    | 127 ++++++++++++++++++++++++++++--------
 tools/run_tests         |   3 +-
 tools/testvm            | 141 +++++++++++++++++++++++++++++-----------
 5 files changed, 227 insertions(+), 81 deletions(-)

diff --git a/tools/impl/common.py b/tools/impl/common.py
index f6e4f5fc45..9e153657e8 100644
--- a/tools/impl/common.py
+++ b/tools/impl/common.py
@@ -18,7 +18,6 @@ from . import preamble  # type: ignore
 
 import argparse
 import contextlib
-import csv
 import datetime
 import functools
 import getpass
@@ -34,7 +33,6 @@ import urllib
 import urllib.request
 import urllib.error
 from copy import deepcopy
-from io import StringIO
 from math import ceil
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
@@ -111,6 +109,9 @@ assert 'name = "crosvm"' in CROSVM_TOML.read_text()
 # List of times recorded by `record_time` which will be printed if --timing-info is provided.
 global_time_records: List[Tuple[str, datetime.timedelta]] = []
 
+# Regex that matches ANSI escape sequences
+ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+
 
 def crosvm_target_dir():
     crosvm_target = os.environ.get("CROSVM_TARGET_DIR")
@@ -1166,6 +1167,10 @@ def download_file(url: str, filename: Path, attempts: int = 3):
                 console.print("Download failed:", e)
 
 
+def strip_ansi_escape_sequences(line: str) -> str:
+    return ANSI_ESCAPE.sub("", line)
+
+
 console = rich.console.Console()
 
 if __name__ == "__main__":
diff --git a/tools/impl/presubmit.py b/tools/impl/presubmit.py
index 3699d00683..72b331c8e3 100644
--- a/tools/impl/presubmit.py
+++ b/tools/impl/presubmit.py
@@ -4,29 +4,29 @@
 # found in the LICENSE file.
 
 import os
-import re
 import subprocess
+import sys
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
+from datetime import datetime, timedelta
 from fnmatch import fnmatch
 from pathlib import Path
-import sys
 from time import sleep
-from typing import Callable, List, NamedTuple, Optional, Set, Union
-from datetime import datetime, timedelta
+from typing import Callable, List, NamedTuple, Optional, Union
 
-from impl.common import Command, all_tracked_files, cmd, console, verbose
-
-import rich
-import rich.console
-import rich.live
-import rich.spinner
-import rich.text
+from impl.common import (
+    Command,
+    all_tracked_files,
+    cmd,
+    console,
+    rich,
+    strip_ansi_escape_sequences,
+    verbose,
+)
 
 git = cmd("git")
 
-ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
-
 
 @dataclass
 class CheckContext(object):
@@ -204,7 +204,7 @@ class Task(object):
             *(
                 # Print last log lines without it's original colors
                 rich.text.Text(
-                    "│ " + ansi_escape.sub("", log_line),
+                    "│ " + strip_ansi_escape_sequences(log_line),
                     style="light_slate_grey",
                     overflow="ellipsis",
                     no_wrap=True,
diff --git a/tools/impl/testvm.py b/tools/impl/testvm.py
index 3959bdc574..cf1a3edf34 100755
--- a/tools/impl/testvm.py
+++ b/tools/impl/testvm.py
@@ -2,6 +2,7 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
+from enum import Enum
 import json
 import os
 import socket
@@ -11,14 +12,15 @@ import time
 import typing
 from contextlib import closing
 from pathlib import Path
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Tuple
 
-from .common import CACHE_DIR, download_file, cmd
+from .common import CACHE_DIR, download_file, cmd, rich, console
 
 KVM_SUPPORT = os.access("/dev/kvm", os.W_OK)
 
 Arch = Literal["x86_64", "aarch64"]
-ARCH_OPTIONS = typing.get_args(Arch)
+ARCH_OPTIONS = typing.cast(Tuple[Arch], typing.get_args(Arch))
+
 
 SCRIPT_DIR = Path(__file__).parent.resolve()
 SRC_DIR = SCRIPT_DIR.joinpath("testvm")
@@ -53,6 +55,10 @@ def pid_path(arch: Arch):
     return data_dir(arch).joinpath("pid")
 
 
+def log_path(arch: Arch):
+    return data_dir(arch).joinpath("vm_log")
+
+
 def base_img_name(arch: Arch):
     return f"base-{arch}-{BASE_IMG_VERSION}.qcow2"
 
@@ -170,10 +176,36 @@ def run_qemu(
     else:
         serial = "stdio"
 
-    process = qemu.with_args(f"-hda {hda}", f"-serial {serial}").popen(start_new_session=background)
-    write_pid_file(arch, process.pid)
-    if not background:
-        process.wait()
+    console.print(f"Booting {arch} VM with disk", hda)
+    command = qemu.with_args(
+        f"-hda {hda}",
+        f"-serial {serial}",
+        f"-netdev user,id=net0,hostfwd=tcp::{SSH_PORTS[arch]}-:22",
+    )
+    if background:
+        # Start qemu in a new session so it can outlive this process.
+        process = command.popen(
+            start_new_session=background, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+
+        # Wait for 1s to see if the qemu is staying alive.
+        assert process.stdout
+        for _ in range(10):
+            if process.poll() is not None:
+                sys.stdout.write(process.stdout.read())
+                print(f"'{command}' exited with code {process.returncode}")
+                sys.exit(process.returncode)
+            time.sleep(0.1)
+
+        # Print any warnings qemu might produce.
+        sys.stdout.write(process.stdout.read(0))
+        sys.stdout.flush()
+        process.stdout.close()
+
+        # Save pid so we can manage the process later.
+        write_pid_file(arch, process.pid)
+    else:
+        command.fg()
 
 
 def run_vm(arch: Arch, background: bool = False):
@@ -200,12 +232,18 @@ def is_running(arch: Arch):
 def kill_vm(arch: Arch):
     pid = read_pid_file(arch)
     if pid:
-        os.kill(pid, 9)
+        try:
+            os.kill(pid, 9)
+            # Ping with signal 0 until we get an OSError indicating the process has shutdown.
+            while True:
+                os.kill(pid, 0)
+        except OSError:
+            return
 
 
 def build_if_needed(arch: Arch, reset: bool = False):
     if reset and is_running(arch):
-        print("Killing existing VM...")
+        print(f"Killing existing {arch} VM to perform reset...")
         kill_vm(arch)
         time.sleep(1)
 
@@ -213,14 +251,14 @@ def build_if_needed(arch: Arch, reset: bool = False):
 
     base_img = base_img_path(arch)
     if not base_img.exists():
-        print(f"Downloading base image ({base_img_url(arch)})...")
+        print(f"Downloading {arch} base image ({base_img_url(arch)})...")
         download_file(base_img_url(arch), base_img_path(arch))
 
     rootfs_img = rootfs_img_path(arch)
     if not rootfs_img.exists() or reset:
         # The rootfs is backed by the base image generated above. So we can
         # easily reset to a clean VM by rebuilding an empty rootfs image.
-        print("Creating rootfs overlay...")
+        print(f"Creating {arch} rootfs overlay...")
         qemu_img.with_args(
             "create",
             "-f qcow2",
@@ -236,33 +274,68 @@ def is_ssh_port_available(arch: Arch):
         return sock.connect_ex(("127.0.0.1", SSH_PORTS[arch])) != 0
 
 
-def up(arch: Arch, reset: bool = False):
-    "Start the VM if it's not already running."
+def up(arch: Arch, reset: bool = False, wait: bool = False, timeout: int = 120):
+    "Starts the test vm if it's not already running. Optionally wait for it to be reachable."
+
+    # Try waiting for the running VM, if it does not become reachable, kill it.
     if is_running(arch):
-        return
+        if not wait:
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+            return
+        if not wait_until_reachable(arch, timeout):
+            if is_running(arch):
+                print(f"{arch} VM is not reachable. Restarting it.")
+                kill_vm(arch)
+            else:
+                print(f"{arch} VM stopped. Starting it again.")
+        else:
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+            return
 
     build_if_needed(arch, reset)
-    print("Booting VM...")
     run_qemu(
         arch,
         rootfs_img_path(arch),
         background=True,
     )
 
+    if wait:
+        if wait_until_reachable(arch, timeout):
+            console.print(f"{arch} VM is running on port {SSH_PORTS[arch]}")
+        else:
+            raise Exception(f"Waiting for {arch} VM timed out.")
 
-def wait(arch: Arch, timeout: int = 120):
+
+def wait_until_reachable(arch: Arch, timeout: int = 120):
     "Blocks until the VM is ready to use."
-    up(arch)
+    if not is_running(arch):
+        return False
     if ping_vm(arch):
-        return
+        return True
 
-    print("Waiting for VM")
-    start_time = time.time()
-    while (time.time() - start_time) < timeout:
-        time.sleep(1)
-        sys.stdout.write(".")
-        sys.stdout.flush()
+    with rich.live.Live(
+        rich.spinner.Spinner("point", f"Waiting for {arch} VM to become reachable...")
+    ):
+        start_time = time.time()
+        while (time.time() - start_time) < timeout:
+            if not is_running(arch):
+                return False
+            if ping_vm(arch):
+                return True
+    return False
+
+
+class VmState(Enum):
+    REACHABLE = "Reachable"
+    RUNNING_NOT_REACHABLE = "Running, but not reachable"
+    STOPPED = "Stopped"
+
+
+def state(arch: Arch):
+    if is_running(arch):
         if ping_vm(arch):
-            print()
-            return
-    raise Exception("Timeout while waiting for VM")
+            return VmState.REACHABLE
+        else:
+            return VmState.RUNNING_NOT_REACHABLE
+    else:
+        return VmState.STOPPED
diff --git a/tools/run_tests b/tools/run_tests
index c2fccb5d54..eba3fe9ab1 100755
--- a/tools/run_tests
+++ b/tools/run_tests
@@ -287,6 +287,7 @@ def main(
     if dut == "host":
         check_host_prerequisites(run_root_tests)
     if dut == "vm":
+        # Start VM ahead of time but don't wait for it to boot.
         testvm.up(get_vm_arch(triple))
 
     nextest_args = [
@@ -353,7 +354,7 @@ def main(
         if dut == "host":
             target = HostTarget(package_dir)
         elif dut == "vm":
-            testvm.wait(get_vm_arch(triple))
+            testvm.up(get_vm_arch(triple), wait=True)
             remote = Remote("localhost", testvm.ssh_opts(get_vm_arch(triple)))
             target = SshTarget(package_archive, remote)
 
diff --git a/tools/testvm b/tools/testvm
index 39a14dc4b2..b527e16c55 100755
--- a/tools/testvm
+++ b/tools/testvm
@@ -4,14 +4,16 @@
 # found in the LICENSE file.
 
 import shutil
+from typing import Iterable, Optional
 
-from impl.common import run_commands, argh
+from impl.common import run_commands, argh, console, strip_ansi_escape_sequences
 from impl import testvm
+from impl.testvm import Arch, VmState
 
 USAGE = """Manages VMs for testing crosvm.
 
 Can run an x86_64 and an aarch64 vm via `./tools/x86vm` and `./tools/aarch64vm`.
-The VM image will be downloaded and initialized on first use.
+Both are a wrapper around `./tools/testvm --arch=x86_64/aarch64`.
 
 The easiest way to use the VM is:
 
@@ -20,21 +22,43 @@ The easiest way to use the VM is:
 Which will initialize and boot the VM, then wait for SSH to be available and
 opens an SSH session. The VM will stay alive between calls.
 
-Alternatively, you can set up an SSH config to connect to the VM. First ensure
-the VM ready:
+Available commands are:
+    - up: Start the VM if it is not already running.
+    - stop: Gracefully stop the VM
+    - kill: Send SIGKILL to the VM
+    - clean: Stop the VM and delete all images
+    - logs: Print logs of the VM console
 
-  $ ./tools/aarch64vm wait
+All of these can be called on `./tools/x86vm` or `./tools/aarch64vm`, but also on
+`tools/testvm` to apply to both VMs.
 """
 
 
-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def up(arch: testvm.Arch = "x86_64", reset: bool = False):
+def cli_shorthand(arch: Arch):
+    if arch == "x86_64":
+        return "tools/x86vm"
+    elif arch == "aarch64":
+        return "tools/aarch64vm"
+    else:
+        raise Exception(f"Unknown architecture: {arch}")
+
+
+def arch_or_all(arch: Optional[Arch]):
+    return (arch,) if arch else testvm.ARCH_OPTIONS
+
+
+ARCHS = testvm.ARCH_OPTIONS
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def up(arch_list: Iterable[Arch] = [], reset: bool = False, wait: bool = False, timeout: int = 120):
     "Start the VM if it's not already running."
-    testvm.up(arch, reset)
+    for arch in arch_list:
+        testvm.up(arch, reset, wait, timeout)
 
 
 @argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def run(arch: testvm.Arch = "x86_64", reset: bool = False):
+def run(arch: Arch = "x86_64", reset: bool = False):
     "Run the VM in foreground for debugging purposes."
     if testvm.is_running(arch):
         raise Exception("VM is already running")
@@ -47,45 +71,88 @@ def run(arch: testvm.Arch = "x86_64", reset: bool = False):
 
 
 @argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def wait(arch: testvm.Arch = "x86_64", timeout: int = 120):
-    "Blocks until the VM is ready to use."
-    testvm.wait(arch, timeout)
-
-
-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def ssh(arch: testvm.Arch = "x86_64", timeout: int = 120):
+def shell(arch: Arch = "x86_64", timeout: int = 120):
     "Starts an interactive shell via SSH, will ensure the VM is running."
-    testvm.up(arch)
-    wait(arch, timeout)
+    testvm.up(arch, wait=True, timeout=timeout)
     testvm.ssh_exec(arch)
 
 
-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def stop(arch: testvm.Arch = "x86_64"):
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def stop(arch_list: Iterable[Arch] = []):
     "Gracefully stops the running VM."
-    if not testvm.is_running(arch):
-        print("VM is not running.")
-        return
-    testvm.ssh_exec(arch, "sudo poweroff")
+    for arch in arch_list:
+        if not testvm.is_running(arch):
+            print(f"{arch} VM is not running")
+            break
+        console.print(f"Stopping {arch} VM")
+        testvm.ssh_exec(arch, "sudo poweroff")
 
 
-@argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def kill(arch: testvm.Arch = "x86_64"):
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def kill(arch_list: Iterable[Arch] = []):
     "Kills the running VM with SIGKILL."
-    if not testvm.is_running(arch):
-        print("VM is not running.")
-        return
-    testvm.kill_vm(arch)
+    for arch in arch_list:
+        if not testvm.is_running(arch):
+            console.print(f"{arch} VM is not running")
+            break
+        console.print(f"Killing {arch} VM process")
+        testvm.kill_vm(arch)
+        print()
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def clean(arch_list: Iterable[Arch] = []):
+    "Stops the VM or VMs and deletes all data."
+    for arch in arch_list:
+        if testvm.is_running(arch):
+            kill(arch)
+        if testvm.data_dir(arch).exists():
+            console.print("Cleaning data directory", testvm.data_dir(arch))
+            shutil.rmtree(testvm.data_dir(arch))
+        print()
+
+
+def vm_status(arch: Arch):
+    def cli_tip(*args: str):
+        return f"[green][bold]{cli_shorthand(arch)} {' '.join(args)}[/bold][/green]"
+
+    vm = f"{arch} VM"
+    port = f"[blue]{testvm.SSH_PORTS[arch]}[/blue]"
+
+    state = testvm.state(arch)
+    if state == VmState.REACHABLE:
+        console.print(f"{vm} is [green]reachable[/green] on port {port}")
+        console.print(f"Start a shell with {cli_tip('shell')}")
+    elif state == VmState.STOPPED:
+        console.print(f"{vm} is [red]stopped[/red]")
+        console.print(f"Start the VM with {cli_tip('up')}")
+    else:
+        console.print(f"{vm} is running but [red]not reachable[/red] on port {port}")
+        console.print(f"Recent logs:")
+        logs(arch, 10, style="light_slate_grey")
+        console.print(f"See all logs with {cli_tip('logs')}")
+
+
+@argh.arg("--arch-list", "--arch", nargs="*", type=str, default=ARCHS, choices=ARCHS)
+def status(arch_list: Iterable[Arch] = []):
+    for arch in arch_list:
+        vm_status(arch)
+        print()
 
 
 @argh.arg("--arch", required=True, choices=testvm.ARCH_OPTIONS)
-def clean(arch: testvm.Arch = "x86_64"):
-    "Stops the VM and deletes all data."
-    if testvm.is_running(arch):
-        kill(arch)
-    if testvm.data_dir(arch).exists():
-        shutil.rmtree(testvm.data_dir(arch))
+def logs(arch: Arch = "x86_64", n: int = 0, style: Optional[str] = None):
+    log_lines = testvm.log_path(arch).read_text().splitlines()
+    if n > 0 and len(log_lines) > n:
+        log_lines = log_lines[-n:]
+    for line in log_lines:
+        if style:
+            console.print(
+                strip_ansi_escape_sequences(line), style=style, markup=False, highlight=False
+            )
+        else:
+            print(line)
 
 
 if __name__ == "__main__":
-    run_commands(up, run, wait, ssh, stop, kill, clean, usage=USAGE)
+    run_commands(up, run, shell, stop, kill, clean, status, logs, usage=USAGE, default_fn=status)