From 15d2f6141137177e8fd292936dd01f63f3535a7f Mon Sep 17 00:00:00 2001 From: facebook-github-bot Date: Wed, 29 Dec 2021 16:14:23 -0800 Subject: [PATCH] Initial commit fbshipit-source-id: c440d991296c92bdc5e109a11d269049e8840e94 --- .github/workflows/ci.yml | 91 + .gitignore | 2 + CHANGELOG.md | 5 + CODE_OF_CONDUCT.md | 80 + CONTRIBUTING.md | 46 + Cargo.toml | 9 + LICENSE | 31 + README.md | 110 + assets/architecture-diagram.svg | 4 + reverie-examples/Cargo.toml | 61 + reverie-examples/README.md | 63 + reverie-examples/chaos.rs | 179 + reverie-examples/chrome-trace/event.rs | 169 + reverie-examples/chrome-trace/global_state.rs | 71 + reverie-examples/chrome-trace/main.rs | 61 + reverie-examples/chrome-trace/tool.rs | 154 + reverie-examples/chunky_print.rs | 258 ++ reverie-examples/counter1.rs | 78 + reverie-examples/counter2.rs | 157 + reverie-examples/debug.rs | 50 + reverie-examples/noop.rs | 38 + reverie-examples/pedigree.rs | 90 + reverie-examples/strace/config.rs | 17 + reverie-examples/strace/filter.rs | 79 + reverie-examples/strace/global_state.rs | 23 + reverie-examples/strace/main.rs | 57 + reverie-examples/strace/tool.rs | 115 + reverie-examples/strace_minimal.rs | 46 + reverie-process/Cargo.toml | 26 + reverie-process/src/builder.rs | 760 ++++ reverie-process/src/child.rs | 259 ++ reverie-process/src/clone.rs | 47 + reverie-process/src/container.rs | 978 +++++ reverie-process/src/env.rs | 108 + reverie-process/src/error.rs | 189 + reverie-process/src/exit_status.rs | 331 ++ reverie-process/src/fd.rs | 531 +++ reverie-process/src/id_map.rs | 17 + reverie-process/src/lib.rs | 610 +++ reverie-process/src/mount.rs | 556 +++ reverie-process/src/namespace.rs | 74 + reverie-process/src/net.rs | 220 ++ reverie-process/src/pid.rs | 132 + reverie-process/src/pty.rs | 203 + reverie-process/src/seccomp/bpf.rs | 440 +++ reverie-process/src/seccomp/mod.rs | 338 ++ reverie-process/src/spawn.rs | 142 + reverie-process/src/stdio.rs | 240 ++ reverie-process/src/util.rs | 81 + reverie-ptrace/Cargo.toml | 38 + reverie-ptrace/src/children.rs | 100 + reverie-ptrace/src/cp/consts.rs | 22 + reverie-ptrace/src/cp/mmap.rs | 40 + reverie-ptrace/src/cp/mod.rs | 14 + reverie-ptrace/src/debug.rs | 252 ++ reverie-ptrace/src/error.rs | 27 + reverie-ptrace/src/gdbstub/breakpoint.rs | 45 + .../gdbstub/commands/base/_QStartNoAckMode.rs | 24 + .../gdbstub/commands/base/_QThreadEvents.rs | 32 + .../gdbstub/commands/base/_QuestionMark.rs | 24 + .../src/gdbstub/commands/base/_c.rs | 16 + .../src/gdbstub/commands/base/_d_upper.rs | 33 + .../src/gdbstub/commands/base/_g.rs | 20 + .../src/gdbstub/commands/base/_g_upper.rs | 27 + .../src/gdbstub/commands/base/_h_upper.rs | 49 + .../src/gdbstub/commands/base/_k.rs | 12 + .../src/gdbstub/commands/base/_m.rs | 31 + .../src/gdbstub/commands/base/_m_upper.rs | 36 + .../src/gdbstub/commands/base/_p.rs | 14 + .../src/gdbstub/commands/base/_p_upper.rs | 16 + .../src/gdbstub/commands/base/_qAttached.rs | 30 + .../src/gdbstub/commands/base/_qC.rs | 20 + .../src/gdbstub/commands/base/_qSupported.rs | 28 + .../src/gdbstub/commands/base/_qXfer.rs | 51 + .../gdbstub/commands/base/_qfThreadInfo.rs | 24 + .../gdbstub/commands/base/_qsThreadInfo.rs | 24 + .../src/gdbstub/commands/base/_s.rs | 15 + .../src/gdbstub/commands/base/_t_upper.rs | 15 + .../src/gdbstub/commands/base/_vCont.rs | 76 + .../src/gdbstub/commands/base/_vFile.rs | 149 + .../src/gdbstub/commands/base/_vKill.rs | 31 + .../src/gdbstub/commands/base/_x_upper.rs | 72 + .../src/gdbstub/commands/base/_z.rs | 33 + .../src/gdbstub/commands/base/_z_upper.rs | 33 + .../src/gdbstub/commands/base/mod.rs | 64 + .../extended_mode/_ExclamationMark.rs | 24 + .../extended_mode/_QDisableRandomization.rs | 27 + .../extended_mode/_QEnvironmentHexEncoded.rs | 16 + .../extended_mode/_QEnvironmentReset.rs | 12 + .../extended_mode/_QEnvironmentUnset.rs | 15 + .../commands/extended_mode/_QSetWorkingDir.rs | 15 + .../extended_mode/_QStartupWithShell.rs | 15 + .../commands/extended_mode/_r_upper.rs | 13 + .../commands/extended_mode/_vAttach.rs | 16 + .../gdbstub/commands/extended_mode/_vRun.rs | 16 + .../src/gdbstub/commands/extended_mode/mod.rs | 30 + reverie-ptrace/src/gdbstub/commands/mod.rs | 589 +++ .../gdbstub/commands/monitor_cmd/_qRcmd.rs | 28 + .../src/gdbstub/commands/monitor_cmd/mod.rs | 12 + .../commands/section_offsets/_qOffsets.rs | 24 + .../gdbstub/commands/section_offsets/mod.rs | 12 + reverie-ptrace/src/gdbstub/error.rs | 85 + reverie-ptrace/src/gdbstub/hex.rs | 401 ++ reverie-ptrace/src/gdbstub/inferior.rs | 129 + reverie-ptrace/src/gdbstub/logger.rs | 81 + reverie-ptrace/src/gdbstub/mod.rs | 35 + reverie-ptrace/src/gdbstub/packet.rs | 132 + reverie-ptrace/src/gdbstub/regs.rs | 557 +++ reverie-ptrace/src/gdbstub/request.rs | 32 + reverie-ptrace/src/gdbstub/response.rs | 263 ++ reverie-ptrace/src/gdbstub/server.rs | 265 ++ reverie-ptrace/src/gdbstub/session.rs | 850 ++++ reverie-ptrace/src/lib.rs | 65 + reverie-ptrace/src/perf.rs | 690 ++++ reverie-ptrace/src/stack.rs | 227 ++ reverie-ptrace/src/task.rs | 2146 +++++++++++ reverie-ptrace/src/testing.rs | 143 + reverie-ptrace/src/timer.rs | 624 +++ reverie-ptrace/src/trace/memory.rs | 360 ++ reverie-ptrace/src/trace/mod.rs | 1458 +++++++ reverie-ptrace/src/trace/notifier.rs | 362 ++ reverie-ptrace/src/trace/waitid.rs | 346 ++ reverie-ptrace/src/tracer.rs | 640 ++++ reverie-ptrace/src/vdso.rs | 216 ++ reverie-syscalls/Cargo.toml | 17 + reverie-syscalls/src/args/fcntl.rs | 77 + reverie-syscalls/src/args/ioctl.rs | 221 ++ reverie-syscalls/src/args/mod.rs | 525 +++ reverie-syscalls/src/args/poll.rs | 104 + reverie-syscalls/src/args/stat.rs | 155 + reverie-syscalls/src/args/time.rs | 97 + reverie-syscalls/src/display.rs | 200 + reverie-syscalls/src/lib.rs | 40 + reverie-syscalls/src/macros.rs | 821 ++++ reverie-syscalls/src/memory/addr.rs | 495 +++ reverie-syscalls/src/memory/local.rs | 88 + reverie-syscalls/src/memory/mod.rs | 342 ++ reverie-syscalls/src/raw.rs | 205 + reverie-syscalls/src/syscalls/family.rs | 133 + reverie-syscalls/src/syscalls/mod.rs | 3406 +++++++++++++++++ reverie-util/Cargo.toml | 20 + reverie-util/src/commandline.rs | 164 + reverie-util/src/lib.rs | 13 + reverie-util/src/pedigree.rs | 248 ++ reverie/Cargo.toml | 23 + reverie/src/auxv.rs | 119 + reverie/src/backtrace.rs | 153 + reverie/src/error.rs | 49 + reverie/src/guest.rs | 364 ++ reverie/src/lib.rs | 74 + reverie/src/rdtsc.rs | 56 + reverie/src/stack.rs | 45 + reverie/src/subscription.rs | 257 ++ reverie/src/timer.rs | 23 + reverie/src/tool.rs | 324 ++ rust-toolchain.toml | 3 + rustfmt.toml | 5 + tests/backtrace.rs | 69 + tests/basics.rs | 264 ++ tests/busywait.rs | 224 ++ tests/c_tests/cc_no_shlib.sh | 29 + tests/c_tests/clock-nanosleep.c | 30 + tests/c_tests/forkExec.c | 43 + tests/c_tests/forkMany-blockSigchld.c | 71 + tests/c_tests/forkMany.c | 67 + tests/c_tests/forkNoWait.c | 64 + tests/c_tests/getpid-pie.c | 42 + tests/c_tests/getpid.c | 23 + tests/c_tests/nanosleep.c | 58 + tests/c_tests/open-many.c | 32 + tests/c_tests/openat1.c | 53 + tests/c_tests/signal1.c | 92 + tests/c_tests/signal2.c | 64 + tests/c_tests/signal3.c | 81 + tests/c_tests/sigprocmask1.c | 71 + tests/c_tests/thread8-cond-wait.c | 77 + tests/c_tests/thread9-cond-bcast.c | 81 + tests/c_tests/threads1.c | 70 + tests/c_tests/threads2.c | 121 + tests/c_tests/threads3.c | 185 + tests/c_tests/threads4.c | 57 + tests/c_tests/threads5.c | 156 + tests/c_tests/threads6.c | 168 + tests/c_tests/threads_dual_exit.c | 53 + tests/c_tests/threads_exit_group.c | 80 + tests/c_tests/threads_exit_mixed.c | 148 + tests/c_tests/threads_group_exit_blocking.c | 73 + tests/c_tests/threads_group_exit_stress.c | 202 + tests/c_tests/vforkExec.c | 43 + tests/c_tests/write-many.c | 24 + tests/convert.rs | 113 + tests/cpuid.rs | 148 + tests/delay_signal.rs | 551 +++ tests/disabled/clobbered.S | 108 + tests/disabled/openat2.S | 29 + tests/disabled/segfault.c | 17 + tests/disabled/signal4.c | 68 + tests/disabled/threads7.c | 168 + tests/disabled/x64-save-return-address.c | 58 + tests/exit.rs | 85 + .../gdbserver-helper/src/client.rs | 84 + .../gdbserver-helper/src/main.rs | 272 ++ .../gdbserver-helper/src/server.rs | 78 + .../gdbserver-integration/test-src/forkExec.c | 43 + .../test-src/manyThreads.c | 74 + tests/gdbserver-integration/test-src/nested.c | 33 + .../gdbserver-integration/test-src/openat1.c | 53 + .../gdbserver-integration/test-src/threads1.c | 70 + .../gdbserver-integration/test-src/threads2.c | 58 + tests/parallelism.rs | 159 + tests/rdtsc.rs | 102 + tests/shell_tests/build-musl.sh | 36 + tests/signal.rs | 113 + tests/signalfd.rs | 127 + tests/spinlock.rs | 53 + tests/stack.rs | 198 + tests/standalone/README.md | 5 + tests/standalone/at_random.rs | 78 + tests/standalone/inject_then_tail_inject.rs | 162 + tests/standalone/parallel_tasks.rs | 106 + tests/stat.rs | 119 + tests/state.rs | 169 + tests/thread_start.rs | 51 + tests/timer_semantics.rs | 712 ++++ tests/vdso.rs | 88 + tests/vfork.rs | 178 + 226 files changed, 37281 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 assets/architecture-diagram.svg create mode 100644 reverie-examples/Cargo.toml create mode 100644 reverie-examples/README.md create mode 100644 reverie-examples/chaos.rs create mode 100644 reverie-examples/chrome-trace/event.rs create mode 100644 reverie-examples/chrome-trace/global_state.rs create mode 100644 reverie-examples/chrome-trace/main.rs create mode 100644 reverie-examples/chrome-trace/tool.rs create mode 100644 reverie-examples/chunky_print.rs create mode 100644 reverie-examples/counter1.rs create mode 100644 reverie-examples/counter2.rs create mode 100644 reverie-examples/debug.rs create mode 100644 reverie-examples/noop.rs create mode 100644 reverie-examples/pedigree.rs create mode 100644 reverie-examples/strace/config.rs create mode 100644 reverie-examples/strace/filter.rs create mode 100644 reverie-examples/strace/global_state.rs create mode 100644 reverie-examples/strace/main.rs create mode 100644 reverie-examples/strace/tool.rs create mode 100644 reverie-examples/strace_minimal.rs create mode 100644 reverie-process/Cargo.toml create mode 100644 reverie-process/src/builder.rs create mode 100644 reverie-process/src/child.rs create mode 100644 reverie-process/src/clone.rs create mode 100644 reverie-process/src/container.rs create mode 100644 reverie-process/src/env.rs create mode 100644 reverie-process/src/error.rs create mode 100644 reverie-process/src/exit_status.rs create mode 100644 reverie-process/src/fd.rs create mode 100644 reverie-process/src/id_map.rs create mode 100644 reverie-process/src/lib.rs create mode 100644 reverie-process/src/mount.rs create mode 100644 reverie-process/src/namespace.rs create mode 100644 reverie-process/src/net.rs create mode 100644 reverie-process/src/pid.rs create mode 100644 reverie-process/src/pty.rs create mode 100644 reverie-process/src/seccomp/bpf.rs create mode 100644 reverie-process/src/seccomp/mod.rs create mode 100644 reverie-process/src/spawn.rs create mode 100644 reverie-process/src/stdio.rs create mode 100644 reverie-process/src/util.rs create mode 100644 reverie-ptrace/Cargo.toml create mode 100644 reverie-ptrace/src/children.rs create mode 100644 reverie-ptrace/src/cp/consts.rs create mode 100644 reverie-ptrace/src/cp/mmap.rs create mode 100644 reverie-ptrace/src/cp/mod.rs create mode 100644 reverie-ptrace/src/debug.rs create mode 100644 reverie-ptrace/src/error.rs create mode 100644 reverie-ptrace/src/gdbstub/breakpoint.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_QStartNoAckMode.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_QThreadEvents.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_QuestionMark.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_c.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_d_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_g.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_g_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_h_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_k.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_m.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_m_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_p.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_p_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qAttached.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qC.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qSupported.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qXfer.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qfThreadInfo.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_qsThreadInfo.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_s.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_t_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_vCont.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_vFile.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_vKill.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_x_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_z.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/_z_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/base/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_ExclamationMark.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QDisableRandomization.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentHexEncoded.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentReset.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentUnset.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QSetWorkingDir.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_QStartupWithShell.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_r_upper.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_vAttach.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/_vRun.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/extended_mode/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/monitor_cmd/_qRcmd.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/monitor_cmd/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/section_offsets/_qOffsets.rs create mode 100644 reverie-ptrace/src/gdbstub/commands/section_offsets/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/error.rs create mode 100644 reverie-ptrace/src/gdbstub/hex.rs create mode 100644 reverie-ptrace/src/gdbstub/inferior.rs create mode 100644 reverie-ptrace/src/gdbstub/logger.rs create mode 100644 reverie-ptrace/src/gdbstub/mod.rs create mode 100644 reverie-ptrace/src/gdbstub/packet.rs create mode 100644 reverie-ptrace/src/gdbstub/regs.rs create mode 100644 reverie-ptrace/src/gdbstub/request.rs create mode 100644 reverie-ptrace/src/gdbstub/response.rs create mode 100644 reverie-ptrace/src/gdbstub/server.rs create mode 100644 reverie-ptrace/src/gdbstub/session.rs create mode 100644 reverie-ptrace/src/lib.rs create mode 100644 reverie-ptrace/src/perf.rs create mode 100644 reverie-ptrace/src/stack.rs create mode 100644 reverie-ptrace/src/task.rs create mode 100644 reverie-ptrace/src/testing.rs create mode 100644 reverie-ptrace/src/timer.rs create mode 100644 reverie-ptrace/src/trace/memory.rs create mode 100644 reverie-ptrace/src/trace/mod.rs create mode 100644 reverie-ptrace/src/trace/notifier.rs create mode 100644 reverie-ptrace/src/trace/waitid.rs create mode 100644 reverie-ptrace/src/tracer.rs create mode 100644 reverie-ptrace/src/vdso.rs create mode 100644 reverie-syscalls/Cargo.toml create mode 100644 reverie-syscalls/src/args/fcntl.rs create mode 100644 reverie-syscalls/src/args/ioctl.rs create mode 100644 reverie-syscalls/src/args/mod.rs create mode 100644 reverie-syscalls/src/args/poll.rs create mode 100644 reverie-syscalls/src/args/stat.rs create mode 100644 reverie-syscalls/src/args/time.rs create mode 100644 reverie-syscalls/src/display.rs create mode 100644 reverie-syscalls/src/lib.rs create mode 100644 reverie-syscalls/src/macros.rs create mode 100644 reverie-syscalls/src/memory/addr.rs create mode 100644 reverie-syscalls/src/memory/local.rs create mode 100644 reverie-syscalls/src/memory/mod.rs create mode 100644 reverie-syscalls/src/raw.rs create mode 100644 reverie-syscalls/src/syscalls/family.rs create mode 100644 reverie-syscalls/src/syscalls/mod.rs create mode 100644 reverie-util/Cargo.toml create mode 100644 reverie-util/src/commandline.rs create mode 100644 reverie-util/src/lib.rs create mode 100644 reverie-util/src/pedigree.rs create mode 100644 reverie/Cargo.toml create mode 100644 reverie/src/auxv.rs create mode 100644 reverie/src/backtrace.rs create mode 100644 reverie/src/error.rs create mode 100644 reverie/src/guest.rs create mode 100644 reverie/src/lib.rs create mode 100644 reverie/src/rdtsc.rs create mode 100644 reverie/src/stack.rs create mode 100644 reverie/src/subscription.rs create mode 100644 reverie/src/timer.rs create mode 100644 reverie/src/tool.rs create mode 100644 rust-toolchain.toml create mode 100644 rustfmt.toml create mode 100644 tests/backtrace.rs create mode 100644 tests/basics.rs create mode 100644 tests/busywait.rs create mode 100755 tests/c_tests/cc_no_shlib.sh create mode 100644 tests/c_tests/clock-nanosleep.c create mode 100644 tests/c_tests/forkExec.c create mode 100644 tests/c_tests/forkMany-blockSigchld.c create mode 100644 tests/c_tests/forkMany.c create mode 100644 tests/c_tests/forkNoWait.c create mode 100644 tests/c_tests/getpid-pie.c create mode 100644 tests/c_tests/getpid.c create mode 100644 tests/c_tests/nanosleep.c create mode 100644 tests/c_tests/open-many.c create mode 100644 tests/c_tests/openat1.c create mode 100644 tests/c_tests/signal1.c create mode 100644 tests/c_tests/signal2.c create mode 100644 tests/c_tests/signal3.c create mode 100644 tests/c_tests/sigprocmask1.c create mode 100644 tests/c_tests/thread8-cond-wait.c create mode 100644 tests/c_tests/thread9-cond-bcast.c create mode 100644 tests/c_tests/threads1.c create mode 100644 tests/c_tests/threads2.c create mode 100644 tests/c_tests/threads3.c create mode 100644 tests/c_tests/threads4.c create mode 100644 tests/c_tests/threads5.c create mode 100644 tests/c_tests/threads6.c create mode 100644 tests/c_tests/threads_dual_exit.c create mode 100644 tests/c_tests/threads_exit_group.c create mode 100644 tests/c_tests/threads_exit_mixed.c create mode 100644 tests/c_tests/threads_group_exit_blocking.c create mode 100644 tests/c_tests/threads_group_exit_stress.c create mode 100644 tests/c_tests/vforkExec.c create mode 100644 tests/c_tests/write-many.c create mode 100644 tests/convert.rs create mode 100644 tests/cpuid.rs create mode 100644 tests/delay_signal.rs create mode 100644 tests/disabled/clobbered.S create mode 100644 tests/disabled/openat2.S create mode 100644 tests/disabled/segfault.c create mode 100644 tests/disabled/signal4.c create mode 100644 tests/disabled/threads7.c create mode 100644 tests/disabled/x64-save-return-address.c create mode 100644 tests/exit.rs create mode 100644 tests/gdbserver-integration/gdbserver-helper/src/client.rs create mode 100644 tests/gdbserver-integration/gdbserver-helper/src/main.rs create mode 100644 tests/gdbserver-integration/gdbserver-helper/src/server.rs create mode 100644 tests/gdbserver-integration/test-src/forkExec.c create mode 100644 tests/gdbserver-integration/test-src/manyThreads.c create mode 100644 tests/gdbserver-integration/test-src/nested.c create mode 100644 tests/gdbserver-integration/test-src/openat1.c create mode 100644 tests/gdbserver-integration/test-src/threads1.c create mode 100644 tests/gdbserver-integration/test-src/threads2.c create mode 100644 tests/parallelism.rs create mode 100644 tests/rdtsc.rs create mode 100755 tests/shell_tests/build-musl.sh create mode 100644 tests/signal.rs create mode 100644 tests/signalfd.rs create mode 100644 tests/spinlock.rs create mode 100644 tests/stack.rs create mode 100644 tests/standalone/README.md create mode 100644 tests/standalone/at_random.rs create mode 100644 tests/standalone/inject_then_tail_inject.rs create mode 100644 tests/standalone/parallel_tasks.rs create mode 100644 tests/stat.rs create mode 100644 tests/state.rs create mode 100644 tests/thread_start.rs create mode 100644 tests/timer_semantics.rs create mode 100644 tests/vdso.rs create mode 100644 tests/vfork.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3cf336d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,91 @@ +name: ci + +on: + push: + pull_request: + +jobs: + check: + name: Check + runs-on: ubuntu-latest + steps: + - name: Install libunwind-dev + run: sudo apt-get install -y libunwind-dev + + - name: Checkout sources + uses: actions/checkout@v2 + + - name: Install nightly toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + + - name: Run cargo check + uses: actions-rs/cargo@v1 + with: + command: check + + test: + name: Test Suite + runs-on: ubuntu-latest + steps: + - name: Install libunwind-dev + run: sudo apt-get install -y libunwind-dev + + - name: Checkout sources + uses: actions/checkout@v2 + + - name: Install nightly toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + + - name: Run cargo test + uses: actions-rs/cargo@v1 + with: + command: test + args: -- --test-threads=1 + +## Currently disabled because internal version of rustfmt produces different +## formatting. +# rustfmt: +# name: Check format +# runs-on: ubuntu-latest +# steps: +# - name: Checkout sources +# uses: actions/checkout@v2 +# +# - name: Install nightly toolchain +# uses: actions-rs/toolchain@v1 +# with: +# profile: minimal +# toolchain: nightly +# override: true +# components: rustfmt +# +# - name: Run cargo fmt +# uses: actions-rs/cargo@v1 +# with: +# command: fmt +# args: --all -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - name: Install libunwind-dev + run: sudo apt-get install -y libunwind-dev + + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + components: clippy + override: true + - uses: actions-rs/clippy-check@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2c96eb1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/ +Cargo.lock diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..9ee178f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Reverie + +## 0.1.0 (December 1, 2021) + + - Initial release diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..83f431e --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when there is a +reasonable belief that an individual's behavior may have a negative impact on +the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..854d94a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,46 @@ +# Contributing to Reverie + +We want to make contributing to this project as easy and transparent as +possible. + +## Our Development Process + +Reverie is currently developed in Meta's internal repositories and then +exported out to GitHub by a Meta team member; however, we invite you to +submit pull requests as described below. + +## Pull Requests + +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") + +In order to accept your pull request, we need you to submit a CLA. You only +need to do this once to work on any of Meta's open source projects. + +Complete your CLA here: + +## Issues + +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style + +Follow the automatic `rustfmt` configuration. + +## License + +By contributing to Reverie, you agree that your contributions will be +licensed under the LICENSE file in the root directory of this source tree. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..182e189 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[workspace] +members = [ + "reverie", + "reverie-examples", + "reverie-process", + "reverie-ptrace", + "reverie-syscalls", + "reverie-util", +] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..36fe71e --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +Copyright notices are include in each source file. For other files, +the below copyright applies: + + Copyright (c) 2018-2019, Trustees of Indiana University + ("University Works" via Baojun Wang) + Copyright (c) 2018-2019, Ryan Newton + ("Traditional Works of Scholarship") + Copyright (c) 2020-, Facebook, Inc. and its affiliates. + +BSD 2-Clause License + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..807eb12 --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# Reverie + +Reverie is a user space system-call interception framework for x86-64 Linux. +It can be used to intercept, modify, or elide a syscall before the kernel +executes it. In essence, Reverie sits at the boundary between user space and +kernel space. + +Some potential use cases include: + +* Observability tools, like `strace`. +* Failure injection to test error handling logic. +* Manipulating scheduling decisions to expose concurrency bugs. + +See the [`reverie-examples`](reverie-examples) directory for examples of +tools that can be built with this library. + +## Features + + * Ergonomic syscall handling. It is easy to modify syscall arguments or return + values, inject multiple syscalls, or suppress the syscall entirely. + * Async-await usage allows blocking syscalls to be handled without blocking + other guest threads. + * Can intercept CPUID and RDTSC instructions. + * Typed syscalls. Every syscall has a wrapper to make it easier to access + pointer values. This also enables strace-like pretty-printing for free. + * Avoid intercepting syscalls we don't care about. For example, if we only care + about `sys_open`, we can avoid paying the cost of intercepting other + syscalls. + * Can act as a GDB server. This allows connection via the GDB client where you + can step through the process that is being traced by Reverie. + +## Terminology and Background + +Clients of the Reverie library write ***tools***. A tool runs a shell command +creating a ***guest*** process tree, comprised of multiple guest threads and +processes, in an instrumented manner. Each Reverie tool is written as a set +of callbacks (i.e. ***handlers***), which are invoked each time a guest +thread encounters a trappable event such as a system call or inbound signal. +The tool can stipulate exactly which events streams it ***subscribes*** to. +The tool itself is stateful, maintaining state between consecutive +invocations. + +## Usage + +Currently, there is only the `reverie-ptrace` backend which uses `ptrace` to +intercept syscalls. Copy one of the example tools to a new Rust project (e.g. +`cargo init`). You’ll see that it depends both on the general `reverie` crate +for the API and on the specific backend implementation crate, +`reverie_ptrace`. + +## Performance + +Since `ptrace` adds significant overhead when the guest has a syscall-heavy +workload, Reverie will add similarly-significant overhead. The slowdown depends +on how many syscalls are being performed and are intercepted by the tool. + +The primary way you can improve performance with the current implementation is +to implement the `subscriptions` callback, specifying a minimal set of syscalls +that are actually required by your tool. + +## Overall architecture + +When implementing a Reverie tool, there are three main components of the tool to +consider: + +* The process-level state, +* the thread-level state, and +* the global state (which is shared among all processes and threads in the + traced process tree). + +This separation of process-, thread-, and global-state is meant to provide an +abstraction that allows future Reverie backends to be used without requiring the +tool to be rewritten. + +![architecture](./assets/architecture-diagram.svg "Architecture Diagram") + +### Process State + +Whenever a new process is spawned (i.e., when `fork` or `clone` is called by the +guest), a new instance of the process state struct is created and managed by the +Reverie backend. + +### Thread State + +When a syscall is intercepted, it is always associated with the thread that +called it. + +### Global State + +The global state is accessed via RPC messages. Since a future Reverie backend +may use in-guest syscall interception, the syscall handler code may not be +running in the same address space. Thus, all shared state is communicated via +RPC messages. (There is, however, currently only a single ptrace-based backend +where all tracer code is in the same address space.) + +## Future Plans + + * Add a more performant backend. The rough goal is to have handlers executing in + the guest with close to regular functional call overhead. Global state and its + methods will still be centralized, but the RPC/IPC mechanism between guest & + the centralized tool process will become much more efficient. + +## Contributing + +Contributions are welcome! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) +file for guidance. + +## License + +Reverie is BSD 2-Clause licensed as found in the [LICENSE](LICENSE) file. diff --git a/assets/architecture-diagram.svg b/assets/architecture-diagram.svg new file mode 100644 index 0000000..d527ded --- /dev/null +++ b/assets/architecture-diagram.svg @@ -0,0 +1,4 @@ + + + +
Global State
Global State
Request
Request
Response
Response
Process
Process
Thread Group Leader
Thread Group L...
Thread 1
Thread 1
Thread 2
Thread 2
Thread 3
Thread 3
Child Process
Child Process
Thread Group Leader
Thread Group L...
Process/Thread Tree
Process/Thread Tree
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/reverie-examples/Cargo.toml b/reverie-examples/Cargo.toml new file mode 100644 index 0000000..48fd1d7 --- /dev/null +++ b/reverie-examples/Cargo.toml @@ -0,0 +1,61 @@ +# @generated by autocargo + +[package] +name = "reverie-examples" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" +publish = false + +[[bin]] +name = "chaos" +path = "chaos.rs" + +[[bin]] +name = "chrome_trace" +path = "chrome-trace/main.rs" + +[[bin]] +name = "chunky_print" +path = "chunky_print.rs" + +[[bin]] +name = "counter1" +path = "counter1.rs" + +[[bin]] +name = "counter2" +path = "counter2.rs" + +[[bin]] +name = "debug" +path = "debug.rs" + +[[bin]] +name = "noop" +path = "noop.rs" + +[[bin]] +name = "pedigree" +path = "pedigree.rs" + +[[bin]] +name = "strace" +path = "strace/main.rs" + +[[bin]] +name = "strace_minimal" +path = "strace_minimal.rs" + +[dependencies] +anyhow = "1.0.51" +nix = "0.22" +reverie = { version = "0.1.0", path = "../reverie" } +reverie-ptrace = { version = "0.1.0", path = "../reverie-ptrace" } +reverie-util = { version = "0.1.0", path = "../reverie-util" } +serde = { version = "1.0.126", features = ["derive", "rc"] } +serde_json = { version = "1.0.64", features = ["float_roundtrip", "unbounded_depth"] } +structopt = "0.3.23" +tokio = { version = "1.10", features = ["full", "test-util", "tracing"] } +tracing = "0.1.29" diff --git a/reverie-examples/README.md b/reverie-examples/README.md new file mode 100644 index 0000000..a2b418e --- /dev/null +++ b/reverie-examples/README.md @@ -0,0 +1,63 @@ +# Examples + +Example tools built on top of Reverie. + +Copying one of these examples is the recommended way to get started using +Reverie. + +# chrome-trace: Generates a chrome trace file + +This tool is like `strace`, but generates a trace file that can be loaded in +`chrome://tracing/`. + +# counter1: Reverie Counter Tool (1) + +This is a basic example of event counting. It counts the number of system +calls and reports that single integer at exit. + +This version of tool uses a single, centralized piece of global state. + +# counter2: Reverie Counter Tool (2) + +This is a basic example of event counting. This tool counts the number of +system calls and reports that single integer at exit. + +This implementation of the tool uses a *distributed* notion of state, +maintaining a per-thread, per-process, and global state. Basically, this is +an example of "MapReduce" style tracing of a process tree. + +# noop: Identity Function Tool + +This instrumentation tool intercepts events but does nothing with them. It is +useful for observing the overhead of interception, and as a starting point. + +# chunky_print: Print-gating Tool + +This example tool intercepts write events on stdout and stderr and +manipulates either when those outputs are released, or the scheduling order +that determines the order of printed output. + +# pedigree: Deterministic virtual process IDs + +This tool monitors the spawning of new processes and maps each new PID to a +deterministic virtual PID. The new virtual PID is reported after each +process-spawning syscall. + +This tool is a work-in-progress and is not yet functioning. + +`pedigree.rs` is an implementation of pedigree / virtual PID generation using local state. +`virtual_process_tree.rs` is an implementation which uses global state. + +# strace: Reverie Echo Tool + +This instrumentation tool simply echos intercepted events, like strace. + +# chaos: Chaos Tool + +This tool is meant to emulate a pathological kernel where: + + 1. `read` and `recvfrom` calls return only one byte at a time. This is + intended to catch errors in parsers that assume multiple bytes will be + returned at a time. + 2. `EINTR` is returned instead of running the real syscall for every other + read. diff --git a/reverie-examples/chaos.rs b/reverie-examples/chaos.rs new file mode 100644 index 0000000..41d6c0a --- /dev/null +++ b/reverie-examples/chaos.rs @@ -0,0 +1,179 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicU64, Ordering}; +use structopt::StructOpt; + +use reverie::{ + syscalls::{Displayable, Errno, Syscall}, + Error, GlobalTool, Guest, Pid, Tool, +}; +use reverie_util::CommonToolArguments; + +/// A tool to introduce inject "chaos" into a running process. A pathological +/// kernel is simulated by forcing reads to only return one byte a time. +#[derive(Debug, StructOpt)] +struct Args { + #[structopt(flatten)] + common_opts: CommonToolArguments, + + #[structopt(flatten)] + chaos_opts: ChaosOpts, +} + +#[derive(StructOpt, Debug, Serialize, Deserialize, Clone, Default)] +struct ChaosOpts { + /// Skips the first N syscalls of a process before doing any intervention. + /// This is useful when you need to skip past an error caused by the tool. + #[structopt(long, value_name = "N", default_value = "0")] + skip: u64, + + /// If set, does not intercept `read`-like system calls and modify them. + #[structopt(long)] + no_read: bool, + + /// If set, does not intercept `recv`-like system calls and modify them. + #[structopt(long)] + no_recv: bool, + + /// If set, does not inject random `EINTR` errors. + #[structopt(long)] + no_interrupt: bool, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct ChaosTool { + count: AtomicU64, +} + +impl Clone for ChaosTool { + fn clone(&self) -> Self { + ChaosTool { + count: AtomicU64::new(self.count.load(Ordering::SeqCst)), + } + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct ChaosToolGlobal {} + +#[reverie::global_tool] +impl GlobalTool for ChaosToolGlobal { + type Config = ChaosOpts; + + async fn receive_rpc(&self, _from: Pid, _request: ()) {} +} + +#[reverie::tool] +impl Tool for ChaosTool { + type ThreadState = bool; + type GlobalState = ChaosToolGlobal; + + fn new(_pid: Pid, _cfg: &ChaosOpts) -> Self { + Self { + count: AtomicU64::new(0), + } + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let count = self.count.fetch_add(1, Ordering::SeqCst); + + let config = guest.config().clone(); + let memory = guest.memory(); + + // This provides a way to wait until the dynamic linker has done its job + // before we start trying to create chaos. glibc's dynamic linker has a + // bug where it doesn't retry `read` calls that don't return the + // expected amount of data. + if count < config.skip { + eprintln!( + "SKIPPED [pid={}, n={}] {}", + guest.pid(), + count, + syscall.display(&memory), + ); + + return guest.tail_inject(syscall).await; + } + + // Transform the syscall arguments. + let syscall = match syscall { + Syscall::Read(read) => { + if !config.no_interrupt && !*guest.thread_state() { + // Return an EINTR instead of running the syscall. + // Programs should always retry the read in this case. + *guest.thread_state_mut() = true; + + // XXX: inject a signal like SIGINT? + let ret = Err(Errno::ERESTARTSYS); + + eprintln!( + "[pid={}, n={}] {} = {}", + guest.pid(), + count, + syscall.display(&memory), + ret.unwrap_or_else(|errno| -errno.into_raw() as i64) + ); + + return Ok(ret?); + } else if !config.no_read { + // Reduce read length to 1 byte at most. + Syscall::Read(read.with_len(1.min(read.len()))) + } else { + // Return syscall unmodified. + Syscall::Read(read) + } + } + Syscall::Recvfrom(recv) if !config.no_recv => { + // Reduce recv length to 1 byte at most. + Syscall::Recvfrom(recv.with_len(1.min(recv.len()))) + } + x => { + eprintln!( + "[pid={}, n={}] {}", + guest.pid(), + count, + syscall.display(&memory), + ); + return guest.tail_inject(x).await; + } + }; + + *guest.thread_state_mut() = false; + + let ret = guest.inject(syscall).await; + + eprintln!( + "[pid={}, n={}] {} = {}", + guest.pid(), + count, + syscall.display_with_outputs(&memory), + ret.unwrap_or_else(|errno| -errno.into_raw() as i64) + ); + + Ok(ret?) + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Args::from_args(); + let log_guard = args.common_opts.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.common_opts.into()) + .config(args.chaos_opts) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/chrome-trace/event.rs b/reverie-examples/chrome-trace/event.rs new file mode 100644 index 0000000..7915f78 --- /dev/null +++ b/reverie-examples/chrome-trace/event.rs @@ -0,0 +1,169 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::path::PathBuf; +use std::time::SystemTime; + +use reverie::syscalls::Sysno; +use reverie::Errno; +use reverie::ExitStatus; +use reverie::Pid; +use reverie::Tid; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +/// A message sent to the global state whenever a thread shuts down. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ThreadExit { + /// Process ID. + pub pid: Pid, + + /// Thread ID. + pub tid: Tid, + + /// The start time of the thread. + pub start: SystemTime, + + /// The end time of the thread. + pub end: SystemTime, + + /// The series of events from this thread. + pub events: Vec, + + /// The final exit status of this thread. + pub exit_status: ExitStatus, +} + +// TODO: Handle signal, rdtsc, and cpuid events. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Event { + /// A syscall event. Happens whenever a syscall happens. + Syscall { + /// The time at which the syscall started. + start: SystemTime, + + /// The time at which the syscall completed. + end: SystemTime, + + /// The syscall number. + sysno: Sysno, + + /// The formatted syscall with all of its arguments. + pretty: String, + + /// The result of the syscall. + result: Result, + }, + + /// A successful execve event. + Exec { + /// The time at which the execve syscall was executed. + timestamp: SystemTime, + + /// The program being executed. + program: Program, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Program { + /// The path to the program. + pub name: PathBuf, + + /// The program arguments. + pub args: Vec, +} + +impl Program { + pub fn new(name: PathBuf, args: Vec) -> Self { + Self { name, args } + } +} + +impl ThreadExit { + pub fn trace_event(&self, epoch: SystemTime, events: &mut Vec) { + let thread_name = format!("TID {}", self.tid); + + // Record the thread/process start. + { + let ts = self.start.duration_since(epoch).unwrap().as_micros() as u64; + + events.push(json!({ + "name": thread_name, + "cat": "process", + "ph": "B", + "ts": ts, + "pid": self.pid, + "tid": self.tid, + })); + } + + for event in &self.events { + match event { + Event::Syscall { + start, + end, + sysno, + pretty, + result, + } => { + let ts = start.duration_since(epoch).unwrap().as_micros() as u64; + let duration = end.duration_since(*start).unwrap().as_micros() as u64; + + events.push(json!({ + "name": sysno.to_string(), + "cat": "syscall", + "ph": "X", + "ts": ts, + "dur": duration, + "pid": self.pid, + "tid": self.tid, + "args": { + "pretty": pretty, + "result": format!("{:?}", result), + }, + })); + } + Event::Exec { timestamp, program } => { + let ts = timestamp.duration_since(epoch).unwrap().as_micros() as u64; + + // FIXME: This shouldn't be an "instant" event. We should be + // able to determine the duration of the execve call. + events.push(json!({ + "name": "execve", + "cat": "syscall", + "ph": "i", + "ts": ts, + "pid": self.pid, + "tid": self.tid, + "args": { + "program": program, + } + })); + } + } + } + + // Record the thread/process exit. + { + let ts = self.end.duration_since(epoch).unwrap().as_micros() as u64; + + events.push(json!({ + "name": thread_name, + "cat": "process", + "ph": "E", + "ts": ts, + "pid": self.pid, + "tid": self.tid, + "args": { + "exit_status": self.exit_status, + } + })); + } + } +} diff --git a/reverie-examples/chrome-trace/global_state.rs b/reverie-examples/chrome-trace/global_state.rs new file mode 100644 index 0000000..f566473 --- /dev/null +++ b/reverie-examples/chrome-trace/global_state.rs @@ -0,0 +1,71 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::GlobalTool; +use reverie::Pid; + +use serde::Deserialize; +use serde::Serialize; + +use crate::event::ThreadExit; + +use std::io; +use std::path::PathBuf; +use std::sync::Mutex; +use std::time::SystemTime; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Program { + /// The path to the program. + name: PathBuf, + + /// The program arguments. + args: Vec, +} + +#[derive(Debug)] +pub struct GlobalState { + epoch: SystemTime, + events: Mutex>, +} + +impl Default for GlobalState { + fn default() -> Self { + Self { + epoch: SystemTime::now(), + events: Default::default(), + } + } +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = ThreadExit; + type Response = (); + + async fn receive_rpc(&self, _pid: Pid, event: ThreadExit) { + let mut events = self.events.lock().unwrap(); + events.push(event); + } +} + +impl GlobalState { + /// Writes out a chrome trace file to the given writer. + pub fn chrome_trace(&self, writer: &mut W) -> serde_json::Result<()> { + let events = self.events.lock().unwrap(); + let mut json: Vec = Vec::new(); + + for event in events.iter() { + event.trace_event(self.epoch, &mut json); + } + + let json = serde_json::Value::Array(json); + + serde_json::to_writer(writer, &json) + } +} diff --git a/reverie-examples/chrome-trace/main.rs b/reverie-examples/chrome-trace/main.rs new file mode 100644 index 0000000..df681d0 --- /dev/null +++ b/reverie-examples/chrome-trace/main.rs @@ -0,0 +1,61 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Runs a process, gathering metadata about all of the processes that were ran +//! and displays it as a tree using Graphviz. + +mod event; +mod global_state; +mod tool; + +use tool::ChromeTrace; + +use structopt::StructOpt; + +use anyhow::Context; +use reverie::Error; +use reverie_util::CommonToolArguments; + +use std::fs; +use std::io; +use std::path::PathBuf; + +/// A tool to render a summary of the process tree. +#[derive(Debug, StructOpt)] +struct Args { + #[structopt(flatten)] + common: CommonToolArguments, + + /// The path to write out Chrome trace file. This can be loaded with + /// `chrome://tracing`. + #[structopt(long)] + out: Option, +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Args::from_args(); + + let log_guard = args.common.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.common.into()) + .spawn() + .await?; + let (status, global_state) = tracer.wait().await?; + + if let Some(path) = args.out { + let mut f = io::BufWriter::new(fs::File::create(path)?); + global_state + .chrome_trace(&mut f) + .context("failed to generate Chrome trace")?; + } + + // Flush logs before exiting. + drop(log_guard); + status.raise_or_exit() +} diff --git a/reverie-examples/chrome-trace/tool.rs b/reverie-examples/chrome-trace/tool.rs new file mode 100644 index 0000000..301879d --- /dev/null +++ b/reverie-examples/chrome-trace/tool.rs @@ -0,0 +1,154 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::event::Event; +use crate::event::Program; +use crate::event::ThreadExit; +use crate::global_state::GlobalState; + +use reverie::syscalls::SyscallInfo; +use reverie::{ + syscalls::{Displayable, Syscall}, + Errno, Error, ExitStatus, GlobalRPC, GlobalTool, Guest, Pid, Subscription, Tid, Tool, +}; +use serde::{Deserialize, Serialize}; + +use std::borrow::Cow; +use std::fs; +use std::str; +use std::time::SystemTime; + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ChromeTrace(Pid); + +impl Default for ChromeTrace { + fn default() -> Self { + unreachable!("never used") + } +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ThreadState { + /// Time stamp when this thread was spawned. + start: SystemTime, + + /// The events that have occurred on this thread. These will be sent to the + /// global state upon thread exit. + events: Vec, +} + +impl Default for ThreadState { + fn default() -> Self { + Self { + start: SystemTime::now(), + events: Vec::new(), + } + } +} + +impl ThreadState { + pub fn push(&mut self, event: Event) { + self.events.push(event) + } +} + +#[reverie::tool] +impl Tool for ChromeTrace { + type GlobalState = GlobalState; + type ThreadState = ThreadState; + + fn new(pid: Pid, _cfg: &::Config) -> Self { + Self(pid) + } + + fn subscriptions(_cfg: &::Config) -> Subscription { + Subscription::all_syscalls() + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Exit(_) | Syscall::ExitGroup(_) => { + // TODO: Record exits + guest.tail_inject(syscall).await + } + Syscall::Execve(_) | Syscall::Execveat(_) => { + // TODO: Record failed execs + guest.tail_inject(syscall).await + } + _ => { + let start = SystemTime::now(); + + let result = guest.inject(syscall).await; + + let end = SystemTime::now(); + + let sysno = syscall.number(); + let pretty = syscall.display_with_outputs(&guest.memory()).to_string(); + + guest.thread_state_mut().push(Event::Syscall { + start, + end, + sysno, + pretty, + result, + }); + + Ok(result?) + } + } + } + + async fn handle_post_exec>(&self, guest: &mut T) -> Result<(), Errno> { + let program = fs::read_link(format!("/proc/{}/exe", guest.pid())).unwrap(); + + let mut cmdline = fs::read(format!("/proc/{}/cmdline", guest.pid())).unwrap(); + + // Shave off the extra NUL terminator at the end so we don't end up with + // an empty arg at the end. + assert_eq!(cmdline.pop(), Some(b'\0')); + + let args: Vec<_> = cmdline + .split(|byte| *byte == 0) + .map(String::from_utf8_lossy) + .map(Cow::into_owned) + .collect(); + + guest.thread_state_mut().push(Event::Exec { + timestamp: SystemTime::now(), + program: Program::new(program, args), + }); + + Ok(()) + } + + async fn on_exit_thread>( + &self, + tid: Tid, + global_state: &G, + thread_state: Self::ThreadState, + exit_status: ExitStatus, + ) -> Result<(), Error> { + global_state + .send_rpc(ThreadExit { + pid: self.0, + tid, + start: thread_state.start, + end: SystemTime::now(), + events: thread_state.events, + exit_status, + }) + .await?; + + Ok(()) + } +} diff --git a/reverie-examples/chunky_print.rs b/reverie-examples/chunky_print.rs new file mode 100644 index 0000000..48b7990 --- /dev/null +++ b/reverie-examples/chunky_print.rs @@ -0,0 +1,258 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::{ + syscalls::{Addr, MemoryAccess, Syscall}, + Error, GlobalTool, Guest, Tid, Tool, +}; +use reverie_util::CommonToolArguments; +use serde::{Deserialize, Serialize}; +use std::{ + collections::HashMap, + fmt::Write, + io, + sync::{ + atomic::{AtomicBool, Ordering}, + Mutex, + }, + vec::Vec, +}; +use structopt::StructOpt; +use tracing::{debug, info, trace}; + +/// This tool will chunk together printed output from each thread, over fixed time intervals. + +/// How many system calls (in each thread) define an epoch? +const EPOCH: u64 = 10; + +#[derive(PartialEq, Debug, Eq, Hash, Clone, Serialize, Deserialize, Copy)] +pub enum Which { + Stderr, + Stdout, +} + +/// Send individual print attepmts (write calls) to the global object: +#[derive(PartialEq, Debug, Eq, Hash, Clone, Serialize, Deserialize)] +pub enum Msg { + /// Route a print over to the tracer to issue. + Print(Which, Vec), + /// Tick the logical clock. + Tick, + /// Print all buffered messages, cutting off the epoch early + Flush, +} + +type LogicalTime = u64; + +#[derive(Debug, Default)] +struct ChunkyPrintGlobal(Mutex); + +#[derive(Debug, Default)] +struct Inner { + times: HashMap, + printbuf: HashMap)>>, + epoch_num: u64, +} + +#[reverie::global_tool] +impl GlobalTool for ChunkyPrintGlobal { + type Request = Msg; + type Response = (); + async fn receive_rpc(&self, from: Tid, m: Msg) { + let mut mg = self.0.lock().unwrap(); + match m { + Msg::Print(w, s) => { + let v = mg.printbuf.entry(from).or_insert_with(Vec::new); + v.push((w, s)); + } + Msg::Tick => { + let ticks = mg.times.entry(from).or_insert(0); + *ticks += 1; + mg.check_epoch(); + } + Msg::Flush => { + let _ = mg.flush_messages(); + } + } + } +} + +impl Inner { + /// Check if the epoch has expired and flush the buffer. + fn check_epoch(&mut self) { + if self.times.iter().all(|(_p, t)| (*t > EPOCH)) { + let _ = self.flush_messages(); + self.times.iter_mut().for_each(|(_, t)| *t -= EPOCH); + self.epoch_num += 1; + } + } + + fn flush_messages(&mut self) -> io::Result<()> { + let non_empty = self + .printbuf + .iter() + .fold(0, |acc, (_, v)| if v.is_empty() { acc } else { acc + 1 }); + if non_empty > 1 { + let mut strbuf = String::new(); + for (tid, v) in self.printbuf.iter() { + let _ = write!(&mut strbuf, "tid {}:{{", tid); + let mut iter = v.iter(); + if let Some((_, b)) = iter.next() { + let _ = write!(&mut strbuf, "{}", b.len()); + for (_, b) in iter { + let _ = write!(&mut strbuf, ", {}", b.len()); + } + } + let _ = write!(&mut strbuf, "}} "); + } + info!( + " [chunky_print] {} threads concurrent output in epoch {}, sizes: {}", + non_empty, self.epoch_num, strbuf + ); + } else { + debug!( + " [chunky_print] output from {} thread(s) in epoch {}: {} bytes", + non_empty, + self.epoch_num, + self.printbuf + .iter() + .fold(0, |acc, (_, v)| v.iter().fold(acc, |a, (_, b)| a + b.len())) + ); + } + for (tid, v) in self.printbuf.iter_mut() { + for (w, b) in v.iter() { + match w { + Which::Stdout => { + trace!( + " [chunky_print] writing {} bytes to stdout from tid {}", + b.len(), + tid + ); + io::Write::write_all(&mut io::stdout(), b)?; + } + Which::Stderr => { + trace!( + " [chunky_print] writing {} bytes to stderr from tid {}", + b.len(), + tid + ); + io::Write::write_all(&mut io::stderr(), b)?; + } + } + } + v.clear(); + } + io::Write::flush(&mut io::stdout())?; + io::Write::flush(&mut io::stderr())?; + Ok(()) + } +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct ChunkyPrintLocal { + stdout_disconnected: AtomicBool, + stderr_disconnected: AtomicBool, +} + +impl Clone for ChunkyPrintLocal { + fn clone(&self) -> Self { + ChunkyPrintLocal { + stdout_disconnected: AtomicBool::new(self.stdout_disconnected.load(Ordering::SeqCst)), + stderr_disconnected: AtomicBool::new(self.stderr_disconnected.load(Ordering::SeqCst)), + } + } +} + +fn read_tracee_memory>( + guest: &T, + addr: Addr, + len: usize, +) -> Result, Error> { + let mut buf = vec![0; len]; + guest.memory().read_exact(addr, &mut buf)?; + Ok(buf) +} + +#[reverie::tool] +impl Tool for ChunkyPrintLocal { + type GlobalState = ChunkyPrintGlobal; + type ThreadState = (); + + async fn handle_syscall_event>( + &self, + guest: &mut T, + call: Syscall, + ) -> Result { + let _ = guest.send_rpc(Msg::Tick).await; + match call { + // Here we make some attempt to catch redirections: + Syscall::Dup2(d) => { + let newfd = d.newfd(); + if newfd == 1 { + self.stdout_disconnected.store(true, Ordering::SeqCst); + } + if newfd == 2 { + self.stderr_disconnected.store(true, Ordering::SeqCst); + } + + guest.tail_inject(call).await + } + Syscall::Write(w) => { + match w.fd() { + 1 | 2 => { + let which = if w.fd() == 1 { + if self.stdout_disconnected.load(Ordering::SeqCst) { + debug!( + " [chunky_print] letting through write on redirected stdout, {} bytes.", + w.len() + ); + return guest.tail_inject(call).await; + } + Which::Stdout + } else { + if self.stderr_disconnected.load(Ordering::SeqCst) { + debug!( + " [chunky_print] letting through write on redirected stderr, {} bytes.", + w.len() + ); + return guest.tail_inject(call).await; + } + Which::Stderr + }; + + let buf = read_tracee_memory(guest, w.buf().unwrap(), w.len())?; + let _ = guest.send_rpc(Msg::Print(which, buf)).await; + info!( + " [chunky_print] suppressed write of {} bytes to fd {}", + w.len(), + w.fd() + ); + // Suppress the original system call: + Ok(w.len() as i64) + } + _ => guest.tail_inject(call).await, + } + } + _ => guest.tail_inject(call).await, + } + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, global_state) = tracer.wait().await?; + trace!(" [chunky_print] global exit, flushing last messages."); + let _ = global_state.0.lock().unwrap().flush_messages(); + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/counter1.rs b/reverie-examples/counter1.rs new file mode 100644 index 0000000..d19abba --- /dev/null +++ b/reverie-examples/counter1.rs @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! An example that counts system calls using a simple, global state. + +use reverie::{ + syscalls::{Syscall, SyscallInfo, Sysno}, + Error, GlobalTool, Guest, Pid, Tool, +}; +use reverie_util::CommonToolArguments; +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicU64, Ordering}; +use structopt::StructOpt; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct CounterGlobal { + num_syscalls: AtomicU64, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct CounterLocal {} + +/// The message sent to the global state method. +/// This contains the syscall number. +#[derive(PartialEq, Debug, Eq, Clone, Copy, Serialize, Deserialize)] +pub struct IncrMsg(Sysno); + +#[reverie::global_tool] +impl GlobalTool for CounterGlobal { + type Request = IncrMsg; + type Response = (); + async fn init_global_state(_: &Self::Config) -> Self { + CounterGlobal { + num_syscalls: AtomicU64::new(0), + } + } + async fn receive_rpc(&self, _from: Pid, IncrMsg(sysno): IncrMsg) -> Self::Response { + AtomicU64::fetch_add(&self.num_syscalls, 1, Ordering::SeqCst); + tracing::info!("count at syscall ({:?}): {:?}", sysno, self.num_syscalls); + } +} + +#[reverie::tool] +impl Tool for CounterLocal { + type GlobalState = CounterGlobal; + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let sysno = syscall.number(); + let _ = guest.send_rpc(IncrMsg(sysno)).await?; + guest.tail_inject(syscall).await + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, global_state) = tracer.wait().await?; + eprintln!( + " [counter tool] Total system calls in process tree: {}", + AtomicU64::load(&global_state.num_syscalls, Ordering::SeqCst) + ); + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/counter2.rs b/reverie-examples/counter2.rs new file mode 100644 index 0000000..aade0bf --- /dev/null +++ b/reverie-examples/counter2.rs @@ -0,0 +1,157 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! An example that counts system calls using a simple, global state. + +use reverie::{ + syscalls::{Syscall, SyscallInfo}, + Error, ExitStatus, GlobalRPC, GlobalTool, Guest, Pid, Tid, Tool, +}; +use reverie_util::CommonToolArguments; +use structopt::StructOpt; + +use core::sync::atomic::{AtomicU64, Ordering}; +use serde::{Deserialize, Serialize}; +use std::sync::Mutex; +use tracing::debug; + +/// Global state for the tool. +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct GlobalInner { + pub total_syscalls: u64, + pub exited_procs: u64, + pub exited_threads: u64, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct CounterGlobal { + pub inner: Mutex, +} + +/// Local, per-process state for the tool. +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct CounterLocal { + proc_syscalls: AtomicU64, + exited_threads: AtomicU64, +} + +impl Clone for CounterLocal { + fn clone(&self) -> Self { + CounterLocal { + proc_syscalls: AtomicU64::new(self.proc_syscalls.load(Ordering::SeqCst)), + exited_threads: AtomicU64::new(self.exited_threads.load(Ordering::SeqCst)), + } + } +} + +/// The message sent to the global state method. +#[derive(PartialEq, Debug, Eq, Hash, Clone, Serialize, Deserialize, Copy)] +pub struct IncrMsg(u64, u64); + +#[reverie::global_tool] +impl GlobalTool for CounterGlobal { + type Request = IncrMsg; + type Response = (); + async fn init_global_state(_: &Self::Config) -> Self { + CounterGlobal { + inner: Mutex::new(GlobalInner { + total_syscalls: 0, + exited_procs: 0, + exited_threads: 0, + }), + } + } + async fn receive_rpc(&self, _from: Pid, IncrMsg(n, t): IncrMsg) -> Self::Response { + let mut mg = self.inner.lock().unwrap(); + mg.total_syscalls += n; + mg.exited_threads += t; + mg.exited_procs += 1; + } +} + +#[reverie::tool] +impl Tool for CounterLocal { + type GlobalState = CounterGlobal; + /// Yet another level of counters per-thread: + type ThreadState = u64; + + fn new(pid: Pid, _cfg: &()) -> Self { + debug!(" [counter] initialize counter for pid {}", pid); + CounterLocal { + proc_syscalls: AtomicU64::new(0), + exited_threads: AtomicU64::new(0), + } + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + call: Syscall, + ) -> Result { + *guest.thread_state_mut() += 1; + debug!( + "thread count at syscall ({:?}): {}, process count: {}", + call.number(), + guest.thread_state(), + self.proc_syscalls.load(Ordering::SeqCst) + ); + guest.tail_inject(call).await + } + + async fn on_exit_thread>( + &self, + tid: Tid, + _global_state: &G, + ts: u64, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + debug!("count at exit thread {} = {}", tid, &ts); + self.proc_syscalls.fetch_add(ts, Ordering::SeqCst); + self.exited_threads.fetch_add(1, Ordering::SeqCst); + debug!( + " contributed to process-level count: {}", + self.proc_syscalls.load(Ordering::Relaxed) + ); + Ok(()) + } + + async fn on_exit_process>( + self, + pid: Pid, + global_state: &G, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + let count = self.proc_syscalls.load(Ordering::SeqCst); + let threads = self.exited_threads.load(Ordering::SeqCst); + drop(self); + debug!( + "At ExitProc (pid {}), contributing {} to global count.", + pid, count + ); + let _ = global_state.send_rpc(IncrMsg(count, threads)).await?; + Ok(()) + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, global_state) = tracer.wait().await?; + let mg = global_state.inner.lock().unwrap(); + eprintln!( + " [counter tool] Total system calls in process tree: {}, from {} processes, {} thread(s).", + mg.total_syscalls, mg.exited_procs, mg.exited_threads + ); + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/debug.rs b/reverie-examples/debug.rs new file mode 100644 index 0000000..05455a5 --- /dev/null +++ b/reverie-examples/debug.rs @@ -0,0 +1,50 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! This instrumentation tool intercepts events but does nothing with them, +//! except acting as a gdbserver. + +use reverie::{Error, Subscription, Tool}; +use reverie_util::CommonToolArguments; +use serde::{Deserialize, Serialize}; +use structopt::StructOpt; + +#[derive(Debug, Default, Serialize, Deserialize)] +struct DebugTool; +impl Tool for DebugTool { + fn subscriptions(_cfg: &()) -> Subscription { + Subscription::none() + } +} + +/// A tool to introduce inject "chaos" into a running process. A pathological +/// kernel is simulated by forcing reads to only return one byte a time. +#[derive(Debug, StructOpt)] +struct Args { + #[structopt(flatten)] + common_opts: CommonToolArguments, + + #[structopt(long, default_value = "1234", help = "launch gdbserver on given port")] + port: u16, +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Args::from_args(); + let port = args.port; + let log_guard = args.common_opts.init_tracing(); + eprintln!("Listening on port {}", port); + let tracer = reverie_ptrace::TracerBuilder::::new(args.common_opts.into()) + .gdbserver(port) + .spawn() + .await?; + let (status, _global_state) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/noop.rs b/reverie-examples/noop.rs new file mode 100644 index 0000000..b570b1b --- /dev/null +++ b/reverie-examples/noop.rs @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! This instrumentation tool intercepts events but does nothing with them. It is +//! useful for observing the overhead of interception, and as a starting point. + +use reverie::{Error, Subscription, Tool}; +use reverie_util::CommonToolArguments; +use serde::{Deserialize, Serialize}; +use structopt::StructOpt; + +#[derive(Debug, Default, Serialize, Deserialize)] +struct NoopTool; + +#[reverie::tool] +impl Tool for NoopTool { + fn subscriptions(_cfg: &()) -> Subscription { + Subscription::none() + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, _global_state) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/pedigree.rs b/reverie-examples/pedigree.rs new file mode 100644 index 0000000..3191d2d --- /dev/null +++ b/reverie-examples/pedigree.rs @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +//! An example that tracks thread pedigree using local state +use reverie::{syscalls::Syscall, Error, Guest, Pid, Tool}; +use reverie_util::{pedigree::Pedigree, CommonToolArguments}; +use serde::{Deserialize, Serialize}; +use structopt::StructOpt; +use tracing::{debug, trace}; + +// TODO: Add handle pedigree forking, initialization, etc. to tool. +// This tool is NOT FUNCTIONAL in its current state. + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct PedigreeLocal(Pedigree); + +#[reverie::tool] +impl Tool for PedigreeLocal { + type ThreadState = PedigreeLocal; + + fn new(pid: Pid, _cfg: &()) -> Self { + debug!("[pedigree] initialize pedigree for pid {}", pid); + PedigreeLocal(Pedigree::new()) + } + + fn init_thread_state( + &self, + _tid: Pid, + parent: Option<(Pid, &Self::ThreadState)>, + ) -> Self::ThreadState { + if let Some((_, state)) = parent { + let mut parent = state.clone(); + let child = parent.0.fork_mut(); + trace!("child pedigree: {:?}", child); + PedigreeLocal(child) + } else { + PedigreeLocal(Pedigree::new()) + } + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Fork(_) | Syscall::Vfork(_) | Syscall::Clone(_) => { + let retval = guest.inject(syscall).await?; + let pedigree = guest.thread_state_mut().0.fork_mut(); + trace!( + "got new pedigree: {:?} => {:x?}", + pedigree, + nix::unistd::Pid::try_from(&pedigree) + ); + Ok(retval) + } + Syscall::Getpid(_) + | Syscall::Getppid(_) + | Syscall::Gettid(_) + | Syscall::Getpgid(_) + | Syscall::Getpgrp(_) => { + let pid = guest.inject(syscall).await?; + let vpid = nix::unistd::Pid::try_from(&self.0).unwrap(); + trace!("getpid returned {:?} vpid: {:?}", pid, vpid); + Ok(pid) + } + Syscall::Setpgid(_) => { + panic!("[pedigree] setpgid is not allowed."); + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, _global_state) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/strace/config.rs b/reverie-examples/strace/config.rs new file mode 100644 index 0000000..a54308a --- /dev/null +++ b/reverie-examples/strace/config.rs @@ -0,0 +1,17 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::filter::Filter; + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Default, Serialize, Deserialize)] +pub struct Config { + pub filters: Vec, +} diff --git a/reverie-examples/strace/filter.rs b/reverie-examples/strace/filter.rs new file mode 100644 index 0000000..405d199 --- /dev/null +++ b/reverie-examples/strace/filter.rs @@ -0,0 +1,79 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::syscalls::Sysno; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Deserialize, Serialize, Eq, PartialEq)] +pub struct Filter { + /// Inverses the match. + pub inverse: bool, + + /// The set of syscalls to match. + pub syscalls: Vec, +} + +impl std::str::FromStr for Filter { + type Err = String; + + // Must parse this: [!][?]value1[,[?]value2]... + fn from_str(s: &str) -> Result { + let (inverse, s) = match s.strip_prefix('!') { + Some(s) => (true, s), + None => (false, s), + }; + + let mut syscalls = Vec::new(); + + for value in s.split(',') { + // FIXME: Handle syscall sets, so we can use '%stat` to trace all + // stat calls, for example. + if value.strip_prefix('%').is_some() { + return Err("filtering sets of syscall is not yet supported".into()); + } + + let syscall: Sysno = value + .parse() + .map_err(|()| format!("invalid syscall name '{}'", value))?; + + syscalls.push(syscall); + } + + Ok(Self { inverse, syscalls }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_filter() { + assert_eq!( + "open,mmap".parse(), + Ok(Filter { + inverse: false, + syscalls: vec![Sysno::open, Sysno::mmap] + }) + ); + + assert_eq!( + "open,foobar".parse::(), + Err("invalid syscall name 'foobar'".into()) + ); + + assert_eq!( + "!read,write".parse(), + Ok(Filter { + inverse: true, + syscalls: vec![Sysno::read, Sysno::write] + }) + ); + } +} diff --git a/reverie-examples/strace/global_state.rs b/reverie-examples/strace/global_state.rs new file mode 100644 index 0000000..688c855 --- /dev/null +++ b/reverie-examples/strace/global_state.rs @@ -0,0 +1,23 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::{GlobalTool, Pid}; + +use crate::config::Config; + +#[derive(Debug, Default)] +pub struct GlobalState; + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = (); + type Response = (); + type Config = Config; + + async fn receive_rpc(&self, _pid: Pid, _req: Self::Request) {} +} diff --git a/reverie-examples/strace/main.rs b/reverie-examples/strace/main.rs new file mode 100644 index 0000000..418c51c --- /dev/null +++ b/reverie-examples/strace/main.rs @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod config; +mod filter; +mod global_state; +mod tool; + +use config::Config; +use filter::Filter; +use tool::Strace; + +use structopt::StructOpt; + +use reverie::Error; +use reverie_util::CommonToolArguments; + +/// A tool to trace system calls. +#[derive(StructOpt, Debug)] +struct Opts { + #[structopt(flatten)] + common: CommonToolArguments, + + /// The set of syscalls to trace. By default, all syscalls are traced. If + /// this is used, then only the specified syscalls are traced. By limiting + /// the set of traced syscalls, we can reduce the overhead of the tracer. + #[structopt(long)] + trace: Vec, +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Opts::from_args(); + + let config = Config { + filters: args.trace, + }; + + let log_guard = args.common.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.common.into()) + .config(config) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-examples/strace/tool.rs b/reverie-examples/strace/tool.rs new file mode 100644 index 0000000..81b8a96 --- /dev/null +++ b/reverie-examples/strace/tool.rs @@ -0,0 +1,115 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::config::Config; +use crate::global_state::GlobalState; + +use reverie::syscalls::{Displayable, Errno, Syscall, SyscallInfo}; +use reverie::{Error, Guest, Signal, Subscription, Tool}; + +use serde::{Deserialize, Serialize}; + +// Strace has no need for process-level state, so this is a unit struct. +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +pub struct Strace; + +/// Here we use the same dummy type for both our local and global trait +/// implementations. +#[reverie::tool] +impl Tool for Strace { + type GlobalState = GlobalState; + + fn subscriptions(cfg: &Config) -> Subscription { + // Check if we're only excluding things. + let exclude_only = cfg.filters.iter().all(|f| f.inverse); + + let mut subs = if exclude_only { + // Only excluding syscalls. + Subscription::all_syscalls() + } else { + // Only including syscalls. + Subscription::none() + }; + + for filter in &cfg.filters { + let syscalls = filter.syscalls.iter().copied(); + if filter.inverse { + subs.disable_syscalls(syscalls); + } else { + subs.syscalls(syscalls); + } + } + + subs + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Exit(_) | Syscall::ExitGroup(_) => { + eprintln!( + "[pid {}] {} = ?", + guest.tid().colored(), + syscall.display_with_outputs(&guest.memory()), + ); + guest.tail_inject(syscall).await + } + Syscall::Execve(_) | Syscall::Execveat(_) => { + let tid = guest.tid(); + + // must be pre-formatted, otherwise the memory references become + // invalid when execve/execveat returns success because the original + // program got wiped out. + eprintln!( + "[pid {}] {}", + tid.colored(), + syscall.display_with_outputs(&guest.memory()) + ); + + let errno = guest.inject(syscall).await.unwrap_err(); + + eprintln!( + "[pid {}] ({}) = {:?}", + tid.colored(), + syscall.number(), + errno + ); + + Err(errno.into()) + } + _otherwise => { + let syscall_ret = guest.inject(syscall).await; + eprintln!( + "[pid {}] {} = {}", + guest.tid().colored(), + syscall.display_with_outputs(&guest.memory()), + // TODO: Pretty print the return value according to its type. + syscall_ret.unwrap_or_else(|errno| -errno.into_raw() as i64) + ); + Ok(syscall_ret?) + } + } + } + + async fn handle_signal_event>( + &self, + guest: &mut G, + signal: Signal, + ) -> Result, Errno> { + eprintln!( + "[pid {}] Received signal: {}", + guest.tid().colored(), + signal + ); + Ok(Some(signal)) + } +} diff --git a/reverie-examples/strace_minimal.rs b/reverie-examples/strace_minimal.rs new file mode 100644 index 0000000..b5937f6 --- /dev/null +++ b/reverie-examples/strace_minimal.rs @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::{ + syscalls::{Displayable, Syscall}, + Error, Guest, Tool, +}; +use reverie_util::CommonToolArguments; +use serde::{Deserialize, Serialize}; +use structopt::StructOpt; + +#[derive(Serialize, Deserialize, Default)] +struct StraceTool {} + +#[reverie::tool] +impl Tool for StraceTool { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + eprintln!( + "[pid {}] {} = ?", + guest.tid(), + syscall.display_with_outputs(&guest.memory()), + ); + guest.tail_inject(syscall).await + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = CommonToolArguments::from_args(); + let log_guard = args.init_tracing(); + let tracer = reverie_ptrace::TracerBuilder::::new(args.into()) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + drop(log_guard); // Flush logs before exiting. + status.raise_or_exit() +} diff --git a/reverie-process/Cargo.toml b/reverie-process/Cargo.toml new file mode 100644 index 0000000..d1cb1bd --- /dev/null +++ b/reverie-process/Cargo.toml @@ -0,0 +1,26 @@ +# @generated by autocargo + +[package] +name = "reverie-process" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" + +[dependencies] +bincode = "1.3.3" +bitflags = "1.3" +colored = "1.9" +futures = { version = "0.3.13", features = ["async-await", "compat"] } +libc = "0.2.98" +nix = "0.22" +serde = { version = "1.0.126", features = ["derive", "rc"] } +syscalls = { version = "0.4.2", features = ["with-serde"] } +thiserror = "1.0.29" +tokio = { version = "1.10", features = ["full", "test-util", "tracing"] } + +[dev-dependencies] +const-cstr = "0.3.0" +num_cpus = "1.11" +raw-cpuid = "9.0" +tempfile = "3.2" diff --git a/reverie-process/src/builder.rs b/reverie-process/src/builder.rs new file mode 100644 index 0000000..b13387f --- /dev/null +++ b/reverie-process/src/builder.rs @@ -0,0 +1,760 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::ffi::{OsStr, OsString}; +use std::io; +use std::os::unix::ffi::OsStrExt; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; + +use syscalls::Errno; + +use super::seccomp; +use super::util::to_cstring; +use super::util::CStringArray; +use super::Command; +use super::Container; +use super::Mount; +use super::Namespace; +use super::PtyChild; +use super::Stdio; + +impl Command { + /// Constructs a new `Command` for launching the program at path `program`, + /// with the following default configuration: + /// + /// * No arguments to the program + /// * Inherit the current process's environment + /// * Inherit the current process's working directory + /// * Inherit stdin/stdout/stderr for `spawn` or `status`, but create pipes + /// for `output` + /// + /// Builder methods are provided to change these defaults and + /// otherwise configure the process. + /// + /// If `program` is not an absolute path, the `PATH` will be searched in an + /// OS-defined way. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// let command = Command::new("sh"); + /// ``` + pub fn new>(program: S) -> Self { + let program = to_cstring(program); + + let mut args = CStringArray::with_capacity(1); + args.push(program.clone()); + + Self { + program, + args, + pre_exec: Vec::new(), + container: Container::new(), + } + } + + /// Sets the path to the program. This can be used to override what was + /// already set in [`Command::new`]. + /// + /// NOTE: This also changes argument 0 to match `program`. + pub fn program>(&mut self, program: S) -> &mut Self { + let cstring = to_cstring(program); + self.program = cstring.clone(); + self.args.set(0, cstring); + self + } + + /// Explicitly sets the first argument. By default, this is the same as the + /// program path and is what you want in most cases. + pub fn arg0>(&mut self, arg0: S) -> &mut Self { + self.args.set(0, to_cstring(arg0)); + self + } + + /// Gets the first argument. Unless [`Command::arg0`] was used, this returns + /// the same string as [`Command::get_program`]. + pub fn get_arg0(&self) -> &OsStr { + OsStr::from_bytes(self.args.get(0).to_bytes()) + } + + /// Adds an argument to pass to the program. + /// + /// Only one argument can be passed per use. So instead of: + /// + /// ```no_run + /// reverie_process::Command::new("sh") + /// .arg("-C /path/to/repo"); + /// ``` + /// + /// usage would be: + /// + /// ```no_run + /// reverie_process::Command::new("sh") + /// .arg("-C") + /// .arg("/path/to/repo"); + /// ``` + /// + /// To pass multiple arguments see [`args`]. + /// + /// [`args`]: method@Self::args + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .arg("-l") + /// .arg("-a"); + /// ``` + pub fn arg>(&mut self, arg: S) -> &mut Self { + self.args.push(to_cstring(arg)); + self + } + + /// Adds multiple arguments to pass to the program. + /// + /// To pass a single argument see [`arg`]. + /// + /// [`arg`]: method@Self::arg + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .args(&["-l", "-a"]); + /// ``` + pub fn args(&mut self, args: I) -> &mut Command + where + I: IntoIterator, + S: AsRef, + { + for arg in args { + self.arg(arg); + } + self + } + + /// Returns an iterator of the arguments that will be passed to the program. + /// + /// This does not include the program name itself. It only includes the + /// arguments specified with [`Command::arg`] and [`Command::args`]. + pub fn get_args(&self) -> impl Iterator { + self.args + .iter() + .skip(1) + .map(|arg| OsStr::from_bytes(arg.to_bytes())) + } + + /// Inserts or updates an environment variable mapping. + /// + /// Note that environment variable names are case-insensitive (but + /// case-preserving) on Windows, and case-sensitive on all other platforms. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .env("PATH", "/bin"); + /// ``` + pub fn env(&mut self, key: K, val: V) -> &mut Self + where + K: AsRef, + V: AsRef, + { + self.container.env(key, val); + self + } + + /// Adds or updates multiple environment variable mappings. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Command, Stdio}; + /// use std::env; + /// use std::collections::HashMap; + /// + /// let filtered_env : HashMap = + /// env::vars().filter(|&(ref k, _)| + /// k == "TERM" || k == "TZ" || k == "LANG" || k == "PATH" + /// ).collect(); + /// + /// let command = Command::new("printenv") + /// .stdin(Stdio::null()) + /// .stdout(Stdio::inherit()) + /// .env_clear() + /// .envs(&filtered_env); + /// ``` + pub fn envs(&mut self, vars: I) -> &mut Self + where + I: IntoIterator, + K: AsRef, + V: AsRef, + { + self.container.envs(vars); + self + } + + /// Removes an environment variable mapping. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .env_remove("PATH"); + /// ``` + pub fn env_remove>(&mut self, key: K) -> &mut Self { + self.container.env_remove(key); + self + } + + /// Clears the entire environment map for the child process. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .env_clear(); + /// ``` + pub fn env_clear(&mut self) -> &mut Self { + self.container.env_clear(); + self + } + + /// Sets the working directory for the child process. + /// + /// # Interaction with `chroot` + /// + /// The working directory is set *after* the chroot is performed (if a chroot + /// directory is specified). Thus, the path given is relative to the chroot + /// directory. Otherwise, if no chroot directory is specified, the working + /// directory is relative to the current working directory of the parent + /// process at the time the child process is spawned. + /// + /// # Platform-specific behavior + /// + /// If the program path is relative (e.g., `"./script.sh"`), it's ambiguous + /// whether it should be interpreted relative to the parent's working + /// directory or relative to `current_dir`. The behavior in this case is + /// platform specific and unstable, and it's recommended to use + /// [`canonicalize`] to get an absolute program path instead. + /// + /// [`canonicalize`]: std::fs::canonicalize() + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .current_dir("/bin"); + /// ``` + pub fn current_dir>(&mut self, dir: P) -> &mut Self { + self.container.current_dir(dir); + self + } + + /// Sets configuration for the child process's standard input (stdin) handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Command, Stdio}; + /// + /// let command = Command::new("ls") + /// .stdin(Stdio::null()); + /// ``` + pub fn stdin>(&mut self, cfg: T) -> &mut Self { + self.container.stdin(cfg); + self + } + + /// Sets configuration for the child process's standard output (stdout) + /// handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Command, Stdio}; + /// + /// let command = Command::new("ls") + /// .stdout(Stdio::null()); + /// ``` + pub fn stdout>(&mut self, cfg: T) -> &mut Self { + self.container.stdout(cfg); + self + } + + /// Sets configuration for the child process's standard error (stderr) + /// handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Command, Stdio}; + /// + /// let command = Command::new("ls") + /// .stderr(Stdio::null()); + /// ``` + pub fn stderr>(&mut self, cfg: T) -> &mut Self { + self.container.stderr(cfg); + self + } + + /// Changes the root directory of the calling process to the specified path. + /// This directory will be inherited by all child processes of the calling + /// process. + /// + /// Note that changing the root directory may cause the program to not be + /// found. As such, the program path should be relative to this directory. + pub fn chroot>(&mut self, chroot: P) -> &mut Self { + self.container.chroot(chroot); + self + } + + /// Unshares parts of the process execution context that are normally shared + /// with the parent process. This is useful for executing the child process + /// in a new namespace. + pub fn unshare(&mut self, namespace: Namespace) -> &mut Self { + self.container.unshare(namespace); + self + } + + /// Schedules a closure to be run just before the `exec` function is invoked. + /// + /// The closure is allowed to return an I/O error whose OS error code will be + /// communicated back to the parent and returned as an error from when the + /// spawn was requested. + /// + /// Multiple closures can be registered and they will be called in order of + /// their registration. If a closure returns `Err` then no further closures + /// will be called and the spawn operation will immediately return with a + /// failure. + /// + /// # Safety + /// + /// This closure will be run in the context of the child process after a + /// `fork`. This primarily means that any modifications made to memory on + /// behalf of this closure will **not** be visible to the parent process. + /// This is often a very constrained environment where normal operations like + /// `malloc` or acquiring a mutex are not guaranteed to work (due to other + /// threads perhaps still running when the `fork` was run). + /// + /// This also means that all resources such as file descriptors and + /// memory-mapped regions got duplicated. It is your responsibility to make + /// sure that the closure does not violate library invariants by making + /// invalid use of these duplicates. + /// + /// When this closure is run, aspects such as the stdio file descriptors and + /// working directory have successfully been changed, so output to these + /// locations may not appear where intended. + pub unsafe fn pre_exec(&mut self, f: F) -> &mut Self + where + F: FnMut() -> Result<(), Errno> + Send + Sync + 'static, + { + self.pre_exec.push(Box::new(f)); + self + } + + /// Returns the path to the program that was given to [`Command::new`]. + /// + /// # Examples + /// + /// ``` + /// use reverie_process::Command; + /// + /// let cmd = Command::new("echo"); + /// assert_eq!(cmd.get_program(), "echo"); + /// ``` + pub fn get_program(&self) -> &OsStr { + OsStr::from_bytes(self.program.to_bytes()) + } + + /// Returns the working directory for the child process. + /// + /// This returns None if the working directory will not be changed. + pub fn get_current_dir(&self) -> Option<&Path> { + self.container.get_current_dir() + } + + /// Returns an iterator of the environment variables that will be set when + /// the process is spawned. Note that this does not include any environment + /// variables inherited from the parent process. + pub fn get_envs(&self) -> impl Iterator)> { + self.container.get_envs() + } + + /// Returns a mapping of all environment variables that the new child process + /// will inherit. + pub fn get_captured_envs(&self) -> BTreeMap { + self.container.get_captured_envs() + } + + /// Gets an environment variable. If the child process is to inherit this + /// environment variable from the current process, then this returns the + /// current process's environment variable unless it is to be overridden. + pub fn get_env>(&self, env: K) -> Option> { + self.container.get_env(env) + } + + /// Maps one user ID to another. + /// + /// Implies `Namespace::USER`. + /// + /// # Example + /// + /// This is can be used to gain `CAP_SYS_ADMIN` privileges in the user + /// namespace by mapping the root user inside the container to the current + /// user outside of the container. + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .map_uid(1, unsafe { libc::getuid() }); + /// ``` + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/uid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_uid(&mut self, inside_uid: libc::uid_t, outside_uid: libc::uid_t) -> &mut Self { + self.container.map_uid(inside_uid, outside_uid); + self + } + + /// Maps potentially many user IDs inside the new user namespace to user IDs + /// outside of the user namespace. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/uid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_uid_range( + &mut self, + starting_inside_uid: libc::uid_t, + starting_outside_uid: libc::uid_t, + count: u32, + ) -> &mut Self { + self.container + .map_uid_range(starting_inside_uid, starting_outside_uid, count); + self + } + + /// Convience function for mapping root (inside the container) to the current + /// user ID (outside the container). This is useful for gaining new + /// capabilities inside the container, such as being able to mount file + /// systems. + /// + /// Implies `Namespace::USER`. + /// + /// This is the same as: + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("ls") + /// .map_uid(0, unsafe { libc::geteuid() }) + /// .map_gid(0, unsafe { libc::getegid() }); + /// ``` + pub fn map_root(&mut self) -> &mut Self { + self.container.map_root(); + self + } + + /// Maps one group ID to another. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/gid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_gid(&mut self, inside_gid: libc::gid_t, outside_gid: libc::gid_t) -> &mut Self { + self.container.map_gid(inside_gid, outside_gid); + self + } + + /// Maps potentially many group IDs inside the new user namespace to group + /// IDs outside of the user namespace. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/gid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_gid_range( + &mut self, + starting_inside_gid: libc::gid_t, + starting_outside_gid: libc::gid_t, + count: u32, + ) -> &mut Self { + self.container + .map_gid_range(starting_inside_gid, starting_outside_gid, count); + self + } + + /// Sets the hostname of the container. + /// + /// Implies `Namespace::UTS`, which requires `CAP_SYS_ADMIN`. + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("cat") + /// .arg("/proc/sys/kernel/hostname") + /// .map_root() + /// .hostname("foobar.local"); + /// ``` + pub fn hostname>(&mut self, hostname: S) -> &mut Self { + self.container.hostname(hostname); + self + } + + /// Sets the domain name of the container. + /// + /// Implies `Namespace::UTS`, which requires `CAP_SYS_ADMIN`. + /// + /// # Example + /// + /// ```no_run + /// use reverie_process::Command; + /// + /// let command = Command::new("cat") + /// .arg("/proc/sys/kernel/domainname") + /// .map_root() + /// .domainname("foobar"); + /// ``` + pub fn domainname>(&mut self, domainname: S) -> &mut Self { + self.container.domainname(domainname); + self + } + + /// Gets the hostname of the container. + pub fn get_hostname(&self) -> Option<&OsStr> { + self.container.get_hostname() + } + + /// Gets the domainname of the container. + pub fn get_domainname(&self) -> Option<&OsStr> { + self.container.get_domainname() + } + + /// Adds a file system to be mounted. Note that these are mounted in the same + /// order as given. + /// + /// Implies `Namespace::MOUNT`. Note that `Namespace::USER` should also have + /// been set and `map_uid` should have been called in order to gain the + /// privileges required to mount. + pub fn mount(&mut self, mount: Mount) -> &mut Self { + self.container.mount(mount); + self + } + + /// Adds multiple mounts. + pub fn mounts(&mut self, mounts: I) -> &mut Self + where + I: IntoIterator, + { + self.container.mounts(mounts); + self + } + + /// Sets up the container to have local networking only. This will prevent + /// any network communication to the outside world. + /// + /// Implies `Namespace::NETWORK` and `Namespace::MOUNT`. + /// + /// This also causes a fresh `/sys` to be mounted to avoid seeing the host + /// network interfaces in `/sys/class/net`. + pub fn local_networking_only(&mut self) -> &mut Self { + self.container.local_networking_only(); + self + } + + /// Sets the seccomp filter. The filter is loaded immediately before `execve` + /// and *after* all `pre_exec` callbacks have been executed. Thus, you will + /// still be able to call filtered syscalls from `pre_exec` callbacks. + pub fn seccomp(&mut self, filter: seccomp::Filter) -> &mut Self { + self.container.seccomp(filter); + self + } + + /// Sets the controlling pseudoterminal for the child process). + /// + /// In the child process, this has the effect of: + /// 1. Creating a new session (with `setsid()`). + /// 2. Using an `ioctl` to set the controlling terminal. + /// 3. Setting this file descriptor as the stdio streams. + /// + /// NOTE: Since this modifies the stdio streams, calling this will reset + /// [`Self::stdin`], [`Self::stdout`], and [`Self::stderr`] back to + /// [`Stdio::inherit()`]. + pub fn pty(&mut self, child: PtyChild) -> &mut Self { + self.container.pty(child); + self + } + + /// Finds the path to the program. + pub fn find_program(&self) -> io::Result { + let program = Path::new(self.get_program()); + + if program.is_absolute() { + // Note: We shouldn't canonicalize here since that will follow + // symlinks. Instead, just make sure the file exists and is + // executable. + let metadata = program.metadata()?; + + if metadata.is_file() && metadata.permissions().mode() & 0o111 != 0 { + Ok(program.to_path_buf()) + } else { + Err(Errno::EPERM.into()) + } + } else if program.components().count() == 1 { + let path = self.get_env("PATH").unwrap_or_default(); + + let paths = path + .as_bytes() + .split(|c| *c == b':') + .map(|bytes| Path::new(OsStr::from_bytes(bytes))); + + find_program_in_paths(program, paths) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("Could not find {:?} in $PATH", program), + ) + })? + .canonicalize() + } else { + // Assume it's in the current directory + let mut path = match self.get_current_dir() { + Some(path) => path.to_owned(), + None => std::env::current_dir()?, + }; + path.push(program); + path.canonicalize() + } + } +} + +fn find_program_in_paths(program: &Path, iter: I) -> Option +where + I: IntoIterator, + S: AsRef, +{ + for path in iter.into_iter() { + let path = path.as_ref().join(program); + if let Ok(metadata) = path.metadata() { + if metadata.is_file() { + if metadata.permissions().mode() & 0o111 != 0 { + return Some(path); + } else { + continue; + } + + #[cfg(not(unix))] + return Some(path); + } + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn find_program() { + assert!(Command::new("cat").find_program().unwrap().is_absolute(),); + } + + #[test] + fn get_program() { + assert_eq!(Command::new("cat").get_program(), "cat"); + } + + #[test] + fn get_arg0() { + assert_eq!(Command::new("cat").get_arg0(), "cat"); + assert_eq!(Command::new("cat").arg0("dog").get_arg0(), "dog"); + assert_eq!( + Command::new("cat").arg0("dog").program("catdog").get_arg0(), + "catdog" + ); + } + + #[test] + fn get_args() { + assert_eq!( + Command::new("cat") + .arg("a") + .arg("b") + .arg("c") + .get_args() + .collect::>(), + vec![OsStr::new("a"), OsStr::new("b"), OsStr::new("c")] + ); + } +} diff --git a/reverie-process/src/child.rs b/reverie-process/src/child.rs new file mode 100644 index 0000000..8ba4f04 --- /dev/null +++ b/reverie-process/src/child.rs @@ -0,0 +1,259 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::ExitStatus; +use super::Pid; + +use super::stdio::{ChildStderr, ChildStdin, ChildStdout, Stdio}; +use super::Command; + +use core::fmt; +use core::future::Future; +use core::pin::Pin; +use core::task::{Context, Poll}; + +use nix::sys::signal::Signal; +use serde::{Deserialize, Serialize}; +use std::io; +use syscalls::Errno; + +/// Represents a child process. +/// +/// NOTE: The child process is not killed or waited on when `Child` is dropped. +/// If `Child` is not waited on before dropped, the child will continue to run in +/// the background and may become a "zombie" after the parent exits. It is +/// therefore best practice to always wait on child processes. +#[derive(Debug)] +pub struct Child { + /// The child's process ID. + pub(super) pid: Pid, + + /// The child's exit status. `Some` if the child has exited already, `None` + /// otherwise. + pub(super) exit_status: Option, + + /// The handle for writing to the child's standard input (stdin), if it has + /// been captured. + pub stdin: Option, + + /// The handle for reading from the child's standard output (stdout), if it + /// has been captured. + pub stdout: Option, + + /// The handle for reading from the child's standard error (stderr), if it + /// has been captured. + pub stderr: Option, +} + +/// The output of a finished process. +#[derive(PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct Output { + /// The exit status of the process. + pub status: ExitStatus, + /// The bytes that the process wrote to stdout. + pub stdout: Vec, + /// The bytes that the process wrote to stderr. + pub stderr: Vec, +} + +impl fmt::Debug for Output { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let stdout = core::str::from_utf8(&self.stdout); + let stdout: &dyn fmt::Debug = match stdout { + Ok(ref s) => s, + Err(_) => &self.stdout, + }; + + let stderr = core::str::from_utf8(&self.stderr); + let stderr: &dyn fmt::Debug = match stderr { + Ok(ref s) => s, + Err(_) => &self.stderr, + }; + + f.debug_struct("Output") + .field("status", &self.status) + .field("stdout", stdout) + .field("stderr", stderr) + .finish() + } +} + +impl Child { + /// Returns the PID of the child. + pub fn id(&self) -> Pid { + self.pid + } + + /// Attempts to collect the exit status of the child if it has already + /// exited. + pub fn try_wait(&mut self) -> io::Result> { + match self.exit_status { + Some(exit_status) => Ok(Some(exit_status)), + None => { + let mut status = 0; + let ret = Errno::result(unsafe { + libc::waitpid(self.pid.as_raw(), &mut status, libc::WNOHANG) + })?; + + if ret == 0 { + Ok(None) + } else { + let exit_status = ExitStatus::from_raw(status); + self.exit_status = Some(exit_status); + Ok(Some(exit_status)) + } + } + } + } + + /// Waits for the child to exit completely, returning its exit status. This + /// function will continue to return the same exit status after the child + /// process has fully exited. + /// + /// To avoid deadlocks, the child's stdin handle, if any, will be closed + /// before waiting. Otherwise, the child could block waiting for input from + /// the parent while the parent is waiting for the child. To keep the stdin + /// handle open and control it explicitly, the caller can `.take()` it before + /// calling `.wait()`. + pub async fn wait(&mut self) -> io::Result { + // Ensure stdin is closed. + drop(self.stdin.take()); + + WaitForChild::new(self)?.await + } + + /// Blocks until the child process exits. + pub fn wait_blocking(&mut self) -> io::Result { + drop(self.stdin.take()); + + let mut status = 0; + + let ret = loop { + match Errno::result(unsafe { libc::waitpid(self.pid.as_raw(), &mut status, 0) }) { + Ok(ret) => break ret, + Err(Errno::EINTR) => continue, + Err(err) => return Err(err.into()), + } + }; + + debug_assert_ne!(ret, 0); + + Ok(ExitStatus::from_raw(status)) + } + + /// Simultaneously waits for the child to exit and collect all remaining + /// output on the stdout/stderr handles, returning an `Output` instance. + /// + /// To avoid deadlocks, the child's stdin handle, if any, will be closed + /// before waiting. Otherwise, the child could block waiting for input from + /// the parent while the parent is waiting for the child. + /// + /// By default, stdin, stdout and stderr are inherited from the parent. In + /// order to capture the output into this `Result` it is necessary to + /// create new pipes between parent and child. Use `stdout(Stdio::piped())` + /// or `stderr(Stdio::piped())`, respectively. + pub async fn wait_with_output(mut self) -> io::Result { + use futures::future::try_join3; + use tokio::io::{AsyncRead, AsyncReadExt}; + + async fn read_to_end(io: Option) -> io::Result> { + let mut vec = Vec::new(); + if let Some(mut io) = io { + io.read_to_end(&mut vec).await?; + } + Ok(vec) + } + + let stdout_fut = read_to_end(self.stdout.take()); + let stderr_fut = read_to_end(self.stderr.take()); + + let (status, stdout, stderr) = try_join3(self.wait(), stdout_fut, stderr_fut).await?; + + Ok(Output { + status, + stdout, + stderr, + }) + } + + /// Sends a signal to the child. If the child has already been waited on, + /// this does nothing and returns success. + pub fn signal(&self, sig: Signal) -> io::Result<()> { + if self.exit_status.is_none() { + Errno::result(unsafe { libc::kill(self.pid.as_raw(), sig as i32) })?; + } + + Ok(()) + } +} + +impl Command { + /// Executes the command, waiting for it to finish and collecting its exit + /// status. + pub async fn status(&mut self) -> io::Result { + let mut child = self.spawn()?; + + // Ensure we close any stdio handles so we can't deadlock waiting on the + // child which may be waiting to read/write to a pipe we're holding. + drop(child.stdin.take()); + drop(child.stdout.take()); + drop(child.stderr.take()); + + child.wait().await + } + + /// Executes the command, waiting for it to finish while collecting its + /// stdout and stderr into buffers. + pub async fn output(&mut self) -> io::Result { + self.stdout(Stdio::piped()); + self.stderr(Stdio::piped()); + + let child = self.spawn(); + + child?.wait_with_output().await + } +} + +struct WaitForChild<'a> { + /// Signal future. Used to get notified asynchronously of a child exiting. + signal: tokio::signal::unix::Signal, + child: &'a mut Child, +} + +impl<'a> WaitForChild<'a> { + fn new(child: &'a mut Child) -> io::Result { + use tokio::signal::unix::{signal, SignalKind}; + + Ok(Self { + signal: signal(SignalKind::child())?, + child, + }) + } +} + +impl<'a> Future for WaitForChild<'a> { + type Output = io::Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll { + loop { + // Register an interest in SIGCHLD signals. We can't just call + // `try_wait` right away. We might miss a signal event if the child + // hasn't exited yet. Thus, we poll the signal stream to tell Tokio + // we're interested in signal events. + let sig = self.signal.poll_recv(cx); + + if let Some(status) = self.child.try_wait()? { + return Poll::Ready(Ok(status)); + } + + if sig.is_pending() { + return Poll::Pending; + } + } + } +} diff --git a/reverie-process/src/clone.rs b/reverie-process/src/clone.rs new file mode 100644 index 0000000..73d6437 --- /dev/null +++ b/reverie-process/src/clone.rs @@ -0,0 +1,47 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use syscalls::Errno; + +use super::Pid; + +pub fn clone(cb: F, flags: libc::c_int) -> Result +where + F: FnMut() -> i32, +{ + let mut stack = [0u8; 4096]; + clone_with_stack(cb, flags, &mut stack) +} + +pub fn clone_with_stack(cb: F, flags: libc::c_int, stack: &mut [u8]) -> Result +where + F: FnMut() -> i32, +{ + type CloneCb<'a> = Box i32 + 'a>; + + extern "C" fn callback(data: *mut CloneCb) -> libc::c_int { + let cb: &mut CloneCb = unsafe { &mut *data }; + (*cb)() as libc::c_int + } + + let mut cb: CloneCb = Box::new(cb); + + let res = unsafe { + let stack = stack.as_mut_ptr().add(stack.len()); + let stack = stack.sub(stack as usize % 16); + + libc::clone( + core::mem::transmute(callback as extern "C" fn(*mut Box i32>) -> i32), + stack as *mut libc::c_void, + flags, + &mut cb as *mut _ as *mut libc::c_void, + ) + }; + + Errno::result(res).map(Pid::from_raw) +} diff --git a/reverie-process/src/container.rs b/reverie-process/src/container.rs new file mode 100644 index 0000000..65f2ae1 --- /dev/null +++ b/reverie-process/src/container.rs @@ -0,0 +1,978 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::clone::clone_with_stack; +use super::env::Env; +use super::error::{AddContext, Context, Error}; +use super::exit_status::ExitStatus; +use super::fd::{pipe, write_bytes, Fd}; +use super::id_map::make_id_map; +use super::mount::Mount; +use super::namespace::Namespace; +use super::net::IfName; +use super::pid::Pid; +use super::pty::PtyChild; +use super::seccomp; +use super::stdio::Stdio; +use super::util::reset_signal_handling; +use super::util::to_cstring; + +use nix::sched::{sched_setaffinity, CpuSet}; +use serde::de::DeserializeOwned; +use serde::Serialize; +use syscalls::Errno; + +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::ffi::CString; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::io::Read; +use std::os::unix::ffi::OsStrExt; +use std::os::unix::io::AsRawFd; +use std::path::Path; + +/// A `Container` is a configuration of how a process shall be spawned. It can, +/// but doesn't have to, include Linux namespace configuration. +/// +/// NOTE: Configuring resource limits via cgroups is not yet supported. +pub struct Container { + pub(super) env: Env, + current_dir: Option, + chroot: Option, + pub(super) namespace: Namespace, + pub(super) stdin: Stdio, + pub(super) stdout: Stdio, + pub(super) stderr: Stdio, + pub(super) uid_map: Vec<(libc::uid_t, libc::uid_t, u32)>, + pub(super) gid_map: Vec<(libc::uid_t, libc::uid_t, u32)>, + mounts: Vec, + local_networking_only: bool, + hostname: Option, + domainname: Option, + seccomp: Option, + pub(super) pty: Option, + /// The core number to which the new process, and descendents, will be + /// pinned. + affinity: Option, +} + +impl Default for Container { + fn default() -> Self { + Self { + env: Default::default(), + current_dir: None, + chroot: None, + namespace: Default::default(), + stdin: Stdio::inherit(), + stdout: Stdio::inherit(), + stderr: Stdio::inherit(), + uid_map: Vec::new(), + gid_map: Vec::new(), + mounts: Vec::new(), + local_networking_only: false, + hostname: None, + domainname: None, + seccomp: None, + pty: None, + affinity: None, + } + } +} + +impl Container { + /// Creates a new `Container` that inherits everything from the parent + /// process. + pub fn new() -> Self { + Self::default() + } + + /// Inserts or updates an environment variable mapping. + /// + /// Note that environment variable names are case-insensitive (but + /// case-preserving) on Windows, and case-sensitive on all other platforms. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .env("PATH", "/bin"); + /// ``` + pub fn env(&mut self, key: K, val: V) -> &mut Self + where + K: AsRef, + V: AsRef, + { + self.env.set(key.as_ref(), val.as_ref()); + self + } + + /// Adds or updates multiple environment variable mappings. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Container, Stdio}; + /// use std::env; + /// use std::collections::HashMap; + /// + /// let filtered_env : HashMap = + /// env::vars().filter(|&(ref k, _)| + /// k == "TERM" || k == "TZ" || k == "LANG" || k == "PATH" + /// ).collect(); + /// + /// let container = Container::new() + /// .stdin(Stdio::null()) + /// .stdout(Stdio::inherit()) + /// .env_clear() + /// .envs(&filtered_env); + /// ``` + pub fn envs(&mut self, vars: I) -> &mut Self + where + I: IntoIterator, + K: AsRef, + V: AsRef, + { + for (k, v) in vars.into_iter() { + self.env(k, v); + } + self + } + + /// Removes an environment variable mapping. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .env_remove("PATH"); + /// ``` + pub fn env_remove>(&mut self, key: K) -> &mut Self { + self.env.remove(key.as_ref()); + self + } + + /// Clears the entire environment map for the child process. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .env_clear(); + /// ``` + pub fn env_clear(&mut self) -> &mut Self { + self.env.clear(); + self + } + + /// Sets the working directory for the child process. + /// + /// # Interaction with `chroot` + /// + /// The working directory is set *after* the chroot is performed (if a chroot + /// directory is specified). Thus, the path given is relative to the chroot + /// directory. Otherwise, if no chroot directory is specified, the working + /// directory is relative to the current working directory of the parent + /// process at the time the child process is spawned. + /// + /// # Platform-specific behavior + /// + /// If the program path is relative (e.g., `"./script.sh"`), it's ambiguous + /// whether it should be interpreted relative to the parent's working + /// directory or relative to `current_dir`. The behavior in this case is + /// platform specific and unstable, and it's recommended to use + /// [`canonicalize`] to get an absolute program path instead. + /// + /// [`canonicalize`]: std::fs::canonicalize() + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .current_dir("/bin"); + /// ``` + pub fn current_dir>(&mut self, dir: P) -> &mut Self { + self.current_dir = Some(to_cstring(dir.as_ref())); + self + } + + /// Sets configuration for the child process's standard input (stdin) handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Container, Stdio}; + /// + /// let container = Container::new() + /// .stdin(Stdio::null()); + /// ``` + pub fn stdin>(&mut self, cfg: T) -> &mut Self { + self.stdin = cfg.into(); + self + } + + /// Sets configuration for the child process's standard output (stdout) + /// handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Container, Stdio}; + /// + /// let container = Container::new() + /// .stdout(Stdio::null()); + /// ``` + pub fn stdout>(&mut self, cfg: T) -> &mut Self { + self.stdout = cfg.into(); + self + } + + /// Sets configuration for the child process's standard error (stderr) + /// handle. + /// + /// Defaults to [`Stdio::inherit`] when used with `spawn` or `status`, and + /// defaults to [`Stdio::piped`] when used with `output`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```no_run + /// use reverie_process::{Container, Stdio}; + /// + /// let container = Container::new() + /// .stderr(Stdio::null()); + /// ``` + pub fn stderr>(&mut self, cfg: T) -> &mut Self { + self.stderr = cfg.into(); + self + } + + /// Changes the root directory of the calling process to the specified path. + /// This directory will be inherited by all child processes of the calling + /// process. + /// + /// Note that changing the root directory may cause the program to not be + /// found. As such, the program path should be relative to this directory. + pub fn chroot>(&mut self, chroot: P) -> &mut Self { + self.chroot = Some(to_cstring(chroot.as_ref())); + self + } + + /// Unshares parts of the process execution context that are normally shared + /// with the parent process. This is useful for executing the child process + /// in a new namespace. + pub fn unshare(&mut self, namespace: Namespace) -> &mut Self { + self.namespace |= namespace; + self + } + + /// Returns the working directory for the child process. + /// + /// This returns None if the working directory will not be changed. + pub fn get_current_dir(&self) -> Option<&Path> { + if let Some(dir) = &self.current_dir { + Some(Path::new(OsStr::from_bytes(dir.to_bytes()))) + } else { + None + } + } + + /// Returns an iterator of the environment variables that will be set when + /// the process is spawned. Note that this does not include any environment + /// variables inherited from the parent process. + pub fn get_envs(&self) -> impl Iterator)> { + self.env.iter() + } + + /// Returns a mapping of all environment variables that the new child process + /// will inherit. + pub fn get_captured_envs(&self) -> BTreeMap { + self.env.capture() + } + + /// Gets an environment variable. If the child process is to inherit this + /// environment variable from the current process, then this returns the + /// current process's environment variable unless it is to be overridden. + pub fn get_env>(&self, env: K) -> Option> { + self.env.get_captured(env) + } + + /// Maps one user ID to another. + /// + /// Implies `Namespace::USER`. + /// + /// # Example + /// + /// This is can be used to gain `CAP_SYS_ADMIN` privileges in the user + /// namespace by mapping the root user inside the container to the current + /// user outside of the container. + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .map_uid(1, unsafe { libc::getuid() }); + /// ``` + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/uid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_uid(&mut self, inside_uid: libc::uid_t, outside_uid: libc::uid_t) -> &mut Self { + self.map_uid_range(inside_uid, outside_uid, 1) + } + + /// Maps potentially many user IDs inside the new user namespace to user IDs + /// outside of the user namespace. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/uid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_uid_range( + &mut self, + starting_inside_uid: libc::uid_t, + starting_outside_uid: libc::uid_t, + count: u32, + ) -> &mut Self { + self.uid_map + .push((starting_inside_uid, starting_outside_uid, count)); + self.namespace |= Namespace::USER; + self + } + + /// Convience function for mapping root (inside the container) to the current + /// user ID (outside the container). This is useful for gaining new + /// capabilities inside the container, such as being able to mount file + /// systems. + /// + /// Implies `Namespace::USER`. + /// + /// This is the same as: + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .map_uid(0, unsafe { libc::geteuid() }) + /// .map_gid(0, unsafe { libc::getegid() }); + /// ``` + pub fn map_root(&mut self) -> &mut Self { + self.map_uid(0, unsafe { libc::geteuid() }); + self.map_gid(0, unsafe { libc::getegid() }) + } + + /// Maps one group ID to another. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/gid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_gid(&mut self, inside_gid: libc::gid_t, outside_gid: libc::gid_t) -> &mut Self { + self.map_gid_range(inside_gid, outside_gid, 1) + } + + /// Maps potentially many group IDs inside the new user namespace to group + /// IDs outside of the user namespace. + /// + /// Implies `Namespace::USER`. + /// + /// # Implementation + /// + /// This modifies `/proc/{pid}/gid_map` where `{pid}` is the PID of the child + /// process. See [`user_namespaces(7)`] for more details. + /// + /// [`user_namespaces(7)`]: https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn map_gid_range( + &mut self, + starting_inside_gid: libc::gid_t, + starting_outside_gid: libc::gid_t, + count: u32, + ) -> &mut Self { + self.namespace |= Namespace::USER; + self.gid_map + .push((starting_inside_gid, starting_outside_gid, count)); + self + } + + /// Sets the hostname of the container. + /// + /// Implies `Namespace::UTS`, which requires `CAP_SYS_ADMIN`. + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .map_root() + /// .hostname("foobar.local"); + /// ``` + pub fn hostname>(&mut self, hostname: S) -> &mut Self { + self.namespace |= Namespace::UTS; + self.hostname = Some(hostname.into()); + self + } + + /// Sets the domain name of the container. + /// + /// Implies `Namespace::UTS`, which requires `CAP_SYS_ADMIN`. + /// + /// # Example + /// + /// ```no_run + /// use reverie_process::Container; + /// + /// let container = Container::new() + /// .map_root() + /// .domainname("foobar"); + /// ``` + pub fn domainname>(&mut self, domainname: S) -> &mut Self { + self.namespace |= Namespace::UTS; + self.domainname = Some(domainname.into()); + self + } + + /// Gets the hostname of the container. + pub fn get_hostname(&self) -> Option<&OsStr> { + self.hostname.as_ref().map(AsRef::as_ref) + } + + /// Gets the domainname of the container. + pub fn get_domainname(&self) -> Option<&OsStr> { + self.domainname.as_ref().map(AsRef::as_ref) + } + + /// Adds a file system to be mounted. Note that these are mounted in the same + /// order as given. + /// + /// Implies `Namespace::MOUNT`. Note that `Namespace::USER` should also have + /// been set and `map_uid` should have been called in order to gain the + /// privileges required to mount. + pub fn mount(&mut self, mount: Mount) -> &mut Self { + self.namespace |= Namespace::MOUNT; + self.mounts.push(mount); + self + } + + /// Adds multiple mounts. + pub fn mounts(&mut self, mounts: I) -> &mut Self + where + I: IntoIterator, + { + self.namespace |= Namespace::MOUNT; + self.mounts.extend(mounts); + self + } + + /// Sets up the container to have local networking only. This will prevent + /// any network communication to the outside world. + /// + /// Implies `Namespace::NETWORK` and `Namespace::MOUNT`. + /// + /// This also causes a fresh `/sys` to be mounted to avoid seeing the host + /// network interfaces in `/sys/class/net`. + pub fn local_networking_only(&mut self) -> &mut Self { + if !self.local_networking_only { + self.local_networking_only = true; + self.namespace |= Namespace::NETWORK; + self.mount(Mount::sysfs("/sys")); + } + self + } + + /// Sets the seccomp filter. The filter is loaded immediately before `execve` + /// and *after* all `pre_exec` callbacks have been executed. Thus, you will + /// still be able to call filtered syscalls from `pre_exec` callbacks. + pub fn seccomp(&mut self, filter: seccomp::Filter) -> &mut Self { + self.seccomp = Some(filter); + self + } + + /// Sets the controlling pseudoterminal for the child process). + /// + /// In the child process, this has the effect of: + /// 1. Creating a new session (with `setsid()`). + /// 2. Using an `ioctl` to set the controlling terminal. + /// 3. Setting this file descriptor as the stdio streams. + /// + /// NOTE: Since this modifies the stdio streams, calling this will reset + /// [`Self::stdin`], [`Self::stdout`], and [`Self::stderr`] back to + /// [`Stdio::inherit()`]. + pub fn pty(&mut self, child: PtyChild) -> &mut Self { + self.pty = Some(child); + self.stdin = Stdio::inherit(); + self.stdout = Stdio::inherit(); + self.stderr = Stdio::inherit(); + self + } + + /// Sets the CPU to which the child threads/processes will be pinned. + pub fn affinity(&mut self, affinity: usize) -> &mut Self { + self.affinity = Some(affinity); + self + } + + /// Called by the child process after `clone` to get itself set up for either + /// `execve` or running an arbitrary function. + /// + /// NOTE: Although this function takes `&mut self`, it is only called in the + /// context of the child process (which has a copy-on-write view of the + /// parent's virtual memory). Thus, the parent's version isn't actually + /// modified. + pub(super) fn setup( + &mut self, + context: &ChildContext, + pre_exec: &mut [Box Result<(), Errno> + Send + Sync>], + ) -> Result<(), Error> { + // NOTE: This function MUST NOT allocate or deallocate any memory! Doing + // so can cause random, difficult to diagnose deadlocks. + + if let Some(pty) = self.pty.take() { + // NOTE: This is done *before* setting the stdio streams so that the + // user can still override individual streams if they only want them + // to be partially attached to the tty. + pty.login().context(Context::Tty)?; + } + + if let Some(fd) = context.stdin { + fd.dup2(libc::STDIN_FILENO) + .context(Context::Stdio)? + .leave_open(); + } + if let Some(fd) = context.stdout { + fd.dup2(libc::STDOUT_FILENO) + .context(Context::Stdio)? + .leave_open(); + } + if let Some(fd) = context.stderr { + fd.dup2(libc::STDERR_FILENO) + .context(Context::Stdio)? + .leave_open(); + } + + unsafe { reset_signal_handling() }.context(Context::ResetSignals)?; + + // Set up UID and GID maps. + if !context.uid_map.is_empty() { + context.map_uid().context(Context::MapUid)?; + } + + if !context.gid_map.is_empty() { + context.setgroups(false).context(Context::MapGid)?; + context.map_gid().context(Context::MapGid)?; + } + + // Set host name, if any. + if let Some(name) = &self.hostname { + Error::result( + unsafe { libc::sethostname(name.as_bytes().as_ptr() as *const _, name.len()) }, + Context::Hostname, + )?; + } + + // Set domain name, if any. + if let Some(name) = &self.domainname { + Error::result( + unsafe { libc::setdomainname(name.as_bytes().as_ptr() as *const _, name.len()) }, + Context::Domainname, + )?; + } + + // Mount all the things. + for mount in &mut self.mounts { + mount.mount().context(Context::Mount)?; + } + + // Change root directory. Note that we do this *after* mounting anything + // so that bind mounts sources that live outside of the chroot directory + // can work. + if let Some(chroot) = &self.chroot { + Error::result(unsafe { libc::chroot(chroot.as_ptr()) }, Context::Chroot)?; + } + + // Set working directory, if any. + if let Some(current_dir) = &self.current_dir { + Error::result(unsafe { libc::chdir(current_dir.as_ptr()) }, Context::Chdir)?; + } + + // Configure networking. + // TODO: Generalize this a bit to allow more complex configuration. + if self.local_networking_only { + // Need a socket to access the network interface. + let sock = Fd::socket(libc::AF_INET, libc::SOCK_DGRAM, libc::IPPROTO_IP) + .context(Context::Network)?; + + let loopback = IfName::LOOPBACK; + + // Bring up the loopback interface in the newly mounted sysfs. + let flags = loopback.get_flags(&sock).context(Context::Network)?; + let flags = flags | libc::IFF_UP as i16; + loopback.set_flags(&sock, flags).context(Context::Network)?; + } + + if let Some(cpu) = self.affinity { + let mut cpu_set = CpuSet::new(); + cpu_set.set(cpu).context(Context::Affinity)?; + sched_setaffinity(nix::unistd::Pid::from_raw(0), &cpu_set) + .context(Context::Affinity)?; + } + + // NOTE: We must call our pre_exec callbacks BEFORE installing the + // seccomp filter because our callbacks could be calling syscalls that + // our seccomp filter may be intending to block. + for f in pre_exec { + f().context(Context::PreExec)?; + } + + // Set up the seccomp filter, if any. + if let Some(filter) = &self.seccomp { + filter.load().context(Context::Seccomp)?; + } + + Ok(()) + } + + /// Runs a function in a new process with the specified namespaces unshared. This + /// blocks until the function itself returns and the process has exited. + /// + /// # Safety + /// + /// - This should be called early on in the life of a process, before any + /// other threads are created. This reduces the chance that any global + /// resources (like the Tokio runtime) have been created yet. + /// + /// - Memory allocated in the parent must not be freed in the child, + /// especially if using jemalloc where a separate thread does deallocations. + pub fn run(&mut self, mut f: F) -> Result + where + F: FnMut() -> T, + T: Serialize + DeserializeOwned, + { + let clone_flags = self.namespace.bits() | libc::SIGCHLD; + + let uid_map = &make_id_map(&self.uid_map); + let gid_map = &make_id_map(&self.gid_map); + + let context = ChildContext { + // TODO: Honor stdio options. For now, always inherit from the + // parent process. + stdin: None, + stdout: None, + stderr: None, + uid_map, + gid_map, + }; + + // Use a pipe for getting the result of the function out of the child + // process. + let (mut reader, writer) = pipe()?; + + let writer_fd = writer.as_raw_fd(); + + // NOTE: Must use a dynamically allocated stack here. Programs expect to + // have at least 2 MB of stack space and if we've already used up some + // stack space before this is called we could overflow the stack. + let mut stack = vec![0u8; 1024 * 1024 * 2]; + + // Disable io redirection just before forking. We want the child process to + // be able to call `println!()` and have that output go to stdout. + // + // See: https://github.com/rust-lang/rust/issues/35136 + let output_capture = std::io::set_output_capture(None); + + let result = clone_with_stack( + || { + let value = self.setup(&context, &mut []).map(|()| f()); + + let writer = std::io::BufWriter::new(Fd::new(writer_fd)); + + // Serialize this result with bincode and send it to the parent + // process via a pipe. + // + // TODO: Handle serialization errors(?) + bincode::serialize_into(writer, &value).expect("Failed to serialize return value"); + + 0 + }, + clone_flags, + &mut stack, + ); + + std::io::set_output_capture(output_capture); + + let child = WaitGuard::new(result?); + + // The writer end must be dropped first so that our reader doesn't block + // forever. + drop(writer); + + // Read the return value. Note that we do this *before* waiting on the + // process to exit. Otherwise, for return values that exceed the pipe + // capacity, we would deadlock. + let mut buf = Vec::new(); + match reader.read_to_end(&mut buf) { + Ok(0) => { + // The writer end was closed before anything could be written. + // This indicates that the process exited before the return + // value could be serialized. The only thing we can do in this + // case is collect the exit status of the process. + // + // NOTE: Since we always send `Result` through the pipe, + // we can guarantee that a successful serialization will never + // be 0 bytes (since it always takes more than 0 bytes to encode + // that type). + // + // NOTE: Since `WaitGuard` is used, we guarantee that the + // process will be waited on in the other cases. + Err(RunError::ExitStatus(child.wait()?)) + } + Ok(n) => { + // FIXME: Handle errors + let value: Result = bincode::deserialize(&buf[0..n]).unwrap(); + Ok(value.unwrap()) + } + Err(err) => { + // FIXME: Handle this error + panic!("Got unexpected error: {}", err) + } + } + } +} + +pub(super) struct ChildContext<'a> { + pub stdin: Option<&'a Fd>, + pub stdout: Option<&'a Fd>, + pub stderr: Option<&'a Fd>, + pub uid_map: &'a [u8], + pub gid_map: &'a [u8], +} + +impl<'a> ChildContext<'a> { + fn map_uid(&self) -> Result<(), Errno> { + write_bytes(b"/proc/self/uid_map\0", self.uid_map) + } + + fn map_gid(&self) -> Result<(), Errno> { + write_bytes(b"/proc/self/gid_map\0", self.gid_map) + } + + fn setgroups(&self, allow: bool) -> Result<(), Errno> { + write_bytes( + b"/proc/self/setgroups\0", + if allow { b"allow\0" } else { b"deny\0" }, + ) + } +} + +/// An error that ocurred while running a containerized function. +#[derive(thiserror::Error, Debug, Eq, PartialEq)] +pub enum RunError { + /// An error that occurred while spawning the container. + #[error("Process failed to spawn: {0}")] + Spawn(#[from] Error), + + /// The function exited prematurely. This can happen if the function called + /// `std::process::exit(0)`, preventing the return value from being sent to + /// the parent. It can also happen if the process panics. + #[error("Process exited with code: {0:?}")] + ExitStatus(ExitStatus), +} + +impl From for RunError { + fn from(errno: Errno) -> Self { + Self::Spawn(Error::from(errno)) + } +} + +// Helper guard for making sure that the process gets waited on even if an error +// is encountered. +struct WaitGuard(Option); + +impl WaitGuard { + pub fn new(pid: Pid) -> Self { + Self(Some(pid)) + } + + /// Eagerly waits for the pid. Otherwise, it'll get waited on upon drop. + pub fn wait(mut self) -> Result { + let pid = self.0.take().unwrap(); + + let mut status = 0; + let ret = Errno::result(unsafe { libc::waitpid(pid.as_raw(), &mut status, 0) })?; + assert_ne!(ret, 0); + + Ok(ExitStatus::from_raw(status)) + } +} + +impl Drop for WaitGuard { + fn drop(&mut self) { + if let Some(pid) = self.0.take() { + let mut status = 0; + unsafe { + libc::waitpid(pid.as_raw(), &mut status, 0); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::sys::signal::Signal; + + #[test] + fn can_panic() { + assert_eq!( + Container::new().run(|| panic!()), + Err(RunError::ExitStatus(ExitStatus::Signaled( + Signal::SIGABRT, + true + ))) + ); + } + + #[test] + fn is_new_process() { + let my_pid = unsafe { libc::getpid() }; + + assert_eq!( + Container::new().run(|| { + assert_ne!(unsafe { libc::getpid() }, 1); + assert_ne!(unsafe { libc::getpid() }, my_pid); + assert_eq!(unsafe { libc::getppid() }, my_pid); + }), + Ok(()) + ); + } + + #[test] + fn pid_namespace() { + assert_eq!( + Container::new() + .unshare(Namespace::USER | Namespace::PID) + .run(|| { + // New PID namespace, so this should be the init process. + assert_eq!(unsafe { libc::getpid() }, 1); + }), + Ok(()) + ); + } + + #[test] + fn return_value() { + assert_eq!(Container::new().run(|| 42), Ok(42)); + + assert_eq!( + Container::new().run(|| String::from("foobar")), + Ok("foobar".into()) + ); + } + + #[test] + fn huge_return_value() { + assert_eq!( + Container::new().run(|| { + // Need something larger than /proc/sys/fs/pipe-max-size, which + // is typically 1MB. + vec![42; 10 * 1024 * 1024 /* 10 MB */] + }), + Ok(vec![42; 10 * 1024 * 1024]) + ); + } + + #[test] + pub fn bind_to_low_port() { + use std::net::Ipv4Addr; + use std::net::SocketAddrV4; + use std::net::TcpListener; + + let addr = Container::new() + .map_root() + .local_networking_only() + .run(|| { + let listener = TcpListener::bind("127.0.0.1:80").unwrap(); + listener.local_addr().unwrap() + }) + .unwrap(); + + assert_eq!( + addr, + SocketAddrV4::new(Ipv4Addr::new(127, 0, 0, 1), 80).into() + ); + } + + #[test] + pub fn pin_affinity_to_all_cores() -> Result<(), Error> { + use raw_cpuid::CpuId; + use std::collections::HashMap; + + let cpus = num_cpus::get(); + println!("Total cpus {}", cpus); + + // Map the apic_id to the number of times we observed it: + let mut results: HashMap = HashMap::new(); + for core in 0..cpus { + println!(" Launching guest with affinity set to {}", core); + let mut container = Container::new(); + container.affinity(core); + let which_core = container + .run(|| { + let cpuid = CpuId::new(); + cpuid + .get_feature_info() + .expect("cpuid failed") + .initial_local_apic_id() + }) + .unwrap(); + println!(" Guest sees its on APIC id {}", which_core); + *results.entry(which_core).or_default() += 1; + } + + println!("Final table size {:?}", results.len()); + assert_eq!(results.values().fold(0, |n, v| std::cmp::max(n, *v)), 1); + Ok(()) + } +} diff --git a/reverie-process/src/env.rs b/reverie-process/src/env.rs new file mode 100644 index 0000000..5bc0d08 --- /dev/null +++ b/reverie-process/src/env.rs @@ -0,0 +1,108 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::ffi::{CString, OsStr, OsString}; + +use super::util::CStringArray; + +/// A mapping of environment variables. +#[derive(Default, Clone, Debug)] +pub struct Env { + clear: bool, + vars: BTreeMap>, +} + +impl Env { + /// Clear out all environment variables, including the ones inherited from + /// the parent process. Any variables set after this are completely new + /// variables. + pub fn clear(&mut self) { + self.clear = true; + self.vars.clear(); + } + + pub fn is_cleared(&self) -> bool { + self.clear + } + + pub fn set(&mut self, key: &OsStr, value: &OsStr) { + self.vars.insert(key.to_owned(), Some(value.to_owned())); + } + + pub fn get>(&self, key: K) -> Option<&OsStr> { + self.vars + .get(key.as_ref()) + .and_then(|v| v.as_ref().map(|v| v.as_os_str())) + } + + pub fn get_captured>(&self, key: K) -> Option> { + let key = key.as_ref(); + + if !self.clear { + if let Some(var) = std::env::var_os(key) { + return Some(Cow::Owned(var)); + } + } + + self.get(key).map(Cow::Borrowed) + } + + pub fn remove(&mut self, key: &OsStr) { + if self.clear { + self.vars.remove(key); + } else { + self.vars.insert(key.to_owned(), None); + } + } + + /// Capture the current environment and merge it with the changes we've + /// applied. + pub fn capture(&self) -> BTreeMap { + let mut env = if self.clear { + BTreeMap::new() + } else { + // Capture from the current environment. + std::env::vars_os().collect() + }; + + for (k, v) in &self.vars { + if let Some(ref v) = v { + env.insert(k.clone(), v.clone()); + } else { + env.remove(k); + } + } + + env + } + + pub fn array(&self) -> CStringArray { + use std::os::unix::ffi::OsStringExt; + + let env = self.capture(); + + let mut result = CStringArray::with_capacity(env.len()); + for (mut k, v) in env { + // Reserve additional space for '=' and null terminator + k.reserve_exact(v.len() + 2); + k.push("="); + k.push(&v); + + // Add the new entry into the array + result.push(CString::new(k.into_vec()).unwrap()); + } + + result + } + + pub fn iter(&self) -> impl Iterator)> { + self.vars.iter().map(|(k, v)| (k.as_ref(), v.as_deref())) + } +} diff --git a/reverie-process/src/error.rs b/reverie-process/src/error.rs new file mode 100644 index 0000000..7581d09 --- /dev/null +++ b/reverie-process/src/error.rs @@ -0,0 +1,189 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::fmt; + +use serde::{Deserialize, Serialize}; +use syscalls::Errno; + +/// Context associated with [`Error`]. Useful for knowing which particular part +/// of [`super::Command::spawn`] failed. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +#[repr(u32)] +pub enum Context { + /// No context provided. + Unknown, + /// Setting CPU affinity failed. + Affinity, + /// The clone syscall failed. + Clone, + /// Setting up the tty failed. + Tty, + /// Setting up stdio failed. + Stdio, + /// Resetting signals failed. + ResetSignals, + /// Changing `/proc/{pid}/uid_map` failed. + MapUid, + /// Changing `/proc/{pid}/setgroups` or `/proc/{pid}/gid_map` failed. + MapGid, + /// Setting the hostname failed. + Hostname, + /// Setting the domainname failed. + Domainname, + /// Chroot failed. + Chroot, + /// Chdir failed. + Chdir, + /// Mounting failed. + Mount, + /// Network configuration failed. + Network, + /// The pre_exec callback(s) failed. + PreExec, + /// Setting the seccomp filter failed. + Seccomp, + /// Exec failed. + Exec, +} + +impl Context { + /// Returns a string representation of the context. + pub fn as_str(&self) -> &'static str { + match self { + Self::Unknown => "Unknown failure", + Self::Affinity => "setting cpu affinity failed", + Self::Clone => "clone failed", + Self::Tty => "Setting the controlling tty failed", + Self::Stdio => "Setting up stdio file descriptors failed", + Self::ResetSignals => "Reseting signal handlers failed", + Self::MapUid => "Setting UID map failed", + Self::MapGid => "Setting GID map failed", + Self::Hostname => "Setting hostname failed", + Self::Domainname => "Setting domainname failed", + Self::Chroot => "chroot failed", + Self::Chdir => "chdir failed", + Self::Mount => "mount failed", + Self::Network => "network configuration failed", + Self::PreExec => "pre_exec callback(s) failed", + Self::Seccomp => "failed to install seccomp filter", + Self::Exec => "execvp failed", + } + } +} + +impl fmt::Display for Context { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Write::write_str(f, self.as_str()) + } +} + +/// An error from spawning a process. This is a thin wrapper around +/// [`crate::Errno`], but with more context about what went wrong. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct Error { + errno: Errno, + context: Context, +} + +impl Error { + /// Creates a new `Error`. + pub fn new(errno: Errno, context: Context) -> Self { + Self { errno, context } + } + + /// Converts a value `S` into an `Error`. Useful for turning `libc` function + /// return types into a `Result`. + pub fn result(value: S, context: Context) -> Result + where + S: syscalls::ErrnoSentinel + PartialEq, + { + Errno::result(value).map_err(|err| Self::new(err, context)) + } + + /// Gets the errno. + pub fn errno(&self) -> Errno { + self.errno + } + + /// Gets the error context. + pub fn context(&self) -> Context { + self.context + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "{}: {}", self.context, self.errno) + } +} + +impl std::error::Error for Error {} + +impl From for Error { + fn from(err: Errno) -> Self { + Self::new(err, Context::Unknown) + } +} + +impl From for Errno { + fn from(err: Error) -> Errno { + err.errno + } +} + +impl From for std::io::Error { + fn from(err: Error) -> Self { + std::io::Error::from(err.errno) + } +} + +impl From<[u8; 8]> for Error { + /// Deserializes an `Error` from bytes. Useful for receiving the error + /// through a pipe from the child process. + fn from(bytes: [u8; 8]) -> Self { + debug_assert_eq!(core::mem::size_of::(), 8); + unsafe { core::mem::transmute(bytes) } + } +} + +impl From for [u8; 8] { + /// Serializes an `Error` into bytes. Useful for sending the error through a + /// pipe to the parent process. + fn from(error: Error) -> Self { + debug_assert_eq!(core::mem::size_of::(), 8); + unsafe { core::mem::transmute(error) } + } +} + +pub(super) trait AddContext { + fn context(self, context: Context) -> Result; +} + +impl AddContext for Result { + fn context(self, context: Context) -> Result { + self.map_err(move |errno| Error::new(errno, context)) + } +} + +impl AddContext for Result { + fn context(self, context: Context) -> Result { + self.map_err(move |errno| Error::new(Errno::new(errno as i32), context)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn to_bytes() { + let bytes: [u8; 8] = Error::new(Errno::ENOENT, Context::Exec).into(); + assert_eq!(Error::from(bytes), Error::new(Errno::ENOENT, Context::Exec)); + } +} diff --git a/reverie-process/src/exit_status.rs b/reverie-process/src/exit_status.rs new file mode 100644 index 0000000..8fcfb59 --- /dev/null +++ b/reverie-process/src/exit_status.rs @@ -0,0 +1,331 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::os::unix::process::ExitStatusExt; + +use nix::sys::signal::{self, SigHandler, SigSet, SigmaskHow, Signal}; + +/// Describes the result of a process after it has exited. +/// +/// This is similar to `std::process::ExitStatus`, but is easier to match +/// against and provides additional functionality like `raise_or_exit` that +/// helps with propagating an exit status. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +pub enum ExitStatus { + /// Program exited with an exit code. + Exited(i32), + /// Program killed by signal, with or without a coredump. + Signaled(Signal, bool), +} + +impl ExitStatus { + /// A successful exit status. + pub const SUCCESS: Self = ExitStatus::Exited(0); + + /// Construct an `ExitStatus` from a raw exit code. + pub fn from_raw(code: i32) -> Self { + if libc::WIFEXITED(code) { + ExitStatus::Exited(libc::WEXITSTATUS(code)) + } else { + ExitStatus::Signaled( + Signal::try_from(libc::WTERMSIG(code)).unwrap(), + libc::WCOREDUMP(code), + ) + } + } + + /// Converts the exit status into a raw number. + pub fn into_raw(self) -> i32 { + match self { + ExitStatus::Exited(code) => code << 8, + ExitStatus::Signaled(sig, coredump) => { + if coredump { + (sig as i32 | 0x80) & 0xff + } else { + sig as i32 & 0x7f + } + } + } + } + + /// If the process was terminated by a signal, returns that signal. + pub fn signal(&self) -> Option { + match self { + ExitStatus::Exited(_) => None, + ExitStatus::Signaled(sig, _) => Some(*sig as i32 & 0x7f), + } + } + + /// Was termination successful? Signal termination is not considered a + /// success, and success is defined as a zero exit status. + pub fn success(&self) -> bool { + self == &ExitStatus::Exited(0) + } + + /// Returns the exit code of the process, if any. If the process was + /// terminated by a signal, this will return `None`. + pub fn code(&self) -> Option { + if let ExitStatus::Exited(code) = *self { + Some(code) + } else { + None + } + } + + /// Propagate the exit status such that the current process exits in the same + /// way that the child process exited. + pub fn raise_or_exit(self) -> ! { + match self { + ExitStatus::Signaled(signal, core_dump) => { + if core_dump { + // Prevent the current process from producing a core dump as + // well when the signal is propagated. + let limit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { + libc::setrlimit(libc::RLIMIT_CORE, &limit) + }; + } + + // Raise the same signal, which may or may not be fatal. + let _ = unsafe { signal::signal(signal, SigHandler::SigDfl) }; + let _ = signal::raise(signal); + + // Unblock the signal. + let mut mask = SigSet::empty(); + mask.add(signal); + let _ = signal::sigprocmask(SigmaskHow::SIG_UNBLOCK, Some(&mask), None); + + // Incase the signal is not fatal: + std::process::exit(signal as i32 + 128); + } + ExitStatus::Exited(code) => std::process::exit(code), + } + } +} + +impl From for std::process::ExitStatus { + fn from(status: ExitStatus) -> Self { + Self::from_raw(status.into_raw()) + } +} + +impl From for ExitStatus { + fn from(status: std::process::ExitStatus) -> Self { + if let Some(sig) = status.signal() { + ExitStatus::Signaled(Signal::try_from(sig).unwrap(), true) + } else { + ExitStatus::Exited(status.code().unwrap_or(255)) + } + } +} + +impl serde::Serialize for ExitStatus { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + serializer.serialize_i32(self.into_raw()) + } +} + +impl<'de> serde::Deserialize<'de> for ExitStatus { + fn deserialize(deserializer: D) -> Result + where + D: serde::de::Deserializer<'de>, + { + let value = i32::deserialize(deserializer)?; + Ok(ExitStatus::from_raw(value)) + } +} + +#[cfg(all(test, not(sanitized)))] +mod tests_non_sanitized { + use super::*; + use nix::{ + sys::{ + signal::{self, Signal}, + wait::{waitpid, WaitStatus}, + }, + unistd::{fork, ForkResult}, + }; + + // Runs a closure in a forked process and reports the exit status. + fn run_forked(f: F) -> nix::Result + where + F: FnOnce() -> nix::Result<()>, + { + match unsafe { fork() }? { + ForkResult::Parent { child, .. } => { + // Simply wait for the child to exit. + match waitpid(child, None)? { + WaitStatus::Exited(_, code) => Ok(ExitStatus::Exited(code)), + WaitStatus::Signaled(_, sig, coredump) => { + Ok(ExitStatus::Signaled(sig, coredump)) + } + wait_status => unreachable!("Got unexpected wait status: {:?}", wait_status), + } + } + ForkResult::Child => { + // Suppress core dumps for testing purposes. + let limit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { + // restore some sighandlers to default + for &sig in &[libc::SIGALRM, libc::SIGINT, libc::SIGVTALRM] { + libc::signal(sig, libc::SIG_DFL); + } + // disable coredump + libc::setrlimit(libc::RLIMIT_CORE, &limit) + }; + + // Run the child. + let code = match f() { + Ok(()) => 0, + Err(err) => { + eprintln!("{}", err); + 1 + } + }; + + // The closure should have called `exit` by this point, but just + // in case it didn't, call it ourselves. + // + // Note: We also can't use the normal exit function here because we + // don't want to call atexit handlers since `execve` was never + // called. + unsafe { + ::libc::_exit(code) + }; + } + } + } + + #[test] + fn normal_exit() { + assert_eq!( + run_forked(|| { + unsafe { libc::_exit(0) } + }), + Ok(ExitStatus::Exited(0)) + ); + + assert_eq!( + run_forked(|| { + unsafe { libc::_exit(42) } + }), + Ok(ExitStatus::Exited(42)) + ); + + // Thread exit + assert_eq!( + run_forked(|| { + unsafe { + libc::syscall(libc::SYS_exit, 42) + }; + unreachable!(); + }), + Ok(ExitStatus::Exited(42)) + ); + + // exit_group. Should be identical to `libc::_exit`. + assert_eq!( + run_forked(|| { + unsafe { + libc::syscall(libc::SYS_exit_group, 42) + }; + unreachable!(); + }), + Ok(ExitStatus::Exited(42)) + ); + } + + #[test] + fn exit_by_signal() { + assert_eq!( + run_forked(|| { + signal::raise(Signal::SIGALRM)?; + unreachable!(); + }), + Ok(ExitStatus::Signaled(Signal::SIGALRM, false)) + ); + + assert_eq!( + run_forked(|| { + signal::raise(Signal::SIGILL)?; + unreachable!(); + }), + Ok(ExitStatus::Signaled(Signal::SIGILL, true)) + ); + } + + #[test] + fn propagate_exit() { + // NOTE: These tests fail under a sanitized build. ASAN leak detection + // must be disabled for this to run correctly. To disable ASAN leak + // detection, set the `ASAN_OPTIONS=detect_leaks=0` environment variable + // *before* the test starts up. (This is currently done in the TARGETS + // file.) Alternatively, we *could* bypass the atexit handler that ASAN + // sets up by calling `libc::_exit`, but that may have unintended + // consequences for real code. + assert_eq!( + run_forked(|| { ExitStatus::Exited(0).raise_or_exit() }), + Ok(ExitStatus::Exited(0)) + ); + assert_eq!( + run_forked(|| { ExitStatus::Exited(42).raise_or_exit() }), + Ok(ExitStatus::Exited(42)) + ); + } + + #[test] + fn propagate_signal() { + assert_eq!( + run_forked(|| { ExitStatus::Signaled(Signal::SIGILL, true).raise_or_exit() }), + Ok(ExitStatus::Signaled(Signal::SIGILL, true)) + ); + assert_eq!( + run_forked(|| { ExitStatus::Signaled(Signal::SIGALRM, false).raise_or_exit() }), + Ok(ExitStatus::Signaled(Signal::SIGALRM, false)) + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn exit_code_into_raw() { + assert_eq!(ExitStatus::Exited(1).into_raw(), 0x1 << 8); + assert_eq!( + ExitStatus::Signaled(Signal::SIGINT, false).into_raw(), + Signal::SIGINT as i32 + ); + assert_eq!( + ExitStatus::Signaled(Signal::SIGILL, true).into_raw(), + 0x80 | Signal::SIGILL as i32 + ); + assert_ne!( + ExitStatus::Exited(2).into_raw(), + ExitStatus::Signaled(Signal::SIGINT, false).into_raw() + ); + } + + #[test] + fn exit_status_from_raw() { + assert_eq!(ExitStatus::from_raw(0x100).code(), Some(1)); + assert_eq!(ExitStatus::from_raw(0x100).signal(), None); + assert_eq!(ExitStatus::from_raw(0x84).code(), None); + assert_eq!(ExitStatus::from_raw(0x84).signal(), Some(4)); + } +} diff --git a/reverie-process/src/fd.rs b/reverie-process/src/fd.rs new file mode 100644 index 0000000..9e22165 --- /dev/null +++ b/reverie-process/src/fd.rs @@ -0,0 +1,531 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::util; + +use core::pin::Pin; +use core::task::{Context, Poll}; + +use std::ffi::{CStr, CString}; +use std::io::{self, Read, Write}; +use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; +use std::path::Path; + +use syscalls::Errno; +use tokio::io::unix::AsyncFd as TokioAsyncFd; +use tokio::io::{AsyncRead, AsyncWrite, Interest, ReadBuf}; + +#[derive(Debug)] +// From `std/src/sys/unix/fd.rs`. Mark `-1` as an invalid file descriptor so it +// can be reused to in `Option`. +#[rustc_layout_scalar_valid_range_start(0)] +#[rustc_layout_scalar_valid_range_end(0xFF_FF_FF_FE)] +pub struct Fd(i32); + +/// An asynchronous file descriptor. The file descriptor is guaranteed to be in +/// non-blocking mode and implements `AsyncRead` and `AsyncWrite`. +#[derive(Debug)] +pub struct AsyncFd(TokioAsyncFd); + +impl Fd { + pub fn new(fd: i32) -> Self { + assert_ne!(fd, -1); + unsafe { Self(fd) } + } + + #[allow(dead_code)] + pub fn open>(path: P, flags: i32) -> Result { + let path = util::to_cstring(path.as_ref()); + Self::open_c(path.as_ptr(), flags) + } + + /// Opens a file from a NUL terminated string. This function does not + /// allocate. + pub fn open_c(path: *const libc::c_char, flags: i32) -> Result { + let fd = Errno::result(unsafe { libc::open(path, flags) })?; + Ok(unsafe { Self(fd) }) + } + + /// Creates a file from a NUL terminated string. This function does not allocate. + pub fn create_c( + path: *const libc::c_char, + flags: i32, + mode: libc::mode_t, + ) -> Result { + let fd = Errno::result(unsafe { libc::open(path, flags | libc::O_CREAT, mode) })?; + Ok(unsafe { Self(fd) }) + } + + pub fn null(readable: bool) -> Result { + let path = unsafe { CStr::from_bytes_with_nul_unchecked(b"/dev/null\0") }; + Self::open_c( + path.as_ptr(), + if readable { + libc::O_RDONLY + } else { + libc::O_WRONLY + }, + ) + } + + /// Creates an endpoint for communications and returns a file descriptor that + /// refers to that endpoint. + pub fn socket(domain: i32, ty: i32, protocol: i32) -> Result { + Errno::result(unsafe { libc::socket(domain, ty, protocol) }).map(Self::new) + } + + fn set_nonblocking(&self) -> Result<(), Errno> { + let fd = self.as_raw_fd(); + let flags = Errno::result(unsafe { libc::fcntl(fd, libc::F_GETFL) })?; + Errno::result(unsafe { libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK) })?; + Ok(()) + } + + /// Returns true if the file descriptor is nonblocking. + #[allow(unused)] + pub fn is_nonblocking(&self) -> Result { + let fd = self.as_raw_fd(); + let flags = Errno::result(unsafe { libc::fcntl(fd, libc::F_GETFL) })?; + Ok(flags & libc::O_NONBLOCK == libc::O_NONBLOCK) + } + + pub fn dup(&self) -> Result { + let fd = Errno::result(unsafe { libc::dup(self.0) })?; + Ok(unsafe { Fd(fd) }) + } + + pub fn dup2(&self, newfd: RawFd) -> Result { + let fd = Errno::result(unsafe { libc::dup2(self.0, newfd) })?; + Ok(unsafe { Fd(fd) }) + } + + #[allow(unused)] + pub fn close(self) -> Result<(), Errno> { + let fd = self.0; + core::mem::forget(self); + Errno::result(unsafe { libc::close(fd) })?; + Ok(()) + } + + /// Discards the file descriptor without closing it. + pub fn leave_open(self) { + core::mem::forget(self); + } +} + +impl IntoRawFd for Fd { + fn into_raw_fd(self) -> RawFd { + let fd = self.as_raw_fd(); + core::mem::forget(self); + fd + } +} + +impl Drop for Fd { + fn drop(&mut self) { + let _ = unsafe { libc::close(self.0) }; + } +} + +impl Read for Fd { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let res = Errno::result(unsafe { + libc::read( + self.0, + buf.as_mut_ptr() as *mut libc::c_void, + buf.len() as libc::size_t, + ) + })?; + + Ok(res as usize) + } +} + +impl Write for Fd { + fn write(&mut self, buf: &[u8]) -> io::Result { + let res = Errno::result(unsafe { + libc::write( + self.0, + buf.as_ptr() as *const libc::c_void, + buf.len() as libc::size_t, + ) + })?; + + Ok(res as usize) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +impl AsRawFd for Fd { + fn as_raw_fd(&self) -> RawFd { + self.0.as_raw_fd() + } +} + +impl FromRawFd for Fd { + unsafe fn from_raw_fd(fd: i32) -> Self { + Self::new(fd) + } +} + +impl From for std::fs::File { + fn from(fd: Fd) -> Self { + unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) } + } +} + +impl AsyncFd { + pub fn new(fd: Fd) -> Result { + fd.set_nonblocking()?; + Ok(Self( + TokioAsyncFd::with_interest(fd, Interest::READABLE | Interest::WRITABLE).unwrap(), + )) + } + + pub fn readable(fd: Fd) -> Result { + fd.set_nonblocking()?; + Ok(Self( + TokioAsyncFd::with_interest(fd, Interest::READABLE).unwrap(), + )) + } + + pub fn writable(fd: Fd) -> Result { + fd.set_nonblocking()?; + Ok(Self( + TokioAsyncFd::with_interest(fd, Interest::WRITABLE).unwrap(), + )) + } +} + +impl AsRawFd for AsyncFd { + fn as_raw_fd(&self) -> RawFd { + self.0.as_raw_fd() + } +} + +impl AsyncRead for AsyncFd { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + loop { + let mut guard = futures::ready!(self.0.poll_read_ready_mut(cx))?; + + match guard.try_io(|inner| { + let n = inner.get_mut().read(buf.initialize_unfilled())?; + buf.advance(n); + + Ok(()) + }) { + Ok(result) => return Poll::Ready(result), + Err(_would_block) => continue, + } + } + } +} + +impl AsyncWrite for AsyncFd { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + loop { + let mut guard = futures::ready!(self.0.poll_write_ready_mut(cx))?; + + match guard.try_io(|inner| inner.get_mut().write(buf)) { + Ok(result) => return Poll::Ready(result), + Err(_would_block) => continue, + } + } + } + + fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } +} + +// Creates a unidirectional pipe. The writable end is second item and the +// readable end is the first item. +pub fn pipe() -> Result<(Fd, Fd), Errno> { + let mut fds = [0; 2]; + + // We use O_CLOEXEC because we don't want the pipe file descriptor to be + // inherited by child processes directly. Instead, we use `dup2` to assign + // it to one of the stdio file descriptors. Then, the duplicated file + // descriptor won't be closed upon exec. + Errno::result(unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) })?; + + Ok((unsafe { Fd(fds[0]) }, unsafe { Fd(fds[1]) })) +} + +/// Writes bytes to a file. The file path must be null terminated. +pub fn write_bytes(path: &'static [u8], bytes: &[u8]) -> Result<(), Errno> { + let path = unsafe { CStr::from_bytes_with_nul_unchecked(path) }; + Fd::open_c(path.as_ptr(), libc::O_WRONLY)? + .write_all(bytes) + .map_err(|err| Errno::new(err.raw_os_error().unwrap())) +} + +/// Creates a file if it does not exist. +pub fn touch(path: *const libc::c_char, mode: libc::mode_t) -> Result<(), Errno> { + Fd::create_c(path, libc::O_CLOEXEC, mode).map(drop) +} + +pub fn lstat(path: *const libc::c_char) -> Result { + let mut buf: libc::stat64 = unsafe { core::mem::zeroed() }; + Errno::result(unsafe { libc::lstat64(path, &mut buf) })?; + Ok(buf) +} + +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct FileType(libc::mode_t); + +impl FileType { + pub fn new(path: *const libc::c_char) -> Result { + Ok(Self::from(lstat(path)?)) + } + + pub fn is_dir(&self) -> bool { + self.0 & libc::S_IFMT == libc::S_IFDIR + } + + #[allow(unused)] + pub fn is_file(&self) -> bool { + self.0 & libc::S_IFMT == libc::S_IFREG + } +} + +impl From for FileType { + fn from(stat: libc::stat64) -> Self { + Self(stat.st_mode) + } +} + +/// Returns true if `path` is a directory. Returns `false` in all other cases. +/// +/// NOTE: The `path` may exist and may be a directory, but this will still return +/// false if there is a permissions error. Use `FileType` to distinguish these +/// cases. +pub fn is_dir(path: *const libc::c_char) -> bool { + match FileType::new(path) { + Ok(ft) => ft.is_dir(), + Err(_) => false, + } +} + +fn cstring_as_slice(s: &mut CString) -> &mut [libc::c_char] { + let bytes = s.as_bytes_with_nul(); + unsafe { + // This is safe because we are already provided a mutable `CString` and + // we don't alias the two mutable references. + core::slice::from_raw_parts_mut(bytes.as_ptr() as *mut libc::c_char, bytes.len()) + } +} + +/// Creates every path component in `path` without allocating. This is done by +/// replacing each `/` with a NUL terminator as needed (and then changing the +/// `\0` back to `/` afterwards). +pub fn create_dir_all(path: &mut CString, mode: libc::mode_t) -> Result<(), Errno> { + create_dir_all_(cstring_as_slice(path), mode) +} + +/// Helper function. The last character in the path is always `\0`. +fn create_dir_all_(path: &mut [libc::c_char], mode: libc::mode_t) -> Result<(), Errno> { + if path.len() == 1 { + return Ok(()); + } + + // Try creating this directory + match Errno::result(unsafe { libc::mkdir(path.as_ptr(), mode) }) { + Ok(_) => return Ok(()), + Err(Errno::ENOENT) => {} + Err(_) if is_dir(path.as_ptr()) => return Ok(()), + Err(e) => return Err(e), + } + + // If it doesn't exist, try creating the parent directory. + with_parent(path, |parent| { + match parent { + Some(p) => create_dir_all_(p, mode), + None => { + // Got all the way to the root without successfully creating any + // child directories. Most likely a permissions error. + Err(Errno::EPERM) + } + } + })?; + + // Finally, try creating the directory again after the parent directories + // now exist. + match Errno::result(unsafe { libc::mkdir(path.as_ptr(), mode) }) { + Ok(_) => Ok(()), + Err(_) if is_dir(path.as_ptr()) => Ok(()), + Err(e) => Err(e), + } +} + +/// Creates an empty file at `path` without allocating. +pub fn touch_path( + path: &mut CString, + file_mode: libc::mode_t, + dir_mode: libc::mode_t, +) -> Result<(), Errno> { + touch_path_(cstring_as_slice(path), file_mode, dir_mode) +} + +/// Helper function. The last character in the path is always `\0`. +fn touch_path_( + path: &mut [libc::c_char], + file_mode: libc::mode_t, + dir_mode: libc::mode_t, +) -> Result<(), Errno> { + // Try to create the file. This may fail if the parent directories do not exist. + match touch(path.as_ptr(), file_mode) { + Ok(_) => return Ok(()), + Err(Errno::ENOENT) => {} + Err(e) => return Err(e), + } + + // Got ENOENT. Try to create the parent directories. + with_parent(path, |parent| match parent { + Some(p) => create_dir_all_(p, dir_mode), + None => Err(Errno::ENOENT), + })?; + + // Try creating the file again after the parent directories now exist. + touch(path.as_ptr(), file_mode) +} + +/// Helper function for chopping off the last path component, leaving only the +/// parent directory. To do this without allocating, the last path separator is +/// replaced with NUL before calling the closure. After the closure is done, the +/// NUL byte is replaced by the path component again. Thus, the path is only +/// mutated for the duration of the closure. +fn with_parent(path: &mut [libc::c_char], mut f: F) -> T +where + F: FnMut(Option<&mut [libc::c_char]>) -> T, +{ + // Find the index of one past the last path separator. + if let Some(parent_index) = path + .iter() + .rev() + .position(|c| *c == b'/' as i8) + .map(|i| path.len() - i) + { + // NB: the index is guaranteed to be >0. + path[parent_index - 1] = 0; + + let result = f(Some(&mut path[..parent_index])); + + // Restore the path to its former glory. + path[parent_index - 1] = b'/' as i8; + + result + } else { + f(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use const_cstr::const_cstr; + use std::os::unix::ffi::OsStrExt; + + #[test] + fn test_is_dir() { + assert!(is_dir(const_cstr!("/").as_ptr())); + assert!(is_dir(const_cstr!("/dev").as_ptr())); + assert!(!is_dir(const_cstr!("/dev/null").as_ptr())); + } + + #[test] + fn test_file_type() { + assert!(FileType::new(const_cstr!("/").as_ptr()).unwrap().is_dir()); + assert!( + FileType::new(const_cstr!("/dev").as_ptr()) + .unwrap() + .is_dir() + ); + assert!( + !FileType::new(const_cstr!("/dev/null").as_ptr()) + .unwrap() + .is_file() + ); + } + + #[test] + fn test_create_dir_all() { + let tempdir = tempfile::TempDir::new().unwrap(); + let mut path = CString::new( + tempdir + .path() + .join("some/path/to/a/dir") + .into_os_string() + .as_bytes(), + ) + .unwrap(); + let path2 = path.clone(); + + create_dir_all(&mut path, 0o777).unwrap(); + + assert_eq!(path, path2); + + assert!(is_dir(path.as_ptr())); + } + + #[test] + fn test_touch_path() { + let tempdir = tempfile::TempDir::new().unwrap(); + let mut path = CString::new( + tempdir + .path() + .join("some/path/to/a/file") + .into_os_string() + .as_bytes(), + ) + .unwrap(); + let path2 = path.clone(); + + touch_path(&mut path, 0o666, 0o777).unwrap(); + + assert_eq!(path, path2); + + assert!(FileType::new(path.as_ptr()).unwrap().is_file()); + } + + #[test] + fn test_nonblocking() -> Result<(), Errno> { + let (r, w) = pipe()?; + + assert!(!r.is_nonblocking()?); + assert!(!w.is_nonblocking()?); + + let f = w.dup()?; + + assert!(!f.is_nonblocking()?); + + w.set_nonblocking()?; + + assert!(!r.is_nonblocking()?); + assert!(w.is_nonblocking()?); + assert!(f.is_nonblocking()?); + + Ok(()) + } +} diff --git a/reverie-process/src/id_map.rs b/reverie-process/src/id_map.rs new file mode 100644 index 0000000..00dc274 --- /dev/null +++ b/reverie-process/src/id_map.rs @@ -0,0 +1,17 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::io::Write; + +pub fn make_id_map(map: &[(libc::uid_t, libc::uid_t, u32)]) -> Vec { + let mut v = Vec::new(); + for (inside, outside, count) in map { + writeln!(v, "{} {} {}", inside, outside, count).unwrap(); + } + v +} diff --git a/reverie-process/src/lib.rs b/reverie-process/src/lib.rs new file mode 100644 index 0000000..bb278ed --- /dev/null +++ b/reverie-process/src/lib.rs @@ -0,0 +1,610 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! A drop-in replacement for `std::process::Command` that provides the ability +//! to set up namespaces, a seccomp filter, and more. + +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] +#![feature(internal_output_capture)] +#![feature(never_type)] +#![feature(rustc_attrs)] + +mod builder; +mod child; +mod clone; +mod container; +mod env; +mod error; +mod exit_status; +mod fd; +mod id_map; +mod mount; +mod namespace; +mod net; +mod pid; +mod pty; +pub mod seccomp; +mod spawn; +mod stdio; +mod util; + +pub use child::Child; +pub use child::Output; +pub use container::Container; +pub use container::RunError; +pub use error::Context; +pub use error::Error; +pub use exit_status::ExitStatus; +pub use mount::Bind; +pub use mount::Mount; +pub use mount::MountFlags; +pub use mount::MountParseError; +pub use namespace::Namespace; +pub use pid::Pid; +pub use pty::Pty; +pub use pty::PtyChild; +pub use stdio::ChildStderr; +pub use stdio::ChildStdin; +pub use stdio::ChildStdout; +pub use stdio::Stdio; + +// Re-export Signal since it is used by `Child::signal`. +pub use nix::sys::signal::Signal; + +use std::ffi::CString; + +use syscalls::Errno; + +/// A builder for spawning a process. +// See the builder.rs for documentation of each field. +pub struct Command { + program: CString, + args: util::CStringArray, + pre_exec: Vec Result<(), Errno> + Send + Sync>>, + container: Container, +} + +impl Command { + /// Converts [`std::process::Command`] into [`Command`]. Note that this is a + /// very basic and *lossy* conversion. + /// + /// This only preserves the + /// - program path, + /// - arguments, + /// - environment variables, + /// - and working directory. + /// + /// # Caveats + /// + /// Since [`std::process::Command`] is rather opaque and doesn't provide + /// access to all fields, this will *not* preserve: + /// - stdio handles, + /// - `env_clear`, + /// - any `pre_exec` callbacks, + /// - `arg0` (if not the same as `program`), + /// - `uid`, `gid`, or `groups`. + pub fn from_std_lossy(cmd: &std::process::Command) -> Command { + let mut result = Command::new(cmd.get_program()); + result.args(cmd.get_args()); + + for (key, value) in cmd.get_envs() { + match value { + Some(value) => result.env(key, value), + None => result.env_remove(key), + }; + } + + if let Some(dir) = cmd.get_current_dir() { + result.current_dir(dir); + } + + result + } + + /// This provides a *lossy* conversion to [`std::process::Command`]. The + /// features that are not supported by [`std::process::Command`] but *are* + /// supported by [`Command`] cannot be converted. For example, namespace and + /// mount configurations cannot be converted since they are not supported by + /// [`std::process::Command`]. + pub fn into_std_lossy(self) -> std::process::Command { + let mut result = std::process::Command::new(self.get_program()); + result.args(self.get_args()); + + if self.container.env.is_cleared() { + result.env_clear(); + } + + for (key, value) in self.get_envs() { + match value { + Some(value) => result.env(key, value), + None => result.env_remove(key), + }; + } + + if let Some(dir) = self.get_current_dir() { + result.current_dir(dir); + } + + #[cfg(unix)] + { + use std::os::unix::process::CommandExt; + + result.arg0(self.get_arg0()); + + for mut f in self.pre_exec { + unsafe { + result.pre_exec(move || f().map_err(Into::into)); + } + } + } + + result.stdin(self.container.stdin); + result.stdout(self.container.stdout); + result.stderr(self.container.stderr); + + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ExitStatus; + + use std::collections::BTreeMap; + use std::fs; + use std::path::Path; + use std::str::from_utf8; + + #[tokio::test] + async fn spawn() { + assert_eq!( + Command::new("true").spawn().unwrap().wait().await.unwrap(), + ExitStatus::Exited(0) + ); + + assert_eq!( + Command::new("false").spawn().unwrap().wait().await.unwrap(), + ExitStatus::Exited(1) + ); + } + + #[test] + fn wait_blocking() { + assert_eq!( + Command::new("true") + .spawn() + .unwrap() + .wait_blocking() + .unwrap(), + ExitStatus::Exited(0) + ); + + assert_eq!( + Command::new("false") + .spawn() + .unwrap() + .wait_blocking() + .unwrap(), + ExitStatus::Exited(1) + ); + } + + #[tokio::test] + async fn spawn_fail() { + assert_eq!( + Command::new("/iprobablydonotexist").spawn().unwrap_err(), + Error::new(Errno::ENOENT, Context::Exec) + ); + } + + #[tokio::test] + async fn double_wait() { + let mut child = Command::new("true").spawn().unwrap(); + assert_eq!(child.wait().await.unwrap(), ExitStatus::Exited(0)); + assert_eq!(child.wait().await.unwrap(), ExitStatus::Exited(0)); + } + + #[tokio::test] + async fn output() { + let output = Command::new("echo") + .arg("foo") + .arg("bar") + .output() + .await + .unwrap(); + assert_eq!(output.stdout, b"foo bar\n"); + assert_eq!(output.stderr, b""); + assert_eq!(output.status, ExitStatus::Exited(0)); + } + + fn parse_proc_status(stdout: &[u8]) -> BTreeMap<&str, &str> { + from_utf8(stdout) + .unwrap() + .trim_end() + .split('\n') + .map(|line| { + let mut items = line.splitn(2, ':'); + let first = items.next().unwrap(); + let second = items.next().unwrap(); + (first, second.trim()) + }) + .collect() + } + + #[tokio::test] + async fn uid_namespace() { + let output = Command::new("cat") + .arg("/proc/self/status") + .map_root() + .output() + .await + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + + let proc_status = parse_proc_status(&output.stdout); + + // We should be root user inside of the container. + assert_eq!(proc_status["Uid"], "0\t0\t0\t0"); + } + + #[tokio::test] + async fn pid_namespace() { + let output = Command::new("cat") + .arg("/proc/self/status") + .map_root() + .unshare(Namespace::PID) + .output() + .await + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + + let proc_status = parse_proc_status(&output.stdout); + + assert_eq!(proc_status["NSpid"].split('\t').nth(1), Some("1"),); + + // Note that, since we haven't mounted a fresh /proc into the container, + // the child still sees what the parent sees and so the PID will *not* + // be 1. + assert_ne!(proc_status["Pid"], "1"); + } + + #[tokio::test] + async fn mount_proc() { + let output = Command::new("cat") + .arg("/proc/self/status") + .map_root() + .unshare(Namespace::PID) + .mount(Mount::proc()) + .output() + .await + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + + let proc_status = parse_proc_status(&output.stdout); + + // With /proc mounted, the child really believes it is the root process. + assert_eq!(proc_status["NSpid"], "1"); + assert_eq!(proc_status["Pid"], "1"); + } + + #[tokio::test] + async fn hostname() { + let output = Command::new("cat") + .arg("/proc/sys/kernel/hostname") + .map_root() + .hostname("foobar.local") + .output() + .await + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + + let hostname = from_utf8(&output.stdout).unwrap().trim(); + + assert_eq!(hostname, "foobar.local"); + } + + #[tokio::test] + async fn domainname() { + let output = Command::new("cat") + .arg("/proc/sys/kernel/domainname") + .map_root() + .domainname("foobar") + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + + let domainname = from_utf8(&output.stdout).unwrap().trim(); + + assert_eq!(domainname, "foobar"); + } + + #[tokio::test] + async fn pty() { + use tokio::io::AsyncReadExt; + + let mut pty = Pty::open().unwrap(); + let pty_child = pty.child().unwrap(); + + let mut tty = pty_child.terminal_params().unwrap(); + // Prevent post-processing of output so `\n` isn't translated to `\r\n`. + tty.c_oflag &= !libc::OPOST; + pty_child.set_terminal_params(&tty).unwrap(); + + pty_child.set_window_size(40, 80).unwrap(); + + // stty is in coreutils and should be available on most systems. + let mut child = Command::new("stty") + .arg("size") + .pty(pty_child) + .spawn() + .unwrap(); + + // NOTE: read_to_end returns an EIO error once the child has exited. + let mut buf = Vec::new(); + assert!(pty.read_to_end(&mut buf).await.is_err()); + + assert_eq!(from_utf8(&buf).unwrap(), "40 80\n"); + + assert_eq!(child.wait().await.unwrap(), ExitStatus::SUCCESS); + } + + #[tokio::test] + async fn mount_devpts_basic() { + let output = Command::new("ls") + .arg("/dev/pts") + .map_root() + .mount(Mount::devpts("/dev/pts")) + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + + // Should be totally empty except for `/dev/pts/ptmx` since we mounted a + // new devpts. + assert_eq!(output.stderr, b""); + assert_eq!(output.stdout, b"ptmx\n"); + } + + #[tokio::test] + async fn mount_devpts_isolated() { + let output = Command::new("ls") + .arg("/dev/pts") + .map_root() + .mount(Mount::devpts("/dev/pts").data("newinstance,ptmxmode=0666")) + .mount(Mount::bind("/dev/pts/ptmx", "/dev/ptmx")) + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + + // Should be totally empty except for `/dev/pts/ptmx` since we mounted a + // new devpts. + assert_eq!(output.stderr, b""); + assert_eq!(output.stdout, b"ptmx\n"); + } + + #[tokio::test] + async fn mount_tmpfs() { + let output = Command::new("ls") + .arg("/tmp") + .map_root() + .mount(Mount::tmpfs("/tmp")) + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + + // Should be totally empty since we mounted a new tmpfs. + assert_eq!(output.stderr, b""); + assert_eq!(output.stdout, b""); + } + + #[tokio::test] + async fn mount_and_move_tmpfs() { + let tmpfs = tempfile::tempdir().unwrap(); + + // Create a temporary directory that will be the only thing to remain in + // the `/tmp` mount. + let persistent = tempfile::tempdir().unwrap(); + fs::write(persistent.path().join("foobar"), b"").unwrap(); + + let output = Command::new("ls") + .arg("/tmp") + .map_root() + .mount(Mount::tmpfs(tmpfs.path())) + // Bind-mount a directory from our upper /tmp to our new /tmp. + .mount(Mount::bind(persistent.path(), &tmpfs.path().join("my-dir")).touch_target()) + // Move our newly-created tmpfs to hide the upper /tmp folder. + .mount(Mount::rename(tmpfs.path(), Path::new("/tmp"))) + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + + // The only thing there should be our bind-mounted directory. + assert_eq!(output.stderr, b""); + assert_eq!(output.stdout, b"my-dir\n"); + } + + #[tokio::test] + async fn mount_bind() { + let temp = tempfile::tempdir().unwrap(); + let a = temp.path().join("a"); + let b = temp.path().join("b"); + + fs::create_dir(&a).unwrap(); + fs::create_dir(&b).unwrap(); + + fs::write(a.join("foobar"), "im a test").unwrap(); + + let output = Command::new("ls") + .arg(&b) + .map_root() + .mount(Mount::bind(&a, &b)) + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout, b"foobar\n"); + assert_eq!(output.stderr, b""); + } + + #[tokio::test] + async fn local_networking_ping() { + let output = Command::new("ping") + .arg("-c1") + .arg("::1") + .map_root() + .local_networking_only() + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0), "{:?}", output); + } + + #[tokio::test] + async fn local_networking_loopback_flags() { + let output = Command::new("cat") + .arg("/sys/class/net/lo/flags") + .map_root() + .local_networking_only() + .output() + .await + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0), "{:?}", output); + assert_eq!(output.stdout, b"0x9\n", "{:?}", output); + } + + /// Show that processes in two separate network namespaces can bind to the + /// same port. + #[tokio::test] + async fn port_isolation() { + use std::thread::sleep; + use std::time::Duration; + + let mut command = Command::new("nc"); + command + .arg("-l") + .arg("127.0.0.1") + // Can bind to a low port without real root inside the namespace. + .arg("80") + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .map_root() + .local_networking_only(); + + let server1 = match command.spawn() { + // If netcat is not installed just exit successfully. + Err(error) if error.errno() == Errno::ENOENT => return, + other => other, + } + .unwrap(); + + let server2 = command.spawn().unwrap(); + + // Give them both time to start up. + sleep(Duration::from_millis(100)); + + // Signal them to shut down. Otherwise, they will wait forever for a + // connection that will never come. + server1.signal(Signal::SIGINT).unwrap(); + server2.signal(Signal::SIGINT).unwrap(); + + let output1 = server1.wait_with_output().await.unwrap(); + let output2 = server2.wait_with_output().await.unwrap(); + + // Without network isolation, one of the servers would exit with an + // "Address already in use" (exit status 2) error. + assert_eq!( + output1.status, + ExitStatus::Signaled(Signal::SIGINT, false), + "{:?}", + output1 + ); + assert_eq!( + output2.status, + ExitStatus::Signaled(Signal::SIGINT, false), + "{:?}", + output2 + ); + } + + /// Make sure we can call `.local_networking_only` more than once. + #[tokio::test] + async fn local_networking_there_can_be_only_one() { + let output = Command::new("true") + .map_root() + .local_networking_only() + // If calling this twice mounted /sys twice, then we'd get a "Device + // or resource busy" error. + .local_networking_only() + .output() + .await + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0), "{:?}", output); + assert_eq!(output.stdout, b"", "{:?}", output); + assert_eq!(output.stderr, b"", "{:?}", output); + } + + #[test] + fn from_std_lossy() { + let mut stdcmd = std::process::Command::new("echo"); + stdcmd.args(["arg1", "arg2"]); + stdcmd.current_dir("/foo/bar"); + stdcmd.env_clear(); + stdcmd.env("FOO", "1"); + stdcmd.env("BAR", "2"); + + let cmd = Command::from_std_lossy(&stdcmd); + + assert_eq!(cmd.get_program(), "echo"); + assert_eq!(cmd.get_arg0(), "echo"); + assert_eq!(cmd.get_args().collect::>(), ["arg1", "arg2"]); + + let envs = cmd + .get_envs() + .filter_map(|(k, v)| Some((k.to_str()?, v.and_then(|v| v.to_str())))) + .collect::>(); + assert_eq!(envs, [("BAR", Some("2")), ("FOO", Some("1"))]); + } + + #[test] + fn into_std_lossy() { + let mut cmd = Command::new("env"); + cmd.args(["-0"]); + cmd.current_dir("/foo/bar"); + cmd.env_clear(); + cmd.env("FOO", "1"); + cmd.env("BAR", "2"); + + let stdcmd = cmd.into_std_lossy(); + + assert_eq!(stdcmd.get_program(), "env"); + assert_eq!(stdcmd.get_args().collect::>(), ["-0"]); + + let envs = stdcmd + .get_envs() + .filter_map(|(k, v)| Some((k.to_str()?, v.and_then(|v| v.to_str())))) + .collect::>(); + + assert_eq!(envs, [("BAR", Some("2")), ("FOO", Some("1"))]); + } +} diff --git a/reverie-process/src/mount.rs b/reverie-process/src/mount.rs new file mode 100644 index 0000000..663d3da --- /dev/null +++ b/reverie-process/src/mount.rs @@ -0,0 +1,556 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::fd::{create_dir_all, touch_path, FileType}; +use super::util; + +use core::convert::Infallible; +use core::fmt; +use core::ptr; +use core::str::FromStr; + +use std::collections::HashMap; +use std::ffi::{CString, OsStr}; +use std::os::unix::ffi::OsStrExt; +use std::path::Path; + +use syscalls::Errno; + +pub use nix::mount::MsFlags as MountFlags; + +/// A mount. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Mount { + source: Option, + target: CString, + fstype: Option, + flags: MountFlags, + data: Option, + touch_target: bool, +} + +/// Represents a bind mount. Can be converted into a [`Mount`]. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Bind { + /// The source path of the bind mount. This path must exist. It can be either + /// a file or directory. + source: CString, + + /// The target of the bind mount. This does not need to exist and can be + /// created when performing the bind mount. + target: CString, +} + +impl Mount { + /// Creates a new mount at the path `target`. + pub fn new>(target: S) -> Self { + Self { + source: None, + target: util::to_cstring(target), + fstype: None, + flags: MountFlags::empty(), + data: None, + touch_target: false, + } + } + + /// Creates a bind mount. This effectively creates hardlink of a directory, + /// making the contents accessible at both places. + /// + /// By default, none of the mounts in the `source` directory are visible in + /// `destination`. To make all mounts recursively visible, combine this with + /// [`Self::recursive`]. Can also be used with [`Self::readonly`] to make the + /// contents of `destination` read-only. + pub fn bind, D: AsRef>(source: S, destination: D) -> Self { + Self::new(destination) + .source(source) + .flags(MountFlags::MS_BIND) + } + + /// Move/rename a mount. + pub fn rename, D: AsRef>(source: S, destination: D) -> Self { + Self::new(destination) + .source(source) + .flags(MountFlags::MS_MOVE) + } + + /// Mount a fresh devpts file system. The target is usually `/dev/pts`. + /// + /// In order for this devpts to be private and independent of other devpts + /// (i.e., for containers), use: + /// ```no_compile + /// Mount::devpts("/dev/pts").data("newinstance,ptmxmode=0666") + /// ``` + /// And either make `/dev/ptmx` a symlink pointing to `/dev/pts/ptmx` or + /// bind-mount it. + /// + /// See also: + pub fn devpts>(target: S) -> Self { + Self::new(target).fstype("devpts") + } + + /// Mount a fresh proc file system at `/proc`. + pub fn proc() -> Self { + Self::new("/proc").fstype("proc") + } + + /// Mount an overlay file system. + /// + /// NOTE: This only works in Linux 5.11 or newer when mounted from a user + /// namespace. Otherwise, you need real root privileges to mount an + /// overlayfs. + /// + /// An overlay filesystem combines two filesystems - an upper filesystem and + /// a lower filesystem. When a name exists in both filesystems, the object + /// in the upper filesystem is visible while the object in the lower + /// filesystem is either hidden or, in the case of directories, merged with + /// the upper object. + /// + /// In other words, the `lowerdir` and `upperdir` are combined into a + /// directory `merged` using `workdir` as a temporary work area. + /// + /// The lower filesystem can be any filesystem supported by Linux and does + /// not need to be writable. The lower filesystem can even be another + /// overlayfs. The upper filesystem should be writable. + /// + /// See for + /// more information. + /// + /// # Arguments + /// + /// * `lowerdir` - The lower directory of the overlay. Can be any filesystem + /// and does not need to be writable. This directory is never + /// modified by writes to `merged`. + /// * `upperdir` - The upper directory of the overlay. This is where all + /// changes to `merged` are collected. Does not need to be + /// empty, but should be when starting a new overlay from + /// scratch. + /// * `workdir` - The work directory. This should always be empty. + /// * `merged` - The combination of `lowerdir` and `upperdir`. + pub fn overlay(lowerdir: &Path, upperdir: &Path, workdir: &Path, merged: &Path) -> Self { + // TODO: Since there can actually be multiple lowerdirs, it might be + // more ergonomic to return an `OverlayBuilder` instead. + let options = format!( + "lowerdir={},upperdir={},workdir={}", + lowerdir.display(), + upperdir.display(), + workdir.display() + ); + + Self::new(merged) + .fstype("overlay") + .source("overlay") + .data(options) + } + + /// Creates a temporary file system at the location specified. + pub fn tmpfs>(target: S) -> Self { + Self::new(target).fstype("tmpfs") + } + + /// Creates a sys file system at the location specified. The target directory + /// is usually `/sys`. This is useful when creating a network namespace. + pub fn sysfs>(target: S) -> Self { + Self::new(target).fstype("sysfs") + } + + /// Sets the mount point target. + pub fn target>(mut self, target: S) -> Self { + self.target = util::to_cstring(target); + self + } + + /// Returns the mount point target path. + pub fn get_target(&self) -> &Path { + Path::new(OsStr::from_bytes(self.target.to_bytes())) + } + + /// Sets the source of the mount. + pub fn source>(mut self, path: S) -> Self { + self.source = Some(util::to_cstring(path)); + self + } + + /// Returns the mount point source path (if any). + pub fn get_source(&self) -> Option<&Path> { + self.source + .as_ref() + .map(|s| Path::new(OsStr::from_bytes(s.to_bytes()))) + } + + /// Indicates that the target of a bind mount should be created + /// automatically. + pub fn touch_target(mut self) -> Self { + self.touch_target = true; + self + } + + /// Adds mount flags. + pub fn flags(mut self, flags: MountFlags) -> Self { + self.flags |= flags; + self + } + + /// Make the file system read-only. + pub fn readonly(mut self) -> Self { + self.flags |= MountFlags::MS_RDONLY; + self + } + + /// Makes a bind mount recursive. + pub fn recursive(mut self) -> Self { + self.flags |= MountFlags::MS_REC; + self + } + + /// Makes this mount point private. Mount and unmount events do not propagate + /// into or out of this mount point. + pub fn private(mut self) -> Self { + self.flags |= MountFlags::MS_PRIVATE; + self + } + + /// Make this mount point shared. Mount and unmount events immediately under + /// this mount point will propagate to the other mount points that are + /// members of this mount's peer group. Propagation here means that the same + /// mount or unmount will automatically occur under all of the other mount + /// points in the peer group. Conversely, mount and unmount events that take + /// place under peer mount points will propagate to this mount point. + pub fn shared(mut self) -> Self { + self.flags |= MountFlags::MS_SHARED; + self + } + + /// Same as specifying both [`recursive`] and [`private`]. + pub fn rprivate(mut self) -> Self { + self.flags |= MountFlags::MS_REC | MountFlags::MS_PRIVATE; + self + } + + /// Same as specifying both [`recursive`] and [`shared`]. + pub fn rshared(mut self) -> Self { + self.flags |= MountFlags::MS_REC | MountFlags::MS_SHARED; + self + } + + /// Sets the filesystem type. + pub fn fstype>(mut self, fstype: S) -> Self { + self.fstype = Some(util::to_cstring(fstype)); + self + } + + /// Sets any additional data required by the mount. + pub fn data>(mut self, data: S) -> Self { + self.data = Some(util::to_cstring(data)); + self + } + + fn source_ptr(&self) -> *const libc::c_char { + self.source.as_ref().map_or(ptr::null(), |s| s.as_ptr()) + } + + fn target_ptr(&self) -> *const libc::c_char { + self.target.as_ptr() + } + + fn fstype_ptr(&self) -> *const libc::c_char { + self.fstype.as_ref().map_or(ptr::null(), |s| s.as_ptr()) + } + + fn data_ptr(&self) -> *const libc::c_void { + self.data + .as_ref() + .map_or(ptr::null(), |s| s.as_ptr() as *const libc::c_void) + } + + /// Performs the mount. For bind-mount operations, the target directory or + /// file is created if [`touch_target`] was used. + /// + /// NOTE: This function *must* not allocate since it is called after `fork` + /// (or `clone`) and before `execve`. Any allocations could cause deadlocks + /// (which are hard to track down). + pub(super) fn mount(&mut self) -> Result<(), Errno> { + // NOTE: Although we can't allocate here, we can safely *modify* `self`. + // When this function is called, we have forked virtual memory and any + // modifications we make are copy-on-write and lost when `execve` is + // called. Thus, this function takes `self` by mutable reference. + if self.flags.contains(MountFlags::MS_BIND) && self.touch_target { + // Bind mounts will fail unless the destination path exists, so it + // is convenient to create it automatically. + // + // One reason for doing this here instead of the parent process is + // because the target may not yet exist until we mount it. For + // example, if we want to create a `/tmp` (tmpfs) folder and then + // bind-mount some files or directories into it, pre-creating the + // destination directories won't work because they'll get created in + // a different tmpfs. + if let Some(src) = &self.source { + if FileType::new(src.as_ptr())?.is_dir() { + create_dir_all(&mut self.target, 0o777)?; + } else { + touch_path(&mut self.target, 0o666, 0o777)?; + } + } + } + + Errno::result(unsafe { + libc::mount( + self.source_ptr(), + self.target_ptr(), + self.fstype_ptr(), + self.flags.bits(), + self.data_ptr(), + ) + })?; + + Ok(()) + } +} + +impl Bind { + /// Creates a new bind mount. The `target` is optional because it is often + /// convenient to use an identical `source` and `target` directory. If + /// `target` is `None`, then it is interpretted as being the same as + /// `source`. + pub fn new(source: S, target: T) -> Self + where + S: AsRef, + T: AsRef, + { + Self { + source: util::to_cstring(source), + target: util::to_cstring(target), + } + } +} + +impl From for Mount { + fn from(b: Bind) -> Self { + Self { + source: Some(b.source), + target: b.target, + fstype: None, + flags: MountFlags::MS_BIND, + data: None, + touch_target: false, + } + } +} + +impl From<&str> for Bind { + fn from(s: &str) -> Self { + if let Some((source, target)) = s.split_once(':') { + Self { + source: util::to_cstring(source), + target: util::to_cstring(target), + } + } else { + let source = util::to_cstring(s); + let target = source.clone(); + Self { source, target } + } + } +} + +impl FromStr for Bind { + type Err = Infallible; + + /// Parses bind mounts of the following forms: + /// 1. "path/to/source" + /// 2. "path/to/source:path/to/dest" + fn from_str(s: &str) -> Result { + Ok(Self::from(s)) + } +} + +/// An error from parsing a mount. +#[derive(thiserror::Error, Debug, Eq, PartialEq)] +pub enum MountParseError { + /// The `target` key is missing. This is always required. + MissingTarget, + + /// An invalid mount option was specified. + Invalid(String, Option), +} + +impl fmt::Display for MountParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::MissingTarget => write!(f, "missing mount target"), + Self::Invalid(k, v) => match v { + Some(v) => write!(f, "invalid mount option '{}={}'", k, v), + None => write!(f, "invalid mount option '{}'", k), + }, + } + } +} + +impl FromStr for Mount { + type Err = MountParseError; + + /// Parses a [`Mount`]. This accepts the same syntax as Docker mounts where + /// each mount consists of a comma-separated key-value list. + /// + /// See https://docs.docker.com/storage/bind-mounts/ for more information. + fn from_str(s: &str) -> Result { + let mut map: HashMap<&str, Option<&str>> = HashMap::new(); + + for item in s.split(',') { + let item = item.trim(); + + if item.is_empty() { + continue; + } + + let (key, value) = match item.split_once('=') { + Some((key, value)) => (key, Some(value)), + None => (item, None), + }; + + map.insert(key, value); + } + + // The mount target is always required. + let mut mount = match map + .remove("target") + .or_else(|| map.remove("destination")) + .or_else(|| map.remove("dest")) + .or_else(|| map.remove("dst")) + .flatten() + { + Some(target) => Mount::new(target), + None => { + return Err(MountParseError::MissingTarget); + } + }; + + if let Some(source) = map.remove("source").or_else(|| map.remove("src")).flatten() { + mount = mount.source(source); + } + + let is_bind_mount = if let Some(fstype) = map.remove("type").flatten() { + if fstype == "bind" { + true + } else { + mount = mount.fstype(fstype); + false + } + } else { + true + }; + + if is_bind_mount { + mount = mount.flags(MountFlags::MS_BIND); + } + + if let Some((key, value)) = map.remove_entry("readonly") { + if let Some(value) = value { + // No value should have been specified. + return Err(MountParseError::Invalid(key.into(), Some(value.to_owned()))); + } + + mount = mount.readonly(); + } + + if let Some(propagation) = map.remove("bind-propagation").flatten() { + let flags = match propagation { + "shared" => MountFlags::MS_SHARED, + "slave" => MountFlags::MS_SLAVE, + "private" => MountFlags::MS_PRIVATE, + "rshared" => MountFlags::MS_REC | MountFlags::MS_SHARED, + "rslave" => MountFlags::MS_REC | MountFlags::MS_SLAVE, + "rprivate" => MountFlags::MS_REC | MountFlags::MS_PRIVATE, + _ => { + return Err(MountParseError::Invalid( + "bind-propagation".into(), + Some(propagation.into()), + )); + } + }; + + mount = mount.flags(flags); + } else { + // All mounts get these flags by default. + mount = mount.flags(MountFlags::MS_REC | MountFlags::MS_PRIVATE); + } + + // Any left over keys are invalid. + if let Some((k, v)) = map.into_iter().next() { + return Err(MountParseError::Invalid(k.into(), v.map(ToOwned::to_owned))); + } + + Ok(mount) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn getters_and_setters() { + let m = Mount::bind("/foo", "/bar"); + assert_eq!(m.get_target(), Path::new("/bar")); + assert_eq!(m.get_source(), Some(Path::new("/foo"))); + + let m = m.target("/baz"); + assert_eq!(m.get_target(), Path::new("/baz")); + } + + #[test] + fn parse_mount() { + assert_eq!( + Mount::from_str("type=bind,source=/foo,target=/bar,readonly"), + Ok(Mount::bind("/foo", "/bar").readonly().rprivate()) + ); + assert_eq!( + Mount::from_str("src=/foo,target=/bar,readonly"), + Ok(Mount::bind("/foo", "/bar").readonly().rprivate()) + ); + assert_eq!( + Mount::from_str("src=/foo,target=/bar,bind-propagation=rshared"), + Ok(Mount::bind("/foo", "/bar").rshared()) + ); + assert_eq!( + Mount::from_str("type=tmpfs,target=/tmp"), + Ok(Mount::tmpfs("/tmp").rprivate()) + ); + assert_eq!( + Mount::from_str("target=foo, ,,,"), + Ok(Mount::new("foo").flags(MountFlags::MS_BIND).rprivate()) + ); + + assert_eq!(Mount::from_str(""), Err(MountParseError::MissingTarget)); + assert_eq!( + Mount::from_str("type=bind,source=/foo,readonly"), + Err(MountParseError::MissingTarget) + ); + assert_eq!( + Mount::from_str("type=tmpfs,target=/foo,wat"), + Err(MountParseError::Invalid("wat".into(), None)) + ); + assert_eq!( + Mount::from_str("type=tmpfs,target=/foo,readonly=wat"), + Err(MountParseError::Invalid( + "readonly".into(), + Some("wat".into()) + )) + ); + } + + #[test] + fn parse_bind() { + assert_eq!(Bind::from("source:target"), Bind::new("source", "target")); + assert_eq!(Bind::from("source"), Bind::new("source", "source")); + + assert_eq!( + Mount::from(Bind::from("source:target")), + Mount::bind("source", "target") + ); + } +} diff --git a/reverie-process/src/namespace.rs b/reverie-process/src/namespace.rs new file mode 100644 index 0000000..ef5da29 --- /dev/null +++ b/reverie-process/src/namespace.rs @@ -0,0 +1,74 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use serde::{Deserialize, Serialize}; +use std::str::FromStr; + +bitflags::bitflags! { + /// A namespace that may be unshared with [`Command::unshare`]. + /// + /// [`Command::unshare`]: super::Command::unshare + #[derive(Deserialize, Serialize)] + pub struct Namespace: i32 { + /// Cgroup namespace. + const CGROUP = libc::CLONE_NEWCGROUP; + /// IPC namespace. + const IPC = libc::CLONE_NEWIPC; + /// Network namespace. + const NETWORK = libc::CLONE_NEWNET; + /// Mount namespace. + const MOUNT = libc::CLONE_NEWNS; + /// PID namespace. + const PID = libc::CLONE_NEWPID; + /// User and group namespace. + const USER = libc::CLONE_NEWUSER; + /// UTS namespace. + const UTS = libc::CLONE_NEWUTS; + } +} + +impl Default for Namespace { + fn default() -> Self { + Self::empty() + } +} + +#[derive(Debug, Clone)] +pub enum ParseNamespaceError { + InvalidNamespace(String), +} + +impl std::error::Error for ParseNamespaceError {} + +impl core::fmt::Display for ParseNamespaceError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self { + ParseNamespaceError::InvalidNamespace(ns) => { + write!(f, "Invalid namespace: {}", ns) + } + } + } +} + +impl FromStr for Namespace { + type Err = ParseNamespaceError; + fn from_str(s: &str) -> Result { + s.split(',').try_fold(Namespace::empty(), |ns, s| match s { + "cgroup" => Ok(ns | Namespace::CGROUP), + "ipc" => Ok(ns | Namespace::IPC), + "network" => Ok(ns | Namespace::NETWORK), + "pid" => Ok(ns | Namespace::PID), + "mount" => Ok(ns | Namespace::MOUNT), + "user" => Ok(ns | Namespace::USER), + "uts" => Ok(ns | Namespace::UTS), + "" | "none" => Ok(ns), + invalid_ns => Err(ParseNamespaceError::InvalidNamespace(invalid_ns.to_owned())), + }) + } +} diff --git a/reverie-process/src/net.rs b/reverie-process/src/net.rs new file mode 100644 index 0000000..df2e956 --- /dev/null +++ b/reverie-process/src/net.rs @@ -0,0 +1,220 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::fd::Fd; + +use std::ffi::CStr; +use std::ffi::OsStr; +use std::mem::MaybeUninit; +use std::os::unix::io::AsRawFd; + +use syscalls::Errno; + +/// Interface name. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)] +#[repr(C)] +pub struct IfName([u8; libc::IFNAMSIZ]); + +/// A network interface request. +#[derive(Debug, Copy, Clone)] +#[repr(C)] +struct IfReq { + /// The interface name. + name: IfName, + + /// The request type. + /// + /// NOTE: The kernel's `if_req` struct is made up of a `union` of all + /// possible request types. Thus, the size of `if_req` is not necessarily the + /// same as the size of `IfReq`. However, there is no danger of a buffer + /// overrun, since the kernel does not write to the unused parts of the union + /// when handling the associated ioctls. + req: T, +} + +#[derive(Debug, Copy, Clone)] +#[repr(C)] +pub struct ifmap { + pub mem_start: usize, + pub mem_end: usize, + pub base_addr: u16, + pub irq: u8, + pub dma: u8, + pub port: u8, + /* 3 bytes spare */ +} + +impl IfName { + // Many unused functions here. The full set of ioctl's are implemented for + // `if_req`, but we don't need them yet. + #![allow(unused)] + + /// The name of the loopback interface. + pub const LOOPBACK: Self = Self(*b"lo\0\0\0\0\0\0\0\0\0\0\0\0\0\0"); + + pub fn new>(name: S) -> Result { + use std::os::unix::ffi::OsStrExt; + + let name = name.as_ref().as_bytes(); + + if name.len() + 1 > libc::IFNAMSIZ { + Err(InterfaceNameTooLong) + } else { + let mut arr = [0u8; libc::IFNAMSIZ]; + arr[..name.len()].copy_from_slice(name); + arr[name.len()] = 0; + Ok(Self(arr)) + } + } + + fn ioctl_get(self, ioctl: libc::c_ulong, socket: &Fd) -> Result { + let mut req = IfReq::new(self, MaybeUninit::uninit()); + Errno::result(unsafe { libc::ioctl(socket.as_raw_fd(), ioctl, &mut req as *mut _) })?; + Ok(unsafe { req.into_req().assume_init() }) + } + + fn ioctl_set(self, ioctl: libc::c_ulong, socket: &Fd, value: T) -> Result<(), Errno> { + let req = IfReq::new(self, value); + Errno::result(unsafe { libc::ioctl(socket.as_raw_fd(), ioctl, &req as *const _) })?; + Ok(()) + } + + pub fn get_addr(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFADDR, socket) + } + + pub fn set_addr(&self, socket: &Fd, addr: libc::sockaddr) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFADDR, socket, addr) + } + + pub fn get_dest_addr(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFDSTADDR, socket) + } + + pub fn set_dest_addr(&self, socket: &Fd, addr: libc::sockaddr) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFDSTADDR, socket, addr) + } + + pub fn get_broadcast_addr(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFBRDADDR, socket) + } + + pub fn set_broadcast_addr(&self, socket: &Fd, addr: libc::sockaddr) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFBRDADDR, socket, addr) + } + + pub fn get_netmask(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFNETMASK, socket) + } + + pub fn set_netmask(&self, socket: &Fd, addr: libc::sockaddr) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFNETMASK, socket, addr) + } + + pub fn get_hw_addr(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFHWADDR, socket) + } + + pub fn set_hw_addr(&self, socket: &Fd, addr: libc::sockaddr) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFHWADDR, socket, addr) + } + + pub fn get_flags(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFFLAGS, socket) + } + + pub fn set_flags(&self, socket: &Fd, flags: i16) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFFLAGS, socket, flags) + } + + pub fn get_metric(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFMETRIC, socket) + } + + pub fn set_metric(&self, socket: &Fd, value: i32) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFMETRIC, socket, value) + } + + pub fn get_mtu(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFMTU, socket) + } + + pub fn set_mtu(&self, socket: &Fd, value: i32) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFMTU, socket, value) + } + + /// Gets the device map. + pub fn get_map(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFMAP, socket) + } + + /// Sets the device map. + pub fn set_map(&self, socket: &Fd, map: ifmap) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFMAP, socket, map) + } + + /// Gets the slave device. + pub fn get_slave(&self, socket: &Fd) -> Result { + self.ioctl_get(libc::SIOCGIFSLAVE, socket) + } + + /// Sets the slave device. + pub fn set_slave(&self, socket: &Fd, name: Self) -> Result<(), Errno> { + self.ioctl_set(libc::SIOCSIFSLAVE, socket, name) + } +} + +impl AsRef for IfName { + fn as_ref(&self) -> &CStr { + unsafe { CStr::from_ptr(self.0.as_ptr() as *const _) } + } +} + +/// An error indicating that the interface name is too long. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct InterfaceNameTooLong; + +impl IfReq { + /// Creates a new interface request. + pub fn new(name: IfName, req: T) -> Self { + Self { name, req } + } + + pub fn into_req(self) -> T { + self.req + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::net::if_::InterfaceFlags; + + #[test] + fn ifname() { + assert_eq!(IfName::new("lo"), Ok(IfName::LOOPBACK)); + assert_eq!( + IfName::new("too loooooooooooooooong"), + Err(InterfaceNameTooLong) + ); + } + + #[test] + fn smoke_tests() { + let sock = Fd::socket(libc::AF_INET, libc::SOCK_DGRAM, libc::IPPROTO_IP).unwrap(); + + let lo = IfName::LOOPBACK; + + let addr = lo.get_addr(&sock).unwrap(); + assert_eq!(addr.sa_family as i32, libc::AF_INET); + + let flags = InterfaceFlags::from_bits_truncate(lo.get_flags(&sock).unwrap() as i32); + assert!(flags.contains(InterfaceFlags::IFF_LOOPBACK)); + } +} diff --git a/reverie-process/src/pid.rs b/reverie-process/src/pid.rs new file mode 100644 index 0000000..6a02bcd --- /dev/null +++ b/reverie-process/src/pid.rs @@ -0,0 +1,132 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::fmt; +use core::hash::Hash; +use serde::{Deserialize, Serialize}; + +/// A process ID (PID). +#[derive( + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Hash, + Serialize, + Deserialize +)] +pub struct Pid(libc::pid_t); + +impl Pid { + /// Creates `Pid` from a raw `pid_t`. + pub fn from_raw(pid: libc::pid_t) -> Self { + Self(pid) + } + + /// Returns the PID of the calling process. + pub fn this() -> Self { + nix::unistd::Pid::this().into() + } + + /// Returns the PID of the calling process. + pub fn parent() -> Self { + nix::unistd::Pid::parent().into() + } + + /// Gets the raw `pid_t` from this `Pid`. + pub fn as_raw(self) -> libc::pid_t { + self.0 + } + + /// Returns a `Display`able that is color-coded. That is, the same PID will + /// get the same color. This makes it easy to visually recognize PIDs when + /// looking through logs. + /// + /// Note that while the same PIDs always have the same color, different PIDs + /// may also have the same color if they fall into the same color bucket. + pub fn colored(self) -> ColoredPid { + ColoredPid(self) + } +} + +impl From for Pid { + fn from(pid: nix::unistd::Pid) -> Pid { + Self(pid.as_raw()) + } +} + +impl From for nix::unistd::Pid { + fn from(pid: Pid) -> nix::unistd::Pid { + nix::unistd::Pid::from_raw(pid.as_raw()) + } +} + +impl From for libc::pid_t { + fn from(pid: Pid) -> libc::pid_t { + pid.as_raw() + } +} + +impl From for Pid { + fn from(pid: libc::pid_t) -> Pid { + Pid::from_raw(pid) + } +} + +impl fmt::Display for Pid { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +/// A colored pid. +pub struct ColoredPid(Pid); + +impl ColoredPid { + /// Gets the ansi color code for the current PID. Returns `None` if not + /// writing to a terminal. + fn ansi_code(&self) -> Option<&'static str> { + if colored::control::SHOULD_COLORIZE.should_colorize() { + // Why not just use `colored::Colorize` you ask? It allocates a + // string in order to create the color code. Since we may log a lot + // of output that may contain a lot of PIDs, we don't want that to + // slow us down. + Some(match self.0.as_raw() % 14 { + 0 => "\x1b[0;31m", // Red + 1 => "\x1b[0;32m", // Green + 2 => "\x1b[0;33m", // Yellow + 3 => "\x1b[0;34m", // Blue + 4 => "\x1b[0;35m", // Magenta + 5 => "\x1b[0;36m", // Cyan + 6 => "\x1b[0;37m", // White + 7 => "\x1b[1;31m", // Bright red + 8 => "\x1b[1;32m", // Bright green + 9 => "\x1b[01;33m", // Bright yellow + 10 => "\x1b[1;34m", // Bright blue + 11 => "\x1b[1;35m", // Bright magenta + 12 => "\x1b[1;36m", // Bright cyan + _ => "\x1b[1;37m", // Bright white + }) + } else { + None + } + } +} + +impl fmt::Display for ColoredPid { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(color) = self.ansi_code() { + write!(f, "{}{}\x1b[0m", color, self.0) + } else { + fmt::Display::fmt(&self.0, f) + } + } +} diff --git a/reverie-process/src/pty.rs b/reverie-process/src/pty.rs new file mode 100644 index 0000000..3cefb34 --- /dev/null +++ b/reverie-process/src/pty.rs @@ -0,0 +1,203 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::fd::{AsyncFd, Fd}; + +use syscalls::Errno; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +use core::mem::MaybeUninit; +use core::pin::Pin; +use core::task::{Context, Poll}; + +use std::io; +use std::os::unix::io::AsRawFd; +use std::os::unix::io::IntoRawFd; +use std::os::unix::io::RawFd; + +/// Represents a pseudo-TTY "master". +#[derive(Debug)] +pub struct Pty { + fd: AsyncFd, +} + +impl Pty { + /// Opens a new pseudo-TTY master. + /// + /// NOTE: As long as there is a handle open to at least one child pty, reads + /// will not reach EOF and will continue to return `EWOULDBLOCK`. + pub fn open() -> Result { + let fd = Fd::new(Errno::result(unsafe { + libc::posix_openpt(libc::O_RDWR | libc::O_NOCTTY) + })?); + + Errno::result(unsafe { libc::grantpt(fd.as_raw_fd()) })?; + Errno::result(unsafe { libc::unlockpt(fd.as_raw_fd()) })?; + + let fd = AsyncFd::new(fd)?; + + Ok(Self { fd }) + } + + /// Opens a pseudo-TTY slave that is connected to this master. + pub fn child(&self) -> Result { + const TIOCGPTPEER: libc::c_ulong = 0x5441; + + let parent = self.fd.as_raw_fd(); + + let fd = Errno::result(unsafe { + // NOTE: This ioctl isn't supported until Linux v4.13 (see + // `ioctl_tty(2)`), so we may fallback to path-based slave fd + // allocation. + libc::ioctl(parent, TIOCGPTPEER, libc::O_RDWR | libc::O_NOCTTY) + }) + .map(Fd::new) + .or_else(|_err| { + let mut path: [libc::c_char; libc::PATH_MAX as usize] = [0; libc::PATH_MAX as usize]; + + Errno::result(unsafe { libc::ptsname_r(parent, path.as_mut_ptr(), path.len()) })?; + + Fd::open_c(path.as_ptr(), libc::O_RDWR | libc::O_NOCTTY) + })?; + + Ok(PtyChild { fd }) + } +} + +/// A pseudo-TTY child (or "slave" in TTY parlance). This is passed to child +/// processes. +#[derive(Debug)] +pub struct PtyChild { + fd: Fd, +} + +impl PtyChild { + /// Sets the pseudo-TTY child as the controlling terminal for the current + /// process. + /// + /// Specifically, this does several things: + /// 1. Calls setsid to create a new session. + /// 2. Makes this fd the controlling terminal of this process by running the + /// correct ioctl. + /// 3. Calls `dup2` to set each stdio stream to redirect to this fd. + /// 4. Closes the fd. + pub fn login(self) -> Result<(), Errno> { + Errno::result(unsafe { libc::login_tty(self.fd.into_raw_fd()) })?; + Ok(()) + } + + /// Sets the window size in rows and columns. + pub fn set_window_size(&self, rows: u16, cols: u16) -> Result<(), Errno> { + let fd = self.fd.as_raw_fd(); + + let winsize = libc::winsize { + ws_row: rows, + ws_col: cols, + ws_xpixel: 0, + ws_ypixel: 0, + }; + + Errno::result(unsafe { libc::ioctl(fd, libc::TIOCSWINSZ, &winsize as *const _) })?; + + Ok(()) + } + + /// Returns the window size in terms of rows and columns. + pub fn window_size(&self) -> Result<(u16, u16), Errno> { + let fd = self.fd.as_raw_fd(); + + let mut winsize = MaybeUninit::::uninit(); + + Errno::result(unsafe { libc::ioctl(fd, libc::TIOCGWINSZ, winsize.as_mut_ptr()) })?; + + let winsize = unsafe { winsize.assume_init() }; + + Ok((winsize.ws_row, winsize.ws_col)) + } + + /// Sets the terminal parameters. + pub fn set_terminal_params(&self, params: &libc::termios) -> Result<(), Errno> { + let fd = self.fd.as_raw_fd(); + Errno::result(unsafe { libc::tcsetattr(fd, libc::TCSAFLUSH, params as *const _) })?; + Ok(()) + } + + /// Gets the terminal parameters. + pub fn terminal_params(&self) -> Result { + let fd = self.fd.as_raw_fd(); + + let mut term = MaybeUninit::::uninit(); + + Errno::result(unsafe { libc::tcgetattr(fd, term.as_mut_ptr()) })?; + + Ok(unsafe { term.assume_init() }) + } +} + +impl AsRawFd for Pty { + fn as_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } +} + +impl AsRawFd for PtyChild { + fn as_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } +} + +impl AsyncWrite for Pty { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.fd).poll_write(cx, buf) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.fd).poll_flush(cx) + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.fd).poll_shutdown(cx) + } +} + +impl AsyncRead for Pty { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context, + buf: &mut ReadBuf, + ) -> Poll> { + Pin::new(&mut self.fd).poll_read(cx, buf) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_open() { + let pty = Pty::open().unwrap(); + + let child1 = pty.child().unwrap(); + child1.set_window_size(20, 40).unwrap(); + assert_eq!(child1.window_size().unwrap(), (20, 40)); + + let child2 = pty.child().unwrap(); + child2.set_window_size(40, 80).unwrap(); + + assert_eq!(child2.window_size().unwrap(), (40, 80)); + + // Since they're both connected to the same master, changing the window + // size of one child affects both of them. + assert_eq!(child1.window_size().unwrap(), (40, 80)); + } +} diff --git a/reverie-process/src/seccomp/bpf.rs b/reverie-process/src/seccomp/bpf.rs new file mode 100644 index 0000000..3933330 --- /dev/null +++ b/reverie-process/src/seccomp/bpf.rs @@ -0,0 +1,440 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#![allow(non_snake_case)] + +use syscalls::Errno; +use syscalls::Sysno; + +pub use libc::sock_filter; + +// See: /include/uapi/linux/bpf_common.h + +// Instruction classes +pub const BPF_LD: u16 = 0x00; +pub const BPF_ST: u16 = 0x02; +pub const BPF_JMP: u16 = 0x05; +pub const BPF_RET: u16 = 0x06; + +// ld/ldx fields +pub const BPF_W: u16 = 0x00; + +pub const BPF_ABS: u16 = 0x20; +pub const BPF_MEM: u16 = 0x60; + +pub const BPF_JEQ: u16 = 0x10; +pub const BPF_JGT: u16 = 0x20; +pub const BPF_JGE: u16 = 0x30; +pub const BPF_K: u16 = 0x00; + +/// Maximum number of instructions. +pub const BPF_MAXINSNS: usize = 4096; + +/// Defined in `/include/uapi/linux/seccomp.h`. +const SECCOMP_SET_MODE_FILTER: u32 = 1; + +/// Offset of `seccomp_data::nr` in bytes. +const SECCOMP_DATA_OFFSET_NR: u32 = 0; + +/// Offset of `seccomp_data::arch` in bytes. +const SECCOMP_DATA_OFFSET_ARCH: u32 = 4; + +/// Offset of `seccomp_data::instruction_pointer` in bytes. +const SECCOMP_DATA_OFFSET_IP: u32 = 8; + +/// Offset of `seccomp_data::args` in bytes. +#[allow(unused)] +const SECCOMP_DATA_OFFSET_ARGS: u32 = 16; + +#[cfg(target_endian = "little")] +const SECCOMP_DATA_OFFSET_IP_HI: u32 = SECCOMP_DATA_OFFSET_IP + 4; +#[cfg(target_endian = "little")] +const SECCOMP_DATA_OFFSET_IP_LO: u32 = SECCOMP_DATA_OFFSET_IP; + +#[cfg(target_endian = "big")] +const SECCOMP_DATA_OFFSET_IP_HI: u32 = SECCOMP_DATA_OFFSET_IP; +#[cfg(target_endian = "big")] +const SECCOMP_DATA_OFFSET_IP_LO: u32 = SECCOMP_DATA_OFFSET_IP + 4; + +// These are defined in `/include/uapi/linux/elf-em.h`. +const EM_386: u32 = 3; +const EM_MIPS: u32 = 8; +const EM_PPC: u32 = 20; +const EM_PPC64: u32 = 21; +const EM_ARM: u32 = 40; +const EM_X86_64: u32 = 62; +const EM_AARCH64: u32 = 183; + +// These are defined in `/include/uapi/linux/audit.h`. +const __AUDIT_ARCH_64BIT: u32 = 0x8000_0000; +const __AUDIT_ARCH_LE: u32 = 0x4000_0000; + +// These are defined in `/include/uapi/linux/audit.h`. +pub const AUDIT_ARCH_X86: u32 = EM_386 | __AUDIT_ARCH_LE; +pub const AUDIT_ARCH_X86_64: u32 = EM_X86_64 | __AUDIT_ARCH_64BIT | __AUDIT_ARCH_LE; +pub const AUDIT_ARCH_ARM: u32 = EM_ARM | __AUDIT_ARCH_LE; +pub const AUDIT_ARCH_AARCH64: u32 = EM_AARCH64 | __AUDIT_ARCH_64BIT | __AUDIT_ARCH_LE; +pub const AUDIT_ARCH_MIPS: u32 = EM_MIPS; +pub const AUDIT_ARCH_PPC: u32 = EM_PPC; +pub const AUDIT_ARCH_PPC64: u32 = EM_PPC64 | __AUDIT_ARCH_64BIT; + +/// Seccomp-BPF program byte code. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Filter { + // Since the limit is 4096 instructions, we *could* use a static array here + // instead. However, that would require bounds checks each time an + // instruction is appended and complicate the interface with `Result` types + // and error handling logic. It's cleaner to just check the size when the + // program is loaded. + filter: Vec, +} + +impl Filter { + /// Creates a new, empty seccomp program. Note that empty BPF programs are not + /// valid and will fail to load. + pub const fn new() -> Self { + Self { filter: Vec::new() } + } + + /// Appends a single instruction to the seccomp-BPF program. + pub fn push(&mut self, instruction: sock_filter) { + self.filter.push(instruction); + } + + /// Returns the number of instructions in the BPF program. + pub fn len(&self) -> usize { + self.filter.len() + } + + /// Returns true if the program is empty. Empty seccomp filters will result + /// in an error when loaded. + pub fn is_empty(&self) -> bool { + self.filter.is_empty() + } + + /// Loads the program via seccomp into the current process. + /// + /// Once loaded, the seccomp filter can never be removed. Additional seccomp + /// filters can be loaded, however, and they will chain together and be + /// executed in reverse order. + /// + /// NOTE: The maximum size of any single seccomp-bpf filter is 4096 + /// instructions. The overall limit is 32768 instructions across all loaded + /// filters. + /// + /// See [`seccomp(2)`](https://man7.org/linux/man-pages/man2/seccomp.2.html) + /// for more details. + pub fn load(&self) -> Result<(), Errno> { + let len = self.filter.len(); + + if len == 0 || len > BPF_MAXINSNS { + return Err(Errno::EINVAL); + } + + let prog = libc::sock_fprog { + // Note: length is guaranteed to be less than `u16::MAX` because of + // the above check. + len: len as u16, + filter: self.filter.as_ptr() as *mut _, + }; + + let ptr = &prog as *const libc::sock_fprog; + + Errno::result(unsafe { + libc::syscall(libc::SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, ptr) + })?; + + Ok(()) + } +} + +impl Extend for Filter { + fn extend>(&mut self, iter: T) { + self.filter.extend(iter) + } +} + +/// Trait for types that can emit BPF byte code. +pub trait ByteCode { + /// Accumulates BPF instructions into the given filter. + fn into_bpf(self, filter: &mut Filter); +} + +impl ByteCode for F +where + F: FnOnce(&mut Filter), +{ + fn into_bpf(self, filter: &mut Filter) { + self(filter) + } +} + +impl ByteCode for sock_filter { + fn into_bpf(self, filter: &mut Filter) { + filter.push(self) + } +} + +/// Returns a seccomp-bpf filter containing the given list of instructions. +/// +/// This can be concatenated with other seccomp-BPF programs. +/// +/// Note that this is not a true BPF program. Seccomp-bpf is a subset of BPF and +/// so many instructions are not available. +/// +/// When executing instructions, the BPF program operates on the syscall +/// information made available as a (read-only) buffer of the following form: +/// +/// ```no_compile +/// struct seccomp_data { +/// // The syscall number. +/// nr: u32, +/// // `AUDIT_ARCH_*` value (see ` { + { + let mut filter = Filter::new(); + $( + $inst.into_bpf(&mut filter); + )+ + filter + } + }; +} + +// See: /include/uapi/linux/filter.h +pub const fn BPF_STMT(code: u16, k: u32) -> sock_filter { + sock_filter { + code, + jt: 0, + jf: 0, + k, + } +} + +/// A BPF jump instruction. +/// +/// # Arguments +/// +/// * `code` is the operation code. +/// * `k` is the value operated on for comparisons. +/// * `jt` is the relative offset to jump to if the comparison is true. +/// * `jf` is the relative offset to jump to if the comparison is false. +/// +/// # Example +/// +/// ```no_compile +/// // Jump to the next instruction if the loaded value is equal to 42. +/// BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 42, 1, 0); +/// ``` +pub const fn BPF_JUMP(code: u16, k: u32, jt: u8, jf: u8) -> sock_filter { + sock_filter { code, jt, jf, k } +} + +/// Loads the syscall number into `seccomp_data.nr`. +pub const LOAD_SYSCALL_NR: sock_filter = BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_OFFSET_NR); + +/// Returns from the seccomp filter, allowing the syscall to pass through. +#[allow(unused)] +pub const ALLOW: sock_filter = BPF_STMT(BPF_RET + BPF_K, libc::SECCOMP_RET_ALLOW); + +/// Returns from the seccomp filter, instructing the kernel to kill the calling +/// thread with `SIGSYS` before executing the syscall. +#[allow(unused)] +pub const DENY: sock_filter = BPF_STMT(BPF_RET + BPF_K, libc::SECCOMP_RET_KILL_THREAD); + +/// Returns from the seccomp filter, causing a `SIGSYS` to be sent to the calling +/// thread skipping over the syscall without executing it. Unlike [`DENY`], this +/// signal can be caught. +#[allow(unused)] +pub const TRAP: sock_filter = BPF_STMT(BPF_RET + BPF_K, libc::SECCOMP_RET_TRAP); + +/// Returns from the seccomp filter, causing `PTRACE_EVENT_SECCOMP` to be +/// generated for this syscall (if `PTRACE_O_TRACESECCOMP` is enabled). If no +/// tracer is present, the syscall will not be executed and returns a `ENOSYS` +/// instead. +/// +/// `data` is made available to the tracer via `PTRACE_GETEVENTMSG`. +#[allow(unused)] +pub fn TRACE(data: u16) -> sock_filter { + BPF_STMT( + BPF_RET + BPF_K, + libc::SECCOMP_RET_TRACE | (data as u32 & libc::SECCOMP_RET_DATA), + ) +} + +/// Returns from the seccomp filter, returning the given error instead of +/// executing the syscall. +#[allow(unused)] +pub fn ERRNO(err: Errno) -> sock_filter { + BPF_STMT( + BPF_RET + BPF_K, + libc::SECCOMP_RET_ERRNO | (err.into_raw() as u32 & libc::SECCOMP_RET_DATA), + ) +} + +macro_rules! instruction { + ( + $( + $(#[$attrs:meta])* + $vis:vis fn $name:ident($($args:tt)*) { + $($instruction:expr;)* + } + )* + ) => { + $( + $vis fn $name($($args)*) -> impl ByteCode { + move |filter: &mut Filter| { + $( + $instruction.into_bpf(filter); + )* + } + } + )* + }; +} + +instruction! { + /// Checks that architecture matches our target architecture. If it does not + /// match, kills the current process. This should be the first step for every + /// seccomp filter to ensure we're working with the syscall table we're + /// expecting. Each architecture has a slightly different syscall table and + /// we need to make sure the syscall numbers we're using are the right ones + /// for the architecture. + pub fn VALIDATE_ARCH(target_arch: u32) { + // Load `seccomp_data.arch` + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_OFFSET_ARCH); + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, target_arch, 1, 0); + BPF_STMT(BPF_RET + BPF_K, libc::SECCOMP_RET_KILL_PROCESS); + } + + pub fn LOAD_SYSCALL_IP() { + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_OFFSET_IP_LO); + // M[0] = lo + BPF_STMT(BPF_ST, 0); + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_OFFSET_IP_HI); + // M[1] = hi + BPF_STMT(BPF_ST, 1); + } + + /// Checks if `seccomp_data.nr` matches the given syscall. If so, then jumps + /// to `action`. + /// + /// # Example + /// ```no_compile + /// SYSCALL(Sysno::socket, DENY); + /// ``` + pub fn SYSCALL(nr: Sysno, action: sock_filter) { + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, nr as i32 as u32, 0, 1); + action; + } + + fn IP_RANGE64(blo: u32, bhi: u32, elo: u32, ehi: u32, action: sock_filter) { + // Most of the complexity below is caused by seccomp-bpf only being able + // to operate on `u32` values. We also can't reuse `JGE64` and `JLE64` + // because the jump offsets would be incorrect. + + // STEP1: if !(begin > arg) goto NOMATCH; + + // if (begin_hi > arg.hi) goto Step2; */ + BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, bhi, 4 /* goto STEP2 */, 0); + // if (begin_hi != arg.hi) goto NOMATCH; + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, bhi, 0, 9 /* goto NOMATCH */); + // Load M[0] to operate on the low bits of the IP. + BPF_STMT(BPF_LD + BPF_MEM, 0); + // if (begin_lo >= arg.lo) goto MATCH; + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, blo, 0, 7 /* goto NOMATCH */); + // Load M[1] because the next instruction expects the high bits of the + // IP. + BPF_STMT(BPF_LD + BPF_MEM, 1); + + // STEP2: if !(arg > end) goto NOMATCH; + + // if (end_hi < arg.hi) goto MATCH; + BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, ehi, 0, 4 /* goto MATCH */); + // if (end_hi != arg.hi) goto NOMATCH; + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ehi, 0, 5 /* goto NOMATCH */); + BPF_STMT(BPF_LD + BPF_MEM, 0); + // if (end_lo < arg.lo) goto MATCH; + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, elo, 2 /* goto NOMATCH */, 0); + BPF_STMT(BPF_LD + BPF_MEM, 1); + + // MATCH: Take the action. + action; + + // NOMATCH: Load M[1] again after we loaded M[0]. + BPF_STMT(BPF_LD + BPF_MEM, 1); + } +} + +/// Checks if the instruction pointer is between a certain range. If so, executes +/// `action`. Otherwise, fall through. +/// +/// Note that if `ip == end`, this will not match. That is, the interval closed +/// at the end. +/// +/// Precondition: The instruction pointer must be loaded with [`LOAD_SYSCALL_IP`] +/// first. +pub fn IP_RANGE(begin: u64, end: u64, action: sock_filter) -> impl ByteCode { + let begin_lo = begin as u32; + let begin_hi = (begin >> 32) as u32; + let end_lo = end as u32; + let end_hi = (end >> 32) as u32; + + IP_RANGE64(begin_lo, begin_hi, end_lo, end_hi, action) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke() { + let filter = seccomp_bpf![ + VALIDATE_ARCH(AUDIT_ARCH_X86_64), + LOAD_SYSCALL_NR, + SYSCALL(Sysno::open, DENY), + SYSCALL(Sysno::close, DENY), + SYSCALL(Sysno::write, DENY), + SYSCALL(Sysno::read, DENY), + ALLOW, + ]; + + assert_eq!(filter.len(), 13); + } +} diff --git a/reverie-process/src/seccomp/mod.rs b/reverie-process/src/seccomp/mod.rs new file mode 100644 index 0000000..3f2bf60 --- /dev/null +++ b/reverie-process/src/seccomp/mod.rs @@ -0,0 +1,338 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Provides helpers for constructing a [`seccomp`][seccomp] filter. This is a +//! pure Rust implementation and does not require libseccomp. +//! +//! # Seccomp Background +//! +//! [`seccomp(2)`][seccomp] is a powerful tool for changing how a process tree +//! behaves when a syscall happens. Seccomp can be used to install a filter that +//! applies to every child process in a process tree. Since filters cannot be +//! removed, they can only get more restrictive. The language used for filters is +//! called `seccomp-bpf`. It is a subset of the BPF byte code language. +//! +//! Some of the restrictions include: +//! - Only being able to JMP forward and never backward. This prevents loops and +//! ensures seccomp-bpf filters always terminate. This is also true of BPF. +//! - Cannot call libbpf functions. +//! - Cannot operate on 64-bit integers, only 32-bit integers. +//! +//! [seccomp]: https://man7.org/linux/man-pages/man2/seccomp.2.html +//! +//! You can think of a seccomp-bpf program as a little function that gets +//! executed for every syscall: +//! +//! ```no_compile +//! // NOTE: seccomp-bpf programs are actually written in byte code, but if a +//! // high-level language could be compiled to BPF byte code, this is what it'd +//! // look like. +//! fn my_program(data: seccomp_data) -> Action { +//! if data.nr == 2 { +//! return Action::Trace; +//! } +//! +//! if data.nr == 3 { +//! return Action::KillProcess; +//! } +//! +//! // Allow the syscall by default. +//! Action::Allow +//! } +//! ``` +//! +//! where `seccomp_data` is defined as: +//! +//! ```no_compile +//! struct seccomp_data { +//! // The syscall number. +//! nr: u32, +//! // The architecture. +//! arch: u32, +//! // Instruction pointer. +//! ip: u64, +//! // The 6 syscall arguments. +//! args: [u64; 6], +//! } +//! ``` +//! +//! This is the only input available to the seccomp filter and is the only bit of +//! data available to make a decision about a syscall (i.e., an "action"). An +//! action might be nothing (i.e., allow the syscall through), kill the +//! process/thread with `SIGSYS`, forward the syscall to ptrace, or return an +//! error code. + +#[macro_use] +mod bpf; + +use bpf::*; + +use syscalls::Errno; +use syscalls::Sysno; + +pub use bpf::Filter; + +use std::collections::BTreeMap; + +/// Builder for creating seccomp filters. +#[derive(Clone)] +pub struct FilterBuilder { + /// The target architecture. + target_arch: TargetArch, + + /// The action to take if there are no matches. + default_action: Action, + + /// The action to take for each syscall. + syscalls: BTreeMap, + + /// Ranges of instruction pointer values. + ip_ranges: Vec<(u64, u64, Action)>, +} + +/// The target architecture. +#[allow(non_camel_case_types, missing_docs)] +#[derive(Debug, Copy, Clone)] +#[repr(u32)] +pub enum TargetArch { + x86 = AUDIT_ARCH_X86, + x86_64 = AUDIT_ARCH_X86_64, + mips = AUDIT_ARCH_MIPS, + powerpc = AUDIT_ARCH_PPC, + powerpc64 = AUDIT_ARCH_PPC64, + arm = AUDIT_ARCH_ARM, + aarch64 = AUDIT_ARCH_AARCH64, +} + +/// The action to take if the conditions of a rule all match. +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Allows the syscallto be executed. + Allow, + + /// Returns the specified error instead of executing the syscall. + Errno(Errno), + + /// Prevents the syscall from being executed and the kernel will kill the + /// calling thread with `SIGSYS`. + KillThread, + + /// Prevents the syscall from being executed and the kernel will kill the + /// calling process with `SIGSYS`. + KillProcess, + + /// Same as [`Action::Allow`] but logs the call. + Log, + + /// If the thread is being ptraced and the tracing process specified + /// `PTRACE_O_SECCOMP`, the tracing process will be notified via + /// `PTRACE_EVENT_SECCOMP` and the value provided can be retrieved using + /// `PTRACE_GETEVENTMSG`. + Trace(u16), + + /// Disallow and raise a SIGSYS in the calling process. + Trap, +} + +impl From for u32 { + fn from(action: Action) -> u32 { + match action { + Action::Allow => libc::SECCOMP_RET_ALLOW, + Action::Errno(x) => { + libc::SECCOMP_RET_ERRNO | (x.into_raw() as u32 & libc::SECCOMP_RET_DATA) + } + Action::KillThread => libc::SECCOMP_RET_KILL_THREAD, + Action::KillProcess => libc::SECCOMP_RET_KILL_PROCESS, + Action::Log => libc::SECCOMP_RET_LOG, + Action::Trace(x) => libc::SECCOMP_RET_TRACE | (x as u32 & libc::SECCOMP_RET_DATA), + Action::Trap => libc::SECCOMP_RET_TRAP, + } + } +} + +impl From for sock_filter { + fn from(action: Action) -> sock_filter { + BPF_STMT(BPF_RET + BPF_K, u32::from(action)) + } +} + +impl TargetArch { + #![allow(missing_docs)] + + #[cfg(target_arch = "x86")] + pub const CURRENT: TargetArch = Self::x86; + + #[cfg(target_arch = "x86_64")] + pub const CURRENT: TargetArch = Self::x86_64; + + #[cfg(target_arch = "mips")] + pub const CURRENT: TargetArch = Self::mips; + + #[cfg(target_arch = "powerpc")] + pub const CURRENT: TargetArch = Self::powerpc; + + #[cfg(target_arch = "powerpc64")] + pub const CURRENT: TargetArch = Self::powerpc64; + + #[cfg(target_arch = "arm")] + pub const CURRENT: TargetArch = Self::arm; + + #[cfg(target_arch = "aarch64")] + pub const CURRENT: TargetArch = Self::aarch64; +} + +impl Default for TargetArch { + fn default() -> Self { + Self::CURRENT + } +} + +impl Default for FilterBuilder { + fn default() -> Self { + Self::new() + } +} + +impl FilterBuilder { + /// Creates the seccomp filter builder. + pub fn new() -> Self { + Self { + target_arch: TargetArch::default(), + default_action: Action::KillThread, + syscalls: Default::default(), + ip_ranges: Default::default(), + } + } + + /// Sets the target architecture. If this doesn't match the architecture of + /// the process, then the process is killed. This is the first step in the + /// seccomp filter and ensures that we're working with the right syscall + /// table. Each architecture has a slightly different syscall table and we + /// need to make sure the syscall numbers we're using are the right ones for + /// the architecture. + /// + /// By default, the target architecture is set to the architecture of the + /// current program (i.e., `TargetArch::CURRENT`). + pub fn target_arch(&mut self, target_arch: TargetArch) -> &mut Self { + self.target_arch = target_arch; + self + } + + /// The default action to take if there are no matches. By default, the + /// default action is to kill the current thread (i.e., the filter becomes an + /// allowlist). + /// + /// When using an allowlist of syscalls, this should be set to + /// `Action::KillThread` or `Action::KillProcess`. + /// + /// When using a blocklist of syscalls, this should be set to + /// `Action::Allow`. + pub fn default_action(&mut self, action: Action) -> &mut Self { + self.default_action = action; + self + } + + /// Sets the action to take for the given syscall. + pub fn syscall(&mut self, syscall: Sysno, action: Action) -> &mut Self { + self.syscalls.insert(syscall, action); + self + } + + /// Sets the action to take for a set of syscalls. + pub fn syscalls(&mut self, table: I) -> &mut Self + where + I: IntoIterator, + { + self.syscalls.extend(table); + self + } + + /// Take an action if the instruction pointer `ip >= begin && ip < end`. + /// + /// This is useful in conjunction with `mmap`. For example, we can use this + /// to deny any syscalls made outside of `ld.so` or `libc.so`. It can also be + /// used to avoid tracing syscalls injected with ptrace. + /// + /// Multiple ranges can be added and are checked in sequence. + pub fn ip_range(&mut self, begin: u64, end: u64, action: Action) -> &mut Self { + self.ip_ranges.push((begin, end, action)); + self + } + + /// Adds multiple IP ranges. This is equivalent to calling + /// [`FilterBuilder::ip_range`] multiple times. + pub fn ip_ranges(&mut self, ranges: I) -> &mut Self + where + I: IntoIterator, + { + self.ip_ranges.extend(ranges); + self + } + + /// Generates the byte code for the filter. + pub fn build(&self) -> Filter { + let mut filter = Filter::new(); + + // This should be the first step for every seccomp-bpf filter. + VALIDATE_ARCH(self.target_arch as u32).into_bpf(&mut filter); + + if !self.ip_ranges.is_empty() { + LOAD_SYSCALL_IP().into_bpf(&mut filter); + + for (begin, end, action) in &self.ip_ranges { + IP_RANGE(*begin, *end, (*action).into()).into_bpf(&mut filter); + } + } + + if !self.syscalls.is_empty() { + // Load the syscall number. + LOAD_SYSCALL_NR.into_bpf(&mut filter); + + for (syscall, action) in &self.syscalls { + SYSCALL(*syscall, (*action).into()).into_bpf(&mut filter); + } + } + + // The default action is always performed last. + sock_filter::from(self.default_action).into_bpf(&mut filter); + + filter + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke() { + assert_eq!( + FilterBuilder::new() + .default_action(Action::Allow) + .target_arch(TargetArch::x86_64) + .syscalls([ + (Sysno::read, Action::KillThread), + (Sysno::write, Action::KillThread), + (Sysno::open, Action::KillThread), + (Sysno::close, Action::KillThread), + (Sysno::write, Action::KillThread), + ]) + .build(), + seccomp_bpf![ + VALIDATE_ARCH(AUDIT_ARCH_X86_64), + LOAD_SYSCALL_NR, + SYSCALL(Sysno::read, DENY), + SYSCALL(Sysno::write, DENY), + SYSCALL(Sysno::open, DENY), + SYSCALL(Sysno::close, DENY), + ALLOW, + ] + ); + } +} diff --git a/reverie-process/src/spawn.rs b/reverie-process/src/spawn.rs new file mode 100644 index 0000000..2d28db2 --- /dev/null +++ b/reverie-process/src/spawn.rs @@ -0,0 +1,142 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::clone::clone; +use super::error::{Context, Error}; +use super::fd::{pipe, Fd}; +use super::id_map::make_id_map; +use super::stdio::{ChildStderr, ChildStdin, ChildStdout}; +use super::util::CStringArray; +use super::Child; +use super::Command; + +use super::container::ChildContext; + +use std::io; +use std::io::Write; + +impl Command { + /// Executes the command as a child process, returning a handle to it. + /// + /// By default, stdin, stdout and stderr are inherited from the parent. + pub fn spawn(&mut self) -> Result { + // Create a pipe to send back errors to the parent process if `execve` + // fails. + let (reader, mut writer) = pipe()?; + + let child = self.spawn_with(|err| { + send_error(&mut writer, err); + 1 + })?; + + // Close the writer end. Otherwise, the following read will hang + // forever. + drop(writer); + + recv_error(reader)?; + + Ok(child) + } + + /// Spawn the child with helper functions. The `onfail` callback runs in the + /// child process if an error occurs during execution of the process. The + /// `wait` function can be used to wait for the child to fully start up and + /// to transform it into another type. + pub fn spawn_with(&mut self, mut onfail: F) -> Result + where + F: FnMut(Error) -> i32, + { + let env = self.container.env.array(); + + // Set up IO pipes + let (stdin, child_stdin) = self.container.stdin.pipes(true)?; + let (stdout, child_stdout) = self.container.stdout.pipes(false)?; + let (stderr, child_stderr) = self.container.stdout.pipes(false)?; + + let clone_flags = self.container.namespace.bits() | libc::SIGCHLD; + + let uid_map = &make_id_map(&self.container.uid_map); + let gid_map = &make_id_map(&self.container.gid_map); + + let context = ChildContext { + stdin: child_stdin.as_ref(), + stdout: child_stdout.as_ref(), + stderr: child_stderr.as_ref(), + uid_map, + gid_map, + }; + + let pid = clone( + || { + let code = onfail(self.do_exec(&context, &env).unwrap_err()); + unsafe { libc::_exit(code) } + }, + clone_flags, + )?; + + drop(child_stdin); + drop(child_stdout); + drop(child_stderr); + drop(self.container.pty.take()); + + let stdin = stdin.map(ChildStdin::new).transpose()?; + let stdout = stdout.map(ChildStdout::new).transpose()?; + let stderr = stderr.map(ChildStderr::new).transpose()?; + + Ok(Child { + pid, + exit_status: None, + stdin, + stdout, + stderr, + }) + } + + /// Note: This function MUST NOT allocate or deallocate any memory. Doing so + /// can cause deadlocks. + fn do_exec(&mut self, context: &ChildContext, env: &CStringArray) -> Result { + self.container.setup(context, &mut self.pre_exec)?; + + let err = Error::result( + unsafe { libc::execvpe(self.program.as_ptr(), self.args.as_ptr(), env.as_ptr()) }, + Context::Exec, + ) + .unwrap_err(); + + Err(err) + } +} + +/// Sends an error and closes the pipe. Ignore any errors if this fails. +pub fn send_error(fd: &mut Fd, err: Error) { + // Writes up to PIPE_BUF (4096) should be atomic. There's also nothing we + // can do with an error if this fails. + let bytes: [u8; 8] = err.into(); + let _ = fd.write(&bytes); +} + +/// Tries to receive an error code from the pipe. If the other end of the +/// pipe is closed before sending an error, then `Ok(())` is returned. +pub fn recv_error(mut fd: Fd) -> Result<(), Error> { + use std::io::Read; + let mut err = [0u8; 8]; + loop { + match fd.read(&mut err) { + Ok(0) => return Ok(()), + Ok(8) => return Err(Error::from(err)), + Ok(n) => { + // Sends up to PIPE_BUF (4096) should be atomic. + panic!("execve pipe: got unexpected number of bytes {}", n); + } + Err(err) if err.kind() == io::ErrorKind::Interrupted => {} + Err(err) => { + panic!("execve pipe: read returned unexpected error {}", err); + } + } + } +} diff --git a/reverie-process/src/stdio.rs b/reverie-process/src/stdio.rs new file mode 100644 index 0000000..7f81331 --- /dev/null +++ b/reverie-process/src/stdio.rs @@ -0,0 +1,240 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use super::fd::{pipe, AsyncFd, Fd}; + +use core::pin::Pin; +use core::task::{Context, Poll}; +use std::io; +use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; + +use syscalls::Errno; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +/// Describes what to do with a standard I/O stream for a child process when +/// passed to the [`stdin`], [`stdout`], and [`stderr`] methods of [`Command`]. +/// +/// [`stdin`]: super::Command::stdin +/// [`stdout`]: super::Command::stdout +/// [`stderr`]: super::Command::stderr +/// [`Command`]: super::Command +#[derive(Debug)] +pub struct Stdio(InnerStdio); + +/// A handle to a child process's standard input (stdin). +/// +/// This struct is used in the [`stdin`] field on [`Child`]. +/// +/// When an instance of `ChildStdin` is [dropped], the `ChildStdin`'s underlying +/// file handle will be closed. If the child process was blocked on input prior +/// to being dropped, it will become unblocked after dropping. +/// +/// [`stdin`]: super::Child::stdin +/// [`Child`]: super::Child +/// [dropped]: Drop +#[derive(Debug)] +pub struct ChildStdin(AsyncFd); + +/// A handle to a child process's standard output (stdout). +/// +/// This struct is used in the [`stdout`] field on [`Child`]. +/// +/// When an instance of `ChildStdout` is [dropped], the `ChildStdout`'s +/// underlying file handle will be closed. +/// +/// [`stdout`]: super::Child::stdout +/// [`Child`]: super::Child +/// [dropped]: Drop +#[derive(Debug)] +pub struct ChildStdout(AsyncFd); + +/// A handle to a child process's stderr. +/// +/// This struct is used in the [`stderr`] field on [`Child`]. +/// +/// When an instance of `ChildStderr` is [dropped], the `ChildStderr`'s +/// underlying file handle will be closed. +/// +/// [`stderr`]: super::Child::stderr +/// [`Child`]: super::Child +/// [dropped]: Drop +#[derive(Debug)] +pub struct ChildStderr(AsyncFd); + +#[derive(Debug)] +enum InnerStdio { + Inherit, + Null, + Piped, + File(Fd), +} + +impl Default for Stdio { + fn default() -> Self { + Self(InnerStdio::Inherit) + } +} + +impl Stdio { + /// A new pipe should be arranged to connect the parent and child processes. + pub fn piped() -> Self { + Self(InnerStdio::Piped) + } + + /// The child inherits from the corresponding parent descriptor. This is the default mode. + pub fn inherit() -> Self { + Self(InnerStdio::Inherit) + } + + /// This stream will be ignored. This is the equivalent of attaching the + /// stream to `/dev/null`. + pub fn null() -> Self { + Self(InnerStdio::Null) + } + + /// Returns a pair of file descriptors, one for the parent and one for the + /// child. If the child's file descriptor is `None`, then it shall be + /// inherited from the parent. If the parent's file descriptor is `None`, + /// then there is no link to the child and the child owns the other half of + /// the file descriptor (if any). Both file descriptors will be `None` if + /// stdio is being inherited. + pub(super) fn pipes(&self, readable: bool) -> Result<(Option, Option), Errno> { + match &self.0 { + InnerStdio::Inherit => Ok((None, None)), + InnerStdio::Null => Ok((None, Some(Fd::null(readable)?))), + InnerStdio::Piped => { + let (reader, writer) = pipe()?; + let (parent, child) = if readable { + (writer, reader) + } else { + (reader, writer) + }; + Ok((Some(parent), Some(child))) + } + InnerStdio::File(file) => Ok((None, Some(file.dup()?))), + } + } +} + +impl From for Stdio { + fn from(f: T) -> Self { + Self(InnerStdio::File(Fd::new(f.into_raw_fd()))) + } +} + +impl From for std::process::Stdio { + fn from(stdio: Stdio) -> Self { + match stdio.0 { + InnerStdio::Inherit => Self::inherit(), + InnerStdio::Null => Self::null(), + InnerStdio::Piped => Self::piped(), + InnerStdio::File(fd) => Self::from(std::fs::File::from(fd)), + } + } +} + +impl ChildStdin { + pub(super) fn new(fd: Fd) -> Result { + AsyncFd::writable(fd).map(Self) + } +} + +impl ChildStdout { + pub(super) fn new(fd: Fd) -> Result { + AsyncFd::readable(fd).map(Self) + } +} + +impl ChildStderr { + pub(super) fn new(fd: Fd) -> Result { + AsyncFd::readable(fd).map(Self) + } +} + +impl AsyncWrite for ChildStdin { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.0).poll_write(cx, buf) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.0).poll_flush(cx) + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.0).poll_shutdown(cx) + } +} + +impl AsyncRead for ChildStdout { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context, + buf: &mut ReadBuf, + ) -> Poll> { + Pin::new(&mut self.0).poll_read(cx, buf) + } +} + +impl AsyncRead for ChildStderr { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context, + buf: &mut ReadBuf, + ) -> Poll> { + Pin::new(&mut self.0).poll_read(cx, buf) + } +} + +impl FromRawFd for ChildStdin { + unsafe fn from_raw_fd(fd: i32) -> Self { + Self::new(Fd::new(fd)).unwrap() + } +} + +impl FromRawFd for ChildStdout { + unsafe fn from_raw_fd(fd: i32) -> Self { + Self::new(Fd::new(fd)).unwrap() + } +} + +impl FromRawFd for ChildStderr { + unsafe fn from_raw_fd(fd: i32) -> Self { + Self::new(Fd::new(fd)).unwrap() + } +} + +impl From for ChildStdin { + fn from(io: tokio::process::ChildStdin) -> Self { + let fd = io.as_raw_fd(); + let fd = unsafe { libc::dup(fd) }; + drop(io); + unsafe { Self::from_raw_fd(fd) } + } +} + +impl From for ChildStdout { + fn from(io: tokio::process::ChildStdout) -> Self { + let fd = io.as_raw_fd(); + let fd = unsafe { libc::dup(fd) }; + drop(io); + unsafe { Self::from_raw_fd(fd) } + } +} + +impl From for ChildStderr { + fn from(io: tokio::process::ChildStderr) -> Self { + let fd = io.as_raw_fd(); + let fd = unsafe { libc::dup(fd) }; + drop(io); + unsafe { Self::from_raw_fd(fd) } + } +} diff --git a/reverie-process/src/util.rs b/reverie-process/src/util.rs new file mode 100644 index 0000000..d5976bb --- /dev/null +++ b/reverie-process/src/util.rs @@ -0,0 +1,81 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::ffi::{CStr, CString, OsStr}; +use std::os::unix::ffi::OsStrExt; + +use syscalls::Errno; + +pub fn to_cstring>(s: S) -> CString { + CString::new(s.as_ref().as_bytes()).unwrap() +} + +pub struct CStringArray { + items: Vec, + ptrs: Vec<*const libc::c_char>, +} + +impl CStringArray { + pub fn with_capacity(capacity: usize) -> Self { + let mut result = CStringArray { + items: Vec::with_capacity(capacity), + ptrs: Vec::with_capacity(capacity + 1), + }; + result.ptrs.push(core::ptr::null()); + result + } + + pub fn push(&mut self, item: CString) { + let l = self.ptrs.len(); + self.ptrs[l - 1] = item.as_ptr(); + self.ptrs.push(core::ptr::null()); + self.items.push(item); + } + + pub fn as_ptr(&self) -> *const *const libc::c_char { + self.ptrs.as_ptr() + } + + pub fn set(&mut self, i: usize, item: CString) { + self.ptrs[i] = item.as_ptr(); + self.items[i] = item; + } + + pub fn get(&self, i: usize) -> &CStr { + self.items[i].as_ref() + } + + pub fn iter(&self) -> impl Iterator { + self.items.iter().map(|x| x.as_ref()) + } +} + +pub unsafe fn reset_signal_handling() -> Result<(), Errno> { + use core::mem::MaybeUninit; + + // Reset signal handling so the child process starts in a standardized + // state. libstd ignores SIGPIPE, and signal-handling libraries often set a + // mask. Child processes inherit ignored signals and the signal mask from + // their parent, but most UNIX programs do not reset these things on their + // own, so we need to clean things up now to avoid confusing the program + // we're about to run. + let mut set = MaybeUninit::::uninit(); + Errno::result(libc::sigemptyset(set.as_mut_ptr()))?; + Errno::result(libc::pthread_sigmask( + libc::SIG_SETMASK, + set.as_ptr(), + core::ptr::null_mut(), + ))?; + + let ret = libc::signal(libc::SIGPIPE, libc::SIG_DFL); + if ret == libc::SIG_ERR { + return Err(Errno::last()); + } + + Ok(()) +} diff --git a/reverie-ptrace/Cargo.toml b/reverie-ptrace/Cargo.toml new file mode 100644 index 0000000..d1893f5 --- /dev/null +++ b/reverie-ptrace/Cargo.toml @@ -0,0 +1,38 @@ +# @generated by autocargo + +[package] +name = "reverie-ptrace" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" + +[dependencies] +anyhow = "1.0.51" +async-trait = "0.1.51" +bincode = "1.3.3" +bitflags = "1.3" +bytes = { version = "1.1", features = ["serde"] } +futures = { version = "0.3.13", features = ["async-await", "compat"] } +goblin = "0.3" +lazy_static = "1.0" +libc = "0.2.98" +nix = "0.22" +num-traits = "0.2" +parking_lot = { version = "0.11.2", features = ["send_guard"] } +paste = "1.0" +perf-event-open-sys = "1.0" +procfs = "0.9" +raw-cpuid = "9.0" +reverie = { version = "0.1.0", path = "../reverie" } +serde = { version = "1.0.126", features = ["derive", "rc"] } +thiserror = "1.0.29" +tokio = { version = "1.10", features = ["full", "test-util", "tracing"] } +tokio-stream = { version = "0.1.4", features = ["fs", "io-util", "net", "signal", "sync", "time"] } +tracing = "0.1.29" +tracing-subscriber = { version = "0.3.3", features = ["ansi", "env-filter", "fmt", "json", "parking_lot", "registry"] } +unwind = { version = "0.4", features = ["ptrace"] } + +[dev-dependencies] +quickcheck = "1.0" +quickcheck_macros = "1.0" diff --git a/reverie-ptrace/src/children.rs b/reverie-ptrace/src/children.rs new file mode 100644 index 0000000..01f787e --- /dev/null +++ b/reverie-ptrace/src/children.rs @@ -0,0 +1,100 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use std::future::Future; +use std::marker::Unpin; +use std::mem; +use std::pin::Pin; +use std::slice; +use std::task::{Context, Poll}; + +use futures::future::FutureExt; + +/// Represents a set of children. +#[derive(Clone, Default)] +pub struct Children { + inner: Vec, +} + +impl<'a, T> IntoIterator for &'a Children { + type Item = &'a T; + type IntoIter = slice::Iter<'a, T>; + + fn into_iter(self) -> slice::Iter<'a, T> { + self.inner.iter() + } +} + +impl<'a, T> IntoIterator for &'a mut Children { + type Item = &'a mut T; + type IntoIter = slice::IterMut<'a, T>; + + fn into_iter(self) -> slice::IterMut<'a, T> { + self.inner.iter_mut() + } +} + +#[allow(unused)] +impl Children { + pub fn new() -> Self { + Children { inner: Vec::new() } + } + + pub fn push(&mut self, item: T) { + self.inner.push(item); + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn into_inner(self) -> Vec { + self.inner + } + + pub fn take_inner(&mut self) -> Vec { + mem::take(&mut self.inner) + } + + pub fn retain(&mut self, f: F) + where + F: FnMut(&T) -> bool, + { + self.inner.retain(f) + } +} + +impl Future for Children +where + T: Future + Unpin, +{ + // (Orphans, Finished) + type Output = (Self, Vec); + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll { + let mut inner = mem::take(&mut self.inner); + let mut ready = Vec::new(); + + // Iterate backwards through the vec. If an item is ready, swap_remove + // it. It is important to iterate backwards so that swap_remove doesn't + // perturb the ordering on the part of the vec we haven't yet iterated + // over. + for i in (0..self.inner.len()).rev() { + if let Poll::Ready(x) = inner[i].poll_unpin(cx) { + inner.swap_remove(i); + ready.push(x); + } + } + + Poll::Ready((Children { inner }, ready)) + } +} diff --git a/reverie-ptrace/src/cp/consts.rs b/reverie-ptrace/src/cp/consts.rs new file mode 100644 index 0000000..3dbaba8 --- /dev/null +++ b/reverie-ptrace/src/cp/consts.rs @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/// A page that is reserved by Reverie in every guest process. +pub const PRIVATE_PAGE_OFFSET: u64 = 0x7000_0000; + +/// trampoline data from private pages +pub const TRAMPOLINE_BASE: u64 = PRIVATE_PAGE_OFFSET; +pub const TRAMPOLINE_SIZE: usize = 0x1000; + +/// total private page size +pub const PRIVATE_PAGE_SIZE: usize = TRAMPOLINE_SIZE; diff --git a/reverie-ptrace/src/cp/mmap.rs b/reverie-ptrace/src/cp/mmap.rs new file mode 100644 index 0000000..759a5fc --- /dev/null +++ b/reverie-ptrace/src/cp/mmap.rs @@ -0,0 +1,40 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::consts::*; +use nix::{ + sys::uio::{self, IoVec, RemoteIoVec}, + unistd::Pid, +}; + +/// generate syscall instructions at injected page +/// the page address should be 0x7000_0000 (PRIVATE_PAGE_OFFSET) +/// the byte code can be confirmed by running objcopy +/// x86_64-linux-gnu-objcopy -I binary /tmp/1.bin -O elf64-x86-64 -B i386:x86-64 /tmp/1.elf +/// then objdump -d 1.elf must match the instructions listed below. +pub fn populate_mmap_page(pid: Pid, page_address: u64) -> nix::Result<()> { + /* the syscall sequences used here: + * 0: 0f 05 syscall // untraced syscall + * 2: 0f 0b ud2 + * 4: 0f 05 syscall // traced syscall + * 6: 0f 0b ud2 + */ + let mut syscall_stubs: Vec = vec![0x0f, 0x05, 0x0f, 0x0b, 0x0f, 0x05, 0x0f, 0x0b]; + syscall_stubs.resize_with(TRAMPOLINE_SIZE, || 0xcc); + let local_iov = &[IoVec::from_slice(syscall_stubs.as_slice())]; + let remote_iov = &[RemoteIoVec { + base: page_address as usize, + len: TRAMPOLINE_SIZE, + }]; + + // initialize the whole page with int3 to prevent unintended + // execution in our injected page. + uio::process_vm_writev(pid, local_iov, remote_iov)?; + Ok(()) +} diff --git a/reverie-ptrace/src/cp/mod.rs b/reverie-ptrace/src/cp/mod.rs new file mode 100644 index 0000000..bdffe8f --- /dev/null +++ b/reverie-ptrace/src/cp/mod.rs @@ -0,0 +1,14 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod consts; +mod mmap; + +pub use consts::*; +pub use mmap::populate_mmap_page; diff --git a/reverie-ptrace/src/debug.rs b/reverie-ptrace/src/debug.rs new file mode 100644 index 0000000..a1367df --- /dev/null +++ b/reverie-ptrace/src/debug.rs @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! convenient functions for debugging tracees + +use core::fmt; + +use nix::sys::{ptrace, signal}; +use reverie::syscalls::{Addr, MemoryAccess}; +use reverie::Pid; +use tracing::debug; + +use crate::trace::Stopped; + +// TODO: could check whether or not stack is valid +fn show_stackframe(tid: Pid, stack: u64, top_size: usize, bot_size: usize) -> String { + let mut text = String::new(); + if stack < top_size as u64 { + return text; + } + let sp_top = stack - top_size as u64; + let sp_bot = stack + bot_size as u64; + let mut sp = sp_top; + + while sp <= sp_bot { + match ptrace::read(tid.into(), sp as ptrace::AddressType) { + Err(_) => break, + Ok(x) => { + if sp == stack { + text += &format!(" => {:12x}: {:16x}\n", sp, x); + } else { + text += &format!(" {:12x}: {:16x}\n", sp, x); + } + } + } + sp += 8; + } + text +} + +fn show_user_regs(regs: &libc::user_regs_struct) -> String { + let mut res = String::new(); + + res += &format!( + "rax {:16x} rbx {:16x} rcx {:16x} rdx {:16x}\n", + regs.rax, regs.rbx, regs.rcx, regs.rdx + ); + res += &format!( + "rsi {:16x} rdi {:16x} rbp {:16x} rsp {:16x}\n", + regs.rsi, regs.rdi, regs.rbp, regs.rsp + ); + res += &format!( + " r8 {:16x} r9 {:16x} r10 {:16x} r11 {:16x}\n", + regs.r8, regs.r9, regs.r10, regs.r11 + ); + res += &format!( + "r12 {:16x} r13 {:16x} r14 {:16x} r15 {:16x}\n", + regs.r12, regs.r13, regs.r14, regs.r15 + ); + res += &format!("rip {:16x} eflags {:16x}\n", regs.rip, regs.eflags); + res += &format!( + "cs {:x} ss {:x} ds {:x} es {:x}\nfs {:x} gs {:x}", + regs.cs, regs.ss, regs.ds, regs.es, regs.fs, regs.gs + ); + res +} + +fn show_proc_maps(maps: &procfs::process::MemoryMap) -> String { + use procfs::process::MMapPath; + let mut res = String::new(); + let fp = match &maps.pathname { + MMapPath::Path(path) => String::from(path.to_str().unwrap_or("")), + MMapPath::Vdso => String::from("[vdso]"), + MMapPath::Vvar => String::from("[vvar]"), + MMapPath::Vsyscall => String::from("[vsyscall]"), + MMapPath::Stack => String::from("[stack]"), + MMapPath::Other(s) => s.clone(), + _ => String::from(""), + }; + let s = format!( + "{:x}-{:x} {} {:08x} {:02x}:{:02x} {}", + maps.address.0, maps.address.1, maps.perms, maps.offset, maps.dev.0, maps.dev.1, maps.inode + ); + res.push_str(&s); + (0..=72 - s.len()).for_each(|_| res.push(' ')); + res.push_str(&fp); + res +} + +fn task_rip_is_valid(pid: Pid, rip: u64) -> bool { + let mut has_valid_rip = None; + if let Ok(mapping) = procfs::process::Process::new(pid.as_raw()).and_then(|p| p.maps()) { + has_valid_rip = mapping + .iter() + .find(|e| e.perms.contains('x') && e.address.0 <= rip && e.address.1 > rip + 0x10) + .cloned(); + } + has_valid_rip.is_some() +} + +// XXX: should limit nb calls to procfs. +/// show task fault context +pub fn show_fault_context(task: &Stopped, sig: signal::Signal) { + let regs = task.getregs().unwrap(); + let siginfo = task.getsiginfo().unwrap(); + debug!( + "{:?} got {:?} si_errno: {}, si_code: {}, regs\n{}", + task, + sig, + siginfo.si_errno, + siginfo.si_code, + show_user_regs(®s) + ); + + debug!( + "stackframe from rsp@{:x}\n{}", + regs.rsp, + show_stackframe(task.pid(), regs.rsp, 0x40, 0x80) + ); + + if task_rip_is_valid(task.pid(), regs.rip) { + if let Some(addr) = Addr::from_raw(regs.rip as usize) { + let mut buf: [u8; 16] = [0; 16]; + if task.read_exact(addr, &mut buf).is_ok() { + debug!("insn @{:x?} = {:02x?}", addr, buf); + } + } + } else { + debug!("insn @{:x?} = ", regs.rip); + } + + procfs::process::Process::new(task.pid().as_raw()) + .and_then(|p| p.maps()) + .unwrap_or_else(|_| Vec::new()) + .iter() + .for_each(|e| { + debug!("{}", show_proc_maps(e)); + }); +} + +/// As a debugging aid, dump the current state of the guest in a readbale format. +/// If an optional snapshot of an earlier register state is provided, the results +/// will be printed a DIFF from that previous state. +pub fn log_guest_state(context_msg: &str, tid: Pid, old_regs: &Option) { + // TODO: could certainly derive this "diffing" functionality as a macro if + // there is a library for that. + let hdr = format!("{}: guest state (tid {}) has ...", context_msg, tid); + let cur = ptrace::getregs(tid.into()).unwrap(); + match old_regs { + None => debug!("{} regs = {:?}", hdr, cur), + Some(old) => { + let mut msg = String::from(" DIFF in regs from prev (new/old): "); + let len1 = msg.len(); + if cur.r15 != old.r15 { + msg.push_str(&format!("r15: {}/{} ", cur.r15, old.r15)); + } + if cur.r14 != old.r14 { + msg.push_str(&format!("r14: {}/{} ", cur.r14, old.r14)); + } + if cur.r13 != old.r13 { + msg.push_str(&format!("r13: {}/{} ", cur.r13, old.r13)); + } + if cur.r12 != old.r12 { + msg.push_str(&format!("r12: {}/{} ", cur.r12, old.r12)); + } + if cur.rbp != old.rbp { + msg.push_str(&format!("rbp: {}/{} ", cur.rbp, old.rbp)); + } + if cur.rbx != old.rbx { + msg.push_str(&format!("rbx: {}/{} ", cur.rbx, old.rbx)); + } + if cur.r11 != old.r11 { + msg.push_str(&format!("r11: {}/{} ", cur.r11, old.r11)); + } + if cur.r10 != old.r10 { + msg.push_str(&format!("r10: {}/{} ", cur.r10, old.r10)); + } + if cur.r9 != old.r9 { + msg.push_str(&format!("r9: {}/{} ", cur.r9, old.r9)); + } + if cur.r8 != old.r8 { + msg.push_str(&format!("r8: {}/{} ", cur.r8, old.r8)); + } + if cur.rax != old.rax { + msg.push_str(&format!("rax: {}/{} ", cur.rax, old.rax)); + } + if cur.rcx != old.rcx { + msg.push_str(&format!("rcx: {}/{} ", cur.rcx, old.rcx)); + } + if cur.rdx != old.rdx { + msg.push_str(&format!("rdx: {}/{} ", cur.rdx, old.rdx)); + } + if cur.rsi != old.rsi { + msg.push_str(&format!("rsi: {}/{} ", cur.rsi, old.rsi)); + } + if cur.rdi != old.rdi { + msg.push_str(&format!("rdi: {}/{} ", cur.rdi, old.rdi)); + } + if cur.orig_rax != old.orig_rax { + msg.push_str(&format!("orig_rax: {}/{} ", cur.orig_rax, old.orig_rax)); + } + if cur.rip != old.rip { + msg.push_str(&format!("rip: {}/{} ", cur.rip, old.rip)); + } + if cur.cs != old.cs { + msg.push_str(&format!("cs: {}/{} ", cur.cs, old.cs)); + } + if cur.eflags != old.eflags { + msg.push_str(&format!("eflags: {}/{} ", cur.eflags, old.eflags)); + } + if cur.rsp != old.rsp { + msg.push_str(&format!("rsp: {}/{} ", cur.rsp, old.rsp)); + } + if cur.ss != old.ss { + msg.push_str(&format!("ss: {}/{} ", cur.ss, old.ss)); + } + if cur.fs_base != old.fs_base { + msg.push_str(&format!("fs_base: {}/{} ", cur.fs_base, old.fs_base)); + } + if cur.gs_base != old.gs_base { + msg.push_str(&format!("gs_base: {}/{} ", cur.gs_base, old.gs_base)); + } + if cur.ds != old.ds { + msg.push_str(&format!("ds: {}/{} ", cur.ds, old.ds)); + } + if cur.es != old.es { + msg.push_str(&format!("es: {}/{} ", cur.es, old.es)); + } + if cur.fs != old.fs { + msg.push_str(&format!("fs: {}/{} ", cur.fs, old.fs)); + } + if cur.gs != old.gs { + msg.push_str(&format!("gs: {}/{} ", cur.gs, old.gs)); + } + if msg.len() == len1 { + debug!("{} NO differences from prev register state.", hdr) + } else { + debug!("{} {}", hdr, msg); + } + } + } +} diff --git a/reverie-ptrace/src/error.rs b/reverie-ptrace/src/error.rs new file mode 100644 index 0000000..f583a9c --- /dev/null +++ b/reverie-ptrace/src/error.rs @@ -0,0 +1,27 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use thiserror::Error; + +use crate::trace; + +/// A reverie-ptrace error. This error type isn't meant to be exposed to the +/// user. +#[derive(Error, Debug)] +pub enum Error { + /// An internal error that is only ever meant to be used as a reverie-ptrace + /// implementation detail. None of these errors should make it through to the + /// user. + #[error(transparent)] + Internal(#[from] trace::Error), + + /// A public error. + #[error(transparent)] + External(#[from] reverie::Error), +} diff --git a/reverie-ptrace/src/gdbstub/breakpoint.rs b/reverie-ptrace/src/gdbstub/breakpoint.rs new file mode 100644 index 0000000..3c918b3 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/breakpoint.rs @@ -0,0 +1,45 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/// Breakpoint type +#[derive(PartialEq, Debug)] +pub enum BreakpointType { + /// Software breakpoint + Software, + /// Hardware breakpoint + Hardware, + /// Read watchpoint + ReadWatch, + /// Write watchpoint + WriteWatch, +} + +impl BreakpointType { + pub fn new(ty: i32) -> Option { + match ty { + 0 => Some(BreakpointType::Software), + 1 => Some(BreakpointType::Hardware), + 2 => Some(BreakpointType::ReadWatch), + 3 => Some(BreakpointType::WriteWatch), + _ => None, + } + } +} + +/// Breakpoint. +#[derive(PartialEq, Debug)] +pub struct Breakpoint { + /// Breakpoint type. + pub ty: BreakpointType, + /// Address to set breakpoint. + pub addr: u64, + /// Additional expression used to implement conditional breakpoints + /// See https://sourceware.org/gdb/current/onlinedocs/gdb/Bytecode-Descriptions.html. + pub bytecode: Option>, +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_QStartNoAckMode.rs b/reverie-ptrace/src/gdbstub/commands/base/_QStartNoAckMode.rs new file mode 100644 index 0000000..e283c36 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_QStartNoAckMode.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct QStartNoAckMode; + +impl ParseCommand for QStartNoAckMode { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(QStartNoAckMode) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_QThreadEvents.rs b/reverie-ptrace/src/gdbstub/commands/base/_QThreadEvents.rs new file mode 100644 index 0000000..27cc4c1 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_QThreadEvents.rs @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct QThreadEvents { + pub enable: bool, +} + +impl ParseCommand for QThreadEvents { + fn parse(bytes: BytesMut) -> Option { + if !bytes.starts_with(b":") { + None + } else { + let value: u32 = decode_hex(&bytes[1..]).ok()?; + if value != 0 && value != 1 { + None + } else { + let enable = value == 1; + Some(QThreadEvents { enable }) + } + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_QuestionMark.rs b/reverie-ptrace/src/gdbstub/commands/base/_QuestionMark.rs new file mode 100644 index 0000000..c3e381b --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_QuestionMark.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct QuestionMark; + +impl ParseCommand for QuestionMark { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(QuestionMark) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_c.rs b/reverie-ptrace/src/gdbstub/commands/base/_c.rs new file mode 100644 index 0000000..c2eb251 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_c.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::commands::ParseCommand; + +pub struct c { + pub addr: Option, +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_d_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_d_upper.rs new file mode 100644 index 0000000..76683c4 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_d_upper.rs @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; +use reverie::Pid; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct D { + pub pid: Option, +} + +impl ParseCommand for D { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(D { pid: None }) + } else if !bytes.starts_with(b";") { + None + } else { + let pid = decode_hex(&bytes[1..]).ok()?; + Some(D { + pid: Some(Pid::from_raw(pid)), + }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_g.rs b/reverie-ptrace/src/gdbstub/commands/base/_g.rs new file mode 100644 index 0000000..1954ee4 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_g.rs @@ -0,0 +1,20 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct g; + +impl ParseCommand for g { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { Some(g) } else { None } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_g_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_g_upper.rs new file mode 100644 index 0000000..7c85797 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_g_upper.rs @@ -0,0 +1,27 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct G { + pub vals: Vec, +} + +impl ParseCommand for G { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + None + } else { + let vals = decode_hex_string(&bytes).ok()?; + Some(G { vals }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_h_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_h_upper.rs new file mode 100644 index 0000000..784a045 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_h_upper.rs @@ -0,0 +1,49 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct H { + pub op: ThreadOp, + pub id: ThreadId, +} + +impl ParseCommand for H { + fn parse(mut bytes: BytesMut) -> Option { + if bytes.is_empty() { + None + } else { + let (ch, bytes) = bytes.split_first_mut()?; + let op = match *ch { + b'c' => Some(ThreadOp::c), + b'g' => Some(ThreadOp::g), + b'G' => Some(ThreadOp::G), + b'm' => Some(ThreadOp::m), + b'M' => Some(ThreadOp::M), + _ => None, + }?; + if bytes == &b"-1"[..] { + Some(H { + op, + id: ThreadId::all(), + }) + } else if bytes == &b"0"[..] { + Some(H { + op, + id: ThreadId::any(), + }) + } else { + let thread_id = ThreadId::decode(bytes)?; + Some(H { op, id: thread_id }) + } + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_k.rs b/reverie-ptrace/src/gdbstub/commands/base/_k.rs new file mode 100644 index 0000000..c4ca938 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_k.rs @@ -0,0 +1,12 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; + +pub struct k; diff --git a/reverie-ptrace/src/gdbstub/commands/base/_m.rs b/reverie-ptrace/src/gdbstub/commands/base/_m.rs new file mode 100644 index 0000000..8927cfa --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_m.rs @@ -0,0 +1,31 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct m { + pub addr: u64, + pub length: usize, +} + +impl ParseCommand for m { + fn parse(mut bytes: BytesMut) -> Option { + if bytes.is_empty() { + None + } else { + let mut iter = bytes.split_mut(|c| *c == b','); + let addr = iter.next().and_then(|x| decode_hex(x).ok())?; + let length = iter.next().and_then(|x| decode_hex(x).ok())?; + Some(m { addr, length }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_m_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_m_upper.rs new file mode 100644 index 0000000..7a31e4f --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_m_upper.rs @@ -0,0 +1,36 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct M { + pub addr: u64, + pub length: usize, + pub vals: Vec, +} + +impl ParseCommand for M { + fn parse(mut bytes: BytesMut) -> Option { + let mut iter = bytes.split_mut(|c| *c == b',' || *c == b':'); + let addr = iter.next()?; + let len = iter.next()?; + let j = 2 + addr.len() + len.len(); + let addr = decode_hex(addr).ok()?; + let len = decode_hex(len).ok()?; + let vals = bytes.split_off(j); + Some(M { + addr, + length: len, + vals: decode_hex_string(&vals).ok()?, + }) + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_p.rs b/reverie-ptrace/src/gdbstub/commands/base/_p.rs new file mode 100644 index 0000000..69056b9 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_p.rs @@ -0,0 +1,14 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; + +pub struct p { + pub reg_id: usize, +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_p_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_p_upper.rs new file mode 100644 index 0000000..7f509ec --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_p_upper.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::Bytes; + +pub struct P { + pub reg_id: usize, + pub val: Bytes, // could val size >= sizeof(usize)? SSE3? +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qAttached.rs b/reverie-ptrace/src/gdbstub/commands/base/_qAttached.rs new file mode 100644 index 0000000..9543694 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qAttached.rs @@ -0,0 +1,30 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct qAttached { + pub pid: Option, +} + +impl ParseCommand for qAttached { + fn parse(mut bytes: BytesMut) -> Option { + if !bytes.starts_with(b":") { + None + } else { + let mut iter = bytes.split_mut(|c| *c == b':'); + let _ = iter.next()?; + Some(qAttached { + pid: iter.next().and_then(|x| decode_hex(x).ok()), + }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qC.rs b/reverie-ptrace/src/gdbstub/commands/base/_qC.rs new file mode 100644 index 0000000..871d39c --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qC.rs @@ -0,0 +1,20 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::commands::*; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct qC {} + +impl ParseCommand for qC { + fn parse(bytes: BytesMut) -> Option { + if !bytes.is_empty() { None } else { Some(qC {}) } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qSupported.rs b/reverie-ptrace/src/gdbstub/commands/base/_qSupported.rs new file mode 100644 index 0000000..d6f42d3 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qSupported.rs @@ -0,0 +1,28 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::{Bytes, BytesMut}; + +#[derive(PartialEq, Debug)] +pub struct qSupported { + pub features: Bytes, // use Features type here! +} + +impl ParseCommand for qSupported { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + None + } else { + Some(qSupported { + features: bytes.freeze(), + }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qXfer.rs b/reverie-ptrace/src/gdbstub/commands/base/_qXfer.rs new file mode 100644 index 0000000..f170256 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qXfer.rs @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub enum qXfer { + FeaturesRead { offset: usize, len: usize }, + AuxvRead { offset: usize, len: usize }, +} + +impl ParseCommand for qXfer { + fn parse(mut bytes: BytesMut) -> Option { + if bytes.starts_with(b":features:read:") { + let mut iter = + bytes[b":features:read:".len()..].split_mut(|c| *c == b':' || *c == b','); + let annex = iter.next()?; + if annex != b"target.xml" { + return None; + } + let offset = iter.next()?; + let len = iter.next()?; + Some(qXfer::FeaturesRead { + offset: decode_hex(offset).ok()?, + len: decode_hex(len).ok()?, + }) + } else if bytes.starts_with(b":auxv:read:") { + let mut iter = bytes[b":auxv:read:".len()..].split_mut(|c| *c == b':' || *c == b','); + let annex = iter.next()?; + if annex != b"" { + return None; + } + let offset = iter.next()?; + let len = iter.next()?; + Some(qXfer::AuxvRead { + offset: decode_hex(offset).ok()?, + len: decode_hex(len).ok()?, + }) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qfThreadInfo.rs b/reverie-ptrace/src/gdbstub/commands/base/_qfThreadInfo.rs new file mode 100644 index 0000000..5b8cada --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qfThreadInfo.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct qfThreadInfo; + +impl ParseCommand for qfThreadInfo { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(qfThreadInfo) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_qsThreadInfo.rs b/reverie-ptrace/src/gdbstub/commands/base/_qsThreadInfo.rs new file mode 100644 index 0000000..166d8e8 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_qsThreadInfo.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct qsThreadInfo; + +impl ParseCommand for qsThreadInfo { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(qsThreadInfo) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_s.rs b/reverie-ptrace/src/gdbstub/commands/base/_s.rs new file mode 100644 index 0000000..96f2bef --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_s.rs @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +pub struct s { + pub addr: Option, +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_t_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_t_upper.rs new file mode 100644 index 0000000..a80b601 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_t_upper.rs @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +pub struct T { + pub thread: ThreadId, +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_vCont.rs b/reverie-ptrace/src/gdbstub/commands/base/_vCont.rs new file mode 100644 index 0000000..ad6eac2 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_vCont.rs @@ -0,0 +1,76 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use nix::sys::signal::Signal; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub enum vCont { + Query, + Actions(Vec<(ResumeAction, ThreadId)>), +} + +impl ParseCommand for vCont { + fn parse(mut bytes: BytesMut) -> Option { + if bytes == b"?"[..] { + Some(vCont::Query) + } else if bytes.is_empty() { + None + } else { + let mut bytes = bytes.split_off(1); + // example packet: $vCont;s:p3e86d3.3e86d3;c:p3e86d3.-1#3b + // with prefix (`$vCont`) and checksum stripped. + let actions: Vec<(ResumeAction, ThreadId)> = bytes + .split_mut(|c| *c == b';') + .filter_map(|act| { + let mut iter = act.split_mut(|c| *c == b':'); + let action = iter.next()?; + let thread_id = iter.next().and_then(|tid| ThreadId::decode(tid))?; + let action = if action.is_empty() { + None + } else { + match action[0] { + b'c' => Some(ResumeAction::Continue(None)), + b'C' => { + let sig = decode_hex::(&action[1..]) + .ok() + .and_then(|s| Signal::try_from(s).ok())?; + Some(ResumeAction::Continue(Some(sig))) + } + b's' => Some(ResumeAction::Step(None)), + b'S' => { + let sig = decode_hex::(&action[1..]) + .ok() + .and_then(|s| Signal::try_from(s).ok())?; + Some(ResumeAction::Step(Some(sig))) + } + b't' => Some(ResumeAction::Stop), + b'r' => { + let mut iter = action[1..].split_mut(|c| *c == b','); + let start: u64 = iter.next().and_then(|x| decode_hex(x).ok())?; + let end: u64 = iter.next().and_then(|x| decode_hex(x).ok())?; + Some(ResumeAction::StepUntil(start, end)) + } + _ => None, + } + }?; + Some((action, thread_id)) + }) + .collect(); + if actions.is_empty() { + None + } else { + Some(vCont::Actions(actions)) + } + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_vFile.rs b/reverie-ptrace/src/gdbstub/commands/base/_vFile.rs new file mode 100644 index 0000000..278535b --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_vFile.rs @@ -0,0 +1,149 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSDstyle license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; +use std::os::unix::ffi::OsStringExt; +use std::{ffi::OsString, path::PathBuf}; + +use nix::sys::stat::FileStat; + +use reverie::Pid; + +use crate::gdbstub::{commands::*, hex::*}; + +/// struct stat defined by gdb host i/o packet. This is *not* the same as +/// libc::stat or nix's FileStat (which is just libc::stat). +// NB: packed is needed to force size_of:: == 0x40. Otherwise +// gdb (client) would complain. +#[repr(packed(4))] +pub struct HostioStat { + st_dev: u32, + st_ino: u32, + st_mode: u32, + st_nlink: u32, + st_uid: u32, + st_gid: u32, + st_rdev: u32, + st_size: u64, + st_blksize: u64, + st_blocks: u64, + st_atime: u32, + st_mtime: u32, + st_ctime: u32, +} + +impl From for HostioStat { + fn from(stat: FileStat) -> HostioStat { + HostioStat { + st_dev: stat.st_dev as u32, + st_ino: stat.st_ino as u32, + st_nlink: stat.st_nlink as u32, + st_mode: stat.st_mode as u32, + st_uid: stat.st_uid, + st_gid: stat.st_gid, + st_rdev: stat.st_rdev as u32, + st_size: stat.st_size as u64, + st_blksize: stat.st_blksize as u64, + st_blocks: stat.st_blocks as u64, + st_atime: stat.st_atime as u32, + st_mtime: stat.st_mtime as u32, + st_ctime: stat.st_ctime as u32, + } + } +} + +#[derive(PartialEq, Debug)] +pub enum vFile { + Setfs(Option), + Open(PathBuf, i32, u32), + Close(i32), + Pread(i32, isize, isize), + Pwrite(i32, isize, Vec), + Fstat(i32), + Unlink(PathBuf), + Readlink(PathBuf), +} + +impl ParseCommand for vFile { + fn parse(mut bytes: BytesMut) -> Option { + if bytes.starts_with(b":setfs:") { + let pid: i32 = decode_hex(&bytes[b":setfs:".len()..]).ok()?; + Some(vFile::Setfs(if pid == 0 { None } else { Some(pid) })) + } else if bytes.starts_with(b":open:") { + let mut iter = bytes[b":open:".len()..].split_mut(|c| *c == b','); + let fname = iter.next().and_then(|s| decode_hex_string(s).ok())?; + let fname = PathBuf::from(OsString::from_vec(fname)); + let flags = iter.next().and_then(|s| decode_hex(s).ok())?; + let mode = iter.next().and_then(|s| decode_hex(s).ok())?; + Some(vFile::Open(fname, flags, mode)) + } else if bytes.starts_with(b":close:") { + let fd: i32 = decode_hex(&bytes[b":close:".len()..]).ok()?; + Some(vFile::Close(fd)) + } else if bytes.starts_with(b":pread:") { + let mut iter = bytes[b":pread:".len()..].split_mut(|c| *c == b','); + let fd = iter.next().and_then(|s| decode_hex(s).ok())?; + let count = iter.next().and_then(|s| decode_hex(s).ok())?; + let offset = iter.next().and_then(|s| decode_hex(s).ok())?; + Some(vFile::Pread(fd, count, offset)) + } else if bytes.starts_with(b":pwrite:") { + let mut iter = bytes[b":pwrite:".len()..].split_mut(|c| *c == b','); + let fd = iter.next().and_then(|s| decode_hex(s).ok())?; + let offset = iter.next().and_then(|s| decode_hex(s).ok())?; + let bytes = iter.next().and_then(|s| decode_binary_string(s).ok())?; + Some(vFile::Pwrite(fd, offset, bytes)) + } else if bytes.starts_with(b":fstat:") { + let fd: i32 = decode_hex(&bytes[b":fstat:".len()..]).ok()?; + Some(vFile::Fstat(fd)) + } else if bytes.starts_with(b":unlink:") { + let fname = bytes.split_off(b":unlink:".len()); + let fname = decode_hex_string(&fname).ok()?; + let fname = PathBuf::from(OsString::from_vec(fname)); + Some(vFile::Unlink(fname)) + } else if bytes.starts_with(b":readlink:") { + let fname = bytes.split_off(b":readlink:".len()); + let fname = decode_hex_string(&fname).ok()?; + let fname = PathBuf::from(OsString::from_vec(fname)); + Some(vFile::Readlink(fname)) + } else { + None + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::mem; + + #[test] + fn hostio_stat_size_check() { + assert_eq!(mem::size_of::(), 0x40); + } + + #[test] + fn hostio_sanity() { + // NB: `vFile` prefix is stripped prior. + assert_eq!( + vFile::parse(BytesMut::from(&b":open:6a7573742070726f62696e67,0,1c0"[..])), + Some(vFile::Open(PathBuf::from("just probing"), 0x0, 0x1c0)) + ); + assert_eq!( + vFile::parse(BytesMut::from(&b":pread:b,1000,0"[..])), + Some(vFile::Pread(0xb, 0x1000, 0x0)) + ); + assert_eq!( + vFile::parse(BytesMut::from(&b":unlink:6a7573742070726f62696e67"[..])), + Some(vFile::Unlink(PathBuf::from("just probing"))) + ); + assert_eq!( + vFile::parse(BytesMut::from(&b":readlink:6a7573742070726f62696e67"[..])), + Some(vFile::Readlink(PathBuf::from("just probing"))) + ); + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_vKill.rs b/reverie-ptrace/src/gdbstub/commands/base/_vKill.rs new file mode 100644 index 0000000..ef1261e --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_vKill.rs @@ -0,0 +1,31 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; +use reverie::Pid; + +#[derive(PartialEq, Debug)] +pub struct vKill { + pub pid: Pid, +} + +impl ParseCommand for vKill { + fn parse(bytes: BytesMut) -> Option { + if !bytes.starts_with(b";") { + None + } else { + let pid = decode_hex(&bytes[1..]).ok()?; + Some(vKill { + pid: Pid::from_raw(pid), + }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_x_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_x_upper.rs new file mode 100644 index 0000000..108dac2 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_x_upper.rs @@ -0,0 +1,72 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct X { + pub addr: u64, + pub length: usize, + pub vals: Vec, +} + +impl ParseCommand for X { + fn parse(mut bytes: BytesMut) -> Option { + let mut first_colon = None; + let mut index = 0; + for &b in &bytes { + if b == b':' { + first_colon = Some(index); + break; + } else { + index += 1; + } + } + + let (addr_len, vals) = bytes.split_at_mut(first_colon?); + let mut iter = addr_len.split_mut(|c| *c == b','); + let addr = iter.next().and_then(|s| decode_hex(s).ok())?; + let len = iter.next().and_then(|s| decode_hex(s).ok())?; + Some(X { + addr, + length: len, + vals: decode_binary_string(&vals[1..]).ok()?, + }) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn can_parse_X_special() { + // Sending packet: $X216eb0,4:,\000\000\000#ae...Packet received: OK + assert_eq!( + X::parse(BytesMut::from("216eb0,4:,\0\0\0")), + Some(X { + addr: 0x216eb0, + length: 4, + vals: vec![0x2c, 0x0, 0x0, 0x0], + }) + ); + + // Sending packet: $X216eb0,4::\000\000\000#bc...Packet received: OK + assert_eq!( + X::parse(BytesMut::from("216eb0,4::\0\0\0")), + Some(X { + addr: 0x216eb0, + length: 4, + vals: vec![0x3a, 0x0, 0x0, 0x0], + }) + ); + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_z.rs b/reverie-ptrace/src/gdbstub/commands/base/_z.rs new file mode 100644 index 0000000..6654809 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_z.rs @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct z { + pub ty: BreakpointType, + pub addr: u64, + pub kind: u8, +} + +impl ParseCommand for z { + fn parse(mut bytes: BytesMut) -> Option { + let mut iter = bytes.split_mut(|c| *c == b','); + let ty = iter + .next() + .and_then(|s| decode_hex(s).ok()) + .and_then(BreakpointType::new)?; + let addr = iter.next().and_then(|s| decode_hex(s).ok())?; + let kind = iter.next().and_then(|s| decode_hex(s).ok())?; + + Some(z { ty, addr, kind }) + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/_z_upper.rs b/reverie-ptrace/src/gdbstub/commands/base/_z_upper.rs new file mode 100644 index 0000000..138a473 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/_z_upper.rs @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(PartialEq, Debug)] +pub struct Z { + pub ty: BreakpointType, + pub addr: u64, + pub kind: u8, + // NB: conditional bkpt here? +} +impl ParseCommand for Z { + fn parse(mut bytes: BytesMut) -> Option { + let mut iter = bytes.split_mut(|c| *c == b','); + let ty = iter + .next() + .and_then(|s| decode_hex(s).ok()) + .and_then(BreakpointType::new)?; + let addr = iter.next().and_then(|s| decode_hex(s).ok())?; + let kind = iter.next().and_then(|s| decode_hex(s).ok())?; + + Some(Z { ty, addr, kind }) + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/base/mod.rs b/reverie-ptrace/src/gdbstub/commands/base/mod.rs new file mode 100644 index 0000000..0e6b248 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/base/mod.rs @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod _QuestionMark; +//mod _c; +mod _d_upper; +mod _g; +mod _g_upper; +mod _h_upper; +//mod _k; +mod _m; +mod _m_upper; +//mod _p; +//mod _p_upper; +mod _QStartNoAckMode; +mod _QThreadEvents; +mod _qAttached; +mod _qC; +mod _qSupported; +mod _qXfer; +mod _qfThreadInfo; +mod _qsThreadInfo; +//mod _s; +//mod _t_upper; +mod _vCont; +mod _vFile; +mod _vKill; +mod _x_upper; +mod _z; +mod _z_upper; + +pub use _QuestionMark::*; +//pub use _c::*; +pub use _d_upper::*; +pub use _g::*; +pub use _g_upper::*; +pub use _h_upper::*; +//pub use _k::*; +pub use _m::*; +pub use _m_upper::*; +//pub use _p::*; +//pub use _p_upper::*; +pub use _QStartNoAckMode::*; +pub use _QThreadEvents::*; +pub use _qAttached::*; +pub use _qC::*; +pub use _qSupported::*; +pub use _qXfer::*; +pub use _qfThreadInfo::*; +pub use _qsThreadInfo::*; +//pub use _s::*; +//pub use _t_upper::*; +pub use _vCont::*; +pub use _vFile::*; +pub use _vKill::*; +pub use _x_upper::*; +pub use _z::*; +pub use _z_upper::*; diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_ExclamationMark.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_ExclamationMark.rs new file mode 100644 index 0000000..0e51499 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_ExclamationMark.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct ExclamationMark; + +impl ParseCommand for ExclamationMark { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(ExclamationMark) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QDisableRandomization.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QDisableRandomization.rs new file mode 100644 index 0000000..87062b7 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QDisableRandomization.rs @@ -0,0 +1,27 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; + +#[derive(Debug, PartialEq)] +pub struct QDisableRandomization { + pub val: bool, +} + +impl ParseCommand for QDisableRandomization { + fn parse(bytes: BytesMut) -> Option { + if bytes == ":0" { + Some(QDisableRandomization { val: false }) + } else if bytes == ":1" { + Some(QDisableRandomization { val: true }) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentHexEncoded.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentHexEncoded.rs new file mode 100644 index 0000000..2559ca8 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentHexEncoded.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::Bytes; + +pub struct QEnvironmentHexEncoded { + pub key: Bytes, + pub value: Option, +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentReset.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentReset.rs new file mode 100644 index 0000000..b6556e4 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentReset.rs @@ -0,0 +1,12 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; + +pub struct QEnvironmentReset; diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentUnset.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentUnset.rs new file mode 100644 index 0000000..eef9eae --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QEnvironmentUnset.rs @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::Bytes; + +pub struct QEnvironmentUnset { + pub key: Bytes, +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QSetWorkingDir.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QSetWorkingDir.rs new file mode 100644 index 0000000..7b30c06 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QSetWorkingDir.rs @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::Bytes; + +pub struct QSetWorkingDir { + pub dir: Option, +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_QStartupWithShell.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QStartupWithShell.rs new file mode 100644 index 0000000..0c922d2 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_QStartupWithShell.rs @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +pub struct QStartupWithShell { + pub val: bool, +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_r_upper.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_r_upper.rs new file mode 100644 index 0000000..0d08bd6 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_r_upper.rs @@ -0,0 +1,13 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +pub struct R; diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_vAttach.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_vAttach.rs new file mode 100644 index 0000000..f5748c0 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_vAttach.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; +use reverie::Pid; + +pub struct vAttach { + pub pid: Pid, +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/_vRun.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/_vRun.rs new file mode 100644 index 0000000..5afc7be --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/_vRun.rs @@ -0,0 +1,16 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::Bytes; + +pub struct vRun { + pub filename: Option, + pub args: Bytes, // use Args type here! +} diff --git a/reverie-ptrace/src/gdbstub/commands/extended_mode/mod.rs b/reverie-ptrace/src/gdbstub/commands/extended_mode/mod.rs new file mode 100644 index 0000000..2088a9f --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/extended_mode/mod.rs @@ -0,0 +1,30 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod _ExclamationMark; +mod _QDisableRandomization; +//mod _QEnvironmentHexEncoded; +//mod _QEnvironmentReset; +//mod _QEnvironmentUnset; +//mod _QSetWorkingDir; +//mod _QStartupWithShell; +//mod _r_upper; +//mod _vAttach; +//mod _vRun; + +pub use _ExclamationMark::*; +pub use _QDisableRandomization::*; +//pub use _QEnvironmentHexEncoded::*; +//pub use _QEnvironmentReset::*; +//pub use _QEnvironmentUnset::*; +//pub use _QSetWorkingDir::*; +//pub use _QStartupWithShell::*; +//pub use _r_upper::*; +//pub use _vAttach::*; +//pub use _vRun::*; diff --git a/reverie-ptrace/src/gdbstub/commands/mod.rs b/reverie-ptrace/src/gdbstub/commands/mod.rs new file mode 100644 index 0000000..977c0bf --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/mod.rs @@ -0,0 +1,589 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#![allow(non_snake_case, non_camel_case_types, dead_code, unused_imports)] + +use crate::gdbstub::{ + hex::*, request::*, response::*, BreakpointType, Inferior, InferiorThreadId, ResumeInferior, + StoppedInferior, +}; +use crate::trace::{ChildOp, Stopped}; +use bytes::{Bytes, BytesMut}; +use paste::paste; +use std::{collections::BTreeMap, path::PathBuf}; +use thiserror::Error; +use tokio::sync::{broadcast, mpsc, oneshot}; + +use reverie::{ExitStatus, Pid, Signal}; + +mod base; +mod extended_mode; +mod monitor_cmd; +mod section_offsets; + +pub use base::*; +pub use extended_mode::*; +pub use monitor_cmd::*; +pub use section_offsets::*; + +trait ParseCommand: Sized { + fn parse(buff: BytesMut) -> Option; +} + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum IdKind { + // all threads: `-1'. + All, + // any thread: `0'. + Any, + Id(Pid), +} + +impl IdKind { + pub fn from_raw(pid: i32) -> Self { + match pid { + -1 => IdKind::All, + 0 => IdKind::Any, + _ => IdKind::Id(Pid::from_raw(pid)), + } + } + + #[allow(clippy::wrong_self_convention)] + pub fn into_raw(&self) -> i32 { + match self { + IdKind::All => -1, + IdKind::Any => 0, + IdKind::Id(pid) => pid.as_raw(), + } + } + + pub fn matches(&self, other: &IdKind) -> bool { + match (self, &other) { + (IdKind::All, _) => true, + (IdKind::Any, _) => true, + (IdKind::Id(pid1), IdKind::Id(pid2)) => pid1 == pid2, + (IdKind::Id(_), _) => other.matches(self), + } + } +} + +impl WriteResponse for IdKind { + fn write_response(&self, writer: &mut ResponseWriter) { + match self { + IdKind::All => writer.put_str("-1"), + IdKind::Any => writer.put_str("0"), + IdKind::Id(pid) => writer.put_num(pid.as_raw()), + } + } +} + +#[derive(PartialEq, Debug)] +pub enum ThreadOp { + c, // step and continue, deprecated because of `vCont' + g, // Other operations + G, + m, + M, +} + +/// Gdb ThreadId. See https://sourceware.org/gdb/onlinedocs/gdb/Packets.html#thread_002did-syntax +/// for more details. +#[derive(PartialEq, Debug, Clone, Copy)] +pub struct ThreadId { + pub pid: IdKind, + pub tid: IdKind, +} + +impl ThreadId { + pub fn all() -> Self { + ThreadId { + tid: IdKind::All, + pid: IdKind::All, + } + } + + pub fn any() -> Self { + ThreadId { + tid: IdKind::All, + pid: IdKind::Any, + } + } + + pub fn pid(pid: i32) -> Self { + ThreadId { + tid: IdKind::All, + pid: IdKind::from_raw(pid), + } + } + + pub fn pid_tid(pid: i32, tid: i32) -> Self { + ThreadId { + pid: IdKind::from_raw(pid), + tid: IdKind::from_raw(tid), + } + } + + // NB: Specifying just a process, as ‘ppid’, is equivalent to ‘ppid.-1’. + pub fn decode(bytes: &[u8]) -> Option { + if !bytes.starts_with(b"p") { + return None; + } + let mut iter = bytes[1..].split(|c| *c == b'.'); + let p = iter.next().and_then(|x| decode_hex(x).ok())?; + Some( + match iter.next().and_then(|x| { + if x == &b"-1"[..] { + Some(-1) + } else { + decode_hex(x).ok() + } + }) { + Some(t) => ThreadId::pid_tid(p, t), + None => ThreadId::pid(p), + }, + ) + } + + /// Check if `tid` matches `ThreadId`. + pub fn matches(&self, other: &ThreadId) -> bool { + self.pid.matches(&other.pid) && self.tid.matches(&other.tid) + } + + pub fn getpid(&self) -> Option { + let id = self.pid.into_raw(); + if id > 0 { + Some(Pid::from_raw(id)) + } else { + None + } + } + + pub fn gettid(&self) -> Option { + let id = self.tid.into_raw(); + if id > 0 { + Some(Pid::from_raw(id)) + } else { + None + } + } +} + +impl WriteResponse for ThreadId { + fn write_response(&self, writer: &mut ResponseWriter) { + writer.put_str("p"); + self.pid.write_response(writer); + writer.put_str("."); + self.tid.write_response(writer); + } +} + +macro_rules! commands { + ( + $(#[$attrs:meta])* + $vis:vis enum $Name:ident { + $( + $(#[$ext_attrs:meta])* + $ext:ident { + $($name:literal => $command:ident,)* + } + )* + } + ) => {paste! { + $( + #[allow(non_camel_case_types)] + #[derive(PartialEq, Debug)] + $(#[$ext_attrs])* + $vis enum [<$ext:camel>] { + $($command(self::$ext::$command),)* + } + )* + + /// GDB commands + $(#[$attrs])* + $vis enum $Name { + $( + [<$ext:camel>]([<$ext:camel>]), + )* + Unknown(Bytes), + } + + impl Command { + pub fn try_parse( + mut buf: BytesMut + ) -> Result { + if buf.is_empty() { + return Err(CommandParseError::Empty); + } + + let body = buf.as_ref(); + + $( + match body { + $(_ if body.starts_with($name.as_bytes()) => { + let nb = $name.as_bytes().len(); + let cmd = self::$ext::$command::parse(buf.split_off(nb)) + .ok_or(CommandParseError::MalformedCommand(String::from(concat!($name))))?; + + return Ok( + Command::[<$ext:camel>]( + [<$ext:camel>]::$command(cmd) + ) + ) + })* + _ => {}, + } + )* + + Ok(Command::Unknown(buf.freeze())) + } + } + }}; +} + +/// Command parse error +#[derive(Debug, PartialEq, Error)] +pub enum CommandParseError { + /// Command is empty + #[error("Command is empty")] + Empty, + + /// Malformed command + #[error("Malformed command: {}", .0)] + MalformedCommand(String), + + #[error("Malformed registers found from g/G packet")] + MalformedRegisters, +} + +commands! { + #[derive(PartialEq, Debug)] + pub enum Command { + base { + "?" => QuestionMark, + "D" => D, + "g" => g, + "G" => G, + "H" => H, + "m" => m, + "M" => M, + "qAttached" => qAttached, + "QThreadEvents" => QThreadEvents, + "qC" => qC, + "qfThreadInfo" => qfThreadInfo, + "QStartNoAckMode" => QStartNoAckMode, + "qsThreadInfo" => qsThreadInfo, + "qSupported" => qSupported, + "qXfer" => qXfer, + "vCont" => vCont, + "vKill" => vKill, + "z" => z, + "Z" => Z, + "X" => X, + /* host i/o commands */ + "vFile" => vFile, + } + + extended_mode { + "!" => ExclamationMark, + "QDisableRandomization" => QDisableRandomization, + } + + monitor_cmd { + "qRcmd" => qRcmd, + } + + section_offsets { + "qOffsets" => qOffsets, + } + } +} + +/// Resume actions set by vCont. +#[derive(PartialEq, Clone, Copy, Debug)] +pub enum ResumeAction { + /// signal step, with optional signal. + Step(Option), + /// cointinue, with optional signal. + Continue(Option), + /// Stop, not sure what it means exactly. + Stop, + /// Keep stepping until rip doesn't belong to start..=end. + StepUntil(u64, u64), +} + +/// Replay log used by reverse debugging. +#[derive(PartialEq, Clone, Copy, Debug)] +pub enum ReplayLog { + /// Relay log reached the beginning. + Begin, + /// Replay log reached the end. + End, +} + +/// Expediated registers. Stop reply packets (as to vCont) can have extra +/// registers, so that gdb doesn't have to read registers unless necessary. +/// On amd64, they're %rbp, %rsp and %rip. +#[derive(PartialEq, Clone, Debug)] +pub struct ExpediatedRegs(BTreeMap); + +impl From for ExpediatedRegs { + fn from(regs: libc::user_regs_struct) -> Self { + let mut exp_regs = BTreeMap::new(); + exp_regs.insert(6, regs.rbp); + exp_regs.insert(7, regs.rsp); + exp_regs.insert(0x10, regs.rip); + ExpediatedRegs(exp_regs) + } +} + +#[derive(PartialEq, Clone, Debug)] +pub enum StopEvent { + /// Stopped by signal. + Signal(Signal), + /// Stopped by softwrae breakpoint. + SwBreak, + /// Stopped due to vforkdone event. + Vforkdone, + /// Replay reached either begin or end. + ReplayDone(ReplayLog), + /// Stopped due to exec event. + Exec(PathBuf), +} + +#[derive(Debug, Clone)] +pub struct StoppedTask { + /// Pid of the event (SYS_gettid) + pub pid: Pid, + /// Thread Group id of the event (SYS_getpid) + pub tgid: Pid, + /// Stop event + pub event: StopEvent, + /// Expediated registers specified by gdb remote protocol + pub regs: ExpediatedRegs, +} + +#[derive(Debug)] +pub struct NewTask { + /// Pid of the event (SYS_gettid) + pub pid: Pid, + /// Thread Group id of the event (SYS_getpid) + pub tgid: Pid, + /// New child Pid + pub child: Pid, + /// Expediated registers specified by gdb remote protocol + pub regs: ExpediatedRegs, + /// Clone type + pub op: ChildOp, + /// channel to send gdb request + pub request_tx: Option>, + /// channel to send gdb resume request + pub resume_tx: Option>, + /// channel to receive new gdb stop event + pub stop_rx: Option>, +} + +/// Reasons why inferior has stopped, reported to gdb (client). +/// See section ["Stop Reply Packets"] +/// (https://sourceware.org/gdb/onlinedocs/gdb/Stop-Reply-Packets.html#Stop-Reply-Packets) +/// for more details. +#[derive(Debug)] +pub enum StopReason { + Stopped(StoppedTask), + NewTask(NewTask), + Exited(Pid, ExitStatus), + ThreadExited(Pid, Pid, ExitStatus), +} + +impl StopReason { + pub fn stopped(pid: Pid, tgid: Pid, event: StopEvent, regs: ExpediatedRegs) -> Self { + StopReason::Stopped(StoppedTask { + pid, + tgid, + event, + regs, + }) + } + + // FIXME: Reduce number of arguments. + #[allow(clippy::too_many_arguments)] + pub fn new_task( + pid: Pid, + tgid: Pid, + child: Pid, + regs: ExpediatedRegs, + op: ChildOp, + request_tx: Option>, + resume_tx: Option>, + stop_rx: Option>, + ) -> Self { + StopReason::NewTask(NewTask { + pid, + tgid, + child, + regs, + op, + request_tx, + resume_tx, + stop_rx, + }) + } + + pub fn thread_exited(pid: Pid, tgid: Pid, exit_status: ExitStatus) -> Self { + StopReason::ThreadExited(pid, tgid, exit_status) + } + + pub fn exit(pid: Pid, exit_status: ExitStatus) -> Self { + StopReason::Exited(pid, exit_status) + } +} + +impl WriteResponse for StopReason { + fn write_response(&self, writer: &mut ResponseWriter) { + match self { + StopReason::NewTask(new_task) => { + writer.put_str("T05"); + match new_task.op { + ChildOp::Fork => { + // T05fork:p21feb6.21feb6;06:30dcffffff7f0000;07:10dcffffff7f0000;10:37c2ecf7ff7f0000;thread:p21f994.21f994;core:10; + let thread_id = ThreadId { + pid: IdKind::from_raw(new_task.child.as_raw()), + tid: IdKind::from_raw(new_task.child.as_raw()), + }; + writer.put_str("fork:"); + thread_id.write_response(writer); + writer.put_str(";"); + } + ChildOp::Vfork => { + let thread_id = ThreadId { + pid: IdKind::from_raw(new_task.child.as_raw()), + tid: IdKind::from_raw(new_task.child.as_raw()), + }; + writer.put_str("vfork:"); + thread_id.write_response(writer); + writer.put_str(";"); + } + ChildOp::Clone => { + let thread_id = ThreadId { + pid: IdKind::from_raw(new_task.tgid.as_raw()), + tid: IdKind::from_raw(new_task.child.as_raw()), + }; + writer.put_str("create:"); + thread_id.write_response(writer); + writer.put_str(";"); + } + } + for (regno, regval) in &new_task.regs.0 { + writer.put_num(*regno); + writer.put_str(":"); + writer.put_hex_encoded(®val.to_ne_bytes()); + writer.put_str(";"); + } + let thread_id = ThreadId::pid_tid(new_task.tgid.as_raw(), new_task.pid.as_raw()); + writer.put_str("thread:"); + thread_id.write_response(writer); + writer.put_str(";"); + } + StopReason::Stopped(stopped) => { + writer.put_str("T05"); + match &stopped.event { + StopEvent::Signal(_) => {} + StopEvent::SwBreak => { + writer.put_str("swbreak:;"); + } + StopEvent::Vforkdone => { + writer.put_str("vforkdone:;"); + } + StopEvent::Exec(p) => { + // T05exec:2f746d702f6631;06:0000000000000000;07:80ddffffff7f0000;10:9030fdf7ff7f0000;thread:p350ad8.350ad8;core:9; + writer.put_str("exec:"); + if let Some(p) = p.to_str() { + writer.put_hex_encoded(p.as_bytes()); + } + writer.put_str(";"); + } + StopEvent::ReplayDone(log) => match log { + ReplayLog::Begin => writer.put_str("replaylog:begin;"), + ReplayLog::End => writer.put_str("replaylog:end;"), + }, + } + for (regno, regval) in &stopped.regs.0 { + writer.put_num(*regno); + writer.put_str(":"); + writer.put_hex_encoded(®val.to_ne_bytes()); + writer.put_str(";"); + } + let thread_id = ThreadId::pid_tid(stopped.tgid.as_raw(), stopped.pid.as_raw()); + writer.put_str("thread:"); + thread_id.write_response(writer); + writer.put_str(";"); + } + StopReason::Exited(pid, exit_status) => { + match exit_status { + ExitStatus::Exited(code) => { + writer.put_str("W"); + writer.put_hex_encoded(&[*code as u8]); + } + ExitStatus::Signaled(sig, _) => { + writer.put_str("X"); + writer.put_hex_encoded(&[(*sig as u8) | 0x80]); + } + } + writer.put_str(";process:"); + writer.put_num(pid.as_raw()); + } + StopReason::ThreadExited(pid, tgid, exit_status) => { + match exit_status { + ExitStatus::Exited(code) => { + writer.put_str("w"); + writer.put_hex_encoded(&[*code as u8]); + } + ExitStatus::Signaled(_, _) => unreachable!(), + } + writer.put_str(";"); + let threadid = ThreadId::pid_tid(tgid.as_raw(), pid.as_raw()); + threadid.write_response(writer); + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn decode_vcont_test() { + let mut packet = BytesMut::from("$vCont;s:p3e86d3.3e86d3;c:p3e86d3.-1#3b"); + let vcont = vCont::parse(packet.split()); + assert!(vcont.is_some()); + + let vcont = vCont::parse(BytesMut::from("$vCont;c:p2.-1#10")); + assert!(vcont.is_some()); + } + + #[test] + fn unknown_command() { + let mut packet = BytesMut::from("just,an,unknown,command#3b"); + let cmd = Command::try_parse(packet.split()); + assert!(cmd.is_ok()); + assert!(matches!(cmd.unwrap(), Command::Unknown(_))); + } + + #[test] + fn malformed_command() { + let mut packet = BytesMut::from("vCont,Just a bad command;c:1.-1#fe"); + let cmd = Command::try_parse(packet.split()); + assert_eq!( + cmd, + Err::(CommandParseError::MalformedCommand(String::from( + "vCont" + ))) + ); + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/monitor_cmd/_qRcmd.rs b/reverie-ptrace/src/gdbstub/commands/monitor_cmd/_qRcmd.rs new file mode 100644 index 0000000..2a3e09d --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/monitor_cmd/_qRcmd.rs @@ -0,0 +1,28 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::{Bytes, BytesMut}; + +#[derive(PartialEq, Debug)] +pub struct qRcmd { + pub cmd: Bytes, +} + +impl ParseCommand for qRcmd { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + None + } else { + Some(qRcmd { + cmd: bytes.freeze(), + }) + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/monitor_cmd/mod.rs b/reverie-ptrace/src/gdbstub/commands/monitor_cmd/mod.rs new file mode 100644 index 0000000..3663885 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/monitor_cmd/mod.rs @@ -0,0 +1,12 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod _qRcmd; + +pub use _qRcmd::*; diff --git a/reverie-ptrace/src/gdbstub/commands/section_offsets/_qOffsets.rs b/reverie-ptrace/src/gdbstub/commands/section_offsets/_qOffsets.rs new file mode 100644 index 0000000..821e936 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/section_offsets/_qOffsets.rs @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use crate::gdbstub::{commands::*, hex::*}; +use bytes::BytesMut; + +#[derive(PartialEq, Debug)] +pub struct qOffsets; + +impl ParseCommand for qOffsets { + fn parse(bytes: BytesMut) -> Option { + if bytes.is_empty() { + Some(qOffsets) + } else { + None + } + } +} diff --git a/reverie-ptrace/src/gdbstub/commands/section_offsets/mod.rs b/reverie-ptrace/src/gdbstub/commands/section_offsets/mod.rs new file mode 100644 index 0000000..f2334fa --- /dev/null +++ b/reverie-ptrace/src/gdbstub/commands/section_offsets/mod.rs @@ -0,0 +1,12 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod _qOffsets; + +pub use _qOffsets::*; diff --git a/reverie-ptrace/src/gdbstub/error.rs b/reverie-ptrace/src/gdbstub/error.rs new file mode 100644 index 0000000..64300b7 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/error.rs @@ -0,0 +1,85 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use thiserror::Error; + +use super::commands::CommandParseError; +use super::hex::GdbHexError; +use super::packet::PacketParseError; + +use crate::trace::Error as TraceError; + +use reverie::Pid; +use std::io; + +#[derive(Error, Debug)] +#[allow(clippy::enum_variant_names)] +pub enum Error { + #[error("gdb server not started yet")] + GdbServerNotStarted, + #[error("Failed waiting for gdb client to connect")] + WaitForGdbConnect { + #[source] + source: io::Error, + }, + #[error("Connection reset")] + ConnReset, + #[error("gdb session not started")] + SessionNotStarted, + #[error(transparent)] + PacketError(PacketParseError), + #[error("No inferior attached")] + Detached, + #[error(transparent)] + TraceError(TraceError), + #[error("Failed to send gdb request over tx channel")] + GdbRequestSendError, + #[error("Failed to receive reply from gdb request")] + GdbRequestRecvError, + #[error("gdbserver failed to resume/step")] + GdbResumeError, + #[error("gdbserver failed to forward gdb packet")] + GdbServerSendPacketError, + #[error("No threadid is being specified")] + ThreadIdNotSpecified, + #[error("Unknown thread {0}")] + UnknownThread(Pid), + #[error("gdbserver failed to receive stop event")] + GdbServerStopEventRecvError, +} + +impl From for PacketParseError { + fn from(err: CommandParseError) -> Self { + PacketParseError::CommandError(err) + } +} + +impl From for Error { + fn from(err: PacketParseError) -> Self { + Error::PacketError(err) + } +} + +impl From for Error { + fn from(err: CommandParseError) -> Self { + Error::PacketError(err.into()) + } +} + +impl From for Error { + fn from(err: GdbHexError) -> Self { + Error::PacketError(err.into()) + } +} + +impl From for Error { + fn from(err: TraceError) -> Self { + Error::TraceError(err) + } +} diff --git a/reverie-ptrace/src/gdbstub/hex.rs b/reverie-ptrace/src/gdbstub/hex.rs new file mode 100644 index 0000000..1c2b474 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/hex.rs @@ -0,0 +1,401 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::{Bytes, BytesMut}; +use num_traits::{CheckedAdd, CheckedMul, FromPrimitive, Zero}; +use serde::{ + de::{self, Visitor}, + ser::{self, SerializeSeq}, + Deserialize, Deserializer, Serialize, Serializer, +}; +use thiserror::Error; + +/// Decode gdb hex error code +#[derive(Debug, Error, PartialEq)] +pub enum GdbHexError { + /// Invalid hex digit + #[error("Input contains non-ASCII chars")] + NotAscii, + /// Input is empty + #[error("Input is empty")] + Empty, + /// Output is too small: overflowed + #[error("Output is too small/overflowed")] + Overflow, + /// Invalid Hex input + #[error("Gdb hex is malformed")] + InvalidGdbHex, + /// Invalid binary inpput + #[error("Gdb binary is malformed")] + InvalidGdbBinary, + /// Invalid Output (num) type. + #[error("Invalid output num type")] + InvalidOutput, +} + +#[derive(PartialEq, Debug)] +pub struct GdbHexString { + bytes: Bytes, +} + +impl From for GdbHexString { + fn from(bytes: Bytes) -> Self { + GdbHexString { bytes } + } +} + +impl From for GdbHexString { + fn from(bytes: BytesMut) -> Self { + GdbHexString { + bytes: bytes.freeze(), + } + } +} + +impl GdbHexString { + /// decode gdb hex encoded binary data into a slice. + #[cfg(test)] + pub fn decode(&self) -> Result, GdbHexError> { + let serialized: Vec = + bincode::serialize(self).map_err(|_| GdbHexError::InvalidGdbHex)?; + bincode::deserialize(&serialized).map_err(|_| GdbHexError::InvalidGdbHex) + } + + /// encode slice into gdb hex encoded data + #[cfg(test)] + pub fn encode(bytes: &[u8]) -> Result { + let serialized: Vec = + bincode::serialize(bytes).map_err(|_| GdbHexError::InvalidGdbHex)?; + bincode::deserialize(&serialized).map_err(|_| GdbHexError::InvalidGdbHex) + } +} + +impl Serialize for GdbHexString { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if self.bytes.is_empty() || self.bytes.len() % 2 != 0 { + return Err(ser::Error::custom(GdbHexError::InvalidGdbHex)); + } + let mut seq = serializer.serialize_seq(Some(self.bytes.len() / 2))?; + let mut j = 0; + while j < self.bytes.len() { + let val: u8 = from_hex(self.bytes[j]) + .ok_or_else(|| ser::Error::custom(GdbHexError::NotAscii))? + * 16 + + from_hex(self.bytes[j + 1]) + .ok_or_else(|| ser::Error::custom(GdbHexError::NotAscii))?; + seq.serialize_element(&val)?; + j += 2; + } + seq.end() + } +} + +struct HexStringVisitor; +impl<'de> Visitor<'de> for HexStringVisitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a &[u8] slice") + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: de::Error, + { + let mut res: Vec = Vec::new(); + + for ch in v { + let hi = to_hex(*ch >> 4).unwrap(); + let lo = to_hex(*ch & 0xf).unwrap(); + res.push(hi); + res.push(lo); + } + Ok(res) + } +} + +impl<'de> Deserialize<'de> for GdbHexString { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let bytes = deserializer.deserialize_bytes(HexStringVisitor)?; + Ok(GdbHexString { + bytes: Bytes::from(bytes), + }) + } +} + +#[derive(PartialEq, Debug)] +pub struct GdbBinaryString { + bytes: Bytes, +} + +impl From for GdbBinaryString { + fn from(bytes: Bytes) -> Self { + GdbBinaryString { bytes } + } +} + +impl From for GdbBinaryString { + fn from(bytes: BytesMut) -> Self { + GdbBinaryString { + bytes: bytes.freeze(), + } + } +} + +impl GdbBinaryString { + /// decode gdb binary encoded binary data into a slice. + #[cfg(test)] + pub fn decode(&self) -> Result, GdbHexError> { + let serialized: Vec = + bincode::serialize(self).map_err(|_| GdbHexError::InvalidGdbHex)?; + bincode::deserialize(&serialized).map_err(|_| GdbHexError::InvalidGdbHex) + } + + /// encode slice into gdb binary encoded data + #[cfg(test)] + pub fn encode(bytes: &[u8]) -> Result { + let serialized: Vec = + bincode::serialize(bytes).map_err(|_| GdbHexError::InvalidGdbHex)?; + bincode::deserialize(&serialized).map_err(|_| GdbHexError::InvalidGdbHex) + } +} + +impl Serialize for GdbBinaryString { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut res: Vec = Vec::new(); + let mut j = 0; + while j < self.bytes.len() { + let ch = self.bytes[j]; + match ch { + b'}' => { + if j == self.bytes.len() - 1 { + return Err(ser::Error::custom(GdbHexError::InvalidGdbBinary)); + } + res.push(self.bytes[1 + j] ^ 0x20); + j += 2; + } + _ => { + res.push(ch); + j += 1; + } + } + } + serializer.serialize_bytes(&res) + } +} + +struct BinaryStringisitor; +impl<'de> Visitor<'de> for BinaryStringisitor { + type Value = Vec; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a &[u8] slice") + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: de::Error, + { + let mut res: Vec = Vec::new(); + + for &ch in v { + match ch { + b'#' | b'$' | b'}' | b'*' => { + res.push(b'}'); + res.push(ch ^ 0x20) + } + _ => res.push(ch), + } + } + Ok(res) + } +} + +impl<'de> Deserialize<'de> for GdbBinaryString { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let bytes = deserializer.deserialize_bytes(BinaryStringisitor)?; + Ok(GdbBinaryString { + bytes: Bytes::from(bytes), + }) + } +} + +fn from_hex(c: u8) -> Option { + if b"0123456789".contains(&c) { + Some(c - b'0') + } else if b"abcdef".contains(&c) { + Some(c - b'a' + 10) + } else if b"ABCDEF".contains(&c) { + Some(c - b'A' + 10) + } else if b"xX".contains(&c) { + Some(0) + } else { + None + } +} + +fn to_hex(c: u8) -> Option { + if c > 15 { + None + } else if c < 10 { + Some(c + b'0') + } else { + Some(c + b'A') + } +} + +/// Decode a GDB dex string into the specified integer. +/// +/// GDB hex strings may include "xx", which represent "missing" data. This +/// method simply treats "xx" as 00. +pub fn decode_hex(buf: &[u8]) -> Result +where + I: FromPrimitive + Zero + CheckedAdd + CheckedMul, +{ + if buf.is_empty() { + return Err(GdbHexError::Empty); + } + + let radix = I::from_u8(16).ok_or(GdbHexError::InvalidOutput)?; + let mut result = I::zero(); + + for &digit in buf { + let x = I::from_u8(from_hex(digit).ok_or(GdbHexError::NotAscii)?) + .ok_or(GdbHexError::InvalidOutput)?; + result = result.checked_mul(&radix).ok_or(GdbHexError::Overflow)?; + result = result.checked_add(&x).ok_or(GdbHexError::Overflow)? + } + + Ok(result) +} + +/// Decode a GDB hex string into a u8 Vector. +/// +/// GDB hex strings may include "xx", which represent "missing" data. This +/// method simply treats "xx" as 00. +pub fn decode_hex_string(buf: &[u8]) -> Result, GdbHexError> { + let mut res = Vec::new(); + let mut i = 0; + + if buf.len() % 2 != 0 { + return Err(GdbHexError::InvalidGdbHex); + } + + while i < buf.len() - 1 { + let x = from_hex(buf[i]).ok_or(GdbHexError::NotAscii)?; + let x = 16 * x + from_hex(buf[i + 1]).ok_or(GdbHexError::NotAscii)?; + res.push(x); + i += 2; + } + + Ok(res) +} + +/// Decode a GDB binary string into a u8 Vector. +/// +/// GDB hex strings may include "xx", which represent "missing" data. This +/// method simply treats "xx" as 00. +pub fn decode_binary_string(buf: &[u8]) -> Result, GdbHexError> { + let mut res = Vec::new(); + let mut i = 0; + + while i < buf.len() { + match buf[i] { + b'}' => { + if i >= buf.len() - 1 { + return Err(GdbHexError::InvalidGdbBinary); + } + res.push(buf[i + 1] ^ 0x20); + i += 2; + } + _ => { + res.push(buf[i]); + i += 1; + } + } + } + Ok(res) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn serde_sanity() { + let test1 = Bytes::from(&[4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4][..]); + let test2 = GdbHexString { + bytes: Bytes::from("01020304"), + }; + let bytes: Vec = bincode::serialize(&test2).unwrap(); + assert_eq!(bytes, test1); + let encoded: GdbHexString = bincode::deserialize(&bytes).unwrap(); + assert_eq!(encoded, test2); + + let bytes: Vec = bincode::deserialize(&bytes).unwrap(); + assert_eq!(bytes, vec![1, 2, 3, 4]); + } + + #[test] + fn encode_decode_sanity() { + let test1 = vec![1, 2, 3, 4]; + let hex = GdbHexString { + bytes: Bytes::from("01020304"), + }; + let test2 = vec![b'1', b'2', b'$', b'{']; + let bin = GdbBinaryString { + bytes: Bytes::from(&b"12}\x04{"[..]), + }; + assert_eq!(GdbHexString::encode(&test1).unwrap(), hex); + assert_eq!(GdbHexString::decode(&hex).unwrap(), test1); + + assert_eq!(GdbBinaryString::encode(&test2).unwrap(), bin); + assert_eq!(GdbBinaryString::decode(&bin).unwrap(), test2); + } + + #[test] + fn decode_gdb_hex_test() { + assert_eq!( + decode_hex_string(b"31323334"), + Ok::<_, GdbHexError>(b"1234".to_vec()) + ); + assert_eq!( + decode_hex_string(b"12345"), + Err::, _>(GdbHexError::InvalidGdbHex) + ); + } + + #[test] + fn decode_gdb_binary_test() { + assert_eq!( + decode_binary_string(b"12345"), + Ok::<_, GdbHexError>(b"12345".to_vec()) + ); + assert_eq!( + decode_binary_string(b"1234}"), + Err::, _>(GdbHexError::InvalidGdbBinary) + ); + assert_eq!( + decode_binary_string(b"1234}A"), + Ok::<_, GdbHexError>(b"1234a".to_vec()) + ); + } +} diff --git a/reverie-ptrace/src/gdbstub/inferior.rs b/reverie-ptrace/src/gdbstub/inferior.rs new file mode 100644 index 0000000..00162b8 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/inferior.rs @@ -0,0 +1,129 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::{commands::*, Error, GdbRequest}; + +use reverie::Pid; +use tokio::sync::mpsc; + +/// Thread id and Pid use to uniquely indentify an inferior. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub struct InferiorThreadId { + pub tid: Pid, + pub pid: Pid, +} + +impl InferiorThreadId { + pub fn new(tid: Pid, pid: Pid) -> Self { + InferiorThreadId { tid, pid } + } +} +impl From for ThreadId { + fn from(id: InferiorThreadId) -> Self { + ThreadId::pid_tid(id.pid.as_raw(), id.tid.as_raw()) + } +} + +impl TryFrom for InferiorThreadId { + type Error = Error; + + fn try_from(threadid: ThreadId) -> Result { + let pid = threadid.getpid().ok_or(Error::ThreadIdNotSpecified)?; + let tid = threadid.gettid().ok_or(Error::ThreadIdNotSpecified)?; + Ok(InferiorThreadId::new(tid, pid)) + } +} + +/// Inferior controlled by gdbstub +pub struct Inferior { + /// Inferior id + pub id: InferiorThreadId, + + /// Resume attached tracee + pub resume_tx: Option>, + + /// Send request to reverie + pub request_tx: Option>, + + /// Channel to receive new gdb stop event + pub stop_rx: Option>, + + /// Has a pending resume + pub resume_pending: bool, +} + +impl Inferior { + pub fn new(id: InferiorThreadId) -> Self { + Inferior { + id, + resume_tx: None, + request_tx: None, + stop_rx: None, + resume_pending: false, + } + } + + pub fn gettid(&self) -> Pid { + self.id.tid + } + + pub fn getpid(&self) -> Pid { + self.id.pid + } + + pub fn matches(&self, threadid: &ThreadId) -> bool { + let this_threadid: ThreadId = self.id.into(); + this_threadid.matches(threadid) + } + + /// Notify target to resume given `Inferior`. + // NB: The inferior could have been resumed previously, meaning there could + // be a pending stop state from last resume. This is possible when + // `vCont;p:-1` is called while there are multiple threads in the same + // process group. The pending flag is cleared when a stop event is reported + // by the target (reverie). + pub async fn notify_resume(&mut self, resume: ResumeInferior) -> Result<(), Error> { + if !self.resume_pending { + let tx = self.resume_tx.as_ref().ok_or(Error::Detached)?; + tx.send(resume).await.map_err(|_| Error::GdbResumeError)?; + self.resume_pending = true; + } + Ok(()) + } + + /// Wait for stop event reported by the target. + pub async fn wait_for_stop(&mut self) -> Result { + let rx = self.stop_rx.as_mut().ok_or(Error::Detached)?; + let stopped = rx.recv().await.ok_or(Error::GdbServerStopEventRecvError)?; + // clear `resume_pending` flag as we got a new stop event, implying + // a new resume *is* to be expected. + self.resume_pending = false; + Ok(stopped.reason) + } +} + +/// Inferior is in stopped state. sent by reverie. +#[derive(Debug)] +pub struct StoppedInferior { + /// Reason why inferior has stopped. + pub reason: StopReason, + /// tx channel to send gdb request (by gdb) + pub request_tx: mpsc::Sender, + /// tx channel to send gdb resume/step (by gdb) + pub resume_tx: mpsc::Sender, +} + +/// Inferior is in stopped state. send to reverie. +#[derive(Debug, Clone, Copy)] +pub struct ResumeInferior { + /// Resume action, step, continue, until, ... + pub action: ResumeAction, + /// Detach (from gdb) after this resume. + pub detach: bool, +} diff --git a/reverie-ptrace/src/gdbstub/logger.rs b/reverie-ptrace/src/gdbstub/logger.rs new file mode 100644 index 0000000..ad63812 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/logger.rs @@ -0,0 +1,81 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use std::fmt::{self, Debug}; + +enum Direction { + In, + Out, +} + +const MAX_BYTES_DISPLAY: usize = 64; + +pub struct PacketLogger<'a> { + direction: Direction, + body: &'a [u8], + checksum: u8, +} + +impl<'a> PacketLogger<'a> { + pub fn incoming + ?Sized>(body: &'a T, checksum: u8) -> Self { + Self { + direction: Direction::In, + body: body.as_ref(), + checksum, + } + } + + pub fn outgoing + ?Sized>(body: &'a T, checksum: u8) -> Self { + Self { + direction: Direction::Out, + body: body.as_ref(), + checksum, + } + } +} + +impl<'a> Debug for PacketLogger<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.direction { + Direction::In => write!(f, "<-- ")?, + Direction::Out => write!(f, "--> ")?, + } + + let nb_left = if self.body.len() > MAX_BYTES_DISPLAY { + Some(self.body.len() - MAX_BYTES_DISPLAY) + } else { + None + }; + write!(f, "b\"")?; + for &b in self.body.iter().take(MAX_BYTES_DISPLAY) { + if b == b'\n' { + write!(f, "\\n")?; + } else if b == b'\r' { + write!(f, "\\r")?; + } else if b == b'\t' { + write!(f, "\\t")?; + } else if b == b'\\' || b == b'"' { + write!(f, "\\{}", b as char)?; + } else if b == b'\0' { + write!(f, "\\0")?; + // ASCII printable + } else if (0x20..0x7f).contains(&b) { + write!(f, "{}", b as char)?; + } else { + write!(f, "\\x{:02x}", b)?; + } + } + if let Some(nb) = nb_left { + write!(f, "[{} bytes omitted]", nb)?; + } + write!(f, "#{:02x}", self.checksum)?; + write!(f, "\"")?; + Ok(()) + } +} diff --git a/reverie-ptrace/src/gdbstub/mod.rs b/reverie-ptrace/src/gdbstub/mod.rs new file mode 100644 index 0000000..ea07969 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/mod.rs @@ -0,0 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod breakpoint; +mod commands; +mod error; +mod hex; +mod inferior; +mod logger; +mod packet; +mod request; +mod server; +mod session; + +mod regs; + +use logger::PacketLogger; +use packet::Packet; + +pub mod response; + +pub use breakpoint::{Breakpoint, BreakpointType}; +pub use commands::{ResumeAction, StopEvent, StopReason}; +pub use error::Error; +pub use inferior::{Inferior, InferiorThreadId, ResumeInferior, StoppedInferior}; +pub use regs::{Amd64CoreRegs, Amd64ExtraRegs}; +pub use request::GdbRequest; +pub use server::GdbServer; +pub use session::Session; diff --git a/reverie-ptrace/src/gdbstub/packet.rs b/reverie-ptrace/src/gdbstub/packet.rs new file mode 100644 index 0000000..c7083cf --- /dev/null +++ b/reverie-ptrace/src/gdbstub/packet.rs @@ -0,0 +1,132 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::{ + commands::{Command, CommandParseError}, + hex::*, + PacketLogger, +}; +use bytes::BytesMut; +use thiserror::Error; + +/// Packet parse error. +#[derive(PartialEq, Debug, Error)] +pub enum PacketParseError { + #[error("Checksum mismatch, expected: {checksum}, got: {calculated}")] + ChecksumMismatched { checksum: u8, calculated: u8 }, + #[error("empty packet buffer")] + EmptyBuf, + #[error("missing checksum")] + MissingChecksum, + #[error("mulformed checksum")] + MalformedChecksum, + #[error(transparent)] + CommandError(CommandParseError), + #[error("unexpected header {0}")] + UnexpectedHeader(u8), + #[error(transparent)] + DecodeHexError(GdbHexError), +} + +impl From for PacketParseError { + fn from(err: GdbHexError) -> Self { + PacketParseError::DecodeHexError(err) + } +} + +/// Packet send/recv from gdb stream. +#[derive(Debug)] +pub enum Packet { + Ack, + Nack, + Interrupt, + Command(Command), +} + +// Remove leading `$' and trailing `#[xx]`, and validate checksum. +fn decode_packet(mut bytes: BytesMut) -> Result { + let end_of_body = bytes + .iter() + .position(|b| *b == b'#') + .ok_or(PacketParseError::MissingChecksum)?; + + // Split buffer into body and checksum, note the packet + // starts with a `$'. + let (body, checksum) = bytes.split_at_mut(end_of_body); + let checksum = &checksum[1..][..2]; // skip the '#' + + // Validate checksum without leading `$'. + let checksum = decode_hex(checksum).map_err(|_| PacketParseError::MalformedChecksum)?; + let calculated = body.iter().skip(1).fold(0u8, |a, x| a.wrapping_add(*x)); + if calculated != checksum { + return Err(PacketParseError::ChecksumMismatched { + checksum, + calculated, + }); + } + + tracing::trace!("{:?}", PacketLogger::incoming(body, checksum)); + + Ok(bytes.split_to(end_of_body).split_off(1)) +} + +impl TryFrom for Packet { + type Error = PacketParseError; + fn try_from(buf: BytesMut) -> Result { + if buf.is_empty() { + return Err(PacketParseError::EmptyBuf); + } + let prefix = buf[0]; + match prefix { + b'$' => { + let body = decode_packet(buf)?; + Ok(Packet::Command(Command::try_parse(body)?)) + } + b'+' => Ok(Packet::Ack), + b'-' => Ok(Packet::Nack), + 0x03 => Ok(Packet::Interrupt), + _ => Err(PacketParseError::UnexpectedHeader(buf[0])), + } + } +} + +impl Packet { + /// Create a new `Packet` from `buf`. + pub fn new(buf: BytesMut) -> Result { + Self::try_from(buf) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn can_decode_packet() { + let cmd = BytesMut::from("$qC#b4"); + assert_eq!( + decode_packet(cmd), + Ok::<_, PacketParseError>(BytesMut::from("qC")) + ); + + assert_eq!( + // Contains non-ascii bytes + decode_packet(BytesMut::from(&b"$X7fffffffdbac,4:\x8a\x02\0\0#09"[..])), + Ok::<_, PacketParseError>(BytesMut::from(&b"X7fffffffdbac,4:\x8a\x02\0\0"[..])) + ); + + let cmd = BytesMut::from("$QPassSignals:e;10;14;17;1a;1b;1c;21;24;25;2c;4c;97;#0a"); + assert_eq!( + decode_packet(cmd), + Ok::<_, PacketParseError>(BytesMut::from( + "QPassSignals:e;10;14;17;1a;1b;1c;21;24;25;2c;4c;97;" + )) + ); + } +} diff --git a/reverie-ptrace/src/gdbstub/regs.rs b/reverie-ptrace/src/gdbstub/regs.rs new file mode 100644 index 0000000..b86294c --- /dev/null +++ b/reverie-ptrace/src/gdbstub/regs.rs @@ -0,0 +1,557 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use serde::{Deserialize, Serialize}; +use std::fmt; + +use super::response::*; + +#[repr(transparent)] +#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)] +/// 80-bit FPU register, see gdb/64bit-core.xml +pub struct Fp80([u8; 10]); + +// NB: st from `libc::user_fpregs_struct' has a different representation. +fn from_u32s(st: &[u32]) -> Fp80 { + Fp80([ + (st[0] & 0xff) as u8, + ((st[0] >> 8) & 0xff) as u8, + ((st[0] >> 16) & 0xff) as u8, + ((st[0] >> 24) & 0xff) as u8, + (st[1] & 0xff) as u8, + ((st[1] >> 8) & 0xff) as u8, + ((st[1] >> 16) & 0xff) as u8, + ((st[1] >> 24) & 0xff) as u8, + (st[2] & 0xff) as u8, + ((st[2] >> 8) & 0xff) as u8, + ]) +} + +impl From<[u32; 4]> for Fp80 { + fn from(v: [u32; 4]) -> Fp80 { + from_u32s(&v) + } +} + +fn from_fp80(fp: &Fp80, u32s: &mut [u32]) { + u32s[0] = + fp.0[0] as u32 | (fp.0[1] as u32) << 8 | (fp.0[2] as u32) << 16 | (fp.0[3] as u32) << 24; + u32s[1] = + fp.0[4] as u32 | (fp.0[5] as u32) << 8 | (fp.0[6] as u32) << 16 | (fp.0[7] as u32) << 24; + u32s[2] = fp.0[8] as u32 | (fp.0[9] as u32) << 8; + u32s[3] = 0; +} + +impl From for [u32; 4] { + fn from(fp: Fp80) -> [u32; 4] { + let mut res: [u32; 4] = [0; 4]; + from_fp80(&fp, &mut res); + res + } +} + +#[repr(transparent)] +struct St([Fp80; 8]); + +impl From<[u32; 32]> for St { + fn from(st: [u32; 32]) -> Self { + St([ + from_u32s(&st[0..]), + from_u32s(&st[4..]), + from_u32s(&st[8..]), + from_u32s(&st[12..]), + from_u32s(&st[16..]), + from_u32s(&st[20..]), + from_u32s(&st[24..]), + from_u32s(&st[28..]), + ]) + } +} + +impl From for [u32; 32] { + fn from(st: St) -> [u32; 32] { + let mut res: [u32; 32] = [0; 32]; + from_fp80(&st.0[0], &mut res[0..]); + from_fp80(&st.0[1], &mut res[4..]); + from_fp80(&st.0[2], &mut res[8..]); + from_fp80(&st.0[3], &mut res[12..]); + from_fp80(&st.0[4], &mut res[16..]); + from_fp80(&st.0[5], &mut res[20..]); + from_fp80(&st.0[6], &mut res[24..]); + from_fp80(&st.0[7], &mut res[28..]); + res + } +} + +#[repr(transparent)] +struct Xmm([u128; 16]); + +impl From<[u32; 64]> for Xmm { + fn from(xmm: [u32; 64]) -> Self { + Xmm([ + (xmm[3] as u128) << 96 + | (xmm[2] as u128) << 64 + | (xmm[1] as u128) << 32 + | (xmm[0] as u128), + (xmm[7] as u128) << 96 + | (xmm[6] as u128) << 64 + | (xmm[5] as u128) << 32 + | (xmm[4] as u128), + (xmm[11] as u128) << 96 + | (xmm[10] as u128) << 64 + | (xmm[9] as u128) << 32 + | (xmm[8] as u128), + (xmm[15] as u128) << 96 + | (xmm[14] as u128) << 64 + | (xmm[13] as u128) << 32 + | (xmm[12] as u128), + (xmm[19] as u128) << 96 + | (xmm[18] as u128) << 64 + | (xmm[17] as u128) << 32 + | (xmm[16] as u128), + (xmm[23] as u128) << 96 + | (xmm[22] as u128) << 64 + | (xmm[21] as u128) << 32 + | (xmm[20] as u128), + (xmm[27] as u128) << 96 + | (xmm[26] as u128) << 64 + | (xmm[25] as u128) << 32 + | (xmm[24] as u128), + (xmm[31] as u128) << 96 + | (xmm[30] as u128) << 64 + | (xmm[29] as u128) << 32 + | (xmm[28] as u128), + (xmm[35] as u128) << 96 + | (xmm[34] as u128) << 64 + | (xmm[33] as u128) << 32 + | (xmm[32] as u128), + (xmm[39] as u128) << 96 + | (xmm[38] as u128) << 64 + | (xmm[37] as u128) << 32 + | (xmm[36] as u128), + (xmm[43] as u128) << 96 + | (xmm[42] as u128) << 64 + | (xmm[41] as u128) << 32 + | (xmm[40] as u128), + (xmm[47] as u128) << 96 + | (xmm[46] as u128) << 64 + | (xmm[45] as u128) << 32 + | (xmm[44] as u128), + (xmm[51] as u128) << 96 + | (xmm[50] as u128) << 64 + | (xmm[49] as u128) << 32 + | (xmm[48] as u128), + (xmm[55] as u128) << 96 + | (xmm[54] as u128) << 64 + | (xmm[53] as u128) << 32 + | (xmm[52] as u128), + (xmm[59] as u128) << 96 + | (xmm[58] as u128) << 64 + | (xmm[57] as u128) << 32 + | (xmm[56] as u128), + (xmm[63] as u128) << 96 + | (xmm[62] as u128) << 64 + | (xmm[61] as u128) << 32 + | (xmm[60] as u128), + ]) + } +} + +fn u128_to_u32s(u: u128, u32s: &mut [u32]) { + u32s[0] = u as u32; + u32s[1] = (u >> 32) as u32; + u32s[2] = (u >> 64) as u32; + u32s[3] = (u >> 96) as u32; +} + +impl From for [u32; 64] { + fn from(xmm: Xmm) -> [u32; 64] { + let mut res: [u32; 64] = [0; 64]; + u128_to_u32s(xmm.0[0], &mut res[0..]); + u128_to_u32s(xmm.0[1], &mut res[4..]); + u128_to_u32s(xmm.0[2], &mut res[8..]); + u128_to_u32s(xmm.0[3], &mut res[12..]); + u128_to_u32s(xmm.0[4], &mut res[16..]); + u128_to_u32s(xmm.0[5], &mut res[20..]); + u128_to_u32s(xmm.0[6], &mut res[24..]); + u128_to_u32s(xmm.0[7], &mut res[28..]); + u128_to_u32s(xmm.0[8], &mut res[32..]); + u128_to_u32s(xmm.0[9], &mut res[36..]); + u128_to_u32s(xmm.0[10], &mut res[40..]); + u128_to_u32s(xmm.0[11], &mut res[44..]); + u128_to_u32s(xmm.0[12], &mut res[48..]); + u128_to_u32s(xmm.0[13], &mut res[52..]); + u128_to_u32s(xmm.0[14], &mut res[56..]); + u128_to_u32s(xmm.0[15], &mut res[60..]); + res + } +} + +/// i387 regs, gdb layout. +#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)] +pub struct X87Regs { + /// fctrl + pub fctrl: u32, + /// fstat + pub fstat: u32, + /// ftag + pub ftag: u32, + /// fiseg + pub fiseg: u32, + /// fioff + pub fioff: u32, + /// foseg + pub foseg: u32, + /// fooff + pub fooff: u32, + /// fop + pub fop: u32, +} + +/// arm64 core/sse regs, see gdb/64bit-{core,sse}-linux.xml. +/// This is the same as: 64bit-core+64bit-sse+64bit-linux. +#[derive(Default, PartialEq, Clone, Deserialize, Serialize)] +pub struct Amd64CoreRegs { + /// general purpose regsiters + /// rax/rbx/rcx/rdx/rsi/rdi/rbp/rsp/r8..r15 + pub regs: [u64; 16], + /// rip aka instruction pointer + pub rip: u64, + /// eflags + pub eflags: u32, + /// cs, ss, ds, es, fs, gs + pub segments: [u32; 6], + /// 80-bit fpu regs + pub st: [Fp80; 8], + /// fpu control regs + pub x87: X87Regs, + /// SSE registers + pub xmm: [u128; 16], + /// Sse status/control + pub mxcsr: u32, + pub orig_rax: u64, + pub fs_base: u64, + pub gs_base: u64, +} + +impl fmt::Debug for Amd64CoreRegs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Amd64CoreRegs") + .field("rax", &self.regs[0]) + .field("rbx", &self.regs[1]) + .field("rcx", &self.regs[2]) + .field("rdx", &self.regs[3]) + .field("rsi", &self.regs[4]) + .field("rdi", &self.regs[5]) + .field("rbp", &self.regs[6]) + .field("rsp", &self.regs[7]) + .field("r8", &self.regs[8]) + .field("r9", &self.regs[9]) + .field("r10", &self.regs[10]) + .field("r11", &self.regs[11]) + .field("r12", &self.regs[12]) + .field("r13", &self.regs[13]) + .field("r14", &self.regs[14]) + .field("r15", &self.regs[15]) + .field("rip", &self.rip) + .field("eflags", &self.eflags) + .field("cs", &self.segments[0]) + .field("ss", &self.segments[1]) + .field("ds", &self.segments[2]) + .field("es", &self.segments[3]) + .field("fs", &self.segments[4]) + .field("gs", &self.segments[5]) + .field("st", &self.st) + .field("x87", &self.x87) + .field("xmm", &self.xmm) + .field("mxcsr", &self.mxcsr) + .field("orig_rax", &self.orig_rax) + .field("fs_base", &self.fs_base) + .field("gs_base", &self.gs_base) + .finish() + } +} + +impl Amd64CoreRegs { + /// create `Amd64CoreRegs` from user and fp regs. + pub fn from(regs: libc::user_regs_struct, i387: libc::user_fpregs_struct) -> Self { + Amd64CoreRegs { + regs: [ + regs.rax, regs.rbx, regs.rcx, regs.rdx, regs.rsi, regs.rdi, regs.rbp, regs.rsp, + regs.r8, regs.r9, regs.r10, regs.r11, regs.r12, regs.r13, regs.r14, regs.r15, + ], + rip: regs.rip, + eflags: regs.eflags as u32, + segments: [ + regs.cs as u32, + regs.ss as u32, + regs.ds as u32, + regs.es as u32, + regs.fs as u32, + regs.fs as u32, + ], + st: St::from(i387.st_space).0, + // NB: fpu/fxsave layout, see https://github.com/rr-debugger/rr/blob/master/src/ExtraRegisters.cc and + // https://elixir.bootlin.com/linux/latest/source/arch/x86/include/asm/user_64.h#L51 + x87: X87Regs { + fctrl: i387.cwd as u32, // 0, short + fstat: i387.swd as u32, // 2, short + ftag: i387.ftw as u32, // 4, short + fiseg: (i387.rip >> 32) as u32, // 12, + fioff: (i387.rip & 0xffffffff) as u32, // 8, + foseg: (i387.rdp >> 32) as u32, // 20, + fooff: (i387.rdp & 0xffffffff) as u32, // 16, + fop: i387.fop as u32, // 6, short + }, + xmm: Xmm::from(i387.xmm_space).0, + mxcsr: i387.mxcsr, + orig_rax: regs.orig_rax, + fs_base: regs.fs_base, + gs_base: regs.gs_base, + } + } + + pub fn into_parts(self) -> (libc::user_regs_struct, libc::user_fpregs_struct) { + // NB: `padding` is private so we cannot use struct literal syntax. + let mut fpregs_intializer: libc::user_fpregs_struct = + unsafe { std::mem::MaybeUninit::zeroed().assume_init() }; + fpregs_intializer.cwd = self.x87.fctrl as u16; + fpregs_intializer.swd = self.x87.fstat as u16; + fpregs_intializer.ftw = self.x87.ftag as u16; + fpregs_intializer.fop = self.x87.fop as u16; + fpregs_intializer.rip = self.x87.fioff as u64 | ((self.x87.fiseg as u64) << 32); + fpregs_intializer.rdp = self.x87.fooff as u64 | ((self.x87.foseg as u64) << 32); + fpregs_intializer.mxcsr = self.mxcsr; + fpregs_intializer.mxcr_mask = 0xffff; // only bit 0-15 are valid. + fpregs_intializer.st_space = St(self.st).into(); + fpregs_intializer.xmm_space = Xmm(self.xmm).into(); + ( + libc::user_regs_struct { + rax: self.regs[0], + rbx: self.regs[1], + rcx: self.regs[2], + rdx: self.regs[3], + rsi: self.regs[4], + rdi: self.regs[5], + rbp: self.regs[6], + rsp: self.regs[7], + r8: self.regs[8], + r9: self.regs[9], + r10: self.regs[10], + r11: self.regs[11], + r12: self.regs[12], + r13: self.regs[13], + r14: self.regs[14], + r15: self.regs[15], + orig_rax: self.orig_rax, + rip: self.rip, + cs: self.segments[0] as u64, + ss: self.segments[1] as u64, + ds: self.segments[2] as u64, + es: self.segments[3] as u64, + fs: self.segments[4] as u64, + gs: self.segments[5] as u64, + eflags: self.eflags as u64, + fs_base: self.fs_base, + gs_base: self.gs_base, + }, + fpregs_intializer, + ) + } +} + +impl WriteResponse for ResponseAsHex { + fn write_response(&self, f: &mut ResponseWriter) { + let encoded: Vec = bincode::serialize(&self.0).unwrap(); + ResponseAsHex(encoded.as_slice()).write_response(f) + } +} + +impl WriteResponse for ResponseAsBinary { + fn write_response(&self, f: &mut ResponseWriter) { + let encoded: Vec = bincode::serialize(&self.0).unwrap(); + ResponseAsBinary(encoded.as_slice()).write_response(f) + } +} + +/// amd64 avx regs +#[derive(Debug, Default, PartialEq, Clone, Deserialize, Serialize)] +pub struct Amd64ExtraRegs { + /// avx registers + pub ymm: [u128; 32], + /// avx512 registers + pub ymmh: [u128; 32], +} + +#[cfg(test)] +mod test { + use super::*; + use std::mem; + + #[test] + fn fp80_sanity() { + assert_eq!(mem::size_of::(), 10); + let u32s: [u32; 4] = [0x12345678, 0x87654321, 0xabcd, 0]; + let fp80: Fp80 = Fp80::from(u32s); + let u32s_1: [u32; 4] = fp80.into(); + assert_eq!(u32s, u32s_1); + } + + #[test] + fn st_sanity() { + let u32s: [u32; 32] = [ + 0x12345678, 0x87654321, 0xabcd, 0, 0x34127856, 0x56781234, 0xcdab, 0, 0x11223344, + 0x44332211, 0xcadb, 0, 0x55667788, 0xaabbccdd, 0x1423, 0, 0x44332211, 0x11223344, + 0x5678, 0, 0xaabbccdd, 0xddccbbaa, 0x1234, 0, 0xabcdabcd, 0xdeadbeef, 0x9876, 0, + 0xdeadbeef, 0xdcbadcba, 0xac12, 0, + ]; + let st: St = St::from(u32s); + let u32s_1: [u32; 32] = st.into(); + assert_eq!(u32s, u32s_1); + } + + #[test] + fn xmm_sanity() { + let u32s: [u32; 64] = [ + 0x12345678, 0x87654321, 0xaabbccdd, 0xddccbbaa, 0x34127856, 0x65872143, 0xbbaaddcc, + 0xccddaabb, 0xccddaabb, 0xbbaaddcc, 0x65872143, 0x34127856, 0xddccbbaa, 0xaabbccdd, + 0x87654321, 0x12345678, 0x12345678, 0x87654321, 0xaabbccdd, 0xddccbbaa, 0x34127856, + 0x65872143, 0xbbaaddcc, 0xccddaabb, 0xccddaabb, 0xbbaaddcc, 0x65872143, 0x34127856, + 0xddccbbaa, 0xaabbccdd, 0x87654321, 0x12345678, 0x12345678, 0x87654321, 0xaabbccdd, + 0xddccbbaa, 0x34127856, 0x65872143, 0xbbaaddcc, 0xccddaabb, 0xccddaabb, 0xbbaaddcc, + 0x65872143, 0x34127856, 0xddccbbaa, 0xaabbccdd, 0x87654321, 0x12345678, 0x12345678, + 0x87654321, 0xaabbccdd, 0xddccbbaa, 0x34127856, 0x65872143, 0xbbaaddcc, 0xccddaabb, + 0xccddaabb, 0xbbaaddcc, 0x65872143, 0x34127856, 0xddccbbaa, 0xaabbccdd, 0x87654321, + 0x12345678, + ]; + let xmm: Xmm = Xmm::from(u32s); + let u32s_1: [u32; 64] = xmm.into(); + assert_eq!(u32s, u32s_1); + } + + #[test] + fn amd64_core_regs_sanity() { + const EXPECTED_SIZE: usize = 16 * 8 + 8 + 4 + 4 * 6 + 10 * 8 + 8 * 4 + 16 * 16 + 4 + 8 * 3; // 560. + assert_eq!(mem::size_of::(), EXPECTED_SIZE); + let core_regs: Amd64CoreRegs = Default::default(); + let encoded: Vec = bincode::serialize(&core_regs).unwrap(); + assert_eq!(encoded.len(), EXPECTED_SIZE); + } + + #[test] + fn amd64_core_regs_serde() { + let core_regs: Amd64CoreRegs = Amd64CoreRegs { + regs: [ + 0x1c, + 0, + 0, + 0x7ffff7fe2f80, + 0x7ffff7ffe6c8, + 0x7ffff7ffe130, + 0, + 0x7fffffffdd20, + 0x4d, + 0x7ffff7f91860, + 0xc2, + 0, + 0x401040, + 0x7fffffffdd20, + 0, + 0, + ], + rip: 0x401040, + eflags: 0x206, + segments: [0x33, 0x2b, 0, 0, 0, 0], + st: [ + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + Fp80([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + ], + x87: X87Regs { + fctrl: 0x37f, + fstat: 0, + ftag: 0, + fiseg: 0, + fioff: 0, + foseg: 0, + fooff: 0, + fop: 0, + }, + xmm: [ + 0xff000000, + 0x2f2f2f2f2f2f2f2f2f2f2f2f2f2f2f2f, + 0xff000000000000, + 0xff0000000000000000ff000000ff0000, + 0, + 0, + 6, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + mxcsr: 0x1f80, + orig_rax: 0xffffffffffffffff, + fs_base: 0x7ffff7fcd540, + gs_base: 0, + }; + let encoded: Vec = bincode::serialize(&core_regs).unwrap(); + // NB: keep this so that we can *visualize* how core regs are + // serialized. + let expected: Vec = vec![ + 0x1c, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x2f, 0xfe, 0xf7, 0xff, 0x7f, 0x0, 0x0, 0xc8, + 0xe6, 0xff, 0xf7, 0xff, 0x7f, 0x0, 0x0, 0x30, 0xe1, 0xff, 0xf7, 0xff, 0x7f, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xdd, 0xff, 0xff, 0xff, 0x7f, 0x0, 0x0, + 0x4d, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x60, 0x18, 0xf9, 0xf7, 0xff, 0x7f, 0x0, 0x0, + 0xc2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x40, + 0x10, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xdd, 0xff, 0xff, 0xff, 0x7f, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x40, 0x10, + 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x2, 0x0, 0x0, 0x33, 0x0, 0x0, 0x0, 0x2b, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7f, 0x3, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, + 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0xff, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x1f, 0x0, + 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x40, 0xd5, 0xfc, 0xf7, 0xff, + 0x7f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + ]; + assert_eq!(encoded, expected); + let core_regs2 = bincode::deserialize(&encoded).unwrap(); + assert_eq!(core_regs, core_regs2); + } +} diff --git a/reverie-ptrace/src/gdbstub/request.rs b/reverie-ptrace/src/gdbstub/request.rs new file mode 100644 index 0000000..277c7fc --- /dev/null +++ b/reverie-ptrace/src/gdbstub/request.rs @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use super::{Amd64CoreRegs, Breakpoint}; +use crate::trace::Error as TraceError; +use tokio::sync::oneshot; + +/// gdb request send to reverie. +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +pub enum GdbRequest { + /// Set a breakpoint. + SetBreakpoint(Breakpoint, oneshot::Sender>), + /// Remove a breakpoint. + RemoveBreakpoint(Breakpoint, oneshot::Sender>), + /// Read inferior memory. Note the memory requested could contain + /// software breakpoint, in such case, `ReadInferiorMemory` should + /// return the original contents (excluding the breakpoint insn). + ReadInferiorMemory(u64, usize, oneshot::Sender, TraceError>>), + /// Write inferior memory + WriteInferiorMemory(u64, usize, Vec, oneshot::Sender>), + /// Read registers + ReadRegisters(oneshot::Sender>), + /// Write registers + WriteRegisters(Amd64CoreRegs, oneshot::Sender>), +} diff --git a/reverie-ptrace/src/gdbstub/response.rs b/reverie-ptrace/src/gdbstub/response.rs new file mode 100644 index 0000000..cbfde74 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/response.rs @@ -0,0 +1,263 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::{BufMut, Bytes, BytesMut}; +use num_traits::{AsPrimitive, PrimInt}; + +use reverie::Errno; + +use super::{Error, PacketLogger}; +use crate::trace::Error as TraceError; + +/// Trait to write gdb reply. This is different than `Fmt` for `Display`, +/// As the response must be a valid gdb packet reply, which does not +/// necessarily translate to either. +pub trait WriteResponse { + /// Write the value into `f` incrementally. The value got written + /// to `f` must be valid gdb reply packets. + fn write_response(&self, f: &mut ResponseWriter); +} + +/// Doesn't send resposne +pub struct ResponseNone; + +/// Send "OK" as response +pub struct ResponseOk; + +/// Response with serialized `T` as plain data +pub struct ResponseAsPlain(pub T); +/// Response with serialized `T` as GDB hex +pub struct ResponseAsHex(pub T); +/// Response with serialized `T` as GDB binary +pub struct ResponseAsBinary(pub T); + +impl WriteResponse for ResponseNone { + fn write_response(&self, _f: &mut ResponseWriter) {} +} + +impl WriteResponse for ResponseOk { + fn write_response(&self, f: &mut ResponseWriter) { + f.put_str("OK") + } +} + +impl WriteResponse for ! { + fn write_response(&self, f: &mut ResponseWriter) { + ResponseNone.write_response(f) + } +} + +impl WriteResponse for ResponseAsPlain +where + T: AsRef<[u8]>, +{ + fn write_response(&self, f: &mut ResponseWriter) { + f.put_slice(self.0.as_ref()) + } +} + +impl WriteResponse for ResponseAsHex +where + T: AsRef<[u8]>, +{ + fn write_response(&self, f: &mut ResponseWriter) { + f.put_hex_encoded(self.0.as_ref()) + } +} + +impl WriteResponse for ResponseAsBinary +where + T: AsRef<[u8]>, +{ + fn write_response(&self, f: &mut ResponseWriter) { + f.put_binary_encoded(self.0.as_ref()) + } +} + +impl WriteResponse for Result +where + T: WriteResponse, +{ + fn write_response(&self, f: &mut ResponseWriter) { + match self { + Ok(resp) => { + resp.write_response(f); + } + Err(errno) => { + f.put_str("E"); + f.put_num(errno.into_raw()); + } + } + } +} + +impl WriteResponse for Result +where + T: WriteResponse, +{ + fn write_response(&self, f: &mut ResponseWriter) { + match self { + Ok(resp) => { + resp.write_response(f); + } + Err(err) => match err { + TraceError::Errno(errno) => { + f.put_str("E"); + f.put_num(errno.into_raw()); + } + TraceError::Died(_) => f.put_str("E03"), + }, + } + } +} + +impl WriteResponse for Result +where + T: WriteResponse, +{ + fn write_response(&self, f: &mut ResponseWriter) { + match self { + Ok(resp) => { + resp.write_response(f); + } + Err(err) => match err { + Error::TraceError(TraceError::Errno(errno)) => { + f.put_str("E"); + f.put_num(errno.into_raw()); + } + _ => f.put_str("E22"), + }, + } + } +} + +#[derive(PartialEq, Eq, Debug)] +/// Response writer to be sent to remote client +pub struct ResponseWriter { + started: bool, + checksum: u8, + buf: BytesMut, +} + +impl ResponseWriter { + /// Creates a new ResponseWriter + pub fn new(mut tx_buf: BytesMut, no_ack_mode: bool) -> Self { + let mut buf = tx_buf.split(); + if !no_ack_mode { + buf.put_u8(b'+'); + } + Self { + started: false, + checksum: 0, + buf, + } + } + + fn put_u8(&mut self, byte: u8) { + if !self.started { + self.started = true; + self.buf.put_u8(b'$'); + } + + self.checksum = self.checksum.wrapping_add(byte); + self.buf.put_u8(byte); + } + + /// encode u8 as gdb hex + fn put_u8_hex(&mut self, byte: u8) { + for digit in [(byte & 0xf0) >> 4, byte & 0x0f] { + let c = match digit { + 0..=9 => b'0' + digit, + 10..=15 => b'a' + digit - 10, + _ => unreachable!(), + }; + self.put_u8(c); + } + } + + /// Write a slice over the connection. + pub fn put_slice(&mut self, s: &[u8]) { + s.iter().for_each(|c| self.put_u8(*c)) + } + + /// Write an entire string over the connection. + pub fn put_str(&mut self, s: &str) { + self.put_slice(s.as_bytes()) + } + + /// Write data as (gdb) hex string. + pub fn put_hex_encoded(&mut self, data: &[u8]) { + data.iter().for_each(|c| self.put_u8_hex(*c)); + } + + /// Write data using the binary protocol. + pub fn put_binary_encoded(&mut self, data: &[u8]) { + for &b in data.iter() { + match b { + b'#' | b'$' | b'}' | b'*' => { + self.put_u8(b'}'); + self.put_u8(b ^ 0x20) + } + _ => self.put_u8(b), + } + } + } + + /// Write a number as a big-endian hex string using the most compact + /// representation possible (i.e: trimming leading zeros). + pub fn put_num + PrimInt>(&mut self, digit: I) { + if digit.is_zero() { + return self.put_u8_hex(0); + } + + let mut buf = [0; 16]; + let mut k = 15; + let mut x = digit; + + while !x.is_zero() { + buf[k] = (x.as_() & 0xffu64) as u8; + k -= 1; + x = x.unsigned_shr(8); + } + + self.put_hex_encoded(&buf[1 + k..]); + } + + /// Consumes self, writing out buffer and the final '#' and checksum + pub fn finish(mut self) -> Bytes { + // don't include the '#' in checksum calculation + let checksum = self.checksum; + + // empty response + if !self.started { + self.started = true; + self.buf.put_u8(b'$'); + } + + tracing::trace!("{:?}", PacketLogger::outgoing(&self.buf, checksum)); + + self.buf.put_u8(b'#'); + self.put_u8_hex(checksum); + self.buf.freeze() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn response_plain_string() { + let mut left = ResponseWriter::new(BytesMut::new(), true); + let mut right = ResponseWriter::new(BytesMut::new(), true); + left.put_str("just a test"); + ResponseAsPlain("just a test").write_response(&mut right); + assert_eq!(left, right); + } +} diff --git a/reverie-ptrace/src/gdbstub/server.rs b/reverie-ptrace/src/gdbstub/server.rs new file mode 100644 index 0000000..5b54b2d --- /dev/null +++ b/reverie-ptrace/src/gdbstub/server.rs @@ -0,0 +1,265 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::BytesMut; +use futures::future; +use std::io; +use std::net::SocketAddr; +use std::path::Path; +use tokio::io::AsyncRead; +use tokio::io::AsyncReadExt; +use tokio::net::TcpListener; +use tokio::net::TcpStream; +use tokio::net::UnixListener; +use tokio::net::UnixStream; +use tokio::sync::mpsc; +use tokio::sync::oneshot; + +use super::error::Error; +use super::inferior::StoppedInferior; +use super::packet::Packet; +use super::session::Session; + +/// GdbServer controller +pub struct GdbServer { + /// Signal gdbserver to start. + pub server_tx: Option>, + /// Signal gdbserver the very first tracee is ready. + pub inferior_attached_tx: Option>, + /// FIXME: the tracees are serialized already, tell gdbserver not to + /// serialize by its own. + pub sequentialized_guest: bool, +} + +impl GdbServer { + /// Creates a GDB server and binds to the given address. + /// + /// NOTE: The canonical GDB server port is `1234`. + pub async fn from_addr(addr: SocketAddr) -> Result { + let (inferior_attached_tx, inferior_attached_rx) = mpsc::channel(1); + let (server_tx, server_rx) = oneshot::channel(); + + let server = GdbServerImpl::from_addr(addr, server_rx, inferior_attached_rx).await?; + tokio::task::spawn(async move { + if let Err(err) = server.run().await { + tracing::error!("Failed to run gdbserver: {:?}", err); + } + }); + Ok(Self { + server_tx: Some(server_tx), + inferior_attached_tx: Some(inferior_attached_tx), + sequentialized_guest: false, + }) + } + + /// Creates a GDB server from the given unix domain socket. This is useful + /// when we know there will only be one client and want to avoid binding to a + /// port. + pub async fn from_path(path: &Path) -> Result { + let (inferior_attached_tx, inferior_attached_rx) = mpsc::channel(1); + let (server_tx, server_rx) = oneshot::channel(); + + let server = GdbServerImpl::from_path(path, server_rx, inferior_attached_rx).await?; + tokio::task::spawn(async move { + if let Err(err) = server.run().await { + tracing::error!("Failed to run gdbserver: {:?}", err); + } + }); + Ok(Self { + server_tx: Some(server_tx), + inferior_attached_tx: Some(inferior_attached_tx), + sequentialized_guest: false, + }) + } + + pub fn sequentialized_guest(&mut self) -> &mut Self { + self.sequentialized_guest = true; + self + } + + #[allow(unused)] + pub async fn notify_start(&mut self) -> Result<(), Error> { + if let Some(tx) = self.server_tx.take() { + tx.send(()).map_err(|_| Error::GdbServerNotStarted) + } else { + Ok(()) + } + } + + #[allow(unused)] + pub async fn notify_gdb_stop(&mut self, stopped: StoppedInferior) -> Result<(), Error> { + if let Some(tx) = self.inferior_attached_tx.take() { + tx.send(stopped) + .await + .map_err(|_| Error::GdbServerSendPacketError) + } else { + Ok(()) + } + } +} + +struct GdbServerImpl { + reader: Box, + pkt_tx: mpsc::Sender, + server_rx: Option>, + session: Option, +} + +/// Binds to the given address and waits for an incoming connection. +async fn wait_for_tcp_connection(addr: SocketAddr) -> io::Result { + // NOTE: `tokio::net::TcpListener::bind` is not used here on purpose. It + // spawns an additional tokio worker thread. We want to avoid an extra + // thread here since it could perturb the deterministic allocation of PIDs. + // Using `std::net::TcpListener::bind` appears to avoid spawning an extra + // tokio worker thread. + let listener = std::net::TcpListener::bind(addr)?; + listener.set_nonblocking(true)?; + let listener = TcpListener::from_std(listener)?; + + let (stream, client_addr) = listener.accept().await?; + + tracing::info!("Accepting client connection: {:?}", client_addr); + + Ok(stream) +} + +/// Binds to the given socket path and waits for an incoming connection. +async fn wait_for_unix_connection(path: &Path) -> io::Result { + let listener = UnixListener::bind(path)?; + + let (stream, client_addr) = listener.accept().await?; + + tracing::info!("Accepting client connection: {:?}", client_addr); + + Ok(stream) +} + +// NB: during handshake, gdb may send packet prefixed with `+' (Ack), or send +// `+' then the actual packet (send two times). Since Ack is also a valid packet +// This may cause confusion to Packet::try_from(), since it tries to decode one +// packet at a time. +enum PacketWithAck { + // Just a packet, note `+' only is considered to be `JustPacket'. + JustPacket(Packet), + // `+' (Ack) followed by a packet, such as `+StartNoAckMode'. + WithAck(Packet), +} + +const PACKET_BUFFER_CAPACITY: usize = 0x8000; + +impl GdbServerImpl { + /// Creates a new gdbserver, by accepting remote connection at `addr`. + async fn from_addr( + addr: SocketAddr, + server_rx: oneshot::Receiver<()>, + inferior_attached_rx: mpsc::Receiver, + ) -> Result { + let stream = wait_for_tcp_connection(addr) + .await + .map_err(|source| Error::WaitForGdbConnect { source })?; + let (reader, writer) = stream.into_split(); + + let (tx, rx) = mpsc::channel(1); + // create a gdb session. + let session = Session::new(Box::new(writer), rx, inferior_attached_rx); + + Ok(GdbServerImpl { + reader: Box::new(reader), + pkt_tx: tx, + server_rx: Some(server_rx), + session: Some(session), + }) + } + + /// Creates a GDB server and listens on the given unix domain socket. + async fn from_path( + path: &Path, + server_rx: oneshot::Receiver<()>, + inferior_attached_rx: mpsc::Receiver, + ) -> Result { + let stream = wait_for_unix_connection(path) + .await + .map_err(|source| Error::WaitForGdbConnect { source })?; + + let (reader, writer) = stream.into_split(); + let (tx, rx) = mpsc::channel(1); + + // Create a gdb session. + let session = Session::new(Box::new(writer), rx, inferior_attached_rx); + + Ok(GdbServerImpl { + reader: Box::new(reader), + pkt_tx: tx, + server_rx: Some(server_rx), + session: Some(session), + }) + } + + async fn recv_packet(&mut self) -> Result { + let mut rx_buf = BytesMut::with_capacity(PACKET_BUFFER_CAPACITY); + self.reader + .read_buf(&mut rx_buf) + .await + .map_err(|_| Error::ConnReset)?; + + // packet to follow, such as `+StartNoAckMode`. + Ok(if rx_buf.starts_with(b"+") && rx_buf.len() > 1 { + PacketWithAck::WithAck(Packet::new(rx_buf.split_off(1))?) + } else { + PacketWithAck::JustPacket(Packet::new(rx_buf.split())?) + }) + } + + async fn send_packet(&mut self, packet: Packet) -> Result<(), Error> { + self.pkt_tx + .send(packet) + .await + .map_err(|_| Error::GdbServerSendPacketError) + } + + async fn relay_gdb_packets(&mut self) -> Result<(), Error> { + while let Ok(pkt) = self.recv_packet().await { + match pkt { + PacketWithAck::JustPacket(pkt) => { + self.send_packet(Packet::Ack).await?; + self.send_packet(pkt).await?; + } + PacketWithAck::WithAck(pkt) => self.send_packet(pkt).await?, + } + } + + // remote client closed connection. + Ok(()) + } + + /// Run gdbserver. + /// + /// The gdbserver can run in a separate tokio thread pool. + /// + /// ```no_compile + /// let gdbserver = GdbServer::new(..).await?; + /// let handle = tokio::task::spawn(gdbserver.run()); + /// // snip + /// handle.await?? + /// ``` + async fn run(mut self) -> Result<(), Error> { + // NB: waiting for initial request to start gdb server. This is + // required because if gdbserver is started too soon, gdb (client) + // could get timeout. Some requests such as `g' needs IPC with a + // gdb session, which only becomes ready later. + if let Some(server_rx) = self.server_rx.take() { + let _ = server_rx.await.map_err(|_| Error::GdbServerNotStarted)?; + let mut session = self.session.take().ok_or(Error::SessionNotStarted)?; + let run_session = session.run(); + let run_loop = self.relay_gdb_packets(); + future::try_join(run_session, run_loop).await?; + } + Ok(()) + } +} diff --git a/reverie-ptrace/src/gdbstub/session.rs b/reverie-ptrace/src/gdbstub/session.rs new file mode 100644 index 0000000..f7de5a2 --- /dev/null +++ b/reverie-ptrace/src/gdbstub/session.rs @@ -0,0 +1,850 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use bytes::{Bytes, BytesMut}; +use futures::future::Future; +use futures::stream::{FuturesUnordered, StreamExt}; +use nix::fcntl; +use nix::fcntl::OFlag; +use nix::sys::signal::Signal; +use nix::sys::stat::{self, Mode}; +use nix::sys::uio; +use nix::unistd; +use reverie::Pid; +use std::sync::Arc; +use tokio::io::AsyncWrite; +use tokio::io::AsyncWriteExt; +use tokio::sync::mpsc; +use tokio::sync::oneshot; +use tokio::sync::MappedMutexGuard; +use tokio::sync::Mutex; +use tokio::sync::MutexGuard; + +use crate::trace::ChildOp; + +use super::commands::{self, *}; +use super::regs::Amd64CoreRegs; +use super::response::*; +use super::Breakpoint; +use super::BreakpointType; +use super::Error; +use super::GdbRequest; +use super::Inferior; +use super::InferiorThreadId; +use super::Packet; +use super::ResumeInferior; +use super::StoppedInferior; + +use std::collections::BTreeMap; + +type BoxWriter = Box; + +/// Gdb session manager. +/// recv commands over tcp stream +/// recv request from Tracee (new task, reap orphans, etc..) +/// Session ends when client disconnect from tcp stream. +/// (gdb) detach semantics? +pub struct Session { + /// No-ACK mode, set by gdb client. + pub no_ack_mode: bool, + + /// Stream to send reply to. + pub stream_tx: BoxWriter, + + /// buffer use to send data over to tcp stream + pub tx_buf: BytesMut, + + /// Gdb remote protocol command notifier. + pub pkt_rx: Option>, + + /// buffer used by hostio. + pub bufsiz: usize, + + /// Current pid used by vFile (hostio). + pub hostio_pid: Option, + + /// Inferiors managed by this session. + pub inferiors: Arc>>, + + /// Current thread + pub current: Option, + + /// Channel to report stop event. + // NB: even though we could use a single `gdb_stop_rx` to receive all + // stop events (mpsc), we use a `stop_rx` channel for each `inferior` + // instead, this is because `vCont;p:-1` could resume multiple + // threads hence there are could be multiple threads reporting stop + // event at the same time, causing de-sync issue. This can be mitigated + // if each inferior has its own `stop_rx` channel. As a result, + // `gdb_stop_rx` is moved after initial gdb attach, once we can create + // the first inferior. + pub gdb_stop_rx: Option>, +} + +struct VcontResumeResult { + /// stop reason + reason: StopReason, + /// A new inferior was created. + new_inferior: Option, + /// ptids must be removed, due to some tasks are exited. + ptid_to_remove: Vec, + /// Switch to a new task + // + // NB: This is possible when supporting multi-threaded programs. See + // Below examples. + // + // Sending packet: $vCont;c:p2.-1#10...Packet received: T05create:p02.12;06:70d9ffffff7f0000;07:28d9ffffff7f0000;10:1138eef7ff7f0000;thread:p02.02; + // Sending packet: $vCont;c:p2.-1#10...Packet received: T05swbreak:;06:201e5cf5ff7f0000;07:f01d5cf5ff7f0000;10:0e14400000000000;thread:p02.10; + // Sending packet: $qfThreadInfo#bb...Packet received: mp02.02,p02.06,p02.08,p02.0a,p02.0c,p02.0e,p02.10,p02.12, + // Sending packet: $qsThreadInfo#c8...Packet received: l + // [New Thread 2.18] + // [Switching to Thread 2.16] + // Sending packet: $z0,40140e,1#91...Packet received: OK + // Sending packet: $z0,7ffff7fe3340,1#ce...Packet received: OK + // + // Even though gdb (client) said `[Switching to Thread 2.16]`, No packets + // was sent to the server side (such as `Hgp2.16`) hence the server side + // was completely unaware of the switching. Presumably gdb (client) + // assumed *any* thread in the same process group and read/write memory, + // but it is not necessarily true for us because we use different + // channels to communicate between gdbstub <-> reverie. As a result + // we switch current to thread `switch_to`, to simulate gdb's (client) + // (mis)behavior. + switch_to: Option, +} + +enum HandleVcontResume { + /// vCont resume not handled, this is possible because vCont can encode + /// multiple actions, only the left-most action is used if it matches + /// a given ptid. + NotHandled, + /// vCont matches a `ptid`. + Handled(VcontResumeResult), +} + +impl Session { + /// Create a new session from root task. + pub fn new( + stream_tx: BoxWriter, + pkt_rx: mpsc::Receiver, + gdb_stop_rx: mpsc::Receiver, + ) -> Self { + Session { + no_ack_mode: false, + stream_tx, + tx_buf: BytesMut::with_capacity(0x8000), + pkt_rx: Some(pkt_rx), + hostio_pid: None, + bufsiz: 0x8000, + inferiors: Arc::new(Mutex::new(BTreeMap::new())), + current: None, + gdb_stop_rx: Some(gdb_stop_rx), + } + } + + /// Get current inferior. GDB can select current inferior by `Hg`. + async fn with_inferior<'a, F, Fut>(&'a self, threadid: ThreadId, f: F) -> Fut::Output + where + F: FnOnce(MappedMutexGuard<'a, Inferior>) -> Fut + 'a, + Fut: Future + 'a, + { + let tid = threadid + .gettid() + .unwrap_or_else(|| threadid.getpid().unwrap()); + let inferiors = self.inferiors.lock().await; + let inferior = MutexGuard::map(inferiors, |inferiors| inferiors.get_mut(&tid).unwrap()); + f(inferior).await + } + + /// Get current inferior. GDB can select current inferior by `Hg`. + async fn with_current_inferior<'a, F, Fut>(&'a self, f: F) -> Fut::Output + where + F: FnOnce(MappedMutexGuard<'a, Inferior>) -> Fut + 'a, + Fut: Future + 'a, + { + let threadid: ThreadId = self.current.unwrap().into(); + self.with_inferior(threadid, f).await + } + + /// create a new response writer + fn response(&self, mut tx: BytesMut) -> ResponseWriter { + ResponseWriter::new(tx.split(), self.no_ack_mode) + } + + /// Detach or Kill all threads matching `threadid`. + async fn detach_or_kill(&self, threadid: ThreadId, kill: bool) -> Result<(), Error> { + let mut inferiors = self.inferiors.lock().await; + let resume = ResumeInferior { + action: if kill { + ResumeAction::Continue(Some(Signal::SIGKILL)) + } else { + ResumeAction::Continue(None) + }, + detach: true, + }; + for (_, inferior) in inferiors.iter_mut() { + if inferior.matches(&threadid) { + inferior.notify_resume(resume).await?; + } + } + inferiors.retain(|_, inferior| !inferior.matches(&threadid)); + Ok(()) + } + + /// handle vCont resume + async fn vcont_resume( + &self, + threadid: ThreadId, + resume: ResumeInferior, + ) -> Result { + let mut inferiors_to_resume: Vec<&mut Inferior> = Vec::new(); + let mut inferiors = self.inferiors.lock().await; + + match threadid.tid { + // vCont a specific ptid, such as $vCont;c:p2.2#.. + IdKind::Id(tid) => { + let inferior = inferiors.get_mut(&tid).ok_or(Error::UnknownThread(tid))?; + inferiors_to_resume.push(inferior); + } + // Invalid vCont + IdKind::Any => { + return Err(Error::ThreadIdNotSpecified); + } + // vCont all threads, such as $vCont;c:p2.-1#10 + IdKind::All => match threadid.pid { + IdKind::Id(pid) => { + for (_, inferior) in inferiors.iter_mut() { + if inferior.getpid() == pid { + inferiors_to_resume.push(inferior); + } + } + } + _ => return Err(Error::ThreadIdNotSpecified), + }, + } + + if inferiors_to_resume.is_empty() { + return Ok(HandleVcontResume::NotHandled); + } + + let mut new_inferior: Option = None; + let mut ptid_to_remove: Vec = Vec::new(); + let mut switch_to: Option = None; + let mut inferiors_to_wait = FuturesUnordered::new(); + + for inferior in inferiors_to_resume { + inferior.notify_resume(resume).await?; + inferiors_to_wait.push(inferior.wait_for_stop()); + } + + let mut reason: Option = None; + while let Some(stop_reason) = inferiors_to_wait.next().await { + let mut stop_reason = stop_reason?; + match &mut stop_reason { + StopReason::ThreadExited(pid, tgid, _exit_status) => { + ptid_to_remove.push(ThreadId::pid_tid(tgid.as_raw(), pid.as_raw())); + // The thread exit event `w XX; ptid is not reported + continue; + } + StopReason::Exited(pid, _exit_staus) => { + ptid_to_remove.push(ThreadId::pid(pid.as_raw())); + } + StopReason::NewTask(new_task) => { + new_inferior = Some(match new_task.op { + ChildOp::Fork => Inferior { + id: InferiorThreadId::new(new_task.child, new_task.child), + resume_tx: new_task.resume_tx.take(), + request_tx: new_task.request_tx.take(), + stop_rx: new_task.stop_rx.take(), + resume_pending: false, + }, + ChildOp::Vfork => Inferior { + id: InferiorThreadId::new(new_task.child, new_task.child), + resume_tx: new_task.resume_tx.take(), + request_tx: new_task.request_tx.take(), + stop_rx: new_task.stop_rx.take(), + resume_pending: false, + }, + ChildOp::Clone => Inferior { + id: InferiorThreadId::new(new_task.child, new_task.tgid), + resume_tx: new_task.resume_tx.take(), + request_tx: new_task.request_tx.take(), + stop_rx: new_task.stop_rx.take(), + resume_pending: false, + }, + }); + } + + StopReason::Stopped(stopped) => { + switch_to = Some(InferiorThreadId::new(stopped.pid, stopped.tgid)); + } + } + reason = Some(stop_reason); + break; + } + Ok(HandleVcontResume::Handled(VcontResumeResult { + reason: reason.unwrap(), + new_inferior, + ptid_to_remove, + switch_to, + })) + } + + /// handle gdb remote base command + async fn handle_base(&mut self, cmd: Base, writer: &mut ResponseWriter) -> Result<(), Error> { + match cmd { + Base::QuestionMark(_) => { + writer.put_str("S05"); + } + Base::QStartNoAckMode(_) => { + self.no_ack_mode = true; + writer.put_str("OK"); + } + Base::qSupported(_) => { + writer.put_str("PacketSize=8000;vContSupported+;multiprocess+;exec-events+;fork-events+;vfork-events+;QThreadEvents+;QStartNoAckMode+;swbreak+;qXfer:features:read+;qXfer:auxv:read+;"); + } + Base::qXfer(request) => match request { + qXfer::FeaturesRead { offset: _, len: _ } => { + // gdb/64bit-sse.xml + writer.put_str("li386:x86-64"); + } + qXfer::AuxvRead { offset, len } => { + if let Some(id) = self.current { + let buffer_size = std::cmp::min(self.bufsiz, len); + let mut auxv: Vec = vec![0; buffer_size]; + if let Ok(nb) = fcntl::open( + format!("/proc/{}/auxv", id.pid).as_str(), + OFlag::O_RDONLY, + Mode::from_bits_truncate(0o644), + ) + .and_then(|fd| { + let nb = uio::pread(fd, &mut auxv, offset as libc::off_t)?; + let _ = unistd::close(fd); + Ok(nb) + }) { + writer.put_str("l"); + writer.put_binary_encoded(&auxv[..nb]); + } + } + } + }, + Base::qfThreadInfo(_) => { + writer.put_str("m"); + for task in self.inferiors.lock().await.values() { + let threadid: ThreadId = task.id.into(); + threadid.write_response(writer); + writer.put_str(","); + } + } + Base::qsThreadInfo(_) => { + writer.put_str("l"); + } + Base::qAttached(_pid) => { + writer.put_str("0"); + } + Base::QThreadEvents(_thread_events) => { + // NB: This should toggle reporting thread event, such as + // `T05Create`, but I couldn't find any examples even with + // vanilla gdbserver debugging threaded programs. gdb client + // never send this command, even after I tried to run + // `set remote thread-events on`, as described in gdb remote + // protocol doc. + writer.put_str("OK"); + } + Base::qC(_) => { + if let Some(id) = self.current { + let thread_id: ThreadId = id.into(); + writer.put_str("QC"); + thread_id.write_response(writer); + } + } + Base::H(h) => { + match h.op { + ThreadOp::g => { + // qeury or set current threadid. + if h.id.pid == IdKind::Any && h.id.tid == IdKind::Any { + ResponseOk.write_response(writer); + } else { + h.id.try_into() + .map(|id| { + self.current = Some(id); + ResponseOk + }) + .write_response(writer) + } + } + _ => { + // Hc is deprecated, others not supported. + writer.put_str("E01"); + } + } + } + Base::g(_) => self + .read_registers() + .await + .map(ResponseAsHex) + .write_response(writer), + Base::G(regs) => self + .write_registers(regs.vals) + .await + .map(|_| ResponseOk) + .write_response(writer), + Base::m(m) => self + .read_inferior_memory(m.addr, m.length) + .await + .map(ResponseAsHex) + .write_response(writer), + Base::M(mem) => self + .write_inferior_memory(mem.addr, mem.length, mem.vals) + .await + .map(|_| ResponseOk) + .write_response(writer), + Base::X(mem) => self + .write_inferior_memory(mem.addr, mem.length, mem.vals) + .await + .map(|_| ResponseOk) + .write_response(writer), + // NB: detach is a resume, but we don't care about receiving + // further (gdb) stop events. + Base::D(pid) => { + let pid = pid.pid; + let threadid = pid.map_or_else(ThreadId::all, |pid| ThreadId::pid(pid.as_raw())); + self.detach_or_kill(threadid, false) + .await + .map(|_| ResponseOk) + .write_response(writer); + } + Base::z(bkpt) => { + if bkpt.ty == BreakpointType::Software { + let bkpt = Breakpoint { + ty: BreakpointType::Software, + addr: bkpt.addr, + bytecode: None, + }; + self.remove_breakpoint(bkpt) + .await + .map(|_| ResponseOk) + .write_response(writer); + } + } + Base::Z(bkpt) => { + if bkpt.ty == BreakpointType::Software { + let bkpt = Breakpoint { + ty: BreakpointType::Software, + addr: bkpt.addr, + bytecode: None, + }; + self.set_breakpoint(bkpt) + .await + .map(|_| ResponseOk) + .write_response(writer); + } + } + // NB: kill is a resume(SIGKILL), but we don't care about + // receiving further (gdb) stop events. + Base::vKill(pid) => { + let threadid = ThreadId::pid(pid.pid.as_raw()); + self.detach_or_kill(threadid, true) + .await + .map(|_| ResponseOk) + .write_response(writer); + } + Base::vCont(vcont) => match vcont { + vCont::Query => { + writer.put_str("vCont;c;C;s;S"); + } + vCont::Actions(actions) => { + // `vCont` can encode multiple actions, but we should + // resume only one matching ptid only (left-most). + while let Some((action, threadid)) = actions.first() { + let resume = match action { + ResumeAction::Step(step) => ResumeInferior { + action: ResumeAction::Step(*step), + detach: false, + }, + ResumeAction::Continue(cont) => ResumeInferior { + action: ResumeAction::Continue(*cont), + detach: false, + }, + not_supported => { + // Shouldn't reach here because only `c;C;s:S` are advertised. + panic!("Unsupported vCont command: {:?}", not_supported); + } + }; + match self.vcont_resume(*threadid, resume).await? { + HandleVcontResume::NotHandled => {} + HandleVcontResume::Handled(VcontResumeResult { + reason, + new_inferior, + ptid_to_remove, + switch_to, + }) => { + let mut inferiors = self.inferiors.lock().await; + for ptid in ptid_to_remove { + if let Some(tid) = ptid.gettid() { + let _ = inferiors.remove(&tid); + } else { + inferiors.retain(|_, inferior| !inferior.matches(&ptid)); + } + } + if let Some(new_inferior) = new_inferior { + inferiors.insert(new_inferior.gettid(), new_inferior); + } + if let Some(switch_to) = switch_to { + self.current = Some(switch_to); + } + reason.write_response(writer); + break; + } + } + } + } + }, + // TODO T92309086: implement ACL for hostio. + Base::vFile(hostio) => match hostio { + vFile::Setfs(pid) => { + match pid { + Some(pid) => { + self.hostio_pid = Some(Pid::from_raw(pid)); + } + None => { + self.hostio_pid = self.current.as_ref().map(|x| x.pid); + } + } + writer.put_str("F0"); + } + vFile::Open(fname, flags, mode) => { + let oflag = OFlag::from_bits_truncate(flags); + let mode = Mode::from_bits_truncate(mode); + writer.put_str("F"); + match fcntl::open(&fname, oflag, mode) { + Ok(fd) => writer.put_num(fd), + Err(_) => writer.put_str("-1"), + } + } + vFile::Close(fd) => { + writer.put_str(unistd::close(fd).map_or("F-1", |_| "F0")); + } + vFile::Pread(fd, count, offset) => { + let count = std::cmp::min(count as usize, self.bufsiz); + let mut buf: Vec = vec![0; count]; + match uio::pread(fd, &mut buf, offset as i64) { + Ok(nb) => { + writer.put_str("F"); + writer.put_num(nb); + writer.put_str(";"); + writer.put_binary_encoded(&buf[..nb]); + } + Err(_) => { + writer.put_str("F-1"); + } + } + } + vFile::Pwrite(fd, offset, data) => match uio::pwrite(fd, &data, offset as i64) { + Ok(nb) => { + writer.put_str("F"); + writer.put_num(nb); + } + Err(_) => { + writer.put_str("F-1"); + } + }, + vFile::Unlink(fname) => { + writer.put_str(unistd::unlink(&fname).map_or("F-1", |_| "F0")); + } + vFile::Readlink(fname) => { + match fcntl::readlink(&fname) + .ok() + .and_then(|s| s.to_str().map(|s| s.as_bytes().to_vec())) + { + Some(bytes) => { + writer.put_str("F"); + writer.put_num(bytes.len()); + writer.put_str(";"); + writer.put_binary_encoded(&bytes) + } + None => { + writer.put_str("F-1"); + } + } + } + vFile::Fstat(fd) => { + // NB: HostioStat is not the same as FileStat. + const STAT_SIZE: usize = std::mem::size_of::(); + match stat::fstat(fd).ok().map(|st| { + let st: HostioStat = st.into(); + let bytes: [u8; STAT_SIZE] = unsafe { std::mem::transmute(st) }; + bytes + }) { + Some(bytes) => { + writer.put_str("F"); + writer.put_num(STAT_SIZE); + writer.put_str(";"); + writer.put_binary_encoded(&bytes); + } + None => { + writer.put_str("F-1"); + } + } + } + }, + } + Ok(()) + } + + /// handle gdb remote extended mode command + async fn handle_extended_mode( + &mut self, + cmd: ExtendedMode, + writer: &mut ResponseWriter, + ) -> Result<(), Error> { + match cmd { + ExtendedMode::ExclamationMark(_) => { + writer.put_str("OK"); + } + ExtendedMode::QDisableRandomization(disable_aslr) => { + // ASLR is always disabled by reverie. + if disable_aslr.val { + writer.put_str("OK"); + } else { + writer.put_str("E22"); + } + } + } + Ok(()) + } + + /// handle gdb remote monitor command + async fn handle_monitor_cmd( + &mut self, + cmd: MonitorCmd, + _writer: &mut ResponseWriter, + ) -> Result<(), Error> { + match cmd { + MonitorCmd::qRcmd(_) => { + unimplemented!() + } + } + } + + /// handle gdb remote section offset command + async fn handle_section_offsets( + &mut self, + cmd: SectionOffsets, + writer: &mut ResponseWriter, + ) -> Result<(), Error> { + match cmd { + // should use libraries-svr4:read instead + SectionOffsets::qOffsets(_) => { + writer.put_str(""); + } + } + Ok(()) + } + + /// handle gdb remote command + async fn handle_command( + &mut self, + cmd: commands::Command, + resp: BytesMut, + ) -> Result { + let mut writer = self.response(resp); + match cmd { + Command::Unknown(cmd) => { + tracing::info!("Unknown command: {:?}", cmd); + } + Command::Base(cmd) => self.handle_base(cmd, &mut writer).await?, + Command::ExtendedMode(cmd) => self.handle_extended_mode(cmd, &mut writer).await?, + Command::MonitorCmd(cmd) => self.handle_monitor_cmd(cmd, &mut writer).await?, + Command::SectionOffsets(cmd) => self.handle_section_offsets(cmd, &mut writer).await?, + }; + Ok(writer.finish()) + } + + /// Handle incoming request sent over tcp stream + pub async fn run(&mut self) -> Result<(), Error> { + let cmd_rx = self.pkt_rx.take().unwrap(); + + let mut gdb_stop_rx = self.gdb_stop_rx.take().ok_or(Error::Detached)?; + let stop_reason = gdb_stop_rx.recv().await.ok_or(Error::Detached)?; + + // set initial task as current attached task. + match stop_reason.reason { + StopReason::Stopped(stopped) => { + let id = InferiorThreadId::new(stopped.pid, stopped.tgid); + self.current = Some(id); + let mut inferior = Inferior::new(id); + inferior.request_tx = Some(stop_reason.request_tx); + inferior.resume_tx = Some(stop_reason.resume_tx); + inferior.stop_rx = Some(gdb_stop_rx); + self.inferiors.lock().await.insert(id.tid, inferior); + } + _ => unreachable!(), + } + + self.handle_gdb_commands(cmd_rx).await + } + + async fn handle_gdb_commands( + &mut self, + mut cmd_rx: mpsc::Receiver, + ) -> Result<(), Error> { + let mut tx_buf = BytesMut::with_capacity(0x8000); + + while let Some(pkt) = cmd_rx.recv().await { + match pkt { + Packet::Ack => {} + Packet::Nack => { + panic!("client send Nack") + } + // handle interrupt + Packet::Interrupt => {} + Packet::Command(cmd) => { + tx_buf.clear(); + let resp = self.handle_command(cmd, tx_buf.clone()).await?; + self.stream_tx.write_all(&resp).await.unwrap(); + } + } + } + Ok(()) + } + + /// Set a breakpoint. must have an active inferior. + async fn set_breakpoint(&self, bkpt: Breakpoint) -> Result<(), Error> { + self.with_current_inferior(async move |inferior| { + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let request = GdbRequest::SetBreakpoint( + Breakpoint { + ty: BreakpointType::Software, + addr: bkpt.addr, + bytecode: None, + }, + reply_tx, + ); + let _ = request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + Ok(reply) + }) + .await + } + + async fn remove_breakpoint(&self, bkpt: Breakpoint) -> Result<(), Error> { + self.with_current_inferior(async move |inferior| { + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let request = GdbRequest::RemoveBreakpoint( + Breakpoint { + ty: BreakpointType::Software, + addr: bkpt.addr, + bytecode: None, + }, + reply_tx, + ); + request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + + Ok(reply) + }) + .await + } + + async fn read_inferior_memory(&self, addr: u64, size: usize) -> Result, Error> { + self.with_current_inferior(async move |inferior| { + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let request = GdbRequest::ReadInferiorMemory(addr, size, reply_tx); + let _ = request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + Ok(reply) + }) + .await + } + + async fn write_inferior_memory( + &self, + addr: u64, + size: usize, + data: Vec, + ) -> Result<(), Error> { + let data = data.clone(); + self.with_current_inferior(async move |inferior| { + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let request = GdbRequest::WriteInferiorMemory(addr, size, data, reply_tx); + let _ = request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + Ok(reply) + }) + .await + } + + async fn read_registers(&self) -> Result { + self.with_current_inferior(async move |inferior| { + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let request = GdbRequest::ReadRegisters(reply_tx); + let _ = request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + Ok(reply) + }) + .await + } + + async fn write_registers(&self, regs: Vec) -> Result<(), Error> { + self.with_current_inferior(async move |inferior| { + let regs = regs.as_slice(); + let request_tx = inferior + .request_tx + .as_ref() + .ok_or(Error::SessionNotStarted)?; + let (reply_tx, reply_rx) = oneshot::channel(); + let core_regs: Amd64CoreRegs = + bincode::deserialize(regs).map_err(|_| CommandParseError::MalformedRegisters)?; + let request = GdbRequest::WriteRegisters(core_regs, reply_tx); + let _ = request_tx + .send(request) + .await + .map_err(|_| Error::GdbRequestSendError)?; + let reply = reply_rx.await.map_err(|_| Error::GdbRequestRecvError)??; + Ok(reply) + }) + .await + } +} diff --git a/reverie-ptrace/src/lib.rs b/reverie-ptrace/src/lib.rs new file mode 100644 index 0000000..532ebe6 --- /dev/null +++ b/reverie-ptrace/src/lib.rs @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Reverie ptrace backend. +//! +//! ptraced task implements `Guest` trait. +//! +//! `TracedTask` implements handlers for ptrace events including +//! seccomp. Notable ptrace events include: +//! +//! `PTRACE_EVENT_EXEC`: `execvpe` is about to return, tracee stopped +//! at entry point. +//! +//! `PTRACE_EVENT_FORK/VFORK/CLONE`: when `fork`/`vfork`/`clone` is about +//! to return +//! +//! `PTRACE_EVENT_SECCOMP`: seccomp stop caused by `RET_TRACE` +//! NB: we patch syscall in seccomp ptrace stop. +//! +//! `PTRACE_EVENT_EXIT`: process is about to exit +//! +//! signals: tracee's pending signal stop. +//! +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] +#![feature(async_closure)] +#![feature(internal_output_capture)] +#![feature(never_type)] +#![feature(llvm_asm)] +#![feature(map_first_last)] + +mod children; +mod cp; +#[allow(unused)] +mod debug; +mod error; +mod gdbstub; +mod perf; +mod stack; +mod task; +pub mod testing; +mod timer; +pub mod trace; +mod tracer; +mod vdso; + +pub use perf::is_perf_supported; +pub use tracer::spawn_fn; +pub use tracer::spawn_fn_with_config; +pub use tracer::GdbConnection; +pub use tracer::Tracer; +pub use tracer::TracerBuilder; + +#[macro_use] +extern crate bitflags; diff --git a/reverie-ptrace/src/perf.rs b/reverie-ptrace/src/perf.rs new file mode 100644 index 0000000..0c14eb8 --- /dev/null +++ b/reverie-ptrace/src/perf.rs @@ -0,0 +1,690 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Provides a more rustic interface to a minimal set of `perf` functionality. +//! +//! Explicitly missing (because they are unnecessary) perf features include: +//! * Grouping +//! * Sample type flags +//! * Reading any kind of sample events +//! * BPF +//! * Hardware breakpoints +//! +//! The arguments and behaviors in this module generally correspond exactly to +//! those of `perf_event_open(2)`. No attempts are made to paper over the +//! non-determinism/weirndess of `perf`. For example, counter increments are +//! dropped whenever an event fires on a running thread. +//! [`PerfCounter::DISABLE_SAMPLE_PERIOD`] can be used to avoid this for sampling. +//! events. + +use core::ptr::NonNull; +use lazy_static::lazy_static; +use nix::{ + sys::signal::Signal, + unistd::{sysconf, SysconfVar}, +}; +use perf_event_open_sys::{bindings as perf, ioctls}; +use reverie::Errno; +use reverie::Tid; +use tracing::info; + +#[allow(unused_imports)] // only used if we have an error +use std::compile_error; + +pub use perf::perf_event_header; + +// Not available in the libc crate +const F_SETOWN_EX: libc::c_int = 15; +const F_SETSIG: libc::c_int = 10; +const F_OWNER_TID: libc::c_int = 0; +#[repr(C)] +struct f_owner_ex { + pub type_: libc::c_int, + pub pid: libc::pid_t, +} + +/// An incomplete enumeration of events perf can monitor +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Event { + #[allow(dead_code)] // used in tests + /// A perf-supported hardware event. + Hardware(HardwareEvent), + /// A perf-supported software event. + Software(SoftwareEvent), + /// A raw CPU event. The inner value will have a CPU-specific meaning. + Raw(u64), +} + +/// An incomplete enumeration of hardware events perf can monitor. +#[allow(dead_code)] // used in tests +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum HardwareEvent { + /// Count retired instructions. Can be affected by hardware interrupt counts. + Instructions, + /// Count retired branch instructions. + BranchInstructions, +} + +/// An incomplete enumeration of software events perf can monitor. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum SoftwareEvent { + /// A placeholder event that counts nothing. + Dummy, +} + +/// A perf counter with a very limited range of configurability. +/// Construct via [`Builder`]. +#[derive(Debug)] +pub struct PerfCounter { + fd: libc::c_int, + mmap: Option>, +} + +impl Event { + fn attr_type(self) -> u32 { + match self { + Event::Hardware(_) => perf::perf_type_id_PERF_TYPE_HARDWARE, + Event::Software(_) => perf::perf_type_id_PERF_TYPE_SOFTWARE, + Event::Raw(_) => perf::perf_type_id_PERF_TYPE_RAW, + } + } + + fn attr_config(self) -> u64 { + match self { + Event::Raw(x) => x, + Event::Hardware(HardwareEvent::Instructions) => { + perf::perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS.into() + } + Event::Hardware(HardwareEvent::BranchInstructions) => { + perf::perf_hw_id_PERF_COUNT_HW_BRANCH_INSTRUCTIONS.into() + } + Event::Software(SoftwareEvent::Dummy) => perf::perf_sw_ids_PERF_COUNT_SW_DUMMY.into(), + } + } +} + +/// Builder for a PerfCounter. Contains only the subset of the attributes that +/// this API allows manipulating set to non-defaults. +#[derive(Debug, Clone)] +pub struct Builder { + pid: libc::pid_t, + cpu: libc::c_int, + evt: Event, + sample_period: u64, + precise_ip: u32, + fast_reads: bool, +} + +impl Builder { + /// Initialize the builder. The initial configuration is for a software + /// counting event that never increments. + /// + /// `pid` accepts a *TID* from `gettid(2)`. Passing `getpid(2)` will + /// monitor the main thread of the calling thread group. Passing `0` + /// monitors the calling thread. Passing `-1` monitors all threads on + /// the specified CPU. + /// + /// `cpu` should almost always be `-1`, which tracks the specified `pid` + /// across all CPUs. Non-negative integers track only the specified `pid` + /// on that CPU. + /// + /// Passing `-1` for both `pid` and `cpu` will result in an error. + pub fn new(pid: libc::pid_t, cpu: libc::c_int) -> Self { + Self { + pid, + cpu, + evt: Event::Software(SoftwareEvent::Dummy), + sample_period: 0, + precise_ip: 0, + fast_reads: false, + } + } + + /// Select the event to monitor. + pub fn event(&mut self, evt: Event) -> &mut Self { + self.evt = evt; + self + } + + /// Set the period for sample collection. Default is 0, which creates a + /// counting event. + /// + /// Because this module always sets `wakeup_events` to 1, this also + /// specifies after how many events an overflow notification should be + /// raised. If a signal has been setup with + /// `PerfCounter::set_signal_delivery`], this corresponds to one sent + /// signal. Overflow notifications are sent whenever the counter reaches a + /// multiple of `sample_period`. + /// + /// If you only want accurate counts, pass + /// `DISABLE_SAMPLE_PERIOD`. Passing `0` will also work, but will create a + /// _counting_ event that cannot become a _sampling event_ via the + /// `PERF_EVENT_IOC_PERIOD` ioctl. + pub fn sample_period(&mut self, period: u64) -> &mut Self { + self.sample_period = period; + self + } + + /// Set `precise_ip` on the underlying perf attribute structure. Valid + /// values are 0-3; the underlying field is 2 bits. + /// + /// Non-zero values will cause perf to attempt to lower the skid of *samples* + /// (but not necessarily notifications), usually via hardware features like + /// Intel PEBS. + /// + /// Use with caution: experiments have shown that counters with non-zero + /// `precise_ip` can drop events under certain circumstances. See + /// `experiments/test_consistency.c` for more information. + pub fn precise_ip(&mut self, precise_ip: u32) -> &mut Self { + self.precise_ip = precise_ip; + self + } + + /// Enable fast reads via shared memory with the kernel for the latest + /// counter value. + pub fn fast_reads(&mut self, enable: bool) -> &mut Self { + self.fast_reads = enable; + self + } + + /// Render the builder into a `PerfCounter`. Created counters begin in a + /// disabled state. Additional initialization steps should be performed, + /// followed by a call to [`PerfCounter::enable`]. + pub fn create(&self) -> Result { + let mut attr = perf::perf_event_attr::default(); + attr.size = core::mem::size_of_val(&attr) as u32; + attr.type_ = self.evt.attr_type(); + attr.config = self.evt.attr_config(); + attr.__bindgen_anon_1.sample_period = self.sample_period; + attr.set_disabled(1); // user must enable later + attr.set_exclude_kernel(1); // we only care about user code + attr.set_exclude_guest(1); + attr.set_exclude_hv(1); // unlikely this is supported, but it doesn't hurt + attr.set_pinned(1); // error state if we are descheduled from the PMU + attr.set_precise_ip(self.precise_ip.into()); + attr.__bindgen_anon_2.wakeup_events = 1; // generate a wakeup (overflow) after one sample event + + let pid = self.pid; + let cpu = self.cpu; + let group_fd: libc::c_int = -1; // always create a new group + let flags = perf::PERF_FLAG_FD_CLOEXEC; // marginally more safe if we fork+exec + + let fd = Errno::result(unsafe { + libc::syscall(libc::SYS_perf_event_open, &attr, pid, cpu, group_fd, flags) + })?; + let fd = fd as libc::c_int; + + let mmap = if self.fast_reads { + let res = Errno::result(unsafe { + libc::mmap( + core::ptr::null_mut(), + get_mmap_size(), + libc::PROT_READ, // leaving PROT_WRITE unset lets us passively read + libc::MAP_SHARED, + fd, + 0, + ) + }); + match res { + Ok(ptr) => Some(NonNull::new(ptr as *mut _).unwrap()), + Err(e) => { + close_perf_fd(fd); + return Err(e); + } + } + } else { + None + }; + + Ok(PerfCounter { fd, mmap }) + } +} + +impl PerfCounter { + /// Perf counters cannot be switched from sampling to non-sampling, so + /// setting their period to this large value effectively disables overflows + /// and sampling. + pub const DISABLE_SAMPLE_PERIOD: u64 = 1 << 60; + + /// Call the `PERF_EVENT_IOC_ENABLE` ioctl. Enables increments of the + /// counter and event generation. + pub fn enable(&self) -> Result<(), Errno> { + Errno::result(unsafe { ioctls::ENABLE(self.fd, 0) }).and(Ok(())) + } + + /// Call the `PERF_EVENT_IOC_ENABLE` ioctl. Disables increments of the + /// counter and event generation. + pub fn disable(&self) -> Result<(), Errno> { + Errno::result(unsafe { ioctls::DISABLE(self.fd, 0) }).and(Ok(())) + } + + /// Corresponds exactly to the `PERF_EVENT_IOC_REFRESH` ioctl. + #[allow(dead_code)] + pub fn refresh(&self, count: libc::c_int) -> Result<(), Errno> { + assert!(count != 0); // 0 is undefined behavior + Errno::result(unsafe { ioctls::REFRESH(self.fd, 0) }).and(Ok(())) + } + + /// Call the `PERF_EVENT_IOC_RESET` ioctl. Resets the counter value to 0, + /// which results in delayed overflow events. + pub fn reset(&self) -> Result<(), Errno> { + Errno::result(unsafe { ioctls::RESET(self.fd, 0) }).and(Ok(())) + } + + /// Call the `PERF_EVENT_IOC_PERIOD` ioctl. This causes the counter to + /// behave as if `ticks` was the original argument to `sample_period` in + /// the builder. + pub fn set_period(&self, ticks: u64) -> Result<(), Errno> { + // The bindings are wrong for this ioctl. The method signature takes a + // u64, but the actual ioctl expects a pointer to a u64. Thus, we use + // the constant manually. + + // This ioctl shouldn't mutate it's argument per its API. But in case it + // does, create a mutable copy to avoid Rust UB. + let mut ticks = ticks; + Errno::result(unsafe { + libc::ioctl( + self.fd, + perf::perf_event_ioctls_PERIOD as _, + &mut ticks as *mut u64, + ) + }) + .and(Ok(())) + } + + /// Call the `PERF_EVENT_IOC_ID` ioctl. Returns a unique identifier for this + /// perf counter. + #[allow(dead_code)] + pub fn id(&self) -> Result { + let mut res = 0u64; + Errno::result(unsafe { ioctls::ID(self.fd, &mut res as *mut u64) })?; + Ok(res) + } + + /// Sets up overflow events to deliver a `SIGPOLL`-style signal, with the + /// signal number specified in `signal`, to the specified `thread`. + /// + /// There is no reason this couldn't be called at any point, but typial use + /// cases will set up signal delivery once or not at all. + pub fn set_signal_delivery(&self, thread: Tid, signal: Signal) -> Result<(), Errno> { + let owner = f_owner_ex { + type_: F_OWNER_TID, + pid: thread.as_raw(), + }; + Errno::result(unsafe { libc::fcntl(self.fd, F_SETOWN_EX, &owner as *const _) })?; + Errno::result(unsafe { libc::fcntl(self.fd, libc::F_SETFL, libc::O_ASYNC) })?; + Errno::result(unsafe { libc::fcntl(self.fd, F_SETSIG, signal as i32) })?; + Ok(()) + } + + /// Read the current value of the counter. + pub fn ctr_value(&self) -> Result { + let mut value = 0u64; + let expected_bytes = std::mem::size_of_val(&value); + loop { + let res = + unsafe { libc::read(self.fd, &mut value as *mut u64 as *mut _, expected_bytes) }; + if res == -1 { + let errno = Errno::last(); + if errno != Errno::EINTR { + return Err(errno); + } + } + if res == 0 { + // EOF: this only occurs when attr.pinned = 1 and our event was descheduled. + // This unrecoverably gives us innacurate counts. + panic!("pinned perf event descheduled!") + } + if res == expected_bytes as isize { + break; + } + } + Ok(value) + } + + /// Perform a fast read, which doesn't involve a syscall in the fast path. + /// This falls back to a slow syscall read where necessary, including if + /// fast reads weren't enabled in the `Builder`. + pub fn ctr_value_fast(&self) -> Result { + match self.mmap { + Some(ptr) => { + // SAFETY: self.mmap is constructed as the correct page or not at all + let res = unsafe { self.ctr_value_fast_loop(ptr) }; + // TODO: remove this assertion after we're confident in correctness + debug_assert_eq!(res, self.ctr_value_fallback()); + res + } + None => self.ctr_value_fallback(), + } + } + + #[cold] + fn ctr_value_fallback(&self) -> Result { + self.ctr_value() + } + + /// Safety: `ptr` must refer to the metadata page corresponding to self.fd. + #[deny(unsafe_op_in_unsafe_fn)] + #[inline(always)] + unsafe fn ctr_value_fast_loop( + &self, + ptr: NonNull, + ) -> Result { + // This implements synchronization with the kernel via a seqlock, + // see https://www.kernel.org/doc/html/latest/locking/seqlock.html. + // Also see experiments/perf_fast_reads.c for more details on fast reads. + use std::ptr::addr_of_mut; + let ptr = ptr.as_ptr(); + let mut seq; + let mut running; + let mut enabled; + let mut count; + loop { + // Acquire a lease on the seqlock -- even values are outside of + // writers' critical sections. + loop { + // SAFETY: ptr->lock is valid and aligned + seq = unsafe { read_once(addr_of_mut!((*ptr).lock)) }; + if seq & 1 == 0 { + break; + } + } + smp_rmb(); // force re-reads of other data + let index; + // SAFETY: these reads are synchronized by the correct reads of the + // seqlock. We don't do anything with them until after the outer + // loop finishing has guaranteed our read was serialized. + unsafe { + running = (*ptr).time_running; + enabled = (*ptr).time_enabled; + count = (*ptr).offset; + index = (*ptr).index; + } + if index != 0 { + // `index` being non-zero indicates we need to read from the + // hardware counter and add it to our count. Instead, we + // fallback to the slow path for a few reasons: + // 1. This only works if we're on the same core, which is basically + // never true for our usecase. + // 2. Reads of an active PMU are racy. + // 3. The PMU should almost never be active, because we should + // generally only read from stopped processes. + return self.ctr_value_fallback(); + } + smp_rmb(); + // SAFETY: ptr->lock is valid and aligned + if seq == unsafe { read_once(addr_of_mut!((*ptr).lock)) } { + // if seq is unchanged, we didn't race with writer + break; + } + } + // This check must be outside the loop to ensure our reads were actually + // serialized with any writes. + if running != enabled { + // Non-equal running/enabled time indicates the event was + // descheduled at some point, meaning our counts are inaccurate. + // This is not recoverable. The slow-read equivalent is getting EOF + // when attr.pinned = 1. + panic!("fast-read perf event was probably descheduled!") + } + Ok(count as u64) + } + + /// Return the underlying perf fd. + pub fn raw_fd(&self) -> libc::c_int { + self.fd + } +} + +fn close_perf_fd(fd: libc::c_int) { + Errno::result(unsafe { libc::close(fd) }).expect("Could not close perf fd"); +} +fn close_mmap(ptr: *mut perf::perf_event_mmap_page) { + Errno::result(unsafe { libc::munmap(ptr as *mut _, get_mmap_size()) }) + .expect("Could not munmap ring buffer"); +} + +impl Drop for PerfCounter { + fn drop(&mut self) { + if let Some(ptr) = self.mmap { + close_mmap(ptr.as_ptr()); + } + close_perf_fd(self.fd); + } +} + +// Safety: +// The mmap region is never written to. Multiple readers then race with the +// kernel as any single thread would. Though the reads are racy, that is the +// intended behavior of the perf api. +unsafe impl std::marker::Send for PerfCounter {} +unsafe impl std::marker::Sync for PerfCounter {} + +fn get_mmap_size() -> usize { + // Use a single page; we only want the perf metadata + sysconf(SysconfVar::PAGE_SIZE) + .unwrap() + .unwrap() + .try_into() + .unwrap() +} + +/// Force a relaxed atomic load. Like Linux's READ_ONCE. +/// SAFETY: caller must ensure v points to valid data and is aligned +#[inline(always)] +#[deny(unsafe_op_in_unsafe_fn)] +unsafe fn read_once(v: *mut u32) -> u32 { + use std::sync::atomic::{AtomicU32, Ordering::Relaxed}; + // SAFETY: AtomicU32 is guaranteed to have the same in-memory representation + // SAFETY: The UnsafeCell inside AtomicU32 allows aliasing with *mut + // SAFETY: The reference doesn't escape this function, so any lifetime is ok + let av: &AtomicU32 = unsafe { &*(v as *const AtomicU32) }; + av.load(Relaxed) +} + +#[inline(always)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn smp_rmb() { + use std::sync::atomic::{compiler_fence, Ordering::SeqCst}; + compiler_fence(SeqCst); +} + +// Test if we have PMU access by doing a check for a basic hardware event. +fn test_perf_pmu_support() -> bool { + // Do a raw perf_event_open because our default configuration has flags that + // might be the actual cause of the error, which we want to catch separately. + let evt = Event::Hardware(HardwareEvent::Instructions); + let mut attr = perf::perf_event_attr::default(); + attr.size = core::mem::size_of_val(&attr) as u32; + attr.type_ = evt.attr_type(); + attr.config = evt.attr_config(); + attr.__bindgen_anon_1.sample_period = PerfCounter::DISABLE_SAMPLE_PERIOD; + attr.set_exclude_kernel(1); // lowers permission requirements + + let pid: libc::pid_t = 0; // track this thread + let cpu: libc::c_int = -1; // across any CPU + let group_fd: libc::c_int = -1; + let flags = perf::PERF_FLAG_FD_CLOEXEC; + let res = Errno::result(unsafe { + libc::syscall(libc::SYS_perf_event_open, &attr, pid, cpu, group_fd, flags) + }); + match res { + Ok(fd) => { + Errno::result(unsafe { libc::close(fd as libc::c_int) }) + .expect("perf feature check: close(fd) failed"); + return true; + } + Err(Errno::ENOENT) => info!("Perf feature check failed due to ENOENT"), + Err(Errno::EPERM) => info!("Perf feature check failed due to EPERM"), + Err(Errno::EACCES) => info!("Perf feature check failed due to EACCES"), + Err(e) => panic!("Unexpected error during perf feature check: {}", e), + } + false +} + +lazy_static! { + static ref IS_PERF_SUPPORTED: bool = test_perf_pmu_support(); +} + +/// Returns true if the current system configuration supports use of perf for +/// hardware events. +pub fn is_perf_supported() -> bool { + *IS_PERF_SUPPORTED +} + +/// Concisely return if `is_perf_supported` is `false`. Useful for guarding +/// tests. +#[macro_export] +macro_rules! ret_without_perf { + () => { + if !$crate::is_perf_supported() { + return; + } + }; + (expr:expr) => { + if !$crate::is_perf_supported() { + return ($expr); + } + }; +} + +/// Perform exactly `count+1` conditional branch instructions. Useful for +/// testing timer-related code. +#[cfg(target_arch = "x86_64")] +#[inline(never)] +pub fn do_branches(count: u64) { + // Anything but assembly is unreliable between debug and release + // TODO: Switch to `asm!()` when our LLVM version supports it. + #[allow(deprecated)] + unsafe { + // Loop until carry flag is set, indicating underflow + llvm_asm!(" + mov $0, %rax + perf_test_branch_loop: + subq $$1, %rax + jnc perf_test_branch_loop + " + : /* no output */ + : "r"(count) + : "cc", "rax" + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + use nix::unistd::gettid; + + #[test] + fn trace_self() { + ret_without_perf!(); + let pc = Builder::new(gettid().as_raw(), -1) + .sample_period(PerfCounter::DISABLE_SAMPLE_PERIOD) + .event(Event::Hardware(HardwareEvent::BranchInstructions)) + .create() + .unwrap(); + pc.reset().unwrap(); + pc.enable().unwrap(); + const ITERS: u64 = 10000; + do_branches(ITERS); + pc.disable().unwrap(); + let ctr = pc.ctr_value().unwrap(); + assert!(ctr >= ITERS); + assert!(ctr <= ITERS + 100); // `.disable()` overhead + } + + #[test] + fn trace_other_thread() { + ret_without_perf!(); + use std::sync::mpsc::sync_channel; + let (tx1, rx1) = sync_channel(0); // send TID + let (tx2, rx2) = sync_channel(0); // start guest spinn + + const ITERS: u64 = 100000; + + let handle = std::thread::spawn(move || { + tx1.send(gettid()).unwrap(); + rx2.recv().unwrap(); + do_branches(ITERS); + }); + + let pc = Builder::new(rx1.recv().unwrap().as_raw(), -1) + .sample_period(PerfCounter::DISABLE_SAMPLE_PERIOD) + .event(Event::Hardware(HardwareEvent::BranchInstructions)) + .create() + .unwrap(); + + pc.enable().unwrap(); + tx2.send(()).unwrap(); // tell thread to start + handle.join().unwrap(); + let ctr = pc.ctr_value().unwrap(); + assert!(ctr >= ITERS); + assert!(ctr <= ITERS + 5000, "{}", ctr); // overhead from channel operations + } + + #[test] + fn deliver_signal() { + ret_without_perf!(); + use std::mem::MaybeUninit; + use std::sync::mpsc::sync_channel; + let (tx1, rx1) = sync_channel(0); // send TID + let (tx2, rx2) = sync_channel(0); // start guest spinn + + // SIGSTKFLT defaults to TERM, so if any thread but the traced one + // receives the signal, the test will fail due to process exit. + const MARKER_SIGNAL: Signal = Signal::SIGSTKFLT; + const SPIN_BRANCHES: u64 = 50000; // big enough to "absorb" noise from debug/release + const SPINS_PER_EVENT: u64 = 10; + const SAMPLE_PERIOD: u64 = SPINS_PER_EVENT * SPIN_BRANCHES + (SPINS_PER_EVENT / 4); + + fn signal_is_pending() -> bool { + unsafe { + let mut mask = MaybeUninit::::zeroed(); + libc::sigemptyset(mask.as_mut_ptr()); + libc::sigpending(mask.as_mut_ptr()); + libc::sigismember(mask.as_ptr(), MARKER_SIGNAL as _) == 1 + } + } + + let handle = std::thread::spawn(move || { + unsafe { + let mut mask = MaybeUninit::::zeroed(); + libc::sigemptyset(mask.as_mut_ptr()); + libc::sigaddset(mask.as_mut_ptr(), MARKER_SIGNAL as _); + libc::sigprocmask(libc::SIG_BLOCK, mask.as_ptr(), std::ptr::null_mut()); + } + + tx1.send(gettid()).unwrap(); + rx2.recv().unwrap(); + + let mut count = 0; + loop { + count += 1; + do_branches(SPIN_BRANCHES); + if signal_is_pending() { + break; + } + } + assert_eq!(count, SPINS_PER_EVENT); + }); + + let tid = rx1.recv().unwrap(); + let pc = Builder::new(tid.as_raw(), -1) + .sample_period(SAMPLE_PERIOD) + .event(Event::Hardware(HardwareEvent::BranchInstructions)) + .create() + .unwrap(); + pc.set_signal_delivery(tid.into(), MARKER_SIGNAL).unwrap(); + pc.enable().unwrap(); + + tx2.send(()).unwrap(); // tell thread to start + handle.join().unwrap(); // propagate panics + } +} diff --git a/reverie-ptrace/src/stack.rs b/reverie-ptrace/src/stack.rs new file mode 100644 index 0000000..64dbd0f --- /dev/null +++ b/reverie-ptrace/src/stack.rs @@ -0,0 +1,227 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use crate::trace::Error as TraceError; +use crate::trace::Stopped; + +use reverie::syscalls::MemoryAccess; +use reverie::syscalls::{Addr, AddrMut}; +use reverie::Errno; +use reverie::Pid; +use reverie::Stack; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +// NB: leaf function can use redzone without explicit stack allocation, as +// a result it is not safe to just adjust stack pointer. 128B of stack +// space is mostly wasted -- to avoid the corner case when redzone is used. +const REDZONE_SIZE: usize = 128; + +// TODO: track actual guest stack size complexity. +// Right now this just uses a conservatively low number and we assume the +// guest stack is bigger than that. +const STACK_CAPACITY: usize = 1024 - REDZONE_SIZE; + +// keep in mind stack grows towards lower address, at least on major +// platforms. +pub struct GuestStack { + top: usize, + sp: usize, + capacity: usize, + buf: Vec, + task: Stopped, + flag: Arc, +} + +impl GuestStack { + pub fn new(pid: Pid, flag: Arc) -> Result { + let old = flag.swap(true, Ordering::SeqCst); + if old { + panic!( + "Invariant violation, cannot retrieve handle on guest Stack when there is already a StackGuard still alive." + ); + } + let task = Stopped::new_unchecked(pid); + let rsp = task.getregs()?.rsp as usize; + let top = rsp - REDZONE_SIZE as usize; + Ok(GuestStack { + top, + sp: top, + capacity: STACK_CAPACITY, + buf: Vec::new(), + task, + flag, + }) + } + + fn allocate<'stack, T>(&mut self, value: T) -> AddrMut<'stack, T> { + let mut buf = unsafe { transmute_u64s(value) }; + let buf_size = buf.len() * core::mem::size_of::(); + if self.size() + buf_size > self.capacity() { + panic!( + "guest(pid={}) stack overflow, capacity = {}", + self.task.pid(), + self.capacity() + ); + } else { + self.sp -= buf_size; + buf.reverse(); + self.buf.extend_from_slice(buf.as_slice()); + AddrMut::from_raw(self.sp).unwrap() + } + } +} + +// We need to use the StackGuard to prevent REENTRANCY. That is, you cannot call +// `Stack::new` while there is still an outstanding guard. We don't have any way to keep +// them from colliding at the moment. +#[derive(Debug)] +// TODO: Ideally we would have some way to connect the actual `Addr` references into the +// guest heap to the lifetime of the StackGuard (like the ST monad in Haskell). +pub struct StackGuard { + flag: Arc, +} + +impl Drop for StackGuard { + fn drop(&mut self) { + let old = self.flag.swap(false, Ordering::SeqCst); + if !old { + panic!( + "Invariant violation, when dropping StackGuard, the internal flag was not set as expected." + ) + } + } +} + +impl Stack for GuestStack { + type StackGuard = StackGuard; + + fn size(&self) -> usize { + (self.top - self.sp) as usize + } + fn capacity(&self) -> usize { + self.capacity + } + fn push<'stack, T>(&mut self, value: T) -> Addr<'stack, T> { + self.allocate(value).into() + } + fn reserve<'stack, T>(&mut self) -> AddrMut<'stack, T> { + let value: T = unsafe { core::mem::MaybeUninit::zeroed().assume_init() }; + self.allocate(value) + } + fn commit(mut self) -> Result { + let remote_sp: AddrMut = AddrMut::from_raw(self.sp).unwrap(); + self.buf.reverse(); + let from = + unsafe { core::slice::from_raw_parts(self.buf.as_ptr() as *const u8, self.size()) }; + self.task.write_exact(remote_sp, from)?; + Ok(StackGuard { flag: self.flag }) + } +} + +impl MemoryAccess for GuestStack { + fn read_vectored( + &self, + read_from: &[std::io::IoSlice], + write_to: &mut [std::io::IoSliceMut], + ) -> Result { + self.task.read_vectored(read_from, write_to) + } + + fn write_vectored( + &mut self, + read_from: &[std::io::IoSlice], + write_to: &mut [std::io::IoSliceMut], + ) -> Result { + self.task.write_vectored(read_from, write_to) + } +} + +#[inline] +pub unsafe fn transmute_u64s(value: T) -> Vec { + let value_ptr = &value as *const T as *const u8; + let size = core::mem::size_of::(); + let mut result: Vec = Vec::new(); + + let mut k = 0; + let mut n = size; + + // use copy_nonloverlapping? + while n >= 8 { + let ptr: *const u64 = value_ptr.offset(k).cast(); + result.push(ptr.read()); + n -= 8; + k += 8; + } + + if n != 0 { + let mut val: u64 = 0; + let src = value_ptr.offset(k); + let dst = &mut val as *mut u64 as *mut u8; + core::ptr::copy_nonoverlapping(src, dst, n); + result.push(val); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn transmute_sanity() { + assert_eq!(unsafe { transmute_u64s(1usize) }, vec![1]); + assert_eq!(unsafe { transmute_u64s(1u8) }, vec![1]); + assert_eq!(unsafe { transmute_u64s(0x12u16) }, vec![0x12]); + assert_eq!(unsafe { transmute_u64s(0x1234u32) }, vec![0x1234]); + assert_eq!(unsafe { transmute_u64s(0x12345678i32) }, vec![0x12345678]); + + let arr: [u8; 1] = [0x11]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x11]); + + let arr: [u8; 2] = [0x11, 0x22]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x2211]); + + let arr: [u8; 3] = [0x11, 0x22, 0x33]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x332211]); + + let arr: [u8; 4] = [0x11, 0x22, 0x33, 0x44]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x44332211]); + + let arr: [u8; 5] = [0x11, 0x22, 0x33, 0x44, 0x55]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x5544332211]); + + let arr: [u8; 6] = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x665544332211]); + + let arr: [u8; 7] = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x0077665544332211u64]); + + let arr: [u8; 8] = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]; + assert_eq!(unsafe { transmute_u64s(arr) }, vec![0x8877665544332211]); + + let arr: [u8; 9] = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99]; + assert_eq!( + unsafe { transmute_u64s(arr) }, + vec![0x8877665544332211, 0x99] + ); + + let arr: [u8; 10] = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa]; + assert_eq!( + unsafe { transmute_u64s(arr) }, + vec![0x8877665544332211, 0xaa99] + ); + + let tp: libc::timespec = libc::timespec { + tv_sec: 0x12, + tv_nsec: 0x3456789a, + }; + + assert_eq!(unsafe { transmute_u64s(tp) }, vec![0x12, 0x3456789a]); + } +} diff --git a/reverie-ptrace/src/task.rs b/reverie-ptrace/src/task.rs new file mode 100644 index 0000000..459a0b6 --- /dev/null +++ b/reverie-ptrace/src/task.rs @@ -0,0 +1,2146 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! `TracedTask` and its methods. + +use crate::children; +use crate::cp; +use crate::error::Error; +use crate::gdbstub::{ + Amd64CoreRegs, BreakpointType, GdbRequest, GdbServer, ResumeAction, ResumeInferior, StopEvent, + StopReason, StoppedInferior, +}; +use crate::stack::GuestStack; +use crate::timer::{HandleFailure, Timer, TimerEventRequest}; +use crate::trace::{ChildOp, Error as TraceError, Event, Running, Stopped, Wait}; +use crate::vdso; + +use async_trait::async_trait; +use futures::future::{self, Either, Future, FutureExt, TryFutureExt}; +use nix::sys::{mman::ProtFlags, signal::Signal}; +use reverie::{ + syscalls::{ + Addr, AddrMut, ArchPrctl, ArchPrctlCmd, MemoryAccess, Mprotect, Syscall, SyscallArgs, + SyscallInfo, Sysno, + }, + Backtrace, Errno, ExitStatus, Frame, GlobalRPC, GlobalTool, Guest, Pid, Rdtsc, Subscription, + Symbol, Tid, TimerSchedule, Tool, +}; +use std::{ + collections::{BTreeMap, HashMap}, + fmt, + ops::DerefMut, + pin::Pin, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, + }, + task::{Context, Poll}, +}; +use tracing::{debug, info, trace, warn}; + +use libc::user_regs_struct; +use tokio::{ + sync::{broadcast, mpsc, oneshot, Mutex, Notify}, + task::{JoinError, JoinHandle}, +}; + +#[derive(Debug)] +struct Suspended { + waker: Option>, + suspended: Arc, +} + +/// Expected resume action sent by gdb client, when the task is in a gdb stop. +#[derive(Debug, Clone, Copy, PartialEq)] +enum ExpectedGdbResume { + /// Expecting a normal gdb resume, either single step, until or continue + Resume, + /// Expecting a gdb step over, this happens the underlying task hit a sw + /// breakpoint, gdb then needs to restore the original instruction -- + /// which implies deleting the breakpoint, single-step, then restore + /// the breakpoint. This is a special case because we need to serialize + /// the whole operation, otherwise when there's a different thread in + /// the same process group which share the same breakpoint, removing + /// breakpoint can cause the 2nd thread to miss the breakpoint. + StepOver, + /// Force single-step, even if Resume(continue) is requested. This + /// is a workaround when fork/vfork/clone event is reported to gdb, + /// gdb could then issue an `vCont;p:-1` to resume all threads in + /// the thread group, which could cause the main thread to miss events. + StepOnly, +} + +pub struct Child { + id: Pid, + /// Task is suspended, either stopped by gdb (client), or received + /// SIGSTOP sent by other threads in the same process group. + suspended: Arc, + /// Notify a task reached SIGSTOP. + wait_all_stop_tx: Option>, + /// Channel to receive if a child task is becoming a daemon, when + /// `daemonize()` is called. + pub(crate) daemonizer_rx: Option>>, + /// Join handle to let child task exit gracefully. + pub(crate) handle: JoinHandle, +} + +impl Child { + /// Child task identifier. + pub fn id(&self) -> Pid { + self.id + } +} + +impl fmt::Debug for Child { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Child").field("id", &self.id).finish() + } +} + +impl Future for Child { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll { + self.handle.poll_unpin(cx) + } +} + +pub type Children = children::Children; + +enum HandleSignalResult { + /// Signal is suppressed with task resumed. + SignalSuppressed(Wait), + /// signal needs to be delivered. + SignalToDeliver(Stopped, Signal), +} + +/// All the info needed to be able to interact with the global state. +struct GlobalState { + /// The tool's static configuration data. + cfg: G::Config, + + /// Reference to the tool's global state. This is used to send it "rpc" messages. + gs_ref: Arc, + + /// Events the tool is subscripted (like interception) + subscriptions: Arc, + + /// guests are sequentialized already (by detcore for example), gdbserver + /// should avoid sequentialize threads. + sequentialized_guest: Arc, +} + +impl Clone for GlobalState { + fn clone(&self) -> Self { + Self { + cfg: self.cfg.clone(), + gs_ref: self.gs_ref.clone(), + subscriptions: self.subscriptions.clone(), + sequentialized_guest: self.sequentialized_guest.clone(), + } + } +} + +/// Our runtime representation of what Reverie knows about a guest thread. Its +/// lifetime matches the lifetime of the thread. +pub struct TracedTask { + /// Thread ID. + tid: Pid, + + /// Process ID. + pid: Pid, + + /// Parent process ID. + ppid: Option, + + /// State associated with the thread. Unique for each thread. + thread_state: L::ThreadState, + + /// State associated with the process. This is shared among threads in the + /// same thread group. + process_state: Arc, + + /// Global state. This is shared among all threads in a process tree. + global_state: GlobalState, + + /// Set to `Some` if the syscall has not been injected yet. `None` if it has. + pending_syscall: Option<(Sysno, SyscallArgs)>, + + /// pending signal to deliver. This can happen when + /// syscall got interrupted (by signal) + pending_signal: Option, + + /// A channel to allow short-circuiting the next state to main run loop. This + /// is useful inside of `inject` or `tail_inject` where we might need to + /// cancel a future early. + next_state: mpsc::Sender>, + + /// The receiving end of the next_state channel. + next_state_rx: Option>>, + + /// The timer tracking this task. Used to trigger RCB-based `timeouts`. + timer: Timer, + + /// A notifier used to cancel `handle_syscall_event` futures. For example, + /// `tail_inject` should never return to the handler. + notifier: Arc, + + /// Child processes to wait on. When one of the children exits, it should be + /// removed from this list. + child_procs: Arc>, + + /// Child threads to wait on. When one of the child threads exits, it should + /// be removed from this list. + child_threads: Arc>, + + /// Channel to send child processes to that are left over by the time this + /// task exits. + orphanage: mpsc::Sender, + + /// broadcast to kill all daemons + daemon_kill_switch: broadcast::Sender<()>, + + /// Channel to damonize a process + daemonizer: mpsc::Sender>, + + /// The rx end of `daemonizer`. + daemonizer_rx: Option>>, + + /// Total number of tasks + ntasks: Arc, + + /// Total number of daemons + ndaemons: Arc, + + /// Task is a daemon + is_a_daemon: bool, + + /// Software breakpoints. + // NB: For multi-threaded programs, sw breakpoints apply to all threads + // because they're in the same address space. Hence removing sw + // breakpoint in one thread also remove it for the rest of the threads + // in the same process group. *However*, our model is slightly different + // because we use different tx/rx channels even the threads are in the + // same process group, hence each threads owns `breakpoints: HashMap` + // instead of `Arc>`. + breakpoints: HashMap, + + /// Notify gdbserver start accepting incoming packets. + gdbserver_start_tx: Option>, + + /// task is suspended (received SIGSTOP) + suspended: Arc, + + /// Notify gdbserver there's a new stop event. + gdb_stop_tx: Option>, + + /// Task is attached by gdb. + // NB: gdb doesn't always attach everything, when fork/clone is called. + // gdb also allows detach from a task, and re-attach again. + attached_by_gdb: bool, + + /// Task is resumed by gdb. + // NB: gdb doesn't always attach everything, when fork/clone is called. + // gdb also allows detach from a task, and re-attach again. + resumed_by_gdb: Option, + + /// GDB resume request, gdbstub is the sender + gdb_resume_tx: Option>, + + /// GDB resume request, reverie is the receiver + gdb_resume_rx: Option>, + + /// Request sent by gdb. the tx channel is used by gdb instead of + /// `TracedTask`. + gdb_request_tx: Option>, + + /// Receiver to receive gdb request. + gdb_request_rx: Option>, + + /// Wait to be resumed when in sigstop due to all stop mode. + exit_suspend_tx: Option>, + + /// Wait to be resumed when in sigstop due to all stop mode. + exit_suspend_rx: Option>, + + /// Suspended task when hitting swbp. This is used to implement gdb's + /// all stop mode. + suspended_tasks: BTreeMap, + + /// Task needs (single) step over the swbp instruciton when a swbp is + /// hit. unless this is done, if is not safe for other threads running + /// in parallel to report breakpoint, otherwise there're could be an + /// interleaved step-over, which might remove the breakpoint, hence + /// causing others to miss the breakpoint. + needs_step_over: Arc>, + + /// Whether or not the tool is currently holding a handle on the guest Stack (and thus + /// potentially using actual stack memory within the guest). + stack_checked_out: Arc, +} + +impl fmt::Debug for TracedTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TracedTask") + .field("tid", &self.tid) + .field("pid", &self.pid) + .field("ppid", &self.ppid) + .finish() + } +} + +impl TracedTask { + /// Create a new TracedTask. + pub fn new( + tid: Pid, + cfg: ::Config, + gs_ref: Arc, + events: &Subscription, + orphanage: mpsc::Sender, + daemon_kill_switch: broadcast::Sender<()>, + mut gdbserver: Option, + ) -> Self { + let process_state = Arc::new(L::new(tid, &cfg)); + let global_state = GlobalState { + gs_ref, + cfg, + subscriptions: Arc::new(events.clone()), + sequentialized_guest: Arc::new( + gdbserver + .as_ref() + .map(|s| s.sequentialized_guest) + .unwrap_or(false), + ), + }; + let thread_state = process_state.init_thread_state(tid, None); + let (next_state, next_state_rx) = mpsc::channel(1); + let (daemonizer, daemonizer_rx) = mpsc::channel(1); + let (gdb_resume_tx, gdb_resume_rx) = mpsc::channel(1); + let (gdb_request_tx, gdb_request_rx) = mpsc::channel(1); + let (exit_suspend_tx, exit_suspend_rx) = mpsc::channel(16); + Self { + tid, + pid: tid, + ppid: None, + thread_state, + process_state, + global_state, + pending_syscall: None, + next_state, + next_state_rx: Some(next_state_rx), + timer: Timer::new(tid, tid), + notifier: Arc::new(Notify::new()), + pending_signal: None, + child_procs: Arc::new(Mutex::new(Children::new())), + child_threads: Arc::new(Mutex::new(Children::new())), + orphanage, + daemon_kill_switch, + daemonizer, + daemonizer_rx: Some(daemonizer_rx), + ntasks: Arc::new(AtomicUsize::new(1)), + ndaemons: Arc::new(AtomicUsize::new(0)), + is_a_daemon: false, + gdbserver_start_tx: gdbserver.as_mut().and_then(|s| s.server_tx.take()), + gdb_stop_tx: gdbserver + .as_mut() + .and_then(|s| s.inferior_attached_tx.take()), + attached_by_gdb: false, + resumed_by_gdb: None, + gdb_resume_tx: Some(gdb_resume_tx), + gdb_resume_rx: Some(gdb_resume_rx), + breakpoints: HashMap::new(), + suspended: Arc::new(AtomicBool::new(false)), + gdb_request_tx: Some(gdb_request_tx), + gdb_request_rx: Some(gdb_request_rx), + exit_suspend_tx: Some(exit_suspend_tx), + exit_suspend_rx: Some(exit_suspend_rx), + needs_step_over: Arc::new(Mutex::new(())), + suspended_tasks: BTreeMap::new(), + stack_checked_out: Arc::new(AtomicBool::new(false)), + } + } + + /// Create a child TracedTask corresponding to a clone() + fn cloned(&self, child: Pid) -> Self { + let global_state = self.global_state.clone(); + let process_state = self.process_state.clone(); + let thread_state = + process_state.init_thread_state(child, Some((self.tid, &self.thread_state))); + let (next_state, next_state_rx) = mpsc::channel(1); + let (daemonizer, daemonizer_rx) = mpsc::channel(1); + let (gdb_resume_tx, gdb_resume_rx) = mpsc::channel(1); + let (gdb_request_tx, gdb_request_rx) = mpsc::channel(1); + let (exit_suspend_tx, exit_suspend_rx) = mpsc::channel(16); + self.ntasks.fetch_add(1, Ordering::SeqCst); + Self { + tid: child, + pid: self.pid, + ppid: self.ppid, + thread_state, + process_state, + global_state, + pending_syscall: None, + next_state, + next_state_rx: Some(next_state_rx), + timer: Timer::new(self.pid, child), + notifier: Arc::new(Notify::new()), + pending_signal: None, + child_procs: self.child_procs.clone(), + child_threads: self.child_threads.clone(), + orphanage: self.orphanage.clone(), + daemon_kill_switch: self.daemon_kill_switch.clone(), + daemonizer, + daemonizer_rx: Some(daemonizer_rx), + ntasks: self.ntasks.clone(), + ndaemons: self.ndaemons.clone(), + is_a_daemon: self.is_a_daemon, + gdbserver_start_tx: None, + gdb_stop_tx: None, + attached_by_gdb: self.attached_by_gdb, + resumed_by_gdb: self.resumed_by_gdb, + gdb_resume_tx: Some(gdb_resume_tx), + gdb_resume_rx: Some(gdb_resume_rx), + breakpoints: self.breakpoints.clone(), + suspended: Arc::new(AtomicBool::new(false)), + gdb_request_tx: Some(gdb_request_tx), + gdb_request_rx: Some(gdb_request_rx), + exit_suspend_tx: Some(exit_suspend_tx), + exit_suspend_rx: Some(exit_suspend_rx), + needs_step_over: self.needs_step_over.clone(), + suspended_tasks: BTreeMap::new(), + stack_checked_out: Arc::new(AtomicBool::new(false)), + } + } + + /// Create a child TracedTask corresponding to a fork() + fn forked(&self, child: Pid) -> Self { + let process_state = Arc::new(L::new(child, &self.global_state.cfg)); + let thread_state = + process_state.init_thread_state(child, Some((self.tid, &self.thread_state))); + let (next_state, next_state_rx) = mpsc::channel(1); + let (daemonizer, daemonizer_rx) = mpsc::channel(1); + let (gdb_resume_tx, gdb_resume_rx) = mpsc::channel(1); + let (gdb_request_tx, gdb_request_rx) = mpsc::channel(1); + let (exit_suspend_tx, exit_suspend_rx) = mpsc::channel(16); + self.ntasks.fetch_add(1, Ordering::SeqCst); + Self { + tid: child, + pid: child, + ppid: Some(self.pid), + thread_state, + process_state, + global_state: self.global_state.clone(), + pending_syscall: None, + next_state, + next_state_rx: Some(next_state_rx), + timer: Timer::new(child, child), + notifier: Arc::new(Notify::new()), + pending_signal: None, + child_procs: Arc::new(Mutex::new(Children::new())), + child_threads: Arc::new(Mutex::new(Children::new())), + orphanage: self.orphanage.clone(), + daemon_kill_switch: self.daemon_kill_switch.clone(), + daemonizer, + daemonizer_rx: Some(daemonizer_rx), + ntasks: self.ntasks.clone(), + ndaemons: self.ndaemons.clone(), + // NB: if daemon forks, then its child's parent pid is no longer 1. + is_a_daemon: false, + gdbserver_start_tx: None, + gdb_stop_tx: None, + attached_by_gdb: self.attached_by_gdb, + resumed_by_gdb: None, + gdb_resume_tx: Some(gdb_resume_tx), + gdb_resume_rx: Some(gdb_resume_rx), + breakpoints: self.breakpoints.clone(), + suspended: Arc::new(AtomicBool::new(false)), + gdb_request_tx: Some(gdb_request_tx), + gdb_request_rx: Some(gdb_request_rx), + exit_suspend_tx: Some(exit_suspend_tx), + exit_suspend_rx: Some(exit_suspend_rx), + needs_step_over: Arc::new(Mutex::new(())), + suspended_tasks: BTreeMap::new(), + stack_checked_out: Arc::new(AtomicBool::new(false)), + } + } + + fn get_syscall(&self, task: &Stopped) -> Result { + let regs = task.getregs()?; + let nr = Sysno::from(regs.orig_rax as i32); + let args = SyscallArgs::from(&[regs.rdi, regs.rsi, regs.rdx, regs.r10, regs.r8, regs.r9]); + trace!( + "[retrieve_task_state] translating ptrace event SECCOMP into syscall {}", + nr + ); + Ok(Syscall::from_raw(nr, args)) + } +} + +fn set_rax(task: &Stopped, rax: u64) -> Result { + let mut regs = task.getregs()?; + let old = regs.rax; + regs.rax = rax; + task.setregs(regs)?; + Ok(old) +} + +/// Handles a potentially internal error, converting it to an exit status. +async fn handle_internal_error(err: Error) -> Result { + match err { + Error::Internal(TraceError::Died(zombie)) => Ok(zombie.reap().await), + Error::Internal(TraceError::Errno(errno)) => Err(errno.into()), + Error::External(err) => Err(err), + } +} + +/// Helper for canceling handlers. +async fn cancellable(notifier: Arc, f: F) -> Option +where + F: Future, +{ + futures::select! { + () = notifier.notified().fuse() => None, + result = f.fuse() => Some(result), + } +} + +#[derive(PartialEq, Eq, Clone, Copy, Debug)] +enum SegfaultTrapInfo { + Cpuid, + Rdtscs(Rdtsc), +} + +// check if segfault is called by cpuid/rdtsc trap +fn decode_segfault(insn_at_rip: u64) -> Option { + if insn_at_rip & 0xffffu64 == 0xa20fu64 { + Some(SegfaultTrapInfo::Cpuid) + } else if insn_at_rip & 0xffffu64 == 0x310fu64 { + Some(SegfaultTrapInfo::Rdtscs(Rdtsc::Tsc)) + } else if insn_at_rip & 0xffffffu64 == 0xf9010fu64 { + Some(SegfaultTrapInfo::Rdtscs(Rdtsc::Tscp)) + } else { + None + } +} + +// restore syscall context when it returns. This is needed because we might +// have injected a different syscall (or arguments) in handle_seccomp. +fn restore_context( + task: &Stopped, + context: libc::user_regs_struct, + rax: Option, +) -> Result<(), TraceError> { + let mut regs = task.getregs()?; + + if let Some(rax) = rax { + regs.rax = rax; + } + + regs.rip = context.rip; + + regs.rdi = context.rdi; + regs.rsi = context.rsi; + regs.rdx = context.rdx; + regs.r10 = context.r10; + regs.r8 = context.r8; + regs.r9 = context.r9; + + // This is needed when syscall is interrupted by a signal (ERESTARTSYS) + // we need restore the original syscall number as well because it is + // possible syscall is reinjected as a different variant, like vfork -> + // clone, which accepts different arguments. + regs.orig_rax = context.orig_rax; + + // NB: syscall also clobbers %rcx/%r11, but we're not required to restore + // them, because the syscall is finished and they're supposed to change. + // TL&DR: do not restore %rcx/%r11 here. + + task.setregs(regs) +} + +impl TracedTask { + async fn intercept_cpuid(&mut self) -> bool { + // FIXME: This is almost certainly broken! + let ret = self + .inject(ArchPrctl::new().with_cmd(ArchPrctlCmd::ARCH_SET_CPUID(0))) + .await; + ret == Ok(0) + } + + /// Perform the very first setup of a fresh tracee process: + /// + /// (1) Set up the special reverie/guest shared page in the tracee. + /// + /// (2) Also disables vdso within the guest + /// + /// Warning: this function MUTATES guest code to accomplish the modifications, even though this + /// mutation is undone before it returns. As a result, it has an extra precondition. + /// + /// Precondition: all threads in the guest process are stopped. Otherwise a guest state may be + /// executing the instructions that are mutated and may crash (due to problems with incoherent + /// instruction fetch resulting in non-atomic writes to instructions that straddle cache line + /// boundaries). + /// + /// Precondition: the caller is entitled to execute (blocking, destructive) waitpids against the + /// target tracee. This must not race with concurrent asynchronous tasks operating on the same + /// TID. + /// + /// Postcondition: the guest registers and code memory are restored to their original state, + /// including RIP, but the vdso page and special shared page are modified accordingly. + pub async fn tracee_preinit(&mut self, task: Stopped) -> Result { + /// Helper function for tracee_preinit that does the core work. + async fn setup_special_mmap_page(task: Stopped) -> Result { + // NB: This point in the code assumes that a specific instruction sequence "INT3; + // SYSCALL; INT3", has been patched into the guest, and that RIP points to the syscall. + // (I.e. we're already past the first breakpoint.) + let mut regs = task.getregs()?; + let mut saved_regs = regs; + + let page_addr = cp::PRIVATE_PAGE_OFFSET; + + regs.orig_rax = Sysno::mmap as u64; + regs.rax = regs.orig_rax; + regs.rdi = page_addr; + regs.rsi = cp::PRIVATE_PAGE_SIZE as u64; + regs.rdx = (libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC) as u64; + regs.r10 = (libc::MAP_PRIVATE | libc::MAP_FIXED | libc::MAP_ANONYMOUS) as u64; + regs.r8 = -1i64 as u64; + regs.r9 = 0u64; + + task.setregs(regs)?; + // Execute the injected mmap call. + let mut running = task.step(None)?; + + // loop until second breakpoint hit after injected syscall. + let task = loop { + let (task, event) = running.next_state().await?.assume_stopped(); + match event { + Event::Signal(Signal::SIGTRAP) => break task, + Event::Signal(sig) => { + // We can catch spurious signals here, such as SIGWINCH. + // All we can do is skip over them. + debug!( + "[{}] Skipping {:?} during initialization", + task.pid(), + event + ); + running = task.resume(sig)?; + } + Event::Seccomp => { + // Injected mmap trapped. + running = task.resume(None)?; + } + unknown => { + panic!("task {} returned unknown event {:?}", task.pid(), unknown); + } + } + }; + + // Make sure we got our desired address. + assert_eq!( + Errno::from_ret(task.getregs()?.rax as i64)? as u64, + page_addr, + "Could not mmap address {}", + page_addr + ); + + cp::populate_mmap_page(task.pid().into(), page_addr).map_err(|err| err)?; + + saved_regs.rip -= 1; // bp size + task.setregs(saved_regs)?; + Ok(task) + } + + /// Put the guest into the weird state where it has an "INT3;SYSCALL;INT3" patched into the + /// code wherever RIP happens to be pointing. It leaves RIP pointing at the syscall + /// instruction. This allows forcible injection of syscalls into the guest. + async fn establish_injection_state( + mut task: Stopped, + ) -> Result<(Stopped, user_regs_struct, u64), TraceError> { + // A syscall instruction flanked by INT3 breakpoints (1+2+1 bytes): + let bp_syscall_bp: u64 = 0xcc050fcc; + let regs = task.getregs()?; + + // Saved instruction memory + let rip = AddrMut::from_raw(regs.rip as usize).unwrap(); + let saved: u64 = task.read_value(rip)?; + // Patch the tracee at the current instruction pointer. + task.write_value(rip, &((saved & !(0xffffffff_u64)) | bp_syscall_bp))?; + + // When resumed, the tracee will hit the first breakpoint. Then we + // wait for it to reach that breakpoint and trap/stop. + let (task, event) = task + .resume(None)? + .wait_for_signal(Signal::SIGTRAP) + .await? + .assume_stopped(); + assert_eq!(event, Event::Signal(Signal::SIGTRAP)); + Ok((task, regs, saved)) + } + + /// Undo the effects of `establish_injection_state` and put the program code memory back to + /// normal. + fn remove_injection_state( + mut task: Stopped, + regs: user_regs_struct, + saved: u64, + ) -> Result { + // Restore what we dirtied: + task.write_value(AddrMut::from_raw(regs.rip as usize).unwrap(), &saved)?; + task.setregs(regs)?; + Ok(task) + } + + let (task, regs, saved) = establish_injection_state(task).await?; + let task = setup_special_mmap_page(task).await?; + + vdso::vdso_patch(self).await.expect("unable to patch vdso"); + + let mprotect = Mprotect::new() + .with_addr(AddrMut::from_raw(cp::TRAMPOLINE_BASE as usize)) + .with_len(cp::TRAMPOLINE_SIZE) + .with_protection(ProtFlags::PROT_READ | ProtFlags::PROT_EXEC); + self.inject(mprotect).await?; + + if self.global_state.subscriptions.has_cpuid() && !self.intercept_cpuid().await { + warn!("unable to intercept cpuid"); + } + + // Registers are restored from establish_injection_state. + remove_injection_state(task, regs, saved) + } + + async fn handle_cpuid( + &mut self, + mut regs: libc::user_regs_struct, + ) -> Result { + let eax = regs.rax as u32; + let ecx = regs.rcx as u32; + let cpuid = self + .process_state + .clone() + .handle_cpuid_event(self, eax, ecx) + .await?; + regs.rax = cpuid.eax as u64; + regs.rbx = cpuid.ebx as u64; + regs.rcx = cpuid.ecx as u64; + regs.rdx = cpuid.edx as u64; + regs.rip += 2; + self.timer.finalize_requests(); + Ok(regs) + } + + async fn handle_rdtscs( + &mut self, + mut regs: libc::user_regs_struct, + request: Rdtsc, + ) -> Result { + let retval = self + .process_state + .clone() + .handle_rdtsc_event(self, request) + .await?; + regs.rax = retval.tsc & 0xffff_ffffu64; + regs.rdx = retval.tsc >> 32; + match request { + Rdtsc::Tsc => { + regs.rip += 2; + } + Rdtsc::Tscp => { + regs.rip += 3; + regs.rcx = retval.aux.unwrap_or(0) as u64; + } + } + self.timer.finalize_requests(); + Ok(regs) + } + + /// Returns `true` if the signal was actually meant for the timer, and + /// therefore should not be forwarded to the tool / guest. + async fn handle_timer(&mut self, task: Stopped) -> Result<(bool, Stopped), TraceError> { + let task = match self.timer.handle_signal(task).await { + Err(HandleFailure::ImproperSignal(task)) => return Ok((false, task)), + Err(HandleFailure::Cancelled(task)) => return Ok((true, task)), + Err(HandleFailure::TraceError(e)) => return Err(e), + Err(HandleFailure::Event(wait)) => self.abort(Ok(wait)).await, + Ok(task) => task, + }; + self.process_state.clone().handle_timer_event(self).await; + self.timer.finalize_requests(); + Ok((true, task)) + } + + /// Handle a state change in the guest, and leave it in a stopped state. + /// Return the signal that the process would be resumed with, if any. + /// + /// Preconditions: + /// * running on the ptracer pthread + /// + /// Postconditions: + /// * guest thread may or may not be stopped, depending on value of GuestNext + /// + async fn handle_stop_event(&mut self, stopped: Stopped, event: Event) -> Result { + self.timer.observe_event(); + // A task is processed by this loop on any state change, so we must + // handle all possibilities here: + Ok(match event { + Event::Signal(sig) => self.handle_signal(stopped, sig).await?, + // A state we reach in the middle, between the prehook (before exec + // syscall) and the exec completing (posthook). + Event::Exec(_new_pid) => self.handle_exec_event(stopped).await?, + // A regular old system call. + Event::Seccomp => self.handle_seccomp(stopped).await?, + Event::NewChild(op, child) => self.handle_new_task(op, stopped, child, None).await?, + Event::VforkDone => self.handle_vfork_done_event(stopped).await?, + task_state => panic!("unknown task state: {:?}", task_state), + }) + } + + async fn get_stop_tx(&self) -> Option<(Arc, mpsc::Sender<(Pid, Suspended)>)> { + for child in self.child_threads.lock().await.deref_mut().into_iter() { + if child.id() == self.tid() { + return Some((child.suspended.clone(), child.wait_all_stop_tx.take()?)); + } + } + None + } + + async fn handle_sigtrap(&mut self, task: Stopped) -> Result { + let resumed_by_gdb_step = self + .resumed_by_gdb + .map_or(false, |action| matches!(action, ResumeAction::Step(_))); + let mut regs = task.getregs()?; + let rip_minus_one = regs.rip - 1; + + Ok(if self.breakpoints.contains_key(&rip_minus_one) { + regs.rip = rip_minus_one; + let next_state = self.resume_from_swbreak(task, regs).await?; + HandleSignalResult::SignalSuppressed(next_state) + } else if resumed_by_gdb_step { + self.notify_gdb_stop(StopReason::stopped( + task.pid(), + self.pid(), + StopEvent::Signal(Signal::SIGTRAP), + regs.into(), + )) + .await?; + let running = self + .await_gdb_resume(task, ExpectedGdbResume::Resume) + .await?; + HandleSignalResult::SignalSuppressed(running.next_state().await?) + } else { + let running = task.resume(None)?; + HandleSignalResult::SignalSuppressed(running.next_state().await?) + }) + } + + async fn handle_sigstop(&mut self, task: Stopped) -> Result { + let resumed_by_gdb_step = self + .resumed_by_gdb + .map_or(false, |action| matches!(action, ResumeAction::Step(_))); + debug_assert!(!resumed_by_gdb_step); + if let Some((suspended_flag, stop_tx)) = self.get_stop_tx().await { + let notify_stop_tx = stop_tx + .send(( + task.pid(), + Suspended { + waker: self.exit_suspend_tx.clone(), + suspended: suspended_flag, + }, + )) + .await; + drop(stop_tx); + if notify_stop_tx.is_ok() { + if let Some(rx) = self.exit_suspend_rx.as_mut() { + let _resumed_by = rx.recv().await.unwrap(); + } + } + } + Ok(HandleSignalResult::SignalSuppressed( + task.resume(None)?.next_state().await?, + )) + } + + async fn handle_sigsegv(&mut self, task: Stopped) -> Result { + let regs = task.getregs()?; + let trap_info = Addr::from_raw(regs.rip as usize) + .and_then(|addr| task.read_value(addr).ok()) + .and_then(decode_segfault); + Ok(match trap_info { + Some(SegfaultTrapInfo::Cpuid) => { + let regs = self.handle_cpuid(regs).await?; + task.setregs(regs)?; + HandleSignalResult::SignalSuppressed(task.resume(None)?.next_state().await?) + } + Some(SegfaultTrapInfo::Rdtscs(req)) => { + let regs = self.handle_rdtscs(regs, req).await?; + task.setregs(regs)?; + HandleSignalResult::SignalSuppressed(task.resume(None)?.next_state().await?) + } + None => HandleSignalResult::SignalToDeliver(task, Signal::SIGSEGV), + }) + } + + // handle ptrace signal delivery stop + async fn handle_signal(&mut self, task: Stopped, sig: Signal) -> Result { + debug!("[{}] handle_signal: received signal {}", task.pid(), sig); + let result = match sig { + Signal::SIGSEGV => self.handle_sigsegv(task).await?, + Signal::SIGSTOP => self.handle_sigstop(task).await?, + Signal::SIGTRAP => self.handle_sigtrap(task).await?, + sig if sig == Timer::signal_type() => { + let (was_timer, task) = self.handle_timer(task).await?; + if was_timer { + HandleSignalResult::SignalSuppressed(task.resume(None)?.next_state().await?) + } else { + HandleSignalResult::SignalToDeliver(task, sig) + } + } + sig => HandleSignalResult::SignalToDeliver(task, sig), + }; + + match result { + HandleSignalResult::SignalSuppressed(wait) => Ok(wait), + HandleSignalResult::SignalToDeliver(task, sig) => { + let sig = self + .process_state + .clone() + .handle_signal_event(self, sig) + .await?; + self.timer.finalize_requests(); + Ok(task.resume(sig)?.next_state().await?) + } + } + } + + // handle ptrace exec event + async fn handle_exec_event(&mut self, task: Stopped) -> Result { + // execve/execveat are tail injected, however, after exec, the new + // program start as a clean slate, hence it is actually ok to do either + // inject or tail inject after execve succeeded. + self.pending_syscall = None; + + // TODO: Update thread ID? Need to write a test checking this. + + let task = self.tracee_preinit(task).await?; + + self.process_state.clone().handle_post_exec(self).await?; + self.timer.finalize_requests(); + + if self.attached_by_gdb { + let request_tx = self.gdb_request_tx.clone(); + let resume_tx = self.gdb_resume_tx.clone(); + + let proc_exe = format!("/proc/{}/exe", task.pid()); + let exe = std::fs::read_link(&proc_exe[..]).unwrap(); + + let stopped = StoppedInferior { + reason: StopReason::stopped( + task.pid(), + self.pid(), + StopEvent::Exec(exe), + task.getregs()?.into(), + ), + request_tx: request_tx.unwrap(), + resume_tx: resume_tx.unwrap(), + }; + + // NB: notify initial gdb stop, this is the first time we can + // tell gdb tracee is ready, because a new memory map has been + // loaded (due to execve). Otherwise gdb may try to manipulate + // old process' address space. + if let Some(attach_tx) = self.gdb_stop_tx.as_ref() { + let _ = attach_tx.send(stopped).await.unwrap(); + } + let running = self + .await_gdb_resume(task, ExpectedGdbResume::Resume) + .await?; + Ok(running.next_state().await?) + } else { + Ok(task.step(None)?.next_state().await?) + } + } + + async fn handle_seccomp(&mut self, mut task: Stopped) -> Result { + let syscall = self.get_syscall(&task)?; + let (nr, args) = syscall.into_parts(); + + self.pending_syscall = Some((nr, args)); + + let retval = cancellable(self.notifier.clone(), async { + self.process_state + .clone() + .handle_syscall_event(self, syscall) + .await + }) + .await; + + // If no syscall was injected, then we need to suppress the implicit + // syscall. + if self.pending_syscall.is_some() { + task = self.skip_seccomp_syscall(task).await?; + } + + // Finalize timer requests after `skip_seccomp_syscall`, which may step + self.timer.finalize_requests(); + + if let Some(retval) = retval { + let ret = match retval { + Ok(x) => x as u64, + Err(err) => (-(err.into_errno()?.into_raw() as i64)) as u64, + }; + + set_rax(&task, ret)?; + } + + // Finally, resume the guest. + let sig = self.pending_signal.take(); + Ok(task.resume(sig)?.next_state().await?) + } + + async fn handle_new_task( + &mut self, + op: ChildOp, + parent: Stopped, + child: Running, + context: Option, + ) -> Result { + debug!( + "[scheduler] handling fork from parent {} to child {}: {:?}", + parent.pid(), + child.pid(), + op + ); + + let mut child_task = match op { + ChildOp::Clone => self.cloned(child.pid()), + ChildOp::Fork => self.forked(child.pid()), + ChildOp::Vfork => self.forked(child.pid()), + }; + + let (child_stop_tx, child_stop_rx) = mpsc::channel(1); + child_task.gdb_stop_tx = Some(child_stop_tx); + + let daemonizer_rx = child_task.daemonizer_rx.take(); + let child_resume_tx = child_task.gdb_resume_tx.clone(); + let child_request_tx = child_task.gdb_request_tx.clone(); + let suspended = child_task.suspended.clone(); + + if let Some(context) = context { + restore_context(&parent, context, Some(child.pid().as_raw() as u64))?; + } + + let id = child.pid(); + + let task = tokio::task::spawn_local(async move { + // The child could potentially exit here. In most cases the first + // event we get here should be `Event::Signal(Signal::SIGSTOP)`, but + // we can also receive `Event::Exit` if a thread is created via + // `clone`, but immediately killed via an `exit_group`. We have to + // handle that rare case here. + // + // NOTE: It is okay to call `wait` instead of the async `next_state` + // here because the notifier is not yet aware of the new process. + let (child, event) = child.wait().unwrap().assume_stopped(); + + assert!( + event == Event::Signal(Signal::SIGSTOP) || event == Event::Exit, + "Got unexpected event {:?}", + event + ); + + if let Some(context) = context { + // Restore context, but only if the child hasn't arrived at + // `Event::Exit`. + if event == Event::Signal(Signal::SIGSTOP) { + restore_context(&child, context, None).unwrap(); + } + } + + if child_task.is_a_daemon { + child_task.ndaemons.fetch_add(1, Ordering::SeqCst); + } + + let tid = child.pid(); + match child_task.run(child).await { + Err(err) => { + tracing::error!("Error in tracee tid {}: {}", tid, err); + + // We assume the tracee is stopped since this error likely + // originated from the tool itself when the tracee is + // already stopped. If the tracee is not in a stopped state, + // that's fine too and ignore the detach error. + let running = match Stopped::new_unchecked(tid).detach(None) { + Err(err) => { + // If we get an error here, the child process may + // not be in a ptrace stop. + tracing::error!("Failed to detach from {}: {}", tid, err); + return ExitStatus::Exited(1); + } + Ok(running) => running, + }; + + // Reap the process and get its exit status. + let (_pid, exit_status) = running.next_state().await.unwrap().assume_exited(); + exit_status + } + Ok(exit_status) => exit_status, + } + }); + + if op == ChildOp::Clone { + let mut child_threads = self.child_threads.lock().await; + child_threads.push(Child { + id, + suspended, + wait_all_stop_tx: None, + daemonizer_rx, + handle: task, + }); + } else { + let mut child_procs = self.child_procs.lock().await; + child_procs.push(Child { + id, + suspended, + wait_all_stop_tx: None, + daemonizer_rx, + handle: task, + }); + } + + let parent_regs = parent.getregs()?; + if self.attached_by_gdb { + // NB: We report T05;create event (for clone). However gdbserver + // from binutils-gdb doesn't report it, even after toggling + // QThreadEvents, as mentioned in https://sourceware.org/gdb/onlinedocs/gdb/General-Query-Packets.html#QThreadEvents + // We report `create` event anyway. + self.notify_gdb_stop(StopReason::new_task( + self.tid(), + self.pid(), + id, + parent_regs.into(), + op, + child_request_tx, + child_resume_tx, + Some(child_stop_rx), + )) + .await?; + // We just reported a new event, wait for gdb resume. + let running = self + .await_gdb_resume(parent, ExpectedGdbResume::StepOnly) + .await?; + // NB: We could potentially hit a breakpoint after above resume, + // make sure we don't miss the breakpoint and await for gdb + // resume (once again). This is possible because result of + // handle_new_task in from_task_state is ignored, while it could + // be a valid state like SIGTRAP, which could be a breakpoint is + // hit. + running + .next_state() + .and_then(|wait| self.check_swbreak(wait)) + .await + } else { + Ok(parent.step(None)?.next_state().await?) + } + } + + async fn handle_vfork_done_event(&mut self, stopped: Stopped) -> Result { + Ok(stopped.resume(None)?.next_state().await?) + } + + async fn handle_exit_event(task: Stopped) -> Result { + // Nothing to do but resume and wait for the final exit status. + let wait = task.resume(None)?.next_state().await?; + let (_pid, exit_status) = wait.assume_exited(); + Ok(exit_status) + } + + /// Aborts the current handler. This just sends a result through a channel to + /// the `run_loop`, which should cause the current future to be dropped and + /// canceled. Thus, this function will never return so that execution of the + /// current future doesn't proceed any further. + async fn abort(&mut self, result: Result) -> ! { + self.next_state.send(result).await.unwrap(); + + // Wait on a future that will never complete. This pending future will + // be dropped when the channel receives the event just sent. + future::pending::().await + } + + /// Marks the current task as exited via a channel. The receiver end of the + /// channel should cause the current future to be dropped and canceled. Thus, + /// this function will never return so that execution doesn't proceed any + /// further. + async fn exit(&mut self, exit_status: ExitStatus) -> ! { + self.abort(Ok(Wait::Exited(self.tid(), exit_status))).await + } + + /// Marks the current task as having successfully called `execve` and so it + /// should never return. + async fn execve(&mut self, next_state: Wait) -> ! { + self.abort(Ok(next_state)).await + } + + /// Triggers the tool exit callbacks. + async fn tool_exit(self, exit_status: ExitStatus) -> Result<(), reverie::Error> { + if self.is_main_thread() { + // Wait for all child threads to fully exit. This *must* happen before + // the main thread can exit. + // TODO: Use FuturesUnordered instead of `join_all` for better + // performance. + { + let children = self.child_threads.lock().await.take_inner(); + future::join_all(children).await; + } + + // Check if there are any children who's futures are still pending. If + // this is the case, then they shall be considered "orphans" and are + // "adopted" by the tracer process who shall then wait for them to exit + // and get their final exit code. Normally, when not running under + // ptrace, orphans are adopted by the init process who should + // automatically reap them by waiting for the final exit status. + let (orphans, _) = { + let mut child_procs = self.child_procs.lock().await; + child_procs.deref_mut().await + }; + + for orphan in orphans.into_inner() { + // Bon voyage. + self.orphanage.send(orphan).await.unwrap(); + } + + let _ = self + .notify_gdb_stop(StopReason::Exited(self.pid(), exit_status)) + .await; + + let wrapped = WrappedFrom(self.tid, &self.global_state); + + // Thread exit + self.process_state + .on_exit_thread(self.tid, &wrapped, self.thread_state, exit_status) + .await?; + + // The try_unwrap and subsequent unwrap are safe to do. ptrace + // guarantees that all threads in the thread group have exited + // before the main thread. + let process_state = Arc::try_unwrap(self.process_state).unwrap_or_else(|_| { + // If you end up seeing this panic, make sure that all clones of + // `process_state` are dropped before reaching this point. + panic!("Reverie internal invariant broken. try_unwrap on process state failed") + }); + let wrapped = WrappedFrom(self.tid, &self.global_state); + process_state + .on_exit_process(self.tid, &wrapped, exit_status) + .await?; + + let ntasks_remaining = self.ntasks.fetch_sub(1, Ordering::SeqCst); + let ndaemons = self.ndaemons.load(Ordering::SeqCst); + + if self.is_a_daemon { + self.ndaemons.fetch_sub(1, Ordering::SeqCst); + } + + if ntasks_remaining == 1 + ndaemons { + // daemonize() might not get called, this is not an error. + let _ = self.daemon_kill_switch.send(()); + } + } else { + let _ = self + .notify_gdb_stop(StopReason::ThreadExited( + self.tid(), + self.pid(), + exit_status, + )) + .await; + let wrapped = WrappedFrom(self.tid, &self.global_state); + + self.child_threads + .lock() + .await + .retain(|child| child.id() != self.tid); + + // Thread exit + self.process_state + .on_exit_thread(self.tid, &wrapped, self.thread_state, exit_status) + .await?; + + self.ntasks.fetch_sub(1, Ordering::SeqCst); + if self.is_a_daemon { + self.ndaemons.fetch_sub(1, Ordering::SeqCst); + } + } + + Ok(()) + } + + async fn run_loop(&mut self, task: Stopped) -> Result { + match self.run_loop_internal(task).await { + Ok(exit_status) => Ok(exit_status), + Err(err) => { + // Note: Calling handle_internal_error cannot happen in the + // `select!()` of the `run` function because then the exit + // events that get generated in here cannot be caught by the + // `select!()`. + handle_internal_error(err).await + } + } + } + + async fn run_loop_internal(&mut self, task: Stopped) -> Result { + // This is the beginning of the life of the guest. Allow the tool to + // inject syscalls as soon as the thread starts. + if let Some(Err(err)) = cancellable(self.notifier.clone(), async { + self.process_state.clone().handle_thread_start(self).await + }) + .await + { + // Propagate user errors. Don't care about the result of syscall injections. + err.into_errno()?; + } + self.timer.finalize_requests(); + + // Resume the guest for the first time. Note that the root task and + // child tasks start out in a stopped state for different reasons: The + // root task is stopped because of the SIGSTOP raised inside of `fork()` + // after calling `traceme`. Child tasks start out in a running state, + // but we wait for them to stop in `Event::NewChild`. + // + // NB: await_gdb_resume == resume if not attached_by_gdb. + let running = self + .await_gdb_resume(task, ExpectedGdbResume::Resume) + .await?; + + // Notify gdb server (if any) that tracee is ready. + if let Some(server_tx) = self.gdbserver_start_tx.take() { + self.attached_by_gdb = true; + server_tx.send(()).unwrap(); + } + + let mut task_state = running.next_state().await?; + let mut next_state_rx = self.next_state_rx.take().unwrap(); + + loop { + match task_state { + Wait::Stopped(stopped, event) => { + // Allow short-circuiting of the event stream. This makes it + // easier to send exit and execve events directly to the run + // loop from within `inject` or `tail_inject`. + let fut1 = next_state_rx.recv().fuse(); + let fut2 = self.handle_stop_event(stopped, event).fuse(); + + futures::pin_mut!(fut1, fut2); + + task_state = futures::select_biased! { + next_state = fut1 => { + if let Some(next_state) = next_state { + next_state.map_err(Error::Internal) + } else { + panic!() + } + } + next_state = fut2 => next_state, + }?; + } + Wait::Exited(pid, exit_status) => { + self.notify_gdb_stop(StopReason::Exited(pid, exit_status)) + .await?; + break Ok(exit_status); + } + } + } + } + + /// Drive a single guest thread to completion. Returns the final exit code + /// when that guest thread exits. + pub async fn run(mut self, child: Stopped) -> Result { + let exit_status = { + let exit_event = child.exit_event().fuse(); + let run_loop = self.run_loop(child).fuse(); + futures::pin_mut!(exit_event, run_loop); + + futures::select_biased! { + task = exit_event => match Self::handle_exit_event(task).await { + Ok(exit_status) => exit_status, + Err(err) => handle_internal_error(err.into()).await?, + }, + exit_status = run_loop => exit_status?, + } + }; + + self.tool_exit(exit_status).await?; + + Ok(exit_status) + } + + /// Skip the syscall which is about to happen in the tracee, switching the tracee + /// from Seccomp() state to Stopped(SIGTRAP) state. + /// + /// This uses the convention that setting the syscall number to -1 causes the + /// kernel to skip it. This function takes as argument the current register state + /// and restores it after stepping over the skipped syscall instruction. + /// + /// Preconditions: + /// Ptrace tracee is in a (seccomp) stopped state. + /// The tracee was stopped with the RIP pointing just after a syscall instruction (+2). + /// + /// Postconditions: + /// Set tracee state to Stopped/SIGTRP. + /// Restore the registers to the state specified by the regs arg. + async fn skip_seccomp_syscall(&mut self, task: Stopped) -> Result { + let regs = task.getregs()?; + + // So here we are, at ptrace seccomp stop, if we simply resume, the kernel + // would do the syscall, without our patch. we change to syscall number to + // -1, so that kernel would simply skip the syscall, so that we can jump to + // our patched syscall on the first run. Please note after calling this + // function, the task state will no longer be in ptrace event seccomp. + let mut new_regs = regs; + new_regs.orig_rax = -1i64 as u64; + task.setregs(new_regs)?; + let mut running = task.step(None)?; + + // After the step, wait for the next transition. Note that this can return + // an exited state if there is a group exit while some thread is blocked on + // a syscall. + loop { + match running.next_state().await? { + Wait::Stopped(task, Event::Signal(Signal::SIGTRAP)) => { + task.setregs(regs)?; + break Ok(task); + } + Wait::Stopped(task, Event::Signal(sig)) => { + // We can get a spurious signal here, such as SIGWINCH. Skip + // past them until the tracee eventually arrives at SIGTRAP. + running = task.step(sig)?; + } + Wait::Stopped(task, event) => { + panic!( + "skip_seccomp_syscall: PID {} got unexpected event: {:?}", + task.pid(), + event + ); + } + Wait::Exited(_pid, exit_status) => { + break self.exit(exit_status).await; + } + } + } + } + + /// inject syscall for given tracee + /// + /// NB: limitations: + /// - tracee must be in stopped state. + /// - the tracee must have returned from PTRACE_EXEC_EVENT + /// - must be called on the ptracer thread + /// + /// Side effects: + /// - mutates contexts + async fn untraced_syscall( + &mut self, + task: Stopped, + nr: Sysno, + args: SyscallArgs, + ) -> Result, TraceError> { + trace!( + "[scheduler/tool] (pid = {}) untraced syscall: {:?}", + task.pid(), + nr + ); + let mut regs = task.getregs()?; + + let oldregs = regs; + + let no = nr as u64; + regs.orig_rax = no; + regs.rax = no; + regs.rdi = args.arg0; + regs.rsi = args.arg1; + regs.rdx = args.arg2; + regs.r10 = args.arg3; + regs.r8 = args.arg4; + regs.r9 = args.arg5; + + // instruction at PRIVATE_PAGE_OFFSET, see `populate_mmap_page`. + // 7000_0000: 0f 05 syscall + // 7000_0002: 0f 0b ud2 + regs.rip = cp::PRIVATE_PAGE_OFFSET; + + task.setregs(regs)?; + + let wait = task.step(None)?.next_state().await?; + + self.from_task_state(wait, Some(oldregs)).await + } + + // Helper function + async fn private_inject( + &mut self, + task: Stopped, + nr: Sysno, + args: SyscallArgs, + ) -> Result, TraceError> { + let task = self.skip_seccomp_syscall(task).await?; + + self.untraced_syscall(task, nr, args).await + } + + async fn from_task_state( + &mut self, + wait_status: Wait, + context: Option, + ) -> Result, TraceError> { + match wait_status { + Wait::Stopped(stopped, event) => match event { + Event::Signal(_sig) if context.is_none() => { + let regs = stopped.getregs()?; + Ok(Ok(regs.rax as i64)) + } + Event::Signal(sig) => { + let mut regs = stopped.getregs()?; + // NB: it is possible to get interrupted by signal (such as + // SIGCHLD) before single step finishes (in that case rip == + // 0x7000_0000u64). + debug_assert!( + regs.rip == cp::PRIVATE_PAGE_OFFSET + 0x2 + || regs.rip == cp::PRIVATE_PAGE_OFFSET + ); + // interrupted by signal, return -ERESTARTSYS so that tracee can do a + // restart_syscall. + if sig != Signal::SIGTRAP { + regs.rax = (-(Errno::ERESTARTSYS.into_raw()) as i64) as u64; + self.pending_signal = Some(sig); + } + if let Some(context) = context { + // Restore syscall args to original values. This is + // needed when we convert syscalls like SYS_open -> + // SYS_openat, syscall args are modified need to restore + // it back. + restore_context(&stopped, context, None)?; + } + Ok(Errno::from_ret(regs.rax as i64)) + } + Event::NewChild(op, child) => { + let ret = child.pid().as_raw() as i64; + let _ = self.handle_new_task(op, stopped, child, context).await?; + Ok(Ok(ret)) + } + Event::Exec(_new_pid) => { + // This should never return. + let next_state = self.handle_exec_event(stopped).await?; + self.execve(next_state).await + } + Event::Syscall => { + let regs = stopped.getregs()?; + Ok(Errno::from_ret(regs.rax as i64)) + } + st => panic!("untraced_syscall returned unknown state: {:?}", st), + }, + Wait::Exited(_pid, exit_status) => self.exit(exit_status).await, + } + } + + async fn do_inject(&mut self, nr: Sysno, args: SyscallArgs) -> Result { + match self.inner_inject(nr, args).await { + Ok(ret) => ret, + Err(err) => self.abort(Err(err)).await, + } + } + + async fn inner_inject( + &mut self, + nr: Sysno, + args: SyscallArgs, + ) -> Result, TraceError> { + let task = self.assume_stopped(); + + info!( + "[tool] (tid {}) beginning inject of syscall: {}", + self.tid(), + nr, + ); + + if self.pending_syscall.take() == Some((nr, args)) { + // If we're reinjecting the same syscall with the same arguments, + // then we can just let the tracee continue and stop at sysexit. + let wait = task.syscall(None)?.next_state().await?; + self.from_task_state(wait, None).await + } else { + self.private_inject(task, nr, args).await + } + } + + async fn do_tail_inject(&mut self, nr: Sysno, args: SyscallArgs) -> ! { + match self.inner_tail_inject(nr, args).await { + Ok(_) => { + // Drop the handle_syscall_event future. + self.notifier.notify_one(); + future::pending::().await + } + Err(err) => self.abort(Err(err)).await, + } + } + + async fn inner_tail_inject( + &mut self, + nr: Sysno, + args: SyscallArgs, + ) -> Result, TraceError> { + let tid = self.tid(); + + info!( + "[tool] (tid {}) beginning tail_inject of syscall: {}", + &tid, nr, + ); + + let task = self.assume_stopped(); + + if self.pending_syscall.take() == Some((nr, args)) { + // We're reinjecting the same syscall with the same arguments. + // Nothing to actually do but let the tracee resume. + + // The return value here doesn't matter. + Ok(Ok(0)) + } else { + // Syscall has already been injected. Can't do the optimization. + self.private_inject(task, nr, args).await + } + } + + /// Get a ptrace stub which can do ptrace operations + // Assumption: Task is in stopped state as long as we have a valid + // reference to `TracedTask`. + fn assume_stopped(&self) -> Stopped { + Stopped::new_unchecked(self.tid()) + } + + async fn notify_gdb_stop(&self, reason: StopReason) -> Result<(), TraceError> { + if !self.attached_by_gdb { + return Ok(()); + } + + if let Some(stop_tx) = self.gdb_stop_tx.as_ref() { + let request_tx = self.gdb_request_tx.clone(); + let resume_tx = self.gdb_resume_tx.clone(); + let stop = StoppedInferior { + reason, + request_tx: request_tx.unwrap(), + resume_tx: resume_tx.unwrap(), + }; + let _ = stop_tx.send(stop).await.unwrap(); + } + Ok(()) + } + + async fn handle_gdb_request(&mut self, request: Option) { + if let Some(request) = request { + match request { + GdbRequest::SetBreakpoint(bkpt, reply_tx) => { + if bkpt.ty == BreakpointType::Software { + let result = self.add_breakpoint(bkpt.addr).await; + reply_tx.send(result).unwrap(); + } + } + GdbRequest::RemoveBreakpoint(bkpt, reply_tx) => { + if bkpt.ty == BreakpointType::Software { + let result = self.remove_breakpoint(bkpt.addr).await; + reply_tx.send(result).unwrap(); + } + } + GdbRequest::ReadInferiorMemory(addr, length, reply_tx) => { + let result = self.read_inferior_memory(addr, length); + reply_tx.send(result).unwrap(); + } + GdbRequest::WriteInferiorMemory(addr, length, data, reply_tx) => { + let result = self.write_inferior_memory(addr, length, data); + reply_tx.send(result).unwrap(); + } + GdbRequest::ReadRegisters(reply_tx) => { + let result = self.read_registers(); + reply_tx.send(result).unwrap(); + } + GdbRequest::WriteRegisters(core_regs, reply_tx) => { + let result = self.write_registers(core_regs); + reply_tx.send(result).unwrap(); + } + } + } + } + + async fn handle_gdb_resume( + resume: Option, + task: Stopped, + resume_action: ExpectedGdbResume, + ) -> Result<(Running, Option), TraceError> { + match resume { + None => Ok((task.resume(None)?, None)), + Some(resume) => { + let is_resume = resume_action == ExpectedGdbResume::Resume || resume.detach; + let is_step_only = resume_action == ExpectedGdbResume::StepOnly; + let running = match resume.action { + ResumeAction::Step(sig) => task.step(sig)?, + ResumeAction::Continue(sig) if is_resume => task.resume(sig)?, + ResumeAction::Continue(sig) if is_step_only => task.step(sig)?, + action => panic!( + "[pid = {}] unexpected resume action {:?}, expecting: {:?}", + task.pid(), + action, + resume_action, + ), + }; + Ok((running, Some(resume))) + } + } + } + + async fn await_gdb_resume( + &mut self, + task: Stopped, + resume_action: ExpectedGdbResume, + ) -> Result { + if !self.attached_by_gdb { + return task.resume(None); + } + + let mut resume_rx = self.gdb_resume_rx.take().unwrap(); + let mut gdb_request_rx = self.gdb_request_rx.take().unwrap(); + + let mut resume_future = Box::pin(resume_rx.recv()); + + let (running, resumed) = loop { + let request_future = Box::pin(gdb_request_rx.recv()); + + match future::select(request_future, resume_future).await { + Either::Left((gdb_request, pending_resume_future)) => { + self.handle_gdb_request(gdb_request).await; + resume_future = pending_resume_future; + } + Either::Right((resume_request, _)) => { + break Self::handle_gdb_resume(resume_request, task, resume_action).await?; + } + } + }; + + self.gdb_request_rx = Some(gdb_request_rx); + self.gdb_resume_rx = Some(resume_rx); + + if let Some(resumed) = resumed { + if resumed.detach { + // no longer report stop event to gdb + // self.gdb_stop_tx = None; + self.attached_by_gdb = false; + } + + self.resumed_by_gdb = Some(resumed.action); + } + + Ok(running) + } + + /// Resume from a software breakpoint set by gdb. The resume action is + /// initiated from gdb (client). + // NB: caller to %rip accordingly prior to hitting breakpoint. + async fn resume_from_swbreak( + &mut self, + task: Stopped, + regs: libc::user_regs_struct, + ) -> Result { + task.setregs(regs)?; + + // Task could be hitting a breakpoint, after previously suspended by + // a different task, need to notify this task is fully stopped. + self.suspended.store(true, Ordering::SeqCst); + if let Some((suspended_flag, stop_tx)) = self.get_stop_tx().await { + let _ = stop_tx + .send(( + self.tid(), + Suspended { + waker: None, + suspended: suspended_flag, + }, + )) + .await + .unwrap(); + } + + // When resuming from breakpoint, gdb (client) needs to remove the + // breakpoint (implying restore the original instruction), do a + // single-step (step-over), and re-insert the breakpoint. + // Because removing (sw) breakpoint modifies the instructions, other + // thread might miss the breakpoint after the breakpoint is removed + // and before the breakpoint is (re-)inserted. Hence we must make + // serialize this sequence. + let needs_step_over = self.needs_step_over.clone(); + let _guard = needs_step_over.lock().await; + + self.notify_gdb_stop(StopReason::stopped( + task.pid(), + self.pid(), + StopEvent::SwBreak, + regs.into(), + )) + .await?; + + self.freeze_all().await?; + + let running = self + .await_gdb_resume(task, ExpectedGdbResume::StepOver) + .await?; + let wait = running.next_state().await?.assume_stopped(); + let mut task = wait.0; + let mut event = wait.1; + + // Detached by client. + if !self.attached_by_gdb { + self.thaw_all().await?; + return Ok(Wait::Stopped(task, event)); + } + + task = loop { + match event { + Event::Signal(Signal::SIGTRAP) => break task, + Event::Signal(Signal::SIGSTOP) => { + let running = task.step(None)?; + let wait = running.next_state().await?.assume_stopped(); + task = wait.0; + event = wait.1; + } + // TODO: combine with handle_signal! + Event::Signal(Signal::SIGCHLD) => { + let running = task.step(Signal::SIGCHLD)?; + let wait = running.next_state().await?.assume_stopped(); + task = wait.0; + event = wait.1; + } + unknown => panic!("[pid = {}] got unexpected event {:?}", self.tid(), unknown), + } + }; + self.notify_gdb_stop(StopReason::stopped( + task.pid(), + self.pid(), + StopEvent::Signal(Signal::SIGTRAP), + task.getregs()?.into(), + )) + .await?; + + let running = self + .await_gdb_resume(task, ExpectedGdbResume::Resume) + .await?; + let wait = running.next_state().await?; + self.thaw_all().await?; + Ok(wait) + } + + /// check if the stop is caused by sw breakpoint. + async fn check_swbreak(&mut self, wait: Wait) -> Result { + match wait { + Wait::Stopped(task, event) if event == Event::Signal(Signal::SIGTRAP) => { + let mut regs = task.getregs()?; + let rip_minus_one = regs.rip - 1; + if self.breakpoints.contains_key(&rip_minus_one) { + regs.rip = rip_minus_one; + self.resume_from_swbreak(task, regs).await + } else { + Ok(Wait::Stopped(task, event)) + } + } + other => Ok(other), + } + } + + async fn add_breakpoint(&mut self, addr: u64) -> Result<(), TraceError> { + if let Some(bkpt_addr) = AddrMut::from_raw(addr as usize) { + let mut task = self.assume_stopped(); + let saved_insn: u64 = task.read_value(bkpt_addr)?; + let insn = (saved_insn & !0xffu64) | 0xccu64; + task.write_value(bkpt_addr, &insn)?; + self.breakpoints.insert(addr, saved_insn); + } + Ok(()) + } + + /// thaw all threads. + async fn thaw_all(&mut self) -> Result<(), TraceError> { + while let Some((_pid, suspended_task)) = self.suspended_tasks.pop_first() { + if let Some(tx) = suspended_task.waker.as_ref() { + suspended_task.suspended.store(false, Ordering::SeqCst); + let _sent = tx.try_send(self.tid()); + } + } + Ok(()) + } + + /// freeze all threads, except the caller. + async fn freeze_all(&mut self) -> Result<(), TraceError> { + // The tool have chosen to sequentialize thread execution, gdbserver + // should avoid doing its own thread serialization, otherwise this + // could lead to deadlock. + if *self.global_state.sequentialized_guest { + return Ok(()); + } + let (stop_tx, mut stop_rx) = mpsc::channel(1); + for child in self.child_threads.lock().await.deref_mut().into_iter() { + if child.id() != self.tid() && !child.suspended.load(Ordering::SeqCst) { + let killed = Errno::result(unsafe { + libc::syscall(libc::SYS_tgkill, self.pid(), child.id(), Signal::SIGSTOP) + }); + if killed.is_ok() { + child.suspended.store(true, Ordering::SeqCst); + child.wait_all_stop_tx = Some(stop_tx.clone()); + } + } + } + drop(stop_tx); + while let Some((pid, suspended_task)) = stop_rx.recv().await { + self.suspended_tasks.insert(pid, suspended_task); + } + Ok(()) + } + + async fn remove_breakpoint(&mut self, addr: u64) -> Result<(), TraceError> { + let insn = self.breakpoints.remove(&addr).ok_or(Errno::ENOENT)?; + let mut task = self.assume_stopped(); + if let Some(bkpt_addr) = AddrMut::from_raw(addr as usize) { + task.write_value(bkpt_addr, &insn)?; + } + Ok(()) + } + + fn read_inferior_memory(&self, addr: u64, mut size: usize) -> Result, TraceError> { + let task = self.assume_stopped(); + + // NB: dont' trust size to be sane blindly. + if size > 0x8000 { + size = 0x8000; + } + + let mut res = vec![0; size]; + if let Some(addr) = Addr::from_raw(addr as usize) { + let nb = task.read(addr, &mut res)?; + res.resize(nb, 0); + } + + // There could be a software breakpoint within the address requested, + // we should return the orignal contents without the breakpoint insn. + // This is *not* documented in gdb remote protocol, however, both + // gdbserver and rr does this. see: + // rr: https://github.com/rr-debugger/rr/blob/master/src/GdbServer.cc#L561 + // gdbserver: https://github.com/bminor/binutils-gdb/blob/master/gdbserver/mem-break.cc#L1914 + for (bkpt, saved_insn) in self.breakpoints.iter() { + if (addr..addr + res.len() as u64).contains(bkpt) { + // This abuses bkpt insn 0xcc is single byte. + res[*bkpt as usize - addr as usize] = *saved_insn as u8; + } + } + + Ok(res) + } + + fn write_inferior_memory( + &self, + addr: u64, + size: usize, + data: Vec, + ) -> Result<(), TraceError> { + let mut task = self.assume_stopped(); + let size = std::cmp::min(size, data.len()); + let addr = AddrMut::from_raw(addr as usize).ok_or(Errno::EFAULT)?; + task.write(addr, &data[..size])?; + Ok(()) + } + + fn read_registers(&self) -> Result { + let task = self.assume_stopped(); + let regs = task.getregs()?; + let fpregs = task.getfpregs()?; + let core_regs = Amd64CoreRegs::from(regs, fpregs); + Ok(core_regs) + } + + fn write_registers(&self, core_regs: Amd64CoreRegs) -> Result<(), TraceError> { + let task = self.assume_stopped(); + let (regs, fpregs) = core_regs.into_parts(); + task.setregs(regs)?; + task.setfpregs(fpregs)?; + Ok(()) + } +} + +#[async_trait] +impl Guest for TracedTask { + type Memory = Stopped; + type Stack = GuestStack; + + #[inline] + fn tid(&self) -> Pid { + self.tid + } + + #[inline] + fn pid(&self) -> Pid { + self.pid + } + + #[inline] + fn ppid(&self) -> Option { + self.ppid + } + + fn memory(&self) -> Self::Memory { + self.assume_stopped() + } + + async fn stack(&mut self) -> Self::Stack { + match GuestStack::new(self.tid, self.stack_checked_out.clone()) { + Ok(ret) => ret, + Err(err) => self.abort(Err(err)).await, + } + } + + fn thread_state_mut(&mut self) -> &mut L::ThreadState { + &mut self.thread_state + } + + fn thread_state(&self) -> &L::ThreadState { + &self.thread_state + } + + async fn daemonize(&mut self) { + let pid = self.pid(); + self.ndaemons.fetch_add(1, Ordering::SeqCst); + self.is_a_daemon = true; + + info!("[reverie] daemonizing pid {} ..", pid); + self.daemonizer + .send(self.daemon_kill_switch.subscribe()) + .await + .unwrap(); + + if self.ndaemons.load(Ordering::SeqCst) == self.ntasks.load(Ordering::SeqCst) { + self.daemon_kill_switch.send(()).unwrap(); + } + } + + async fn inject(&mut self, syscall: S) -> Result { + // Call a non-templatized function to reduce code bloat. + let (nr, args) = syscall.into_parts(); + self.do_inject(nr, args).await + } + + #[allow(unreachable_code)] + async fn tail_inject(&mut self, syscall: S) -> ! { + // Call a non-templatized function to reduce code bloat. + let (nr, args) = syscall.into_parts(); + self.do_tail_inject(nr, args).await + } + + fn set_timer(&mut self, sched: TimerSchedule) -> Result<(), reverie::Error> { + let rcbs = match sched { + TimerSchedule::Rcbs(r) => r, + TimerSchedule::Time(dur) => Timer::as_ticks(dur), + }; + self.timer + .request_event(TimerEventRequest::Imprecise(rcbs))?; + Ok(()) + } + + fn set_timer_precise(&mut self, sched: TimerSchedule) -> Result<(), reverie::Error> { + let rcbs = match sched { + TimerSchedule::Rcbs(r) => r, + TimerSchedule::Time(dur) => Timer::as_ticks(dur), + }; + self.timer.request_event(TimerEventRequest::Precise(rcbs))?; + Ok(()) + } + + fn read_clock(&mut self) -> Result { + Ok(self.timer.read_clock()) + } + + fn backtrace(&mut self) -> Option { + use unwind::{Accessors, AddressSpace, Byteorder, Cursor, PTraceState, RegNum}; + + let mut frames = Vec::new(); + + let space = AddressSpace::new(Accessors::ptrace(), Byteorder::DEFAULT).ok()?; + let state = PTraceState::new(self.tid.as_raw() as u32).ok()?; + let mut cursor = Cursor::remote(&space, &state).ok()?; + + loop { + let ip = cursor.register(RegNum::IP).ok()?; + let is_signal = cursor.is_signal_frame().ok()?; + + // Try to resolve the symbol. + let mut symbol = None; + if let Ok(name) = cursor.procedure_name() { + if let Ok(info) = cursor.procedure_info() { + if info.start_ip() + name.offset() == ip { + symbol = Some(Symbol { + name: name.name().to_string(), + offset: name.offset(), + address: info.start_ip(), + size: info.end_ip() - info.start_ip(), + }); + } + } + } + + frames.push(Frame { + ip, + is_signal, + symbol, + }); + + if !cursor.step().ok()? { + break; + } + } + + Some(Backtrace::new(self.tid(), frames)) + } +} + +#[async_trait] +impl GlobalRPC for TracedTask { + async fn send_rpc<'a>( + &'a self, + args: ::Request, + ) -> Result<::Response, reverie::Error> { + let wrapped = WrappedFrom(self.tid(), &self.global_state); + wrapped.send_rpc(args).await + } + + fn config(&self) -> &::Config { + &self.global_state.cfg + } +} + +/// Wrap a GlobalState with a Tid from which the messages originate. This enables the +/// GlobalRPC instance below. +struct WrappedFrom<'a, G: GlobalTool>(Tid, &'a GlobalState); + +#[async_trait] +impl<'a, G: GlobalTool> GlobalRPC for WrappedFrom<'a, G> { + async fn send_rpc(&self, args: G::Request) -> Result { + // In debugging mode we round-trip through a serialized representation + // to make sure it works. + let deserial = if cfg!(debug_assertions) { + let serial = bincode::serialize(&args).unwrap(); + bincode::deserialize(&serial).unwrap() + } else { + args + }; + let x = self.1.gs_ref.receive_rpc(self.0, deserial).await; + Ok(x) + } + fn config(&self) -> &G::Config { + &self.1.cfg + } +} diff --git a/reverie-ptrace/src/testing.rs b/reverie-ptrace/src/testing.rs new file mode 100644 index 0000000..7f8c887 --- /dev/null +++ b/reverie-ptrace/src/testing.rs @@ -0,0 +1,143 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Utilities that support constructing tests for Reverie Tools. + +use crate::{spawn_fn_with_config, TracerBuilder}; +use futures::Future; +use reverie::process::{Command, Output, Stdio}; +use reverie::{Error, ExitStatus, GlobalTool, Tool}; + +pub use crate::perf::do_branches; + +/// For some tests, its nice to show what was printed. +pub fn print_tracee_output(output: &Output) { + println!( + " >>> Tracee completed, {:?}, stdout len {}, stderr len {}", + output.status, + output.stdout.len(), + output.stderr.len(), + ); + if !output.stdout.is_empty() { + println!( + " >>> stdout:\n{}", + &std::str::from_utf8(&output.stdout).unwrap() + ); + } + if !output.stderr.is_empty() { + println!( + " >>> stderr:\n{}", + &std::str::from_utf8(&output.stderr).unwrap() + ); + } +} + +/// Configure tokio and tracing in the way that we like, and run the future. +pub fn run_tokio_test(fut: F) -> F::Output { + let collector = tracing_subscriber::fmt() + .with_max_level(tracing::Level::TRACE) + .finish(); + + // For reentrancy during testing we need to set up logging early because mio + // will actually do some log chatter. + + // Here we ignore errors, because tests may be running in parallel, and we don't care who "wins". + tracing::subscriber::set_global_default(collector).unwrap_or(()); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_io() + .enable_time() + .worker_threads(2) + .build() + .unwrap(); + rt.block_on(async move { + let local_set = tokio::task::LocalSet::new(); + local_set.run_until(fut).await + }) +} + +/// Runs a command as a guest and returns its collected output and global state. +pub fn test_cmd_with_config( + program: &str, + args: &[&str], + config: ::Config, +) -> Result<(Output, T::GlobalState), Error> +where + T: Tool + 'static, +{ + let mut cmd = Command::new(program); + cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); + run_tokio_test(async move { + let tracer = TracerBuilder::::new(cmd).config(config).spawn().await?; + tracer.wait_with_output().await + }) +} + +/// Runs a command as a guest and returns its collected output and global state. +pub fn test_cmd(program: &str, args: &[&str]) -> Result<(Output, T::GlobalState), Error> +where + T: Tool + 'static, +{ + test_cmd_with_config::(program, args, Default::default()) +} + +/// Runs a function as a guest and returns its collected (stdout/err) output and global state. +pub fn test_fn_with_config( + f: F, + config: ::Config, + capture_output: bool, +) -> Result<(Output, T::GlobalState), Error> +where + T: Tool + 'static, + F: FnOnce(), +{ + run_tokio_test(async move { + let tracee = spawn_fn_with_config::(f, config, capture_output).await?; + tracee.wait_with_output().await + }) +} + +/// Runs a function as a guest and returns its collected output and global state. +pub fn test_fn(f: F) -> Result<(Output, T::GlobalState), Error> +where + T: Tool + 'static, + F: FnOnce(), +{ + test_fn_with_config::(f, Default::default(), true) +} + +/// Runs a function as a guest and returns its global state. Also checks that the +/// tracee exit code is 0. +pub fn check_fn_with_config( + f: F, + config: ::Config, + capture_output: bool, +) -> T::GlobalState +where + T: Tool + 'static, + F: FnOnce(), +{ + let (output, state) = test_fn_with_config::(f, config, capture_output).unwrap(); + + if output.status != ExitStatus::Exited(0) { + print_tracee_output(&output); + panic!("Got exit status {:?}", output.status); + } + + state +} + +/// Runs a function as a guest and returns its global state. Also checks that the +/// tracee exit code is 0. +pub fn check_fn(f: F) -> T::GlobalState +where + T: Tool + 'static, + F: FnOnce(), +{ + check_fn_with_config::(f, Default::default(), true) +} diff --git a/reverie-ptrace/src/timer.rs b/reverie-ptrace/src/timer.rs new file mode 100644 index 0000000..4a92695 --- /dev/null +++ b/reverie-ptrace/src/timer.rs @@ -0,0 +1,624 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Timers monitor a specified thread using the PMU and deliver a signal +//! after a specified number of events occur. The signal is then identified +//! and transformed into a reverie timer event. This is intended to allow +//! tools to break busywaits or other spins in a reliable manner. Timers +//! are ideally deterministic so that `detcore` can use them. +//! +//! Due to PMU skid, precise timer events must be driven to completion via +//! single stepping. This means the PMI is scheduled early, and events with very +//! short timeouts require immediate single stepping. Immediate stepping is +//! acheived by artificially generating a signal that will then be delivered +//! immediately upon resumption of the guest. +//! +//! Proper use of timers requires that all delivered signals of type +//! `Timer::signal_type()` be passed through `Timer::handle_signal`, and that +//! `Timer::observe_event()` be called whenever a Tool-observable reverie event +//! occurs. Additionally, `Timer::finalize_requests()` must be called +//! - after the end of the tool callback in which the user could have +//! requested a timer event, i.e. those with `&mut guest` access. +//! - after any reverie-critical single-stepping occurs (e.g. in syscall +//! injections), +//! - before resumption of the guest, +//! which _usually_ means immediately after the tool callback returns. + +use crate::perf::*; +use crate::trace::{Error as TraceError, Event as TraceEvent, Stopped, Wait}; +use raw_cpuid::{CpuId, FeatureInfo}; +use reverie::{Errno, Pid, Signal, Tid}; +use thiserror::Error; +use tracing::{debug, warn}; + +// This signal is unused, in that the kernel will never send it to a process. +const MARKER_SIGNAL: Signal = reverie::PERF_EVENT_SIGNAL; + +const AMD_VENDOR: &str = "AuthenticAMD"; +const INTEL_VENDOR: &str = "GenuineIntel"; + +// This done in .(model|family)_id() directly in raw_cpuid v10. This can +// then be replaced with those calls, but that means this will also break! +fn full_family_model(vendor: &str, fi: &FeatureInfo) -> (u8, u8) { + let base_family_id = fi.family_id(); + let base_model_id = fi.model_id(); + let extended_model_id = fi.extended_model_id(); + let extended_family_id = fi.extended_family_id(); + let family_id = { + let just_use_base = (vendor == AMD_VENDOR && base_family_id < 0xf) + || (vendor == INTEL_VENDOR && base_family_id != 0xf); + if just_use_base { + base_family_id + } else { + base_family_id + extended_family_id + } + }; + let model_id = { + let just_use_base = (vendor == AMD_VENDOR && base_family_id < 0xf) + || (vendor == INTEL_VENDOR && base_family_id != 0xf && base_family_id != 0x6); + if just_use_base { + base_model_id + } else { + (extended_model_id << 4) | base_model_id + } + }; + (family_id, model_id) +} + +fn get_rcb_perf_config() -> u64 { + let c = CpuId::new(); + let vendor = c.get_vendor_info().unwrap(); + let vendor_str = vendor.as_string(); + match vendor_str { + AMD_VENDOR | INTEL_VENDOR => {} + s => panic!("Unknown CPU vendor: {}", s), + }; + let fi = c.get_feature_info().unwrap(); + // based on rr's PerfCounters_x86.h and PerfCounters.cc + match full_family_model(vendor_str, &fi) { + (0x06, 0x1A) | (0x06, 0x1E) | (0x06, 0x2E) => 0x5101c4, // Intel Nehalem + (0x06, 0x25) | (0x06, 0x2C) | (0x06, 0x2F) => 0x5101c4, // Intel Westmere + (0x06, 0x2A) | (0x06, 0x2D) | (0x06, 0x3E) => 0x5101c4, // Intel SanyBridge + (0x06, 0x3A) => 0x5101c4, // Intel IvyBridge + (0x06, 0x3C) | (0x06, 0x3F) | (0x06, 0x45) | (0x06, 0x46) => 0x5101c4, // Intel Haswell + (0x06, 0x3D) | (0x06, 0x47) | (0x06, 0x4F) | (0x06, 0x56) => 0x5101c4, // Intel Broadwell + (0x06, 0x4E) | (0x06, 0x55) | (0x06, 0x5E) => 0x5101c4, // Intel Skylake + (0x06, 0x8E) | (0x06, 0x9E) => 0x5101c4, // Intel Kabylake + (0x06, 0xA5) | (0x06, 0xA6) => 0x5101c4, // Intel Cometlake + _ => panic!("Unsupported processor with feature info: {:?}", fi), + } +} + +/// A timer monitoring a single thread. The underlying implementation is eagerly +/// initialized, but left empty if perf is not supported. In that case, any +/// methods with semantics that require a functioning clock or timer will panic. +#[derive(Debug)] +pub struct Timer { + inner: Option, +} + +/// Data requires to request a timer event +#[derive(Debug, Copy, Clone)] +pub enum TimerEventRequest { + /// Event should fire after precisely this many RCBs. + Precise(u64), + + /// Event should fire after at least this many RCBs. + Imprecise(u64), +} + +/// The possible results of handling a timer signal. +#[derive(Error, Debug, Eq, PartialEq)] +pub enum HandleFailure { + #[error(transparent)] + TraceError(#[from] TraceError), + + #[error("Unexpected event while single stepping")] + Event(Wait), + + /// The timer signal was for a timer event that was otherwise cancelled. The + /// task is returned unchanged. + #[error("Timer event was cancelled and should not fire")] + Cancelled(Stopped), + + /// The signal causing the signal-delivery stop was not actually meant for + /// this timer. The task is returned unchanged. + #[error("Pending signal was not for this timer")] + ImproperSignal(Stopped), +} + +impl Timer { + /// Create a new timer monitoring the specified thread. + pub fn new(guest_pid: Pid, guest_tid: Tid) -> Self { + // No errors are exposed here, as the construction should be + // bullet-proof, and if it wasn't, consumers wouldn't be able to + // meaningfully handle the error anyway. + Self { + inner: if is_perf_supported() { + Some(TimerImpl::new(guest_pid, guest_tid).unwrap()) + } else { + None + }, + } + } + + fn inner(&self) -> &TimerImpl { + self.inner.as_ref().expect("Perf support required") + } + + fn inner_noinit(&self) -> Option<&TimerImpl> { + self.inner.as_ref() + } + + fn inner_mut_noinit(&mut self) -> Option<&mut TimerImpl> { + self.inner.as_mut() + } + + /// Read the thread-local deterministic clock. Represents total elapsed RCBs + /// on this thread since the timer was constructed, which should be at or + /// near thread creation time. + pub fn read_clock(&self) -> u64 { + self.inner().read_clock() + } + + /// Approximately convert a duration to the internal notion of timer ticks. + pub fn as_ticks(dur: core::time::Duration) -> u64 { + // assumptions: 10% conditional branches, 3 GHz, avg 2 IPC + // this gives: 0.6B branch / sec = 0.6 branch / ns + (dur.as_secs() * 600_000_000) + (u64::from(dur.subsec_nanos()) * 6 / 10) + } + + /// Return the signal type sent by the timer. This is intended to allow + /// pre-filtering signals without the full overhead of gathering signal info + /// to pass to ['Timer::generated_signal`]. + pub fn signal_type() -> Signal { + MARKER_SIGNAL + } + + /// Request a timer event to occur in the future at a time specified by + /// `evt`. + /// + /// This is *not* idempotent and will replace the outstanding request. If it + /// is called repeatedly no events will be delivered. + pub fn request_event(&mut self, evt: TimerEventRequest) -> Result<(), Errno> { + self.inner_mut_noinit() + .ok_or(Errno::ENODEV)? + .request_event(evt) + } + + /// Must be called whenever a Tool-observable reverie event occurs. This + /// ensures proper cancellation semantics are observed. See the internal + /// `timer::EventStatus` type for details. + pub fn observe_event(&mut self) { + if let Some(t) = self.inner_mut_noinit() { + t.observe_event(); + } + } + + /// Cancel pending timer notifications. This is idempotent. + /// + /// If there was a previous call to [`Timer::enable_interval'], this + /// will prevent the delivery of that notification. This also has the effect + /// of reseting the "elapsed ticks." That is, if the current notification + /// duration is `N` ticks, then a full `N` ticks must elapse after the next + /// call to [`enable_interval`](Timer::enable_interval) before a + /// notification is delivered. + /// + /// While [`Timer::cancel`] actually disables the counting of RCBs, this + /// method simply sets a flag to subsequent delivered signals until + /// [`Timer::request_event`] is called again. Thus, this method is lighter + /// if called multiple times, but still results in a signal delivery, while + /// [`Timer::cancel`] must perform a syscall, but will actually cancel the + /// signal. + #[allow(dead_code)] + pub fn schedule_cancellation(&mut self) { + if let Some(t) = self.inner_mut_noinit() { + t.schedule_cancellation(); + } + } + + /// Cancel pending timer notifications. This is idempotent. + /// + /// If there was a previous call to [`Timer::enable_interval'], this + /// will prevent the delivery of that notification. This also has the effect + /// of reseting the "elapsed ticks." That is, if the current notification + /// duration is `N` ticks, then a full `N` ticks must elapse after the next + /// call to [`enable_interval`](Timer::enable_interval) before a + /// notification is delivered. + /// + /// See [`Timer::schedule_cancellation`] for a comparison with this + /// method. + #[allow(dead_code)] + pub fn cancel(&self) -> Result<(), Errno> { + self.inner_noinit().map(|t| t.cancel()).unwrap_or(Ok(())) + } + + /// Perform finalization actions on requests for timer events before guest + /// resumption. See the module-level documentation for rules about when this can and + /// should be called. + /// + /// Currently, this will, if necessary, `tgkill` a timer signal to the guest + /// thread. + pub fn finalize_requests(&self) { + if let Some(t) = self.inner_noinit() { + t.finalize_requests(); + } + } + + /// When a signal is received, this method drives the timer event to + /// completion via single stepping, after checking that the signal was meant + /// for this specific timer. This *must* be called when a timer signal is + /// received for correctness. + /// + /// Preconditions: task is in signal-delivery-stop. + /// Postconditions: if a signal meant for this timer was the cause of the + /// stop, the tracee will be at the precise instruction the timer event + /// should fire at. + pub async fn handle_signal(&mut self, task: Stopped) -> Result { + match self.inner_mut_noinit() { + Some(t) => t.handle_signal(task).await, + None => { + warn!("Stray SIGSTKFLT indicates a bug!"); + Err(HandleFailure::ImproperSignal(task)) + } + } + } +} + +/// The lazy-initialized part of a `Timer` that holds the functionality. +#[derive(Debug)] +struct TimerImpl { + /// A non-resetting counter functioning as a thread-local clock. + clock: PerfCounter, + + /// A separate counter used to generate signals for timer events + timer: PerfCounter, + + /// Information about the active timer event, including expected counter + /// values. + event: ActiveEvent, + + /// The cancellation status of the active timer event. + timer_status: EventStatus, + + /// Whether or not the active timer event requires an artificial signal + send_artificial_signal: bool, + + /// Pid (tgid) of the monitored thread + guest_pid: Pid, + + /// Tid of the monitored thread + guest_tid: Tid, +} + +/// Tracks cancellation status of a timer event in response to other reverie +/// events. +/// +/// Whenever a reverie event occurs, this should tick "forward" once. If the +/// timer signal is first to occur, then the cancellation will be pending, and +/// the event will fire. If instead some other event occured, the tick will +/// result in `Cancelled` and the event will not fire. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum EventStatus { + Scheduled, + Armed, + Cancelled, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ActiveEvent { + Precise { + /// Expected clock value when event fires. + clock_target: u64, + }, + Imprecise { + /// Expected minimum clock value when event fires. + clock_min: u64, + }, +} + +impl EventStatus { + pub fn next(self) -> Self { + match self { + EventStatus::Scheduled => EventStatus::Armed, + EventStatus::Armed => EventStatus::Cancelled, + EventStatus::Cancelled => EventStatus::Cancelled, + } + } + + pub fn tick(&mut self) { + *self = self.next() + } +} + +/// This is the experimentally determined maximum number of RCBs an overflow +/// interrupt is delivered after the originating RCB. +/// +/// If this is number is too small, timer event delivery will be delayed and +/// non-deterministic, which, if observed, will result in a panic. +/// If this number is too big, we degrade performance from excessive single +/// stepping. +/// +/// `rr` uses a value of 100 for almost all platforms, but with precise_ip = 0. +/// Enabling Intel PEBS via precise_ip > 0 seems to reduce observed skid by 1/2, +/// in synthetic benchmarks, though it makes counter _values_ incorrect. As a +/// result, we choose 50. +const SKID_MARGIN_RCBS: u64 = 50; + +/// We refuse to schedule a "perf timeout" for this or fewer RCBs, instead +/// choosing to directly single step. This is because I am somewhat paranoid +/// about perf event throttling, which isn't well-documented. +const SINGLESTEP_TIMEOUT_RCBS: u64 = 5; + +impl TimerImpl { + pub fn new(guest_pid: Pid, guest_tid: Tid) -> Result { + let has_debug_store = CpuId::new() + .get_feature_info() + .map_or(false, |info| info.has_ds()); + + let evt = Event::Raw(get_rcb_perf_config()); + + // measure the target tid irrespective of CPU + let mut builder = Builder::new(guest_tid.as_raw(), -1); + builder + .sample_period(PerfCounter::DISABLE_SAMPLE_PERIOD) + .event(evt); + + // Check if we can set precise_ip = 1 by checking if debug store is enabled. + if has_debug_store { + // set precise_ip to lowest value to enable PEBS (TODO: AMD?) + builder.precise_ip(1); + } + + let timer = builder.create()?; + timer.set_signal_delivery(guest_tid, MARKER_SIGNAL)?; + timer.reset()?; + // measure the target tid irrespective of CPU + let clock = Builder::new(guest_tid.as_raw(), -1) + // counting event + .sample_period(0) + .event(evt) + .fast_reads(true) + .create()?; + clock.reset()?; + clock.enable()?; + + Ok(Self { + timer, + clock, + event: ActiveEvent::Precise { clock_target: 0 }, + timer_status: EventStatus::Cancelled, + send_artificial_signal: false, + guest_pid, + guest_tid, + }) + } + + pub fn request_event(&mut self, evt: TimerEventRequest) -> Result<(), Errno> { + let (delivery, notification) = match evt { + TimerEventRequest::Precise(ticks) => (ticks, ticks.saturating_sub(SKID_MARGIN_RCBS)), + TimerEventRequest::Imprecise(ticks) => (ticks, ticks), + }; + if delivery == 0 { + return Err(Errno::EINVAL); // bail before setting timer + } + self.send_artificial_signal = if notification <= SINGLESTEP_TIMEOUT_RCBS { + // If there's an existing event making use of the timer counter, + // we need to "overwrite" it the same way setting an actual RCB + // notification does. + self.timer.disable()?; + true + } else { + self.timer.reset()?; + self.timer.set_period(notification)?; + self.timer.enable()?; + false + }; + let clock = self.read_clock() + delivery; + self.event = match evt { + TimerEventRequest::Precise(_) => ActiveEvent::Precise { + clock_target: clock, + }, + TimerEventRequest::Imprecise(_) => ActiveEvent::Imprecise { clock_min: clock }, + }; + self.timer_status = EventStatus::Scheduled; + Ok(()) + } + + pub fn observe_event(&mut self) { + self.timer_status.tick() + } + + pub fn schedule_cancellation(&mut self) { + self.timer_status = EventStatus::Cancelled; + } + + pub fn cancel(&self) -> Result<(), Errno> { + self.timer.disable() + } + + fn is_timer_generated_signal(signal: &libc::siginfo_t) -> bool { + // The signal that gets sent is SIGPOLL. We reconfigured the signal + // number, but the struct info is the same. Per the perf manpage, signal + // notifications will come indicating either POLL_IN or POLL_HUP. + signal.si_signo == MARKER_SIGNAL as i32 + && (signal.si_code == i32::from(libc::POLLIN) + || signal.si_code == i32::from(libc::POLLHUP)) + } + + fn generated_signal(&self, signal: &libc::siginfo_t) -> bool { + signal.si_signo == MARKER_SIGNAL as i32 + // If we sent an artificial signal, it doesn't have any siginfo + && (self.send_artificial_signal + // If not, the fd should match. This could possibly lead to a + // collision, because an fd comparing-equal to this one in another + // process could also send a signal. However, that it would also do so + // as SIGSTKFLT is effectively not going to happen. + || (Self::is_timer_generated_signal(signal) + && get_si_fd(signal) == self.timer.raw_fd())) + } + + pub fn read_clock(&self) -> u64 { + self.clock.ctr_value_fast().expect("Failed to read clock") + } + + pub fn finalize_requests(&self) { + if self.send_artificial_signal { + // Give the guest a kick via an "artificial signal". This gives us something + // to handle in `handle_signal` and thus drives single-stepping. + Errno::result(unsafe { + libc::syscall( + libc::SYS_tgkill, + self.guest_pid.as_raw(), + self.guest_tid.as_raw(), + MARKER_SIGNAL as i32, + ) + }) + .expect("Timer tgkill error indicates a bug"); + } + } + + pub async fn handle_signal(&mut self, task: Stopped) -> Result { + let signal = task.getsiginfo()?; + if !self.generated_signal(&signal) { + warn!( + ?signal, + "Passed a signal that wasn't for this timer, likely indicating a bug!", + ); + return Err(HandleFailure::ImproperSignal(task)); + } + + match self.timer_status { + EventStatus::Scheduled => panic!( + "Timer event status should tick at least once before the signal \ + is handled. This is a bug!" + ), + EventStatus::Armed => {} + EventStatus::Cancelled => { + debug!("Delivered timer signal cancelled due to status"); + self.disable_timer_before_stepping(); + return Err(HandleFailure::Cancelled(task)); + } + }; + + // At this point, we've decided that a timer event is to be delivered. + + // Before we drive the event to completion, clear `send_artificial_signal` flag so that: + // - another signal isn't generated anytime Timer::finalize_requests() is called + // - spurious SIGSTKFLTs aren't let errantly let through + // Cancellations should prevent spurious timer events in any case. + self.send_artificial_signal = false; + // Ensure any new timer signals don't mess with us while single-stepping + self.disable_timer_before_stepping(); + + let ctr = self.read_clock(); + match self.event { + ActiveEvent::Precise { clock_target } => { + self.attempt_single_step(task, ctr, clock_target).await + } + ActiveEvent::Imprecise { clock_min } => { + debug!( + "Imprecise timer event delivered. Ctr val: {}, min val: {}", + ctr, clock_min + ); + assert!(ctr >= clock_min); + Ok(task) + } + } + } + + async fn attempt_single_step( + &self, + task: Stopped, + ctr_initial: u64, + target: u64, + ) -> Result { + let mut ctr = ctr_initial; + assert!( + ctr <= target, + "Clock perf counter exceeds target value at start of attempted single-step: \ + {} > {}. Consider increasing SKID_MARGIN_RCBS.", + ctr, + target + ); + debug!("Timer will single-step from ctr {} to {}", ctr, target); + let mut task = task; + loop { + if ctr >= target { + break; + } + task = match task.step(None)?.next_state().await? { + // a successful single step results in SIGTRAP stop + Wait::Stopped(new_task, TraceEvent::Signal(Signal::SIGTRAP)) => new_task, + wait => return Err(HandleFailure::Event(wait)), + }; + ctr = self.read_clock(); + } + Ok(task) + } + + /// Imagine our skid margin is 50 RCBs, and we set the timer for 5 RCBs. + /// Since we step for 50, the timer will trigger multiple times unless we + /// disable it before stepping. This would count as a state machine + /// transition and errantly cancel the delivery of the timer event. + fn disable_timer_before_stepping(&self) { + self.timer + .disable() + .expect("Must be able to disable timer before stepping"); + } +} + +#[cfg(target_os = "linux")] +fn get_si_fd(signal: &libc::siginfo_t) -> libc::c_int { + // This almost certainly broken for anything other than linux (glibc?). + // + // The `libc` crate doesn't expose these fields properly, because the + // current version was released before union support, and `siginfo_t` is a + // messy enum/union, making this super fragile. + // + // `libc` has an accessor system in place, but only for a few particular + // signal types as of right now. We could submit a PR for SIGPOLL/SIGIO, but + // until then, this is copies the currently used accessor idea. + + #[repr(C)] + #[derive(Copy, Clone)] + struct sifields_sigpoll { + si_band: libc::c_long, + si_fd: libc::c_int, + } + #[repr(C)] + union sifields { + _align_pointer: *mut libc::c_void, + sigpoll: sifields_sigpoll, + } + #[repr(C)] + struct siginfo_f { + _siginfo_base: [libc::c_int; 3], + sifields: sifields, + padding: [libc::c_int; 24], + } + + // These compile to no-op or unconditional runtime panic, which is good, + // because code not using timers continues to work. + assert_eq!( + core::mem::size_of::(), + core::mem::size_of_val(signal), + ); + assert_eq!( + core::mem::align_of::(), + core::mem::align_of_val(signal), + ); + + unsafe { + (*(signal as *const _ as *const siginfo_f)) + .sifields + .sigpoll + .si_fd + } +} diff --git a/reverie-ptrace/src/trace/memory.rs b/reverie-ptrace/src/trace/memory.rs new file mode 100644 index 0000000..fc291af --- /dev/null +++ b/reverie-ptrace/src/trace/memory.rs @@ -0,0 +1,360 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::mem; + +use nix::sys::ptrace; + +use std::io; + +use super::Stopped; +use reverie::syscalls::{Addr, AddrMut, AddrSlice, AddrSliceMut, Errno, MemoryAccess}; + +impl Stopped { + /// Does a read that is already page-aligned. + fn read_aligned(&self, addr: Addr, buf: &mut [u8]) -> Result { + let slice = unsafe { AddrSlice::from_raw_parts(addr, buf.len()) }; + let from = [unsafe { slice.as_ioslice() }]; + let mut to = [io::IoSliceMut::new(buf)]; + self.read_vectored(&from, &mut to) + } + + /// Does a write that is already page-aligned. + fn write_aligned(&mut self, addr: AddrMut, buf: &[u8]) -> Result { + let mut slice = unsafe { AddrSliceMut::from_raw_parts(addr, buf.len()) }; + let from = [io::IoSlice::new(buf)]; + let mut to = [unsafe { slice.as_ioslice_mut() }]; + self.write_vectored(&from, &mut to) + } + + /// Reads a single u64. + fn read_u64(&self, addr: Addr) -> Result { + ptrace::read(self.0.into(), unsafe { + addr.as_ptr() as *mut ::core::ffi::c_void + }) + .map_err(|err| Errno::new(err as i32)) + .map(|x| x as u64) + } + + /// Writes a single u64. + fn write_u64(&mut self, addr: AddrMut, value: u64) -> Result<(), Errno> { + unsafe { + ptrace::write( + self.0.into(), + addr.as_mut_ptr() as *mut ::core::ffi::c_void, + value as *mut ::core::ffi::c_void, + ) + } + .map_err(|err| Errno::new(err as i32)) + } +} + +impl MemoryAccess for Stopped { + /// Does a vectored read from the remote address space. Returns the number of + /// bytes read. + /// + /// Note that there is no guarantee that all of the requested buffers will be + /// filled. See `man 2 process_vm_readv` for more information on specific + /// behavior. + fn read_vectored( + &self, + remote: &[io::IoSlice], + local: &mut [io::IoSliceMut], + ) -> Result { + Errno::result(unsafe { + libc::process_vm_readv( + self.0.as_raw(), + local.as_ptr() as *const libc::iovec, + local.len() as libc::c_ulong, + remote.as_ptr() as *const libc::iovec, + remote.len() as libc::c_ulong, + 0, + ) + }) + .map(|x| x as usize) + .or_else(|err| { + if err == Errno::EFAULT { + // Treat page faults as an EOF. + Ok(0) + } else { + Err(err) + } + }) + } + + /// Does a vectored writes to the address space. Returns the number of bytes + /// written. + /// + /// Note that there is no guarantee that all of the requested buffers will + /// be written. See `man 2 process_vm_writev` for more information on + /// specific behavior. + fn write_vectored( + &mut self, + local: &[io::IoSlice], + remote: &mut [io::IoSliceMut], + ) -> Result { + Errno::result(unsafe { + libc::process_vm_writev( + self.0.as_raw(), + local.as_ptr() as *const libc::iovec, + local.len() as libc::c_ulong, + remote.as_ptr() as *const libc::iovec, + remote.len() as libc::c_ulong, + 0, + ) + }) + .map(|x| x as usize) + .or_else(|err| { + if err == Errno::EFAULT { + // Treat page faults as an EOF. + Ok(0) + } else { + Err(err) + } + }) + } + + /// Performs a read starting at the given address. The number of bytes read + /// is returned. The buffer is not guaranteed to be completely filled. + fn read<'a, A>(&self, addr: A, buf: &mut [u8]) -> Result + where + A: Into>, + { + let addr = addr.into(); + let size = buf.len(); + if size == 0 { + return Ok(0); + } else if size <= mem::size_of::() { + // This needs to be benchmarked, but according to @wangbj + // PTRACE_PEEKDATA is faster than `process_vm_readv` for small + // reads. + let value = self.read_u64(addr.cast::())?; + let bytes = value.to_ne_bytes(); + buf.copy_from_slice(&bytes[0..size]); + return Ok(size); + } + + let addr_slice = unsafe { AddrSlice::from_raw_parts(addr, buf.len()) }; + + // Since process_vm_readv partial transfers apply at the granularity of + // the iovec elements, we need to know if the address range spans a page + // boundary and split the remote read if it does. This helps ensure that + // we get a read length >0 while there is still more data to read. + if let Some((first, second)) = addr_slice.split_at_page_boundary() { + let remote = unsafe { [first.as_ioslice(), second.as_ioslice()] }; + + // The two remote reads are merged into a single local buffer. + let mut local = [io::IoSliceMut::new(buf)]; + + self.read_vectored(&remote, &mut local) + } else { + // The address range fits into one page. Nothing special to do. + self.read_aligned(addr, buf) + } + } + + fn write(&mut self, addr: AddrMut, buf: &[u8]) -> Result { + let size = buf.len(); + if size == 0 { + return Ok(0); + } else if size == mem::size_of::() { + #[allow(clippy::cast_ptr_alignment)] + let value = unsafe { *(buf.as_ptr() as *const u64) }; + self.write_u64(addr.cast::(), value)?; + return Ok(size); + } + + let mut addr_slice = unsafe { AddrSliceMut::from_raw_parts(addr, buf.len()) }; + + // Since process_vm_writev partial transfers apply at the granularity of + // the iovec elements, we need to know if the address range spans a page + // boundary and split the remote write if it does. This helps ensure that + // we get a write length >0 before we hit a protected page. + if let Some((mut first, mut second)) = addr_slice.split_at_page_boundary() { + let mut remote = unsafe { [first.as_ioslice_mut(), second.as_ioslice_mut()] }; + + // The two remote writes come from a single local buffer. + let local = [io::IoSlice::new(buf)]; + + self.write_vectored(&local, &mut remote) + } else { + // The address range fits into one page. Nothing special to do. + self.write_aligned(addr, buf) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + use reverie::Pid; + + use std::ffi::CString; + + use nix::{ + sys::{ + ptrace, + signal::{raise, Signal}, + wait::{waitpid, WaitStatus}, + }, + unistd::{fork, ForkResult}, + }; + use quickcheck::QuickCheck; + use quickcheck_macros::quickcheck; + + // Helper function for spawning a child process in a stopped state. The + // value `T` will be in the child's address space allowing us to read or + // modify it from the parent. + fn fork_helper(mut value: T, parent: P, child: C) -> bool + where + P: FnOnce(Pid, T) -> bool, + C: FnOnce(&mut T), + { + match unsafe { fork() }.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitpid(child, None).unwrap(), + WaitStatus::Stopped(child, Signal::SIGTRAP) + ); + + let result = parent(child.into(), value); + + // Allow child to exit. + ptrace::cont(child, None).unwrap(); + assert_eq!(waitpid(child, None).unwrap(), WaitStatus::Exited(child, 0)); + + result + } + ForkResult::Child => { + ptrace::traceme().unwrap(); + + // Give us a chance to modify if needed. + child(&mut value); + + // Allow parent to control when we exit. While stopped here, the + // parent can mess with the child's memory. + raise(Signal::SIGTRAP).unwrap(); + + // Can't use the normal exit function here because we don't want + // to call atexit handlers since `execve` was never called. + unsafe { + ::libc::_exit(0); + } + } + } + } + + fn prop_remote_read_exact(buf: Vec) -> bool { + fork_helper( + buf, + move |child, mut buf| { + let copied = buf.clone(); + + let memory = Stopped::new_unchecked(child); + let addr = Addr::from_ptr(buf.as_ptr()).unwrap(); + + // Zero out the buffer just to show that we are really reading from + // the child process and not our own process. + for byte in buf.iter_mut() { + *byte = 0; + } + + memory.read_exact(addr, &mut buf).unwrap(); + + buf == copied + }, + |_| {}, + ) + } + + fn prop_remote_write_exact(buf: Vec) -> bool { + fork_helper( + buf, + move |child, mut buf| { + let copied = buf.clone(); + + let mut memory = Stopped::new_unchecked(child); + let addr = AddrMut::from_ptr(buf.as_ptr()).unwrap(); + + memory.write_exact(addr, &copied).unwrap(); + memory.read_exact(addr, &mut buf).unwrap(); + + buf == copied + }, + |buf| { + // Zero out the buffer before the parent gets a chance to write + // to it to demonstrate that writes by the parent are actually + // working. + for byte in buf.iter_mut() { + *byte = 0; + } + }, + ) + } + + #[test] + fn test_remote_memory() { + // We need our generator to produce vectors that are at least one page + // in size, ideally larger. By default, quickcheck uses a max size of + // 100 which is far too small. Here, we use 4 pages in size. + // + // FIXME: Because of the issue [1], u8::arbitrary() only ever generates + // zeros when size % u8::max_value() == 0. + // + // [1] https://github.com/BurntSushi/quickcheck/issues/119 + let mut qc = QuickCheck::new().gen(quickcheck::Gen::new(0x4000 + u8::max_value() as usize)); + + qc.quickcheck(prop_remote_read_exact as fn(Vec) -> bool); + + // Check with some known small reads. Quickcheck probably won't always + // cover these cases due to random chance. + assert!(prop_remote_read_exact(vec![])); + assert!(prop_remote_read_exact(vec![1])); + assert!(prop_remote_read_exact(vec![1, 2])); + assert!(prop_remote_read_exact(vec![1, 2, 3])); + assert!(prop_remote_read_exact(vec![1, 2, 3, 4])); + assert!(prop_remote_read_exact(vec![1, 2, 3, 4, 5, 6, 7, 8])); + + qc.quickcheck(prop_remote_write_exact as fn(Vec) -> bool); + + // Check with some known small reads. Quickcheck probably won't always + // cover these cases due to random chance. + assert!(prop_remote_write_exact(vec![])); + assert!(prop_remote_write_exact(vec![1])); + assert!(prop_remote_write_exact(vec![1, 2])); + assert!(prop_remote_write_exact(vec![1, 2, 3])); + assert!(prop_remote_write_exact(vec![1, 2, 3, 4])); + assert!(prop_remote_write_exact(vec![1, 2, 3, 4, 5, 6, 7, 8])); + } + + #[quickcheck] + fn prop_remote_read_cstring(s: String) -> bool { + // quickcheck doesn't support CString :-( + let s = CString::new( + s.into_bytes() + .into_iter() + .filter(|&x| x != 0) + .collect::>(), + ) + .unwrap(); + + fork_helper( + s, + move |child, s| { + let memory = Stopped::new_unchecked(child); + let addr = Addr::from_ptr(s.as_bytes().as_ptr()).unwrap(); + + let remote_string = memory.read_cstring(addr).unwrap(); + + remote_string == s + }, + |_| {}, + ) + } +} diff --git a/reverie-ptrace/src/trace/mod.rs b/reverie-ptrace/src/trace/mod.rs new file mode 100644 index 0000000..efc22e7 --- /dev/null +++ b/reverie-ptrace/src/trace/mod.rs @@ -0,0 +1,1458 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! A safe ptrace API. This API forces correct usage of ptrace in that it is +//! not possible to call ptrace on a process not in a stopped state. +mod memory; +#[allow(unused)] +mod notifier; +mod waitid; + +use std::{fmt, mem, ptr}; + +use nix::sys::{ + ptrace, + wait::{WaitPidFlag, WaitStatus}, +}; +use thiserror::Error; + +pub use reverie::{Errno, ExitStatus, Pid}; +use waitid::{waitid, IdType}; + +// Re-exports so that nothing else needs to depend on `nix`. +pub use nix::sys::ptrace::Options; +pub use nix::sys::signal::Signal; + +/// An error that occurred during tracing. +#[derive(Error, Debug, Eq, PartialEq)] +pub enum Error { + /// A low-level errno. + #[error(transparent)] + Errno(#[from] Errno), + + /// The tracee died unexpectedly. This should be handled gracefully by + /// reaping the zombie. + #[error("tracee {0} is a zombie")] + Died(Zombie), +} + +impl From for Error { + fn from(err: nix::errno::Errno) -> Self { + Self::Errno(Errno::new(err as i32)) + } +} + +// Helper function for removing the nix::Error type. +fn from_nix_err(pid: Pid, err: nix::Error) -> Error { + // The `nix` ptrace API only constructs `nix::Error::Sys(Errno)` + // errors, thus this `unwrap` is safe to do. + let err = Errno::new(err as i32); + + // According to ptrace(2), any ptrace operation may return ESRCH + // ("No such process") for one of three reasons: + // 1. The process was observed to be in a stopped state and died + // unexpectedly. + // 2. The process is not currently being traced by the caller. + // 3. The process is not in a stopped state. + // + // Since we know that reasons (2) and (3) only occur due to + // programmer errors that this API is designed to prevent, we can + // safely assume that this ESRCH means the tracee has died + // unexpectedly while in a stopped state. + // + // For more information, please see the "Death under ptrace" section + // in `man 2 ptrace`. + if err == Errno::ESRCH { + Error::Died(Zombie::new(pid)) + } else { + Error::Errno(err) + } +} + +/// Represents an invalid state. Useful for errors. +#[derive(Debug, Eq, PartialEq)] +struct InvalidState(pub TryWait); + +impl From for TryWait { + fn from(error: InvalidState) -> TryWait { + error.0 + } +} + +impl fmt::Display for InvalidState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "got unexpected status {}", self.0) + } +} + +impl std::error::Error for InvalidState {} + +/// Indicates how a child was created (i.e., via `fork`, `vfork`, or `clone`). +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ChildOp { + /// Stop before return from `fork(2)` or `clone(2)` with the exit signal set + /// to `SIGCHLD`. + Fork, + + /// Stop before return from `vfork(2)` or `clone(2)` with the `CLONE_VFORK` + /// flag. When the tracee is continued after this stop, it will wait for + /// child to exit/exec before continuing its execution (in other words, the + /// usual behavior on `vfork(2)`). + Vfork, + + /// Stop before return from `clone(2)`. + Clone, +} + +/// A stop event. Documentation is from `ptrace(2)`. +#[derive(Debug, Eq, PartialEq)] +pub enum Event { + /// Stop event after a new child has been created (i.e., via `fork`, `vfork`, + /// or `clone`). + NewChild(ChildOp, Running), + + /// Stop before return from `execve(2)`. Since Linux 3.0, + /// `PTRACE_GETEVENTMSG` returns the former thread ID. + Exec(Pid), + + /// Stop before return from `vfork(2)` or `clone(2)` with the `CLONE_VFORK` + /// flag, but after the child unblocked this tracee by exiting or execing. + VforkDone, + + /// Stop before exit (including death from `exit_group(2)`), signal death, or + /// exit caused by `execve(2)` in a multithreaded process. + /// `PTRACE_GETEVENTMSG` returns the exit status. Registers can be examined + /// (unlike when "real" exit happens). The tracee is still alive; it needs to + /// be `PTRACE_CONT`ed or `PTRACE_DETACH`ed to finish exiting. + Exit, + + /// Stop triggered by a `seccomp(2)` rule on tracee syscall entry when + /// `PTRACE_O_TRACESECCOMP` has been set by the tracer. The seccomp event + /// message data (from the `SECCOMP_RET_DATA` portion of the seccomp filter + /// rule) can be retrieved with `PTRACE_GETEVENTMSG`. The semantics of this + /// stop are described in detail in a separate section below. + Seccomp, + + /// Stop induced by PTRACE_INTERRUPT command, or group-stop, or initial + /// ptrace-stop when a new child is attached (only if attached using + /// PTRACE_SEIZE). + Stop, + + /// The tracee was stopped by execution of a system call. + Syscall, + + /// The tracee was stopped by delivery of a signal. + Signal(Signal), +} + +impl Event { + /// Converts a raw i32 to a ptrace event and gets any associated data. + fn from_ptrace_event(task: &Stopped, event: i32) -> Result { + // Note that there is no danger in calling ptrace here because the + // process is guaranteed to be in a ptrace-stop state when this function + // is called. + match event { + libc::PTRACE_EVENT_FORK => { + // Get the pid of the child immediately since we almost always + // want that. + let child_pid = Pid::from_raw(task.getevent()? as i32); + Ok(Self::NewChild(ChildOp::Fork, Running(child_pid))) + } + libc::PTRACE_EVENT_VFORK => { + // Get the pid of the child immediately since we almost always + // want that. + let child_pid = Pid::from_raw(task.getevent()? as i32); + Ok(Self::NewChild(ChildOp::Vfork, Running(child_pid))) + } + libc::PTRACE_EVENT_CLONE => { + // Get the pid of the child immediately since we almost always + // want that. + let child_pid = Pid::from_raw(task.getevent()? as i32); + Ok(Self::NewChild(ChildOp::Clone, Running(child_pid))) + } + libc::PTRACE_EVENT_EXEC => { + // Get the pid of the thread group leader that this call to exec + // is replacing. This is not necessarily equal to `pid` since + // another thread besides the main thread can call `exec`. This + // information is necessary to track the "death" of a process. + let new_pid = Pid::from_raw(task.getevent()? as i32); + Ok(Self::Exec(new_pid)) + } + libc::PTRACE_EVENT_VFORK_DONE => Ok(Self::VforkDone), + libc::PTRACE_EVENT_EXIT => { + // Note that we can get the exit status here using `getevent`, + // but that's almost never what we want to do. It is better to + // get that during the final exit event. + Ok(Self::Exit) + } + libc::PTRACE_EVENT_SECCOMP => Ok(Self::Seccomp), + libc::PTRACE_EVENT_STOP => Ok(Self::Stop), + _ => unreachable!("unknown ptrace event {:#x}", event), + } + } +} + +/// Helper function for waiting on one or more processes. Returns `None` if +/// `WaitPidFlag::WNOHANG` was specified and the process is still running. +fn wait(id: IdType, flags: WaitPidFlag) -> Result, Errno> { + loop { + let result = waitid(id, flags).map(|status| { + if status == WaitStatus::StillAlive { + None + } else { + Some(status) + } + }); + + if result == Err(Errno::EINTR) { + continue; + } + + return result; + } +} + +/// The result of a non-blocking wait. A process can be in one of three main +/// states: running, ptrace-stopped, or exited. +/// +/// Both `Clone` and `Copy` are intentionally not implemented. This is to enforce +/// type safety. +#[derive(Debug, Eq, PartialEq)] +pub enum TryWait { + /// The process is in either a stopped state or an exited state. + Wait(Wait), + + /// The process is in a running state and thus can only be waited on. + /// + /// When the process is successfully waited on, it transitions to a waited + /// state. + Running(Running), +} + +impl TryWait { + /// Returns the PID for this attempted wait. + pub fn pid(&self) -> Pid { + match self { + Self::Wait(wait) => wait.pid(), + Self::Running(running) => running.pid(), + } + } + + /// Returns true if we're in a running state. Note that this may not reflect + /// the real *current* state that we may not yet have observed. + pub fn is_running(&self) -> bool { + matches!(self, Self::Running(_)) + } + + /// Returns true if we're in a stopped state. Note that this may not reflect + /// the real *current* state that we may not yet have observed. + pub fn is_stopped(&self) -> bool { + matches!(self, Self::Wait(Wait::Stopped(_, _))) + } + + /// Assumes the process is in a stopped state. Panics if it isn't. + pub fn assume_stopped(self) -> (Stopped, Event) { + match self { + Self::Wait(Wait::Stopped(stopped, event)) => (stopped, event), + status => Err(InvalidState(status)).unwrap(), + } + } + + /// Assumes the process is in a running state. Panics if it isn't. + pub fn assume_running(self) -> Running { + match self { + Self::Running(running) => running, + status => Err(InvalidState(status)).unwrap(), + } + } + + /// Assumes the process is in an exited state. Panics if it isn't. + pub fn assume_exited(self) -> (Pid, ExitStatus) { + match self { + Self::Wait(Wait::Exited(pid, exit_status)) => (pid, exit_status), + status => Err(InvalidState(status)).unwrap(), + } + } +} + +impl fmt::Display for TryWait { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Wait(wait) => write!(f, "{}", wait), + Self::Running(Running(pid)) => write!(f, "pid {} is running", pid), + } + } +} + +impl From for TryWait { + fn from(status: Running) -> Self { + Self::Running(status) + } +} + +impl From for TryWait { + fn from(wait: Wait) -> Self { + Self::Wait(wait) + } +} + +/// The result of a blocking wait. A process in this state is guaranteed to not +/// be in a running state. +/// +/// Both `Clone` and `Copy` are intentionally not implemented. This is to enforce +/// type safety. +#[derive(Debug, Eq, PartialEq)] +pub enum Wait { + /// The process is in a stopped state and thus only operations that can be + /// done during a stopped state are allowed (i.e., ptrace operations). + /// + /// When the process is resumed, it transitions to a running state. + Stopped(Stopped, Event), + + /// The process has exited with an exit status. + Exited(Pid, ExitStatus), +} + +impl Wait { + /// Returns the PID for this state. + pub fn pid(&self) -> Pid { + match self { + Self::Stopped(Stopped(pid), _) => *pid, + Self::Exited(pid, _exit_status) => *pid, + } + } + + /// Assumes the process is in a stopped state. Panics if it isn't. + pub fn assume_stopped(self) -> (Stopped, Event) { + match self { + Self::Stopped(stopped, event) => (stopped, event), + state => Err(InvalidState(state.into())).unwrap(), + } + } + + /// Assumes the process is in an exited state. Panics if it isn't. + pub fn assume_exited(self) -> (Pid, ExitStatus) { + match self { + Self::Exited(pid, exit_status) => (pid, exit_status), + state => Err(InvalidState(state.into())).unwrap(), + } + } + + /// Converts a raw `i32` status to this type. + /// + /// Preconditions: + /// The process must not be in a running state. + pub fn from_raw(pid: Pid, status: i32) -> Result { + Ok(if libc::WIFEXITED(status) { + Wait::Exited(pid, ExitStatus::Exited(libc::WEXITSTATUS(status))) + } else if libc::WIFSIGNALED(status) { + let sig = Signal::try_from(libc::WTERMSIG(status)).map_err(|_| Errno::EINVAL)?; + Wait::Exited(pid, ExitStatus::Signaled(sig, libc::WCOREDUMP(status))) + } else if libc::WIFSTOPPED(status) { + let task = Stopped(pid); + + let event = if libc::WSTOPSIG(status) == libc::SIGTRAP | 0x80 { + Event::Syscall + } else if (status >> 16) == 0 { + let sig = Signal::try_from(libc::WSTOPSIG(status)).map_err(|_| Errno::EINVAL)?; + Event::Signal(sig) + } else { + let sig = Signal::try_from(libc::WSTOPSIG(status)).map_err(|_| Errno::EINVAL)?; + + let event = status >> 16; + + // PTRACE_EVENT_STOP is not guaranteed to return the correct + // signal, so we ignore it here. + debug_assert!(event == libc::PTRACE_EVENT_STOP || sig == Signal::SIGTRAP); + + Event::from_ptrace_event(&task, event)? + }; + + Wait::Stopped(task, event) + } else if libc::WIFCONTINUED(status) { + // TODO: Handle continued status. + unimplemented!("Continued status not yet handled") + } else { + panic!("PID {} got unexpected status: {:#x}", pid, status) + }) + } +} + +impl TryFrom for Wait { + type Error = Error; + + /// Converts a `WaitStatus` to this type. + /// + /// Preconditions: + /// The process must not be in a `StillAlive` state. + fn try_from(wait_status: WaitStatus) -> Result { + Ok(match wait_status { + WaitStatus::Exited(pid, code) => Self::Exited(pid.into(), ExitStatus::Exited(code)), + WaitStatus::Signaled(pid, sig, coredump) => { + Self::Exited(pid.into(), ExitStatus::Signaled(sig, coredump)) + } + WaitStatus::Stopped(pid, sig) => { + let event = Event::Signal(sig); + Self::Stopped(Stopped(pid.into()), event) + } + WaitStatus::PtraceEvent(pid, sig, event) => { + // PTRACE_EVENT_STOP is not guaranteed to return the correct + // signal, so we ignore it here. + debug_assert!(event == libc::PTRACE_EVENT_STOP || sig == Signal::SIGTRAP); + let task = Stopped(pid.into()); + let event = Event::from_ptrace_event(&task, event)?; + Self::Stopped(task, event) + } + WaitStatus::PtraceSyscall(pid) => { + let event = Event::Syscall; + Self::Stopped(Stopped(pid.into()), event) + } + WaitStatus::Continued(_pid) => { + // Not possible because we aren't using WaitPidFlag::WCONTINUED + // anywhere. + unreachable!("unexpected WaitStatus::Continued"); + } + WaitStatus::StillAlive => { + // The precondition of this function forbids this. + unreachable!("precondition violated with WaitStatus::StillAlive"); + } + }) + } +} + +/// Temporary conversion back to a `WaitStatus`. Used for refactoring +/// scaffolding. +impl From for WaitStatus { + fn from(wait: Wait) -> WaitStatus { + match wait { + Wait::Stopped(stopped, event) => { + let pid = stopped.pid(); + match event { + Event::NewChild(op, _child) => match op { + ChildOp::Fork => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_FORK, + ), + ChildOp::Vfork => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_VFORK, + ), + ChildOp::Clone => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_CLONE, + ), + }, + Event::Exec(_) => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_EXEC, + ), + Event::VforkDone => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_VFORK_DONE, + ), + Event::Exit => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_EXIT, + ), + Event::Seccomp => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGTRAP, + libc::PTRACE_EVENT_SECCOMP, + ), + Event::Stop => WaitStatus::PtraceEvent( + pid.into(), + Signal::SIGSTOP, + libc::PTRACE_EVENT_STOP, + ), + Event::Syscall => WaitStatus::PtraceSyscall(pid.into()), + Event::Signal(sig) => WaitStatus::Stopped(pid.into(), sig), + } + } + Wait::Exited(pid, ExitStatus::Exited(code)) => WaitStatus::Exited(pid.into(), code), + Wait::Exited(pid, ExitStatus::Signaled(signal, core_dump)) => { + WaitStatus::Signaled(pid.into(), signal, core_dump) + } + } + } +} + +impl fmt::Display for Wait { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Stopped(stopped, event) => { + write!(f, "pid {} stopped ({:?})", stopped.pid(), event) + } + Self::Exited(pid, exit_status) => write!(f, "pid {} exited ({:?})", pid, exit_status), + } + } +} + +// libc crate doesn't provide this struct +#[repr(C)] +struct ptrace_peeksiginfo_args { + off: u64, + flags: u32, + nr: u32, +} + +bitflags! { + /// Flags for ptrace peeksiginfo + pub struct PeekSigInfoFlags: u32 { + /// dumping signals from the process-wide signal queue. signals are + /// read from the per-thread queue of the specified thread if this + /// flag is not set. + const SHARED = 1; + } +} + +/// A process that is in a stopped state and allows ptrace operations to be +/// performed. +#[derive(Debug, Hash, Eq, PartialEq)] +pub struct Stopped(Pid); + +// TODO: Since we can guarantee that a process is in a stopped state, we should +// impl `MemoryAccess` *only* for the `Stopped` type. +impl Stopped { + /// Waits for the next exit stop to occur. This is received asynchronously + /// regardless of what the process was doing at the time. This is useful for + /// canceling futures when a process enters a `PTRACE_EVENT_EXIT` (such as + /// when one thread calls `exit_group` and causes all other threads to + /// suddenly exit). + pub fn exit_event(&self) -> notifier::ExitFuture { + notifier::ExitFuture(self.0) + } + + /// Creates a new stopped state. This is useful when we know the process is + /// in a stopped state already. + /// + /// Using this method is unsound because there is no check to verify that the + /// pid really is in a stopped state. It is better to arrive at a stopped + /// state via other methods such as `Running::wait`. + pub fn new_unchecked(pid: Pid) -> Self { + // FIXME: Remove this method. + Stopped(pid) + } + + /// Returns the process ID of the tracee. + pub fn pid(&self) -> Pid { + self.0 + } + + /// Sets the ptracer options. + pub fn setoptions(&self, options: ptrace::Options) -> Result<(), Error> { + ptrace::setoptions(self.0.into(), options).map_err(|err| from_nix_err(self.0, err)) + } + + /// Gets the state of the registers. + pub fn getregs(&self) -> Result { + ptrace::getregs(self.0.into()).map_err(|err| from_nix_err(self.0, err)) + } + + /// Sets the registers. + pub fn setregs(&self, regs: libc::user_regs_struct) -> Result<(), Error> { + ptrace::setregs(self.0.into(), regs).map_err(|err| from_nix_err(self.0, err)) + } + + /// Gets the state of the registers. + pub fn getfpregs(&self) -> Result { + let mut data = mem::MaybeUninit::::uninit(); + nix::errno::Errno::result(unsafe { + libc::ptrace( + libc::PTRACE_GETFPREGS, + self.0.as_raw(), + ptr::null() as *const libc::c_void, + data.as_mut_ptr() as *const _ as *const libc::c_void, + ) + }) + .map_err(|err| from_nix_err(self.0, err))?; + Ok(unsafe { data.assume_init() }) + } + + /// Sets the registers. + pub fn setfpregs(&self, regs: libc::user_fpregs_struct) -> Result<(), Error> { + nix::errno::Errno::result(unsafe { + libc::ptrace( + libc::PTRACE_SETFPREGS, + self.0.as_raw(), + ptr::null() as *const libc::c_void, + ®s as *const _ as *const libc::c_void, + ) + }) + .map_err(|err| from_nix_err(self.0, err))?; + Ok(()) + } + + /// Resumes the process and transitions it back to a running state. + pub fn resume>>(self, sig: T) -> Result { + ptrace::cont(self.0.into(), sig).map_err(|err| from_nix_err(self.0, err))?; + Ok(Running::new(self.0)) + } + + /// Advances the execution of the process by a single step optionally + /// delivering a signal specified by `sig`. + pub fn step>>(self, sig: T) -> Result { + ptrace::step(self.0.into(), sig).map_err(|err| from_nix_err(self.0, err))?; + Ok(Running::new(self.0)) + } + + /// Like `step`, but arranges for the tracee to be stopped at the next + /// entry to or exit from a system call. + pub fn syscall>>(self, sig: T) -> Result { + ptrace::syscall(self.0.into(), sig).map_err(|err| from_nix_err(self.0, err))?; + Ok(Running::new(self.0)) + } + + /// Gets info about the signal that caused the process to be stopped. + pub fn getsiginfo(&self) -> Result { + ptrace::getsiginfo(self.0.into()).map_err(|err| from_nix_err(self.0, err)) + } + + /// Sets info about the singal that caused the process to be stopped. + pub fn setsiginfo(&self, siginfo: &libc::siginfo_t) -> Result<(), Error> { + ptrace::setsiginfo(self.0.into(), siginfo).map_err(|err| from_nix_err(self.0, err)) + } + + /// Like `getsiginfo`, but do not remove the signal info from an internal + /// queue. + pub fn peeksiginfo>>( + &self, + flags: T, + ) -> Result, Error> { + const SIGNAL_MAX: usize = 8 * std::mem::size_of::(); + let mut data = core::mem::MaybeUninit::<[libc::siginfo_t; SIGNAL_MAX]>::zeroed(); + let mut siginfo_args = ptrace_peeksiginfo_args { + off: 0, + flags: flags.into().map_or(0, |x| x.bits()), + nr: SIGNAL_MAX as u32, + }; + let count = nix::errno::Errno::result(unsafe { + libc::ptrace( + libc::PTRACE_PEEKSIGINFO, + self.0.as_raw(), + &mut siginfo_args as *mut _, + data.as_mut_ptr() as *const _ as *const libc::c_void, + ) + }) + .map_err(|err| from_nix_err(self.0, err))?; + Ok(unsafe { data.assume_init() }[0..count as usize].to_vec()) + } + + /// Retrieve a message about the ptrace event that just happened. + /// + /// It shouldn't be necessary to call this in most cases because `Event` + /// provides the necessary context for certain ptrace events. + pub fn getevent(&self) -> Result { + ptrace::getevent(self.0.into()).map_err(|err| from_nix_err(self.0, err)) + } + + /// Detaches from and then resumes the stopped tracee. + pub fn detach>>(self, sig: T) -> Result { + ptrace::detach(self.0.into(), sig).map_err(|err| from_nix_err(self.0, err))?; + Ok(Running::new(self.0)) + } +} + +/// Waits for any child processes to change state, blocking until the next event. +/// This is equivalent to `waitpid(-1)`. +pub fn wait_all() -> Result, Error> { + let result = wait(IdType::All, WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED) + .map_err(Error::from) + .and_then(|status| { + // Unwrap is OK because the process cannot be left in a running + // state without WNOHANG. + Wait::try_from(status.unwrap()) + }); + + match result { + Ok(state) => Ok(Some(state)), + Err(Error::Errno(Errno::ECHILD)) => { + // waitpid(-1) only returns ECHILD when there are no more children + // to wait for. Returning `None` here makes it easy to write a while + // loop that terminates when there are no more children left. + Ok(None) + } + Err(err) => Err(err), + } +} + +/// Like `wait_all`, but immediately returns `Ok(None)` if no state transition +/// will occur. +/// +/// This is the non-blocking version of `wait_all`. +pub fn try_wait_all() -> Result, Error> { + wait( + IdType::All, + WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::WNOHANG, + )? + .map(Wait::try_from) + .transpose() +} + +/// Waits for any child in a process group to change state, blocking until the +/// next event. +pub fn wait_group(pid: Pid) -> Result, Error> { + let result = wait( + IdType::Pgid(pid.into()), + WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED, + ) + .map_err(Error::from) + .and_then(|status| { + // Unwrap is OK because the process cannot be left in a running + // state without WNOHANG. + Wait::try_from(status.unwrap()) + }); + + match result { + Ok(state) => Ok(Some(state)), + Err(Error::Errno(Errno::ECHILD)) => { + // This only returns ECHILD when there are no more children to wait + // for. Returning `None` here makes it easy to write a while loop + // that terminates when there are no more children left. + Ok(None) + } + Err(err) => Err(err), + } +} + +/// Blocks until a state change is ready to consume, but does not consume it. +/// Returns the pid that has the pending state change. Returns `Ok(None)` if +/// there are no child processes to wait on. +/// +/// This is useful for deciding which processes to consume events for. +/// +/// # Examples +/// +/// ```ignore +/// while let Some(process) = peek_all()? { +/// match process.wait()? { +/// Wait::Stopped(tracee, _event) => { +/// tracee.resume(None)?; +/// } +/// Wait::Exited(pid, exit_status) => { +/// println!("pid {} exited ({})", pid, exit_status); +/// } +/// } +/// } +/// ``` +pub fn peek_all() -> Result, Errno> { + let result = wait( + IdType::All, + WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::WNOWAIT, + ) + .map(|state| { + // Unwrap is OK because the process cannot be in a running state without + // WNOHANG. + state.unwrap() + }); + + match result { + Ok(status) => Ok(status.pid().map(|pid| Running(pid.into()))), + Err(Errno::ECHILD) => { + // waitpid(-1) only returns ECHILD when there are no more children + // to wait for. Returning `None` here makes it easy to write a while + // loop that terminates when there are no more children left. + Ok(None) + } + Err(err) => Err(err), + } +} + +/// Returns a process that is ready to change state. If there are no child +/// processes ready to change, returns immediately. +/// +/// This is the non-blocking version of `peek_all`. +pub fn try_peek_all() -> Result, Errno> { + let next = wait( + IdType::All, + WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, + )?; + + Ok(next.and_then(|state| state.pid().map(|pid| Running(pid.into())))) +} + +/// A running child. +#[derive(Debug, Hash, Eq, PartialEq)] +pub struct Running(Pid); + +impl Running { + /// Creates a new running process. This is generally the entry point for a + /// new process as soon as it is created. + pub fn new(pid: Pid) -> Self { + Running(pid) + } + + /// Attaches to a running process. The process becomes a tracee and a SIGSTOP + /// is sent to it. By the time this function ends, the tracee may not yet + /// have actually stopped. Thus, the tracee is still considered to be in a + /// running state and needs to be waited upon to observe the SIGSTOP. + pub fn attach(pid: Pid) -> Result { + ptrace::attach(pid.into()).map_err(|err| Errno::new(err as i32))?; + Ok(Running(pid)) + } + + /// Similar to attach, but does not stop the process. This also affects the + /// events that are later delivered. Upon clone, fork, or vfork, an + /// `Event::Stop` is delivered instead of `Event::Signal(Signal::SIGSTOP)`. + /// + /// Unlike other modes, a seized process can also accept interrupts. + pub fn seize(pid: Pid, options: Options) -> Result { + ptrace::seize(pid.into(), options).map_err(|err| Errno::new(err as i32))?; + Ok(Running(pid)) + } + + /// Interrupts the running process, even if it is in the middle of a syscall. + /// The next time the process is waited on, the process transitions to a + /// stopped state and `Event::Stop` is returned. + /// + /// # Limitations + /// + /// This only works for processes being traced via `Running::seize`. + pub fn interrupt(&self) -> Result<(), Errno> { + // nix doesn't provide `ptrace::interrupt` yet, so we need to roll our + // own. + Errno::result(unsafe { + libc::ptrace( + libc::PTRACE_INTERRUPT, + self.0.as_raw(), + std::ptr::null_mut::(), + std::ptr::null_mut::(), + ) + }) + .map(drop) + } + + /// Returns the pid of the running process. + pub fn pid(&self) -> Pid { + self.0 + } + + /// Blocks until a state change occurs. This may transition the process to + /// either a stopped state or exited state, but never a running state. + pub fn wait(self) -> Result { + wait( + IdType::Pid(self.0.into()), + WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED, + ) + .map_err(Error::from) + .and_then(|status| { + // Unwrap is OK because the process cannot be in a running state without + // WNOHANG. + Wait::try_from(status.unwrap()) + }) + } + + /// Like `wait`, but filters out events we don't care about by resuming the + /// tracee when encountering them. This is useful for skipping past spurious + /// events until a point we know the tracee must stop. + pub async fn wait_until(mut self, mut pred: F) -> Result + where + F: FnMut(&Event) -> bool, + { + loop { + match self.next_state().await? { + Wait::Stopped(stopped, event) => { + if pred(&event) { + break Ok(Wait::Stopped(stopped, event)); + } else if let Event::Signal(sig) = event { + self = stopped.resume(Some(sig))?; + } else { + self = stopped.resume(None)?; + } + } + task => break Ok(task), + } + } + } + + /// Waits until we receive a specific stop signal. Useful for skipping past + /// spurious signals. + pub async fn wait_for_signal(self, sig: Signal) -> Result { + self.wait_until(|event| event == &Event::Signal(sig)).await + } + + /// Waits for the next exit stop to occur. This is received asynchronously + /// regardless of what the process was doing at the time. This is useful for + /// canceling futures when a process enters a `PTRACE_EVENT_EXIT` (such as + /// when one thread calls `exit_group` and causes all other threads to + /// suddenly exit). + pub fn exit_event(&self) -> notifier::ExitFuture { + notifier::ExitFuture(self.0) + } + + /// Like `wait`, but wait asynchronously for the next state change. + pub async fn next_state(self) -> Result { + notifier::WaitFuture(self).await + } +} + +/// A process that is no longer running, but hasn't yet fully exited. The only +/// thing zombie can do is exit. +#[derive(Debug, Hash, Eq, PartialEq)] +pub struct Zombie(Running); + +impl Zombie { + /// Creates a new instance. + fn new(pid: Pid) -> Self { + Zombie(Running(pid)) + } + + /// Returns the PID of the zombie. + pub fn pid(&self) -> Pid { + self.0.pid() + } + + /// Reaps the zombie by waiting for it to fully exit. + pub async fn reap(self) -> ExitStatus { + // The tracee may not be fully dead yet. It is still possible for it to + // still enter an `Event::Exit` state by waiting on it. For more info, + // see the "BUGS" section in `man 2 ptrace`. + let mut next_state = self.0.next_state().await; + + loop { + match next_state { + Ok(wait) => match wait { + Wait::Stopped(stopped, event) => { + if let Event::Exit = event { + next_state = match stopped.resume(None) { + Ok(task) => task.next_state().await, + Err(err) => Err(err), + }; + } else { + panic!("Task {:?} unexpected stop event {:?}", stopped, event) + } + } + Wait::Exited(_pid, exit_status) => break exit_status, + }, + Err(Error::Died(zombie)) => next_state = zombie.0.next_state().await, + Err(Error::Errno(Errno::ECHILD)) => break ExitStatus::Exited(1), + other => panic!( + "Got unexpected result when awaiting final death {:?}", + other + ), + } + } + } +} + +impl fmt::Display for Zombie { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.pid()) + } +} + +/// Sets up this process to be traced by its parent and raises a SIGSTOP. +pub fn traceme_and_stop() -> Result<(), Errno> { + ptrace::traceme() + .and_then(|()| nix::sys::signal::raise(Signal::SIGSTOP)) + .map_err(|e| Errno::new(e as i32))?; + Ok(()) +} + +/// These tests are meant to test this API but also to show how ptrace works. +#[cfg(test)] +mod test { + use super::*; + + use nix::{ + sys::signal::{self, Signal}, + unistd::{fork, ForkResult}, + }; + + use std::{io, mem, thread}; + + // Traces a closure in a forked process. The forked process starts in a + // stopped state so that ptrace options may be set. + fn trace(f: F, options: Options) -> Result<(Pid, Stopped), Error> + where + F: FnOnce() -> i32, + { + match unsafe { fork() }.map_err(|err| err)? { + ForkResult::Parent { child, .. } => { + let mut running = Running::seize(child.into(), options)?; + + // Keep consuming events until we reach a SIGSTOP or group stop. + let stopped = loop { + match running.wait()? { + Wait::Stopped(stopped, event) => { + if event == Event::Signal(Signal::SIGSTOP) || event == Event::Stop { + break stopped; + } else if let Event::Signal(sig) = event { + running = stopped.resume(Some(sig))?; + } else { + running = stopped.resume(None)?; + } + } + task => panic!("Got unexpected exit: {:?}", task), + } + }; + + Ok((stopped.pid(), stopped)) + } + ForkResult::Child => { + // Create a new process group so we can wait on this process and + // every child more efficiently. + let _ = unsafe { libc::setpgid(0, 0) }; + + // Suppress core dumps for testing purposes. + let limit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + let _ = unsafe { libc::setrlimit(libc::RLIMIT_CORE, &limit) }; + + // PTRACE_SEIZE is inherently racey, so we stop the child + // process here. + signal::raise(Signal::SIGSTOP).unwrap(); + + // Run the child when the process is resumed. + let exit_code = f(); + + // Note: We can't use the normal exit function here because we + // don't want to call atexit handlers since `execve` was never + // called. + let _ = unsafe { ::libc::_exit(exit_code) }; + } + } + } + + #[test] + fn basic() -> Result<(), Box> { + // Do nothing but exit. + let (pid, tracee) = trace(|| 42, Options::empty())?; + assert_eq!( + tracee.resume(None)?.wait()?, + Wait::Exited(pid, ExitStatus::Exited(42)) + ); + + Ok(()) + } + + #[test] + fn stop_on_exit() -> Result<(), Box> { + let (pid, tracee) = trace( + || 42, + Options::PTRACE_O_EXITKILL | Options::PTRACE_O_TRACEEXIT, + )?; + + let running = tracee.resume(None)?; + let (stopped, event) = running.wait()?.assume_stopped(); + + // The tracee has stopped just before exiting. Resuming or detaching now + // will let the process exit. + assert_eq!(event, Event::Exit); + + assert_eq!( + stopped.resume(None)?.wait()?, + Wait::Exited(pid, ExitStatus::Exited(42)) + ); + + Ok(()) + } + + #[test] + #[cfg(not(sanitized))] + fn serialized_threads() -> Result<(), Box> { + const THREAD_COUNT: usize = 8; + + let (pid, tracee) = trace( + move || { + // Create a handful of threads that do nothing but exit. + let threads = (0..THREAD_COUNT) + .map(|i| thread::spawn(move || i)) + .collect::>(); + + for t in threads { + t.join().unwrap(); + } + + 42 + }, + Options::PTRACE_O_EXITKILL + | Options::PTRACE_O_TRACEEXIT + | ptrace::Options::PTRACE_O_TRACECLONE, + )?; + + let mut parent = tracee.resume(None)?; + + // We should observe threads getting created. + for _ in 0..THREAD_COUNT { + let (stopped, event) = parent.wait()?.assume_stopped(); + + let child = match event { + Event::NewChild(ChildOp::Clone, child) => child, + e => panic!("Expected clone event, got {:?}", e), + }; + + // Should be at a group stop. + let (child, event) = child.wait()?.assume_stopped(); + assert_eq!(event, Event::Stop); + + // Resume the child. + let child = child.resume(None)?; + + // Wait for it to exit. + let (child, event) = child.wait()?.assume_stopped(); + assert_eq!(event, Event::Exit); + + // Resume one last time to let it fully exit. + let (_child_pid, exit_status) = child.resume(None)?.wait()?.assume_exited(); + assert_eq!(exit_status, ExitStatus::Exited(0)); + + // Resume the parent. + parent = stopped.resume(None)?; + } + + // ptrace stop just before fully exiting. + let (parent, event) = parent.wait()?.assume_stopped(); + assert_eq!(event, Event::Exit); + + // Fully exited. + let parent = parent.resume(None)?; + assert_eq!(parent.wait()?, Wait::Exited(pid, ExitStatus::Exited(42))); + + Ok(()) + } + + #[cfg(not(sanitized))] + fn group_exit(thread_count: usize) -> Result<(), Box> { + use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }; + use std::time::Duration; + + let (parent_pid, tracee) = trace( + move || { + let counter = Arc::new(AtomicUsize::new(0)); + + // Create a handful of threads that sleep forever. + let _threads = (0..thread_count) + .map(|_i| { + let counter = counter.clone(); + + thread::spawn(move || { + counter.fetch_add(1, Ordering::Relaxed); + thread::sleep(Duration::from_secs(60)); + }) + }) + .collect::>(); + + // Wait for each of the threads to actually get initialized. + while counter.load(Ordering::Relaxed) != thread_count { + thread::yield_now(); + } + + // All threads should be alive at this point. SYS_exit_group + // should force all threads to exit. + let _ = unsafe { libc::syscall(libc::SYS_exit_group, 42) }; + + unreachable!() + }, + Options::PTRACE_O_EXITKILL + | Options::PTRACE_O_TRACEEXIT + | ptrace::Options::PTRACE_O_TRACECLONE, + )?; + + tracee.resume(None)?; + + let mut exited = Vec::new(); + + // Keep consuming events until everything has exited. + while let Some(wait) = wait_group(parent_pid)? { + match wait { + Wait::Stopped(tracee, _event) => { + tracee.resume(None)?; + } + Wait::Exited(pid, exit_status) => { + exited.push((pid, exit_status)); + } + } + } + + // The parent should have exited last. + assert_eq!(exited.pop(), Some((parent_pid, ExitStatus::Exited(42)))); + + // The only things left should be the threads that were spawned. + assert_eq!(exited.len(), thread_count); + + // All others should have exited with the same exit status. + for (_pid, exit_status) in exited { + assert_eq!(exit_status, ExitStatus::Exited(42)); + } + + Ok(()) + } + + /// Tests that we receive an exit for all threads in the right order even + /// when the main thread calls `exit_group`. + #[test] + #[cfg(not(sanitized))] + fn group_exit_stress() { + // Test a variety of thread counts. Super-high thread counts makes + // ptrace very slow, so we keep this to a relatively low number. + for i in 0..100 { + group_exit(i / 2).unwrap(); + } + } + + /// Tests that trying to trace from another thread does not work. + #[test] + fn trace_from_another_thread() -> Result<(), Box> { + let (pid, tracee) = trace(|| 42, Options::empty()).unwrap(); + + assert_eq!( + // Try resuming from another thread, which should fail. + thread::spawn(move || tracee.resume(None)).join().unwrap(), + // The process didn't actually die, this is just how ESRCH was + // interpretted. + Err(Error::Died(Zombie::new(pid))) + ); + + assert_eq!( + Stopped(pid).resume(None)?.wait()?, + Wait::Exited(pid, ExitStatus::Exited(42)) + ); + + Ok(()) + } + + #[test] + fn trace_killed_by_signal() -> Result<(), Box> { + let (pid, tracee) = trace( + || { + signal::raise(Signal::SIGILL).unwrap(); + unreachable!() + }, + Options::PTRACE_O_EXITKILL, + )?; + + let running = tracee.resume(None)?; + + let (stopped, event) = running.wait()?.assume_stopped(); + + // The tracee has stopped just before exiting. Resuming or detaching now + // will let the process exit. + assert_eq!(event, Event::Signal(Signal::SIGILL)); + + assert_eq!( + stopped.resume(Some(Signal::SIGILL))?.wait()?, + Wait::Exited(pid, ExitStatus::Signaled(Signal::SIGILL, true)) + ); + + Ok(()) + } + + #[cfg(not(sanitized))] + #[tokio::test] + async fn notifier_basic() -> Result<(), Box> { + let (pid, tracee) = trace(|| 42, Options::empty())?; + assert_eq!( + tracee.resume(None)?.next_state().await?, + Wait::Exited(pid, ExitStatus::Exited(42)) + ); + + Ok(()) + } + + // kernel_sigset_t used by naked syscall + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + struct KernelSigset(u64); + + impl From<&[Signal]> for KernelSigset { + fn from(signals: &[Signal]) -> Self { + let mut set: u64 = 0; + for &sig in signals { + set |= 1u64 << (sig as usize - 1); + } + KernelSigset(set) + } + } + + #[no_mangle] + extern "C" fn sigalrm_handler( + _sig: i32, + _siginfo: *mut libc::siginfo_t, + _ucontext: *const libc::c_void, + ) { + nix::unistd::write(2, b"caught SIGALRM!").unwrap(); + } + + #[allow(dead_code)] + unsafe fn install_sigalrm_handler() -> i32 { + let mut sa: libc::sigaction = mem::MaybeUninit::zeroed().assume_init(); + sa.sa_flags = libc::SA_RESTART | libc::SA_SIGINFO | libc::SA_NODEFER; + sa.sa_sigaction = sigalrm_handler as _; + + libc::sigaction(libc::SIGALRM, &sa as *const _, std::ptr::null_mut()) + } + + #[allow(dead_code)] + // unblock signal(s) and set its handler to SIG_DFL + unsafe fn unblock_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: mem::MaybeUninit = mem::MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_UNBLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + #[allow(dead_code)] + unsafe fn block_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: mem::MaybeUninit = mem::MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_BLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + #[cfg(not(sanitized))] + #[test] + fn peeksiginfo_returns_pending_siginfo() -> Result<(), Box> { + let (parent_pid, tracee) = trace( + move || { + let _ = unsafe { + block_signals(&[Signal::SIGALRM, Signal::SIGVTALRM, Signal::SIGPROF]) + }; + assert!(signal::raise(Signal::SIGALRM).is_ok()); + assert!(signal::raise(Signal::SIGVTALRM).is_ok()); + assert!(signal::raise(Signal::SIGPROF).is_ok()); + + // All threads should be alive at this point. SYS_exit_group + // should force all threads to exit. + let _ = unsafe { libc::syscall(libc::SYS_exit_group, 0) }; + + unreachable!() + }, + Options::PTRACE_O_EXITKILL + | Options::PTRACE_O_TRACEEXIT + | ptrace::Options::PTRACE_O_TRACECLONE, + )?; + + tracee.resume(None)?; + + let mut exited = Vec::new(); + + // Keep consuming events until everything has exited. + while let Some(wait) = wait_group(parent_pid)? { + match wait { + Wait::Stopped(tracee, Event::Exit) => { + let pending: Vec<_> = tracee + .peeksiginfo(None)? + .iter() + .map(|&si| Signal::try_from(si.si_signo).unwrap()) + .collect(); + assert_eq!( + pending, + [Signal::SIGALRM, Signal::SIGVTALRM, Signal::SIGPROF] + ); + // do a second peek here to demostrate peek doesn't + // *pop* pending signals. + let pending: Vec<_> = tracee + .peeksiginfo(None)? + .iter() + .map(|&si| Signal::try_from(si.si_signo).unwrap()) + .collect(); + assert_eq!( + pending, + [Signal::SIGALRM, Signal::SIGVTALRM, Signal::SIGPROF] + ); + tracee.resume(None)?; + } + Wait::Stopped(tracee, _event) => { + tracee.resume(None)?; + } + Wait::Exited(pid, exit_status) => { + exited.push((pid, exit_status)); + } + } + } + + // The parent should have exited last + assert_eq!(exited.pop(), Some((parent_pid, ExitStatus::Exited(0)))); + + Ok(()) + } + + #[cfg(not(sanitized))] + #[test] + fn getsiginfo_should_success() -> Result<(), Box> { + let (parent_pid, tracee) = trace( + move || { + let _ = unsafe { unblock_signals(&[Signal::SIGALRM]) }; + let _ = unsafe { block_signals(&[Signal::SIGVTALRM, Signal::SIGPROF]) }; + assert_eq!(unsafe { install_sigalrm_handler() }, 0); + assert!(signal::raise(Signal::SIGALRM).is_ok()); + + // All threads should be alive at this point. SYS_exit_group + // should force all threads to exit. + let _ = unsafe { libc::syscall(libc::SYS_exit_group, 0) }; + + unreachable!() + }, + Options::PTRACE_O_EXITKILL + | Options::PTRACE_O_TRACEEXIT + | ptrace::Options::PTRACE_O_TRACECLONE, + )?; + + tracee.resume(None)?; + + let mut exited = Vec::new(); + + // Keep consuming events until everything has exited. + while let Some(wait) = wait_group(parent_pid)? { + match wait { + Wait::Stopped(tracee, Event::Signal(Signal::SIGALRM)) => { + let siginfo = tracee.getsiginfo()?; + assert_eq!(siginfo.si_signo, Signal::SIGALRM as i32); + tracee.resume(Signal::SIGALRM)?; + } + Wait::Stopped(tracee, Event::Signal(other_signal)) => { + tracee.resume(other_signal)?; + } + Wait::Stopped(tracee, _event) => { + tracee.resume(None)?; + } + Wait::Exited(pid, exit_status) => { + exited.push((pid, exit_status)); + } + } + } + + // The parent should have exited last + assert_eq!(exited.pop(), Some((parent_pid, ExitStatus::Exited(0)))); + + Ok(()) + } +} diff --git a/reverie-ptrace/src/trace/notifier.rs b/reverie-ptrace/src/trace/notifier.rs new file mode 100644 index 0000000..7117590 --- /dev/null +++ b/reverie-ptrace/src/trace/notifier.rs @@ -0,0 +1,362 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! # Making `ptrace` async +//! +//! Getting asynchronous notifications for a tree of child processes is tricky. +//! The common way is to just call `waitpid(-1)` in the tracer process and let +//! that scoop up every event for every child of the current process. This is +//! what `strace` and `rr` do to receive `ptrace` stop events. The problem is +//! that we shouldn't do something like that in a library like Reverie since we +//! don't know what other (untraced) processes the user has spawned. Calling +//! `waitpid(-1)` will consume and "steal" exit events from processes we aren't +//! actively tracing. +//! +//! The best solution would be one where we can wait on all child processes of a +//! specific subtree. +//! +//! ## Failed ideas +//! +//! 1. As an initial dumb implementation, we simply called `waitid` on all child +//! processes one by one in a round-robin fashion until an event was finally +//! received. While it worked, this wasn't the best solution for two reasons: +//! (1) it uses a lot of CPU which starves the guest of CPU resources and +//! slows everything down to a crawl, and (2) it didn't allow us to receive +//! `PTRACE_EVENT_EXIT` events out-of-band which is necessary for canceling +//! pending futures in the event a guest process is suddenly killed. +//! 2. Using `pidfd_open(2)` to receive events over file descriptors would be +//! great, but `ptrace` events are not receivable with `pidfd`. This might +//! change in the future, but there is currently no motivation among Linux +//! devs to implement support for that. (Folks hate the complexity of ptrace +//! and are fearful of introducing new security vulnerabilities.) +//! 3. Using `tokio::task::spawn_blocking` to simply call `waitid()` on the +//! process we're interested in works, but is about twice as slow as (1) +//! because of the overhead of locking a mutex and shuffling bits of data +//! in/out of the Tokio thread pool. +//! 4. Process groups sound like the ideal solution, but it is possible for a +//! process to escape a process group by simply calling `setpgid(2)`. Thus, +//! such a solution would need to be aware of all calls to `setpgid` and +//! `setsid` to perform proper bookkeeping and maintain an internal set of +//! process groups. +//! 5. We could fork off a child process that calls `waitpid(-1)`, which then +//! sends events back to the tracer process via a pipe. The forked process +//! would need to call `prctl` with `PR_SET_CHILD_SUBREAPER` so that orphaned +//! processes don't escape the process tree. This is similar to [what Bazel +//! does](https://jmmv.dev/2019/11/bazel-process-wrapper.html) to keep track +//! of the process tree of a build rule. Unfortunately, this won't work +//! because `ptrace` must be only be called by the *thread* that spawned the +//! initial process. +//! +//! ## Current implementation +//! +//! Currently, we spawn one thread per guest thread who each call `waitid` in a +//! loop on an individual thread/process ID. The nice thing about this is that we +//! can receive `PTRACE_EVENT_EXIT` events "out-of-band" and use that to cancel +//! any futures that may be pending in a tool's `handle_syscall_event`. This +//! approach also avoids the overhead of `tokio::task::spawn_blocking` by not +//! locking a `Mutex` each time an event is received. (An `AtomicI32` plus an +//! `AtomicWaker` can be used instead.) The downside of this approach is that we +//! can end up spawning a lot of guest threads. + +use std::collections::{hash_map::Entry, HashMap}; +use std::fmt; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::Arc; +use std::task::{Context, Poll, Waker}; +use std::thread::{self, JoinHandle}; + +use futures::task::AtomicWaker; +use lazy_static::lazy_static; +use nix::sys::wait::{WaitPidFlag, WaitStatus}; +use parking_lot::Mutex; + +use super::waitid; +use crate::trace::{self, peek_all, Errno, Error, Pid, Running, Stopped, TryWait, Wait}; + +lazy_static! { + static ref NOTIFIER: Notifier = Notifier::new(); +} + +/// A place-holder status used to indicate that no status has been set. +const INVALID_STATUS: i32 = -1; + +/// The number we get when in a PTRACE_EVENT_EXIT stop. +const PTRACE_EVENT_EXIT_STOP: i32 = (libc::PTRACE_EVENT_EXIT << 16) | (libc::SIGTRAP << 8) | 0x7f; + +#[derive(Debug)] +struct Event { + /// Waker for exit events. + exit_waker: AtomicWaker, + + /// Waker for regular status events. + status_waker: AtomicWaker, + + /// The raw status. A status of `-1` indicates that no status has been set + /// yet. + status: AtomicI32, +} + +impl Event { + pub fn new() -> Self { + Self { + exit_waker: AtomicWaker::new(), + status_waker: AtomicWaker::new(), + status: AtomicI32::new(INVALID_STATUS), + } + } + + pub fn from_exit_waker(waker: &Waker) -> Self { + let me = Self::new(); + me.exit_waker.register(waker); + me + } + + pub fn from_status_waker(waker: &Waker) -> Self { + let me = Self::new(); + me.status_waker.register(waker); + me + } + + /// Replaces the status and notifies the notifier of the change. Returns the + /// old status if there was one. + pub fn update(&self, status: i32) -> Option { + let previous = self.status.swap(status, Ordering::SeqCst); + + if status == PTRACE_EVENT_EXIT_STOP { + self.exit_waker.wake(); + } else { + self.status_waker.wake(); + } + + if previous == INVALID_STATUS { + None + } else { + Some(previous) + } + } + + /// Polls the event to check if there is a new status ready to be consumed. + pub fn poll_status(&self, waker: &Waker) -> Poll { + // Register the waker *before* checking the status to avoid a race condition. + self.status_waker.register(waker); + + // Only modify the status if we're *not* in a PTRACE_EVENT_EXIT stop. + // TODO: Think really hard and relax the ordering. + let status = self + .status + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |prev| { + if prev == INVALID_STATUS || prev == PTRACE_EVENT_EXIT_STOP { + // Don't update if we're exiting or if there is no status to + // be consumed. + None + } else { + // Reset the value to indicate it has been consumed. + Some(INVALID_STATUS) + } + }); + + match status { + Ok(status) => Poll::Ready(status), + Err(_) => { + // There is either no status available or the guest is exiting. + Poll::Pending + } + } + } + + /// Polls the event to check if there is a new status ready to be consumed. + pub fn poll_exit(&self, waker: &Waker) -> Poll<()> { + // Register the waker *before* checking the status to avoid a race condition. + self.exit_waker.register(waker); + + // Only reset the status if we're in a PTRACE_EVENT_EXIT. + // TODO: Think really hard and relax the ordering. + let status = self.status.compare_exchange( + PTRACE_EVENT_EXIT_STOP, + INVALID_STATUS, + Ordering::SeqCst, + Ordering::SeqCst, + ); + + match status { + Ok(_) => Poll::Ready(()), + Err(_) => Poll::Pending, + } + } +} + +fn spawn_worker(pid: Pid, event: Arc) -> JoinHandle<()> { + thread::Builder::new() + .name(format!("guest-{}", pid)) + .spawn(move || worker_thread(pid, event)) + .expect("failed to spawn thread") +} + +/// Waits on a process and returns the raw status. Returns `None` if the process +/// does not exist. +fn wait(pid: Pid) -> Option { + loop { + let result = waitid::waitpid(pid.into(), WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED); + + return match result { + Ok(status) => Some(status.unwrap()), + Err(Errno::EINTR) => continue, + Err(Errno::ECHILD) => None, + Err(err) => { + // No other errors should be possible because we handled EINTR + // and ECHILD. EINVAL only happens when using the API + // incorrectly. + panic!("waitid::waitpid({}) failed unexpectedly: {}", pid, err) + } + }; + } +} + +/// A worker thread that simply wakes a future when a process changes state. +fn worker_thread(pid: Pid, event: Arc) { + while let Some(status) = wait(pid) { + if let Some(old_status) = event.update(status) { + if status != PTRACE_EVENT_EXIT_STOP && !libc::WIFEXITED(status) { + panic!( + "Got unexpected event: Event {:?} replaced {:?}", + WaitStatus::from_raw(pid.into(), status), + WaitStatus::from_raw(pid.into(), old_status), + ); + } + } + + // Try to avoid reaching an ECHILD error by terminating the loop on the + // last event. + if libc::WIFEXITED(status) || libc::WIFSIGNALED(status) { + break; + } + } +} + +struct Notifier { + /// Mapping of pids to wakers. + pids: Mutex>>, +} + +impl Notifier { + /// Creates the notifier. + pub fn new() -> Self { + let pids = Mutex::new(HashMap::new()); + Notifier { pids } + } + + /// Polls for a state change on the given PID. + pub fn poll_status(&self, pid: Pid, cx: &mut Context) -> Poll> { + // Check if there is a worker thread associated with this PID and create + // one if there isn't. + let mut pids = self.pids.lock(); + match pids.entry(pid) { + Entry::Occupied(mut occupied) => { + let status = futures::ready!(occupied.get_mut().poll_status(cx.waker())); + + // This should be the last event. We need to remove the PID from + // the map so the thread can be spawned again if the PID is ever + // reused. + if libc::WIFEXITED(status) || libc::WIFSIGNALED(status) { + occupied.remove(); + } + + Poll::Ready(Wait::from_raw(pid, status)) + } + Entry::Vacant(vacant) => { + // No thread exists for this yet. Create it. + // TODO: A potential optimization here is that we could call + // `try_wait` instead of spawning a new thread. + let event = Arc::new(Event::from_status_waker(cx.waker())); + vacant.insert(event.clone()); + spawn_worker(pid, event); + Poll::Pending + } + } + } + + /// Polls for an exit event on the given PID. + pub fn poll_exit(&self, pid: Pid, cx: &mut Context) -> Poll { + let mut pids = self.pids.lock(); + match pids.entry(pid) { + Entry::Occupied(mut occupied) => { + futures::ready!(occupied.get_mut().poll_exit(cx.waker())); + Poll::Ready(Stopped::new_unchecked(pid)) + } + Entry::Vacant(vacant) => { + // No thread exists for this yet. Create it. + let event = Arc::new(Event::from_exit_waker(cx.waker())); + vacant.insert(event.clone()); + spawn_worker(pid, event); + Poll::Pending + } + } + } +} + +impl Drop for Notifier { + fn drop(&mut self) { + // All guests should have exited by now. + let pids = self.pids.lock(); + assert_eq!( + pids.len(), + 0, + "Some tracees have not exited yet:\n{:#?}", + pids + ); + } +} + +/// A future representing a process state change. +pub struct WaitFuture(pub(super) Running); + +impl Future for WaitFuture { + type Output = Result; + + fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll { + NOTIFIER.poll_status(self.0.pid(), cx) + } +} + +/// A future representing PTRACE_EVENT_EXIT. The future resolves when the process +/// receives a PTRACE_EVENT_EXIT. A process can receive this event at any time, +/// even when in another ptrace stop state. +/// +/// The next state after this should be the final exit status. +pub struct ExitFuture(pub(super) Pid); + +impl Future for ExitFuture { + type Output = Stopped; + + fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll { + NOTIFIER.poll_exit(self.0, cx) + } +} + +#[cfg(test)] +mod test { + use super::*; + use nix::sys::signal::Signal; + use nix::sys::wait::WaitStatus; + use nix::unistd::Pid; + + #[test] + fn exit_event_code() { + assert_eq!( + WaitStatus::from_raw(Pid::from_raw(42), PTRACE_EVENT_EXIT_STOP), + Ok(WaitStatus::PtraceEvent( + Pid::from_raw(42), + Signal::SIGTRAP, + libc::PTRACE_EVENT_EXIT + )) + ); + } +} diff --git a/reverie-ptrace/src/trace/waitid.rs b/reverie-ptrace/src/trace/waitid.rs new file mode 100644 index 0000000..6b880fd --- /dev/null +++ b/reverie-ptrace/src/trace/waitid.rs @@ -0,0 +1,346 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Provide `waitid` which is based on `SYS_waitid` syscall. +//! `SYS_waitid` provide `WNOWAIT` flag which is absent in `SYS_waitpid`. +//! compare to `waitpid`, flags *must* be explicitly provided. +//! which could be a combination (bitwise-or) of `WEXITED`, `WSTOPPED`, +//! `WCONTINUED`, `WNOHANG` and `WNOWAIT`. see `waitid(2)` for more details. +//! NB: `waitid` here provide a similar interface as `nix`'s `waitpid`. + +use super::Errno; +use nix::sys::{ + signal::Signal, + wait::{WaitPidFlag, WaitStatus}, +}; +use nix::unistd::Pid; + +use std::mem::MaybeUninit; +use std::os::unix::io::RawFd; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum IdType { + Pid(Pid), + Pgid(Pid), + #[allow(unused)] + Pidfd(RawFd), // this requires linux kernel >= 5.4 + All, +} + +#[inline] +fn si_status_signal(info: &libc::siginfo_t) -> Signal { + let status = unsafe { info.si_status() }; + // The status can sometimes be 0 when using PTRACE_SEIZE, so we report a + // bogus SIGSTOP instead. + Signal::try_from(status & 0xff).unwrap_or(Signal::SIGSTOP) +} + +#[inline] +fn si_status_event(info: &libc::siginfo_t) -> i32 { + (unsafe { info.si_status() } >> 8) as i32 +} + +/// Returns the raw siginfo from a waitid call. +fn waitid_si(waitid_type: IdType, flags: WaitPidFlag) -> Result { + let mut siginfo = MaybeUninit::::zeroed(); + let siginfo_ptr: *mut libc::siginfo_t = siginfo.as_mut_ptr(); + + let (id_type, pid_or_pidfd) = match waitid_type { + IdType::Pid(pid) => (libc::P_PID, pid.as_raw()), + IdType::Pgid(pid) => (libc::P_PGID, pid.as_raw()), + IdType::Pidfd(raw_fd) => (libc::P_PIDFD, raw_fd), + IdType::All => (libc::P_ALL, -1), + }; + + Errno::result(unsafe { + libc::waitid( + id_type, + pid_or_pidfd as libc::id_t, + siginfo_ptr, + flags.bits(), + ) + })?; + + Ok(unsafe { siginfo.assume_init() }) +} + +/// `waitpid` implemented with `waitid`. `waitid` has fewer limitations than `waitpid`. +pub fn waitpid(pid: Pid, flags: WaitPidFlag) -> Result, Errno> { + let si = waitid_si(IdType::Pid(pid), flags)?; + + if unsafe { si.si_pid() } == 0 { + // Still alive. + return Ok(None); + } + + Ok(Some(siginfo_to_status(si))) +} + +// Converts a siginfo to a more compact status code. +fn siginfo_to_status(si: libc::siginfo_t) -> i32 { + let si_status = unsafe { si.si_status() }; + + let status = match si.si_code { + libc::CLD_EXITED => si_status << 8, + libc::CLD_KILLED => si_status & 0x7f, + libc::CLD_DUMPED => (si_status | 0x80) & 0xff, + libc::CLD_TRAPPED => (si_status << 8) | 0x7f, + libc::CLD_STOPPED => si_status << 8, + libc::CLD_CONTINUED => 0xffff, + other => panic!("unexpected si_code: {}", other), + }; + + debug_assert_eq!( + siginfo_to_waitstatus(si), + WaitStatus::from_raw(Pid::from_raw(unsafe { si.si_pid() }), status).unwrap() + ); + + status +} + +fn siginfo_to_waitstatus(si: libc::siginfo_t) -> WaitStatus { + let pid = Pid::from_raw(unsafe { si.si_pid() }); + match si.si_code { + libc::CLD_EXITED => WaitStatus::Exited(pid, unsafe { si.si_status() }), + libc::CLD_KILLED => WaitStatus::Signaled(pid, si_status_signal(&si), false), + libc::CLD_DUMPED => WaitStatus::Signaled(pid, si_status_signal(&si), true), + libc::CLD_STOPPED => WaitStatus::Stopped(pid, si_status_signal(&si)), + libc::CLD_TRAPPED if unsafe { si.si_status() } == 0x80 | Signal::SIGTRAP as i32 => { + WaitStatus::PtraceSyscall(pid) + } + libc::CLD_TRAPPED => { + let trap_sig = si_status_signal(&si); + let event = si_status_event(&si); + if event == 0 { + // could return SIGSTOP here for initial ptrace stop + // right after clone/fork/vfork event. + WaitStatus::Stopped(pid, trap_sig) + } else { + WaitStatus::PtraceEvent(pid, trap_sig, event) + } + } + libc::CLD_CONTINUED => WaitStatus::Continued(pid), + bad_si_code => panic!("unexpected si_code {} from siginfo_t", bad_si_code), + } +} + +/// waitid as to SYS_waitid. +/// return +/// - Err when syscall returns -1. +/// - OK(WaitStatus::StillAlive) when no state change +/// - OK(WaitStatus::...) when state has changed. +pub fn waitid(waitid_type: IdType, flags: WaitPidFlag) -> Result { + let siginfo = waitid_si(waitid_type, flags)?; + + if unsafe { siginfo.si_pid() } == 0 { + Ok(WaitStatus::StillAlive) + } else { + Ok(siginfo_to_waitstatus(siginfo)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::{ + sys::signal::Signal, + sys::wait::WaitPidFlag, + unistd::{self, ForkResult}, + }; + + #[test] + fn waitid_w_exited_0() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Exited(child, 0)) + ); + } + ForkResult::Child => { + let hundred_millies = std::time::Duration::from_millis(100); + std::thread::sleep(hundred_millies); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + } + } + } + + #[test] + fn waitid_w_exited_1() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Exited(child, 1)) + ); + } + ForkResult::Child => { + let hundred_millies = std::time::Duration::from_millis(100); + std::thread::sleep(hundred_millies); + unsafe { + libc::syscall(libc::SYS_exit_group, 1) + }; + } + } + } + + #[test] + fn waitid_w_killed_by_signal() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert!(nix::sys::signal::kill(child, Signal::SIGINT).is_ok()); + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Signaled(child, Signal::SIGINT, false)) + ); + } + ForkResult::Child => { + let one_sec = std::time::Duration::from_millis(1000); + loop { + std::thread::sleep(one_sec); + } + } + } + } + + #[test] + fn waitid_w_exited_no_wait_then_wait() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid( + IdType::Pid(child), + WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT + ), + Ok(WaitStatus::Exited(child, 0)) + ); + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Exited(child, 0)) + ); + } + ForkResult::Child => { + let hundred_millies = std::time::Duration::from_millis(100); + std::thread::sleep(hundred_millies); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + } + } + } + + #[test] + fn waitid_w_exited_then_echild() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Exited(child, 0)) + ); + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Err(Errno::ECHILD) + ); + } + ForkResult::Child => { + let hundred_millies = std::time::Duration::from_millis(100); + std::thread::sleep(hundred_millies); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + } + } + } + + #[test] + fn waitid_w_nohang_then_kill() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid( + IdType::Pid(child), + WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG + ), + Ok(WaitStatus::StillAlive), + ); + assert!(nix::sys::signal::kill(child, Signal::SIGINT).is_ok()); + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Signaled(child, Signal::SIGINT, false)) + ); + } + ForkResult::Child => { + let one_sec = std::time::Duration::from_millis(100); + loop { + std::thread::sleep(one_sec); + } + } + } + } + + #[test] + fn waitid_w_nohang_kill_nohang_nowait_wait() { + let fork_result = unsafe { unistd::fork() }; + assert!(fork_result.is_ok()); + match fork_result.unwrap() { + ForkResult::Parent { child, .. } => { + assert_eq!( + waitid( + IdType::Pid(child), + WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG + ), + Ok(WaitStatus::StillAlive), + ); + assert!(nix::sys::signal::kill(child, Signal::SIGINT).is_ok()); + loop { + // this is not very ideal, the loops generally runs 1K - 10K times.. + let status = waitid( + IdType::Pid(child), + WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT, + ); + assert!(status.is_ok()); + match status.unwrap() { + WaitStatus::StillAlive => {} + waitid_nohang_nowait => { + assert_eq!( + waitid_nohang_nowait, + WaitStatus::Signaled(child, Signal::SIGINT, false) + ); + break; + } + } + } + assert_eq!( + waitid(IdType::Pid(child), WaitPidFlag::WEXITED), + Ok(WaitStatus::Signaled(child, Signal::SIGINT, false)) + ); + } + ForkResult::Child => { + let one_sec = std::time::Duration::from_millis(100); + loop { + std::thread::sleep(one_sec); + } + } + } + } +} diff --git a/reverie-ptrace/src/tracer.rs b/reverie-ptrace/src/tracer.rs new file mode 100644 index 0000000..bdcc66b --- /dev/null +++ b/reverie-ptrace/src/tracer.rs @@ -0,0 +1,640 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! `Tracer` type, plus ways to spawn it and retrieve its output. + +use crate::cp; +use crate::gdbstub::GdbServer; +use crate::task::{Child, TracedTask}; +use crate::trace::{self, Error as TraceError, Event, Running, Stopped}; + +use anyhow::Context; +use futures::{ + future::{self, BoxFuture, Either}, + stream::StreamExt, +}; +use nix::{ + sys::{ + ptrace, + signal::{self, Signal}, + }, + unistd::{self, ForkResult}, +}; +use tokio::sync::{broadcast, mpsc}; + +use reverie::process::seccomp; +use reverie::process::{ChildStderr, ChildStdin, ChildStdout, Command, Output}; +use reverie::syscalls::Sysno; +use reverie::Pid; +use reverie::{Errno, Error, ExitStatus, GlobalTool, Subscription, Tool}; + +use std::io::Write; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::sync::Arc; + +/// Represents the tracer. +/// +/// We need to simultaneously capture stderr/stdout while handling events. These +/// can be two separate futures. The stderr/stdout future will finish when the +/// pipes are closed. +/// +/// The stderr/stdout capture can be a `Stream>` +/// where each item is either a chunk of stderr bytes or stdout bytes. Zipping +/// together the two streams like this preserves ordering. +pub struct Tracer { + /// PID of the root guest process. + guest_pid: Pid, + + // Future of the running handler. + tracer: BoxFuture<'static, Result>, + + // A reference to the global state. + gref: Arc, + + stdin: Option, + stdout: Option, + stderr: Option, +} + +impl Tracer { + /// Returns the PID of the root guest process. + pub fn guest_pid(&self) -> Pid { + self.guest_pid + } + + /// Simultaneously waits for the tracee to exit and collect all remaining + /// output on the stdout/stderr handles, returning an `Output` instance. + /// + /// The stdin handle to the child process, if any, will be closed before + /// waiting. This helps avoid deadlock: it ensures that the child does not + /// block waiting for input from the parent, while the parent waits for the + /// child to exit. + /// + /// By default, stdin, stdout and stderr are inherited from the parent. In + /// order to capture the output it is necessary to create new pipes between + /// parent and child. Use `stdout(Stdio::piped())` or + /// `stderr(Stdio::piped())`, respectively. + pub async fn wait_with_output(mut self) -> Result<(Output, G), Error> { + use tokio::io::{AsyncRead, AsyncReadExt}; + + async fn read_to_end(io: Option) -> Result, Error> { + let mut vec = Vec::new(); + if let Some(mut io) = io { + io.read_to_end(&mut vec).await?; + } + Ok(vec) + } + + drop(self.stdin.take()); + + let stdout = read_to_end(self.stdout.take()); + let stderr = read_to_end(self.stderr.take()); + + let ((status, state), stdout, stderr) = + future::try_join3(self.wait(), stdout, stderr).await?; + + Ok(( + Output { + status, + stdout, + stderr, + }, + state, + )) + } + + /// Waits for the tracee to exit and returns its exit status and global + /// state. + pub async fn wait(self) -> Result<(ExitStatus, G), Error> { + // Note: The usage of LocalSet is *very* important here. Once polled, + // the `tracer` future drives all tracees to completion. The `fork` for + // the root tracee and all subsequent ptrace operations *MUST* be done + // on the same thread. Thus, we use `LocalSet` in combination with + // `tokio::task::spawn_local` to ensure that everything happens on the + // same thread. Otherwise, ptrace operations will start returning + // `ESRCH` errors and they will be (incorrectly) interpretted to mean + // that the tracee has died unexpectedly. + let local_set = tokio::task::LocalSet::new(); + let exit_status = local_set.run_until(self.tracer).await?; + + let g = Arc::try_unwrap(self.gref).unwrap_or_else(|_| { + panic!("Reverie internal invariant broken. Arc::try_unwrap on global state failed.") + }); + + Ok((exit_status, g)) + } +} + +fn from_nix_error(err: nix::Error) -> Errno { + Errno::new(err as i32) +} + +/// Sets up the child process for ptracing right before execve is called. +fn init_tracee(intercept_rdtsc: bool) -> Result<(), Errno> { + // NOTE: There should be *NO* allocations along the happy path here. + // Allocating between a fork() and execve() can cause deadlocks in glibc + // when using jemalloc. + + // hardcoded because `libc` does not export these. + const PER_LINUX: u64 = 0x0; + const ADDR_NO_RANDOMIZE: u64 = 0x0004_0000; + + if intercept_rdtsc { + unsafe { + assert_eq!( + libc::prctl(libc::PR_SET_TSC, libc::PR_TSC_SIGSEGV, 0, 0, 0), + 0 + ) + }; + } + + unsafe { + assert!(libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0); + assert!(libc::personality(PER_LINUX | ADDR_NO_RANDOMIZE) != -1); + } + + // FIXME: This is a hacky workaround for `std::process::Command::spawn` + // getting stuck in a deadlock because of the SIGSTOP below. + // `Command::spawn` uses a pipe to communicate the error code to the parent + // process if the `execve` fails. The idea is that the write end of the pipe + // will be closed upon a successful call to `execve` and the parent will + // abort the blocking read on the read end of the pipe. We don't know + // exactly which file descriptor the pipe uses, so we attempt to close the + // first N file descriptors hoping it is among those. Unfortunately, in + // doing so, we lose the ability to capture `execve` failures. + // + // There are a couple options for a better implementation: + // 1. Recreate the entire `std::process` module to provide better ptrace + // support. (A lot of work!) + // 2. Don't raise a SIGSTOP, but instead let the ptracer stop on the call to + // `execve` and have the parent set the ptrace options at that point. + for i in 3..256 { + unsafe { + libc::close(i); + } + } + + trace::traceme_and_stop()?; + + unsafe { + signal::sigaction( + signal::SIGTTIN, + &signal::SigAction::new( + signal::SigHandler::SigIgn, + signal::SaFlags::SA_RESTART, + signal::SigSet::empty(), + ), + ) + .map_err(from_nix_error)?; + + signal::sigaction( + signal::SIGTTOU, + &signal::SigAction::new( + signal::SigHandler::SigIgn, + signal::SaFlags::SA_RESTART, + signal::SigSet::empty(), + ), + ) + .map_err(from_nix_error)?; + } + + Ok(()) +} + +async fn run_orphaned(orphans: mpsc::Receiver) { + tokio_stream::wrappers::ReceiverStream::new(orphans) + .for_each_concurrent(None, |orphan| async { + let pid = orphan.id(); + let mut daemonizer = orphan.daemonizer_rx.unwrap(); + + let daemonizer = daemonizer.recv(); + futures::pin_mut!(daemonizer); + + match future::select(Box::pin(orphan.handle), daemonizer).await { + Either::Left((exit_status, _)) => { + tracing::debug!( + "[reverie] Orphan {} exited with status {:?}", + pid, + exit_status + ); + } + Either::Right((kill_switch, handle)) => { + tracing::debug!("[reverie] pid {} daemonized", pid); + if let Some(mut kill_switch) = kill_switch { + let kill_switch = kill_switch.recv(); + futures::pin_mut!(kill_switch); + match future::select(Box::pin(handle), kill_switch).await { + Either::Left((exit_status, _)) => { + tracing::debug!( + "[reverie] Daemon {} exited with status {:?}", + pid, + exit_status + ); + } + Either::Right((_, handle)) => { + tracing::debug!("sending sigkill {}", pid); + unsafe { + libc::kill(pid.as_raw(), libc::SIGKILL); + } + let status = handle.await; + tracing::debug!( + "[reverie] Daemon {} exited with status {:?}", + pid, + status + ); + } + } + } + } + } + }) + .await; +} + +/// Runs the task tree to completion and returns the exit status of the root +/// task. +async fn run_task_tree( + root: TracedTask, + child: Stopped, + orphanage: mpsc::Receiver, +) -> Result { + future::join( + // Run the root task to completion + root.run(child), + // ...and wait for all orphans simultaneously. + run_orphaned(orphanage), + ) + .await + .0 +} + +/// Helper function for everything after the child is spawned. +async fn postspawn( + child: Running, + gref: Arc, + config: ::Config, + events: &Subscription, + gdbserver: Option, +) -> Result>, TraceError> { + let pid = child.pid(); + + // Wait for the child to enter a stopped state. The child will enter a + // stopped state immediately after ptrace::traceme is called. + // + // NOTE: We may rarely get spurious signals here, like SIGWINCH, so we must + // skip past them. + let (mut child, event) = child + .wait_for_signal(Signal::SIGSTOP) + .await? + .assume_stopped(); + assert_eq!(event, Event::Signal(Signal::SIGSTOP)); + + child.setoptions( + ptrace::Options::PTRACE_O_TRACEEXEC + | ptrace::Options::PTRACE_O_EXITKILL + | ptrace::Options::PTRACE_O_TRACECLONE + | ptrace::Options::PTRACE_O_TRACEFORK + | ptrace::Options::PTRACE_O_TRACEVFORK + | ptrace::Options::PTRACE_O_TRACEVFORKDONE + | ptrace::Options::PTRACE_O_TRACEEXIT + | ptrace::Options::PTRACE_O_TRACESECCOMP + | ptrace::Options::PTRACE_O_TRACESYSGOOD, + )?; + + let (orphan_sender, orphan_receiver) = mpsc::channel(1); + let (daemon_kill, _) = broadcast::channel(1); + + // This is the root task, so there's no reason to make run its init routine + // asynchronously, as there isn't any other work to do. + let mut tracer = TracedTask::::new( + pid, + config, + gref, + events, + orphan_sender, + daemon_kill, + gdbserver, + ); + + child = tracer.tracee_preinit(child).await?; + + let tracer = Box::pin(run_task_tree(tracer, child, orphan_receiver)); + Ok(tracer) +} + +/// Creates the seccomp filter. This lets us control which syscalls are traced +/// and which ones are allowed through. +fn seccomp_filter(events: &Subscription) -> seccomp::Filter { + use reverie::process::seccomp::Action; + + seccomp::FilterBuilder::new() + // By default, all syscalls are allowed through untraced. Then, we can + // intercept only the syscalls we are interested in. + .default_action(Action::Allow) + .syscalls( + events + .iter_syscalls() + .map(|syscall| (syscall, Action::Trace(0))), + ) + // Always allow these syscalls to pass through untraced. + .syscall(Sysno::restart_syscall, Action::Allow) + .syscall(Sysno::rt_sigreturn, Action::Allow) + // Allow untraced syscalls through without tracing them. + // NOTE: 2 is the length of a syscall instruction (0x0f 0x05) and we + // want to allow the ud2 instruction immediately following it. + .ip_range( + cp::TRAMPOLINE_BASE + 2, + cp::TRAMPOLINE_BASE + 3, + Action::Allow, + ) + .build() +} + +/// Specifies *how* the GDB server should listen for incoming connections. +pub enum GdbConnection { + /// The server shall bind to and listen on the given socket address. + Addr(SocketAddr), + + /// The server shall bind to and listen on the given unix domain socket. This + /// path must not exist, otherwise the bind will fail with `EADDRINUSE`. + Path(PathBuf), +} + +impl From for GdbConnection { + fn from(addr: SocketAddr) -> Self { + Self::Addr(addr) + } +} + +impl From for GdbConnection { + fn from(path: PathBuf) -> Self { + Self::Path(path) + } +} + +impl From for GdbConnection { + fn from(port: u16) -> Self { + Self::Addr(([127, 0, 0, 1], port).into()) + } +} + +/// A builder for creating a tracer. +pub struct TracerBuilder { + /// The program to execute that will be traced. + command: Command, + + /// The global state static config. + config: Option<::Config>, + + /// Set to `Some` if we should spawn a GDB server. + gdbserver: Option, + + /// Indicates that the guest's scheduling will be serialized by the Reverie + /// tool. This is only relevant for the GDB server. + sequentialized_guest: bool, +} + +impl TracerBuilder { + /// Creates the builder with the given command. + pub fn new(command: Command) -> Self { + Self { + command, + config: None, + gdbserver: None, + sequentialized_guest: false, + } + } + + /// Sets the static configuration that will be made available to the tool. + pub fn config(mut self, config: ::Config) -> Self { + self.config = Some(config); + self + } + + /// Configures the tracer to create a GDB server and listen for incoming + /// connections. The tracer will start in a stopped state and will not + /// proceed until a connection is made. This allows the GDB client to observe + /// the full execution of the guest. + pub fn gdbserver>(mut self, connection: C) -> Self { + self.gdbserver = Some(connection.into()); + self + } + + /// Make the GDB server aware that guest threads are sequentialized. This is + /// needed when the Reverie tool has full control of scheduling and already + /// sequentializes thread execution. This helps avoid deadlocks. + pub fn sequentialized_guest(mut self) -> Self { + self.sequentialized_guest = true; + self + } + + /// Spawns the tracer. + pub async fn spawn(self) -> Result, Error> { + let mut command = self.command; + let config = self.config.unwrap_or_default(); + + // Because this ptrace backend is CENTRALIZED, it can keep all the + // tool's state here in a single address space. + let global_state = ::init_global_state(&config).await; + let events = T::subscriptions(&config); + let gref = Arc::new(global_state); + + // Get the full path to the program and change the command to use it. This + // also checks that the path exists and provides an early exit just in case + // it doesn't. + // + // Normally, we'd rely upon the `exit(1)` following a failed call to + // `execve`, but that is tricky when ptracing the `execve` call. + let program = command + .find_program() + .with_context(|| format!("Could not execute {:?}", command.get_program()))?; + command.program(program); + + // Disable sanitizers that use ptrace from running on tracer. + command.env("LSAN_OPTIONS", "detect_leaks=0"); + command.env("ASAN_OPTIONS", "detect_leaks=0"); + + let intercept_rdtsc = events.has_rdtsc(); + unsafe { + command.pre_exec(move || init_tracee(intercept_rdtsc)); + } + + command.seccomp(seccomp_filter(&events)); + + let mut child = command.spawn().context("Failed to spawn tracee")?; + let guest_pid = child.id(); + let running_child = Running::new(guest_pid); + + // Configure the gdb server (if any). + let gdbserver = match self.gdbserver { + None => None, + Some(connection) => { + let server = match connection { + GdbConnection::Addr(addr) => GdbServer::from_addr(addr).await, + GdbConnection::Path(path) => GdbServer::from_path(&path).await, + }; + + // FIXME: Don't panic. Return an error here instead. + let mut server = server.unwrap(); + + if self.sequentialized_guest { + server.sequentialized_guest(); + } + + Some(server) + } + }; + + let tracer = + match postspawn::(running_child, gref.clone(), config, &events, gdbserver).await { + Ok(tracer) => tracer, + Err(TraceError::Errno(err)) => return Err(Error::Errno(err)), + Err(TraceError::Died(zombie)) => panic!( + "tracee {} died unexpectedly during initialization", + zombie.pid() + ), + }; + + let stdin = child.stdin.take(); + let stdout = child.stdout.take(); + let stderr = child.stderr.take(); + + // Don't let the drop logic run for the child. Tokio will add the child to a + // "orphan queue" that will try to call `waitpid` on the process when a + // `SIGCHLD` signal is received. This interferes with our own process + // handling where we need full control over the lifetime of the child + // process. + core::mem::forget(child); + + Ok(Tracer { + guest_pid, + tracer, + gref, + stdin, + stdout, + stderr, + }) + } +} + +/// Spawn a *function* to be executed under instrumentation instrumentation +/// (rather than a subprocess indicated with a Command). +/// +/// This still creates a fresh child process and runs it under ptrace. However, +/// the child process is a fork of the current process, and is used to run the +/// indicated function. +pub async fn spawn_fn(fun: F) -> Result, Error> +where + L: Tool + 'static, + F: FnOnce(), +{ + spawn_fn_with_config::(fun, Default::default(), true).await +} + +/// Spawn a function with instrumentation rather than a subprocess indicated with +/// a Command. This still creates a fresh child process and runs it under ptrace. +/// However, the child process is a fork of the current process, and is used to +/// run the indicated function. +/// +/// The main use case for this entrypoint into the library is testing. +pub async fn spawn_fn_with_config( + fun: F, + config: ::Config, + capture_output: bool, +) -> Result, Error> +where + L: Tool + 'static, + F: FnOnce(), +{ + use std::os::unix::io::FromRawFd; + + // Because this ptrace backend is CENTRALIZED, it can keep all the + // tool's state here in a single address space. + let global_state = ::init_global_state(&config).await; + let events = L::subscriptions(&config); + let gref = Arc::new(global_state); + + let seccomp_filter = seccomp_filter(&events); + + let (read1, write1) = unistd::pipe().map_err(from_nix_error)?; + let (read2, write2) = unistd::pipe().map_err(from_nix_error)?; + + // Disable io redirection just before forking. We want the child process to + // be able to call `println!()` and have that output go to stdout. + // + // See: https://github.com/rust-lang/rust/issues/35136 + let output_capture = std::io::set_output_capture(None); + + // Warning: fork is wildely unsafe in Rust because of runtime issues (printing, + // panicking, etc). We make a best-effort attempt to solve some of these issues. + match unsafe { unistd::fork() }.expect("unistd::fork failed") { + ForkResult::Child => { + unistd::close(read1) + .and_then(|_| unistd::close(read2)) + .map_err(from_nix_error)?; + if capture_output { + unistd::dup2(write1, 1) + .and_then(|_| unistd::dup2(write2, 2)) + .and_then(|_| unistd::close(write1)) + .and_then(|_| unistd::close(write2)) + .map_err(from_nix_error)?; + } + + init_tracee(events.has_rdtsc()).expect("init_tracee failed"); + + seccomp_filter.load().expect("Failed to set seccomp filter"); + + match std::panic::catch_unwind(std::panic::AssertUnwindSafe(fun)) { + Ok(()) => { + std::io::stdout().flush()?; + std::process::exit(0); + } + Err(e) => { + std::io::stdout().flush()?; + let _ = nix::unistd::write( + 2, + format!("Forked Rust process panicked, cause: {:?}", e).as_ref(), + ); + std::process::exit(1); + } + }; + } + ForkResult::Parent { child } => { + std::io::set_output_capture(output_capture); + + let guest_pid = Pid::from(child); + let child = Running::new(guest_pid); + unistd::close(write1) + .and_then(|_| unistd::close(write2)) + .map_err(from_nix_error) + .unwrap(); + + let stdout = unsafe { ChildStdout::from_raw_fd(read1) }; + let stderr = unsafe { ChildStderr::from_raw_fd(read2) }; + let tracer = match postspawn::(child, gref.clone(), config, &events, None).await { + Ok(tracer) => tracer, + Err(TraceError::Errno(err)) => return Err(Error::Errno(err)), + Err(TraceError::Died(zombie)) => panic!( + "tracee {} died unexpectedly during initialization", + zombie.pid() + ), + }; + + Ok(Tracer { + guest_pid, + tracer, + gref, + stdin: None, + stdout: Some(stdout), + stderr: Some(stderr), + }) + } + } +} diff --git a/reverie-ptrace/src/vdso.rs b/reverie-ptrace/src/vdso.rs new file mode 100644 index 0000000..16598a2 --- /dev/null +++ b/reverie-ptrace/src/vdso.rs @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! provide APIs to disable VDSOs at runtime. +use goblin::elf::Elf; + +use lazy_static::lazy_static; +use nix::{sys::mman::ProtFlags, unistd}; +use reverie::{ + syscalls::{AddrMut, MemoryAccess, Mprotect}, + Error, Guest, Tool, +}; +use std::collections::HashMap; +use tracing::debug; + +/* + * byte code for the new psudo vdso functions + * which do the actual syscalls. + * NB: the byte code must be 8 bytes + * aligned + */ + +#[allow(non_upper_case_globals)] +const __vdso_time: &[u8] = &[ + 0xb8, 0xc9, 0x0, 0x0, 0x0, // mov %SYS_time, %eax + 0x0f, 0x05, // syscall + 0xc3, // retq + 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, // nopl 0x0(%rax, %rax, 1) + 0x00, +]; + +#[allow(non_upper_case_globals)] +const __vdso_clock_gettime: &[u8] = &[ + 0xb8, 0xe4, 0x00, 0x00, 0x00, // mov SYS_clock_gettime, %eax + 0x0f, 0x05, // syscall + 0xc3, // retq + 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, // nopl 0x0(%rax, %rax, 1) + 0x00, +]; + +#[allow(non_upper_case_globals)] +const __vdso_getcpu: &[u8] = &[ + 0x48, 0x85, 0xff, // test %rdi, %rdi + 0x74, 0x06, // je .. + 0xc7, 0x07, 0x00, 0x00, 0x00, 0x00, // movl $0x0, (%rdi) + 0x48, 0x85, 0xf6, // test %rsi, %rsi + 0x74, 0x06, // je .. + 0xc7, 0x06, 0x00, 0x00, 0x00, 0x00, // movl $0x0, (%rsi) + 0x31, 0xc0, // xor %eax, %eax + 0xc3, // retq + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, +]; // nopl 0x0(%rax) + +#[allow(non_upper_case_globals)] +const __vdso_gettimeofday: &[u8] = &[ + 0xb8, 0x60, 0x00, 0x00, 0x00, // mov SYS_gettimeofday, %eax + 0x0f, 0x05, // syscall + 0xc3, // retq + 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, // nopl 0x0(%rax, %rax, 1) + 0x00, +]; + +const VDSO_SYMBOLS: &[&str] = &[ + "__vdso_time", + "__vdso_clock_gettime", + "__vdso_getcpu", + "__vdso_gettimeofday", +]; + +lazy_static! { + static ref VDSO_PATCH_INFO: HashMap = { + let info = vdso_get_symbols_info(); + let mut res: HashMap = HashMap::new(); + let funcs = &[ + __vdso_time, + __vdso_clock_gettime, + __vdso_getcpu, + __vdso_gettimeofday, + ]; + VDSO_SYMBOLS.iter().zip(funcs).for_each(|(k, v)| { + let name = String::from(*k); + if let Some(&(base, size)) = info.get(&name) { + assert!(v.len() <= size); + res.insert(String::from(*k), (base, size, v)); + } + }); + res + }; +} + +// get vdso symbols offset/size from current process +// assuming vdso binary is the same for all processes +// so that we don't have to decode vdso for each process +fn vdso_get_symbols_info() -> HashMap { + let mut res: HashMap = HashMap::new(); + procfs::process::Process::new(unistd::getpid().as_raw()) + .and_then(|p| p.maps()) + .unwrap_or_else(|_| Vec::new()) + .iter() + .find(|e| e.pathname == procfs::process::MMapPath::Vdso) + .and_then(|vdso| { + let slice = unsafe { + std::slice::from_raw_parts( + vdso.address.0 as *mut u8, + (vdso.address.1 - vdso.address.0) as usize, + ) + }; + Elf::parse(slice) + .map(|elf| { + let strtab = elf.dynstrtab; + elf.dynsyms.iter().for_each(|sym| { + let sym_name = &strtab[sym.st_name]; + if VDSO_SYMBOLS.contains(&sym_name) { + debug_assert!(sym.is_function()); + res.insert( + String::from(sym_name), + (sym.st_value, sym.st_size as usize), + ); + } + }); + }) + .ok() + }); + res +} + +/// patch VDSOs when enabled +/// +/// `guest` must be in one of ptrace's stopped states. +pub async fn vdso_patch(guest: &mut G) -> Result<(), Error> +where + G: Guest, + T: Tool, +{ + if let Some(vdso) = procfs::process::Process::new(guest.pid().as_raw()) + .and_then(|p| p.maps()) + .unwrap_or_else(|_| Vec::new()) + .iter() + .find(|e| e.pathname == procfs::process::MMapPath::Vdso) + { + let mut memory = guest.memory(); + + // Allow write access to the vdso memory page. + guest + .inject_with_retry( + Mprotect::new() + .with_addr(AddrMut::from_raw(vdso.address.0 as usize)) + .with_len((vdso.address.1 - vdso.address.0) as usize) + .with_protection( + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE | ProtFlags::PROT_EXEC, + ), + ) + .await?; + + for (name, (offset, size, bytes)) in VDSO_PATCH_INFO.iter() { + let start = vdso.address.0 + offset; + assert!(bytes.len() <= *size); + let rptr = AddrMut::from_raw(start as usize).unwrap(); + memory.write_exact(rptr, bytes)?; + assert!(*size >= bytes.len()); + if *size > bytes.len() { + let fill: Vec = std::iter::repeat(0x90u8).take(size - bytes.len()).collect(); + memory.write_exact(unsafe { rptr.add(bytes.len()) }, &fill)?; + } + debug!("{} patched {}@{:x}", guest.pid(), name, start); + } + + guest + .inject_with_retry( + Mprotect::new() + .with_addr(AddrMut::from_raw(vdso.address.0 as usize)) + .with_len((vdso.address.1 - vdso.address.0) as usize) + .with_protection(ProtFlags::PROT_READ | ProtFlags::PROT_EXEC), + ) + .await?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_find_vdso() { + assert!( + procfs::process::Process::new(unistd::getpid().as_raw()) + .and_then(|p| p.maps()) + .unwrap_or_else(|_| Vec::new()) + .iter() + .any(|e| e.pathname == procfs::process::MMapPath::Vdso) + ); + } + + #[test] + fn vdso_can_find_symbols_info() { + assert!(!vdso_get_symbols_info().is_empty()); + } + + #[test] + fn vdso_patch_info_is_valid() { + let info = &VDSO_PATCH_INFO; + info.iter().for_each(|i| println!("info: {:x?}", i)); + assert!(!info.is_empty()); + } +} diff --git a/reverie-syscalls/Cargo.toml b/reverie-syscalls/Cargo.toml new file mode 100644 index 0000000..4e622cc --- /dev/null +++ b/reverie-syscalls/Cargo.toml @@ -0,0 +1,17 @@ +# @generated by autocargo + +[package] +name = "reverie-syscalls" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" + +[dependencies] +bitflags = "1.3" +derive_more = "0.99.3" +libc = "0.2.98" +nix = "0.22" +paste = "1.0" +serde = { version = "1.0.126", features = ["derive", "rc"] } +syscalls = { version = "0.4.2", features = ["with-serde"] } diff --git a/reverie-syscalls/src/args/fcntl.rs b/reverie-syscalls/src/args/fcntl.rs new file mode 100644 index 0000000..8bd8228 --- /dev/null +++ b/reverie-syscalls/src/args/fcntl.rs @@ -0,0 +1,77 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use std::os::unix::io::RawFd; + +use super::{Addr, Pid}; +use crate::FromToRaw; + +// TODO: Upstream this struct to libc crate. +#[repr(C)] +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub struct f_owner_ex { + typ: libc::c_int, + pid: libc::pid_t, +} + +command_enum! { + /// A `fcntl` command paired with its argument. + pub enum FcntlCmd<'a>: libc::c_int { + F_DUPFD(RawFd) = 0, + F_GETFD = 1, + F_SETFD(RawFd) = 2, + F_GETFL = 3, + F_SETFL(i32) = 4, + F_GETLK(Option>) = 5, + F_SETLK(Option>) = 6, + F_SETLKW(Option>) = 7, + F_SETOWN = 8, + F_GETOWN(Pid) = 9, + F_SETSIG(i32) = 10, + F_GETSIG = 11, + F_GETLK64(Option>) = 12, + F_SETLK64(Option>) = 13, + F_SETLKW64(Option>) = 14, + F_SETOWN_EX(Option>) = 15, + F_GETOWN_EX(Option>) = 16, + F_GETOWNER_UIDS = 17, + + F_OFD_GETLK(Option>) = 36, + F_OFD_SETLK(Option>) = 37, + F_OFD_SETLKW(Option>) = 38, + + F_SETLEASE(i32) = 1024, + F_GETLEASE = 1025, + F_NOTIFY(i32) = 1026, + F_DUPFD_CLOEXEC(i32) = 1030, + F_SETPIPE_SZ(i32) = 1031, + F_GETPIPE_SZ = 1032, + F_ADD_SEALS(i32) = 1033, + F_GET_SEALS = 1034, + + F_GET_RW_HINT(Option>) = 1035, + F_SET_RW_HINT(Option>) = 1036, + F_GET_FILE_RW_HINT(Option>) = 1037, + F_SET_FILE_RW_HINT(Option>) = 1038, + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_fcntl() { + assert_eq!(format!("{:?}", FcntlCmd::F_DUPFD(2)), "F_DUPFD(2)"); + assert_eq!(format!("{}", FcntlCmd::F_DUPFD(2)), "F_DUPFD(2)"); + assert_eq!(FcntlCmd::from_raw(libc::F_DUPFD, 42), FcntlCmd::F_DUPFD(42)); + assert_eq!(FcntlCmd::from_raw(1337, 42), FcntlCmd::Other(1337, 42)); + assert_eq!(FcntlCmd::F_DUPFD(42).into_raw(), (libc::F_DUPFD, 42)); + } +} diff --git a/reverie-syscalls/src/args/ioctl.rs b/reverie-syscalls/src/args/ioctl.rs new file mode 100644 index 0000000..753bd15 --- /dev/null +++ b/reverie-syscalls/src/args/ioctl.rs @@ -0,0 +1,221 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Everything related to ioctl arguments. + +use crate::{Addr, AddrMut, Errno, FromToRaw, MemoryAccess}; + +use serde::{Deserialize, Serialize}; + +/// The type of ioctl from the perspective of userspace. That is, whether +/// userspace is reading, writing, or doing nothing. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum Direction { + /// Userspace is reading. + Read, + /// Userspace is writing. + Write, + /// There is neither reading nor writing. This is the case for ioctls that + /// have value parameters instead of pointer parameters. + None, +} + +command_enum! { + /// An `ioctl` request. + /// + /// The `ioctl` syscall is a dumping ground for sending a request to a file + /// descriptor. This is not a complete list of all the possible requests, but + /// we try to have the most-commonly used requests listed. + /// + /// See [`ioctl_list(2)`][ioctl_list] for a more complete list. + /// + /// [ioctl_list]: http://man7.org/linux/man-pages/man2/ioctl_list.2.html + pub enum Request<'a>: libc::c_ulong { + // + FIOSETOWN(Option>) = 0x00008901, + SIOCSPGRP(Option>) = 0x00008902, + FIOGETOWN(Option>) = 0x00008903, + SIOCGPGRP(Option>) = 0x00008904, + SIOCATMAR(Option>) = 0x00008905, + SIOCGSTAMP(Option>) = 0x00008906, + + // + TCGETS(Option>) = 0x00005401, + TCSETS(Option>) = 0x00005402, + TCSETSW(Option>) = 0x00005403, + TCSETSF(Option>) = 0x00005404, + TCGETA(Option>) = 0x00005405, + TCSETA(Option>) = 0x00005406, + TCSETAW(Option>) = 0x00005407, + TCSETAF(Option>) = 0x00005408, + TCSBRK(libc::c_int) = 0x00005409, + TCXONC(libc::c_int) = 0x0000540A, + TCFLSH(libc::c_int) = 0x0000540B, + TIOCEXCL = 0x0000540C, + TIOCNXCL = 0x0000540D, + TIOCSCTTY(libc::c_int) = 0x0000540E, + TIOCGPGRP(Option>) = 0x0000540F, + TIOCSPGRP(Option>) = 0x00005410, + TIOCOUTQ(Option>) = 0x00005411, + TIOCSTI(Option>) = 0x00005412, + TIOCGWINSZ(Option>) = 0x00005413, + TIOCSWINSZ(Option>) = 0x00005414, + TIOCMGET(Option>) = 0x00005415, + TIOCMBIS(Option>) = 0x00005416, + TIOCMBIC(Option>) = 0x00005417, + TIOCMSET(Option>) = 0x00005418, + TIOCGSOFTCAR(Option>) = 0x00005419, + TIOCSSOFTCAR(Option>) = 0x0000541A, + FIONREAD(Option>) = 0x0000541B, + // Duplicate of FIONREAD; can't properly match the ID. + #[cfg(none)] + TIOCINQ(Option>) = 0x0000541B, + TIOCLINUX(Option>) = 0x0000541C, + TIOCCONS = 0x0000541D, + // Disabled because `libc::serial_struct` isn't defined. + #[cfg(none)] + TIOCGSERIAL(Option>) = 0x0000541E, + // Disabled because `libc::serial_struct` isn't defined. + #[cfg(none)] + TIOCSSERIAL(Option>) = 0x0000541F, + TIOCPKT(Option>) = 0x00005420, + FIONBIO(Option>) = 0x00005421, + TIOCNOTTY = 0x00005422, + TIOCSETD(Option>) = 0x00005423, + TIOCGETD(Option>) = 0x00005424, + TCSBRKP(libc::c_int) = 0x00005425, + // Disabled because `libc::tty_struct` isn't defined. + #[cfg(none)] + TIOCTTYGSTRUCT(Option>) = 0x00005426, + FIONCLEX = 0x00005450, + FIOCLEX = 0x00005451, + FIOASYNC(Option>) = 0x00005452, + TIOCSERCONFIG = 0x00005453, + TIOCSERGWILD(Option>) = 0x00005454, + TIOCSERSWILD(Option>) = 0x00005455, + TIOCGLCKTRMIOS(Option>) = 0x00005456, + TIOCSLCKTRMIOS(Option>) = 0x00005457, + // Disabled because `libc::async_struct` isn't defined. + #[cfg(none)] + TIOCSERGSTRUCT(Option>) = 0x00005458, + TIOCSERGETLSR(Option>) = 0x00005459, + } +} + +impl<'a> Request<'a> { + /// Returns the direction of the request. That is, whether it is a read or + /// write request. + pub fn direction(&self) -> Direction { + // TODO: Generate this with a macro. + match self { + Self::TCGETS(_) => Direction::Read, + Self::TCSETS(_) => Direction::Write, + Self::TIOCGWINSZ(_) => Direction::Read, + Self::TIOCSWINSZ(_) => Direction::Write, + Self::TIOCSPGRP(_) => Direction::Write, + Self::TIOCGPGRP(_) => Direction::Read, + Self::FIONREAD(_) => Direction::Read, + other => { + panic!("ioctl: unsupported request: {:?}", other) + } + } + } + + /// Reads the output associated with this request. If the request has no + /// outputs, returns `Ok(None)`. + /// + /// Panics if this request is unsupported. + pub fn read_output(&self, m: &M) -> Result, Errno> { + // TODO: Generate this with a macro. + Ok(Some(match self { + Self::TCGETS(p) => Output::TCGETS(m.read_value(p.ok_or(Errno::EFAULT)?)?), + Self::TCSETS(_) => return Ok(None), + Self::TIOCGWINSZ(p) => Output::TIOCGWINSZ(m.read_value(p.ok_or(Errno::EFAULT)?)?), + Self::TIOCSWINSZ(_) => return Ok(None), + Self::TIOCGPGRP(p) => Output::TIOCGPGRP(m.read_value(p.ok_or(Errno::EFAULT)?)?), + Self::TIOCSPGRP(_) => return Ok(None), + Self::FIONREAD(p) => Output::FIONREAD(m.read_value(p.ok_or(Errno::EFAULT)?)?), + other => { + panic!("ioctl: unsupported request: {:?}", other); + } + })) + } + + /// Writes the output associated with this request to the provided address + /// (if any). If the request has no outputs, returns `Ok(())`. + pub fn write_output(&self, m: &mut M, output: &Output) -> Result<(), Errno> { + match (self, output) { + (Self::TCGETS(p), Output::TCGETS(output)) => { + m.write_value(p.ok_or(Errno::EFAULT)?, output) + } + (Self::TCSETS(_), _) => Ok(()), + (Self::TIOCGWINSZ(p), Output::TIOCGWINSZ(output)) => { + m.write_value(p.ok_or(Errno::EFAULT)?, output) + } + (Self::TIOCSWINSZ(_), _) => Ok(()), + (Self::TIOCGPGRP(p), Output::TIOCGPGRP(output)) => { + m.write_value(p.ok_or(Errno::EFAULT)?, output) + } + (Self::TIOCSPGRP(_), _) => Ok(()), + (Self::FIONREAD(p), Output::FIONREAD(output)) => { + m.write_value(p.ok_or(Errno::EFAULT)?, output) + } + (other, output) => { + panic!( + "ioctl: unsupported request/output pair: {:?}, {:?}", + other, output + ); + } + } + } +} + +/// The output after a successful call to `ioctl`. This is only relavent for +/// requests with outputs. +/// +/// Note that this is a `union`. The descriminator is the [`Request`] type. +#[allow(missing_docs, non_camel_case_types, clippy::upper_case_acronyms)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub enum Output { + TCGETS(Termios), + TIOCGWINSZ(Winsize), + TIOCGPGRP(libc::pid_t), + FIONREAD(libc::c_int), +} + +/// Terminal I/O. This is the same as `termios` as defined in +/// `include/uapi/asm-generic/termbits.h`. Note that this is *different* from the +/// struct exposed by libc (which maps the smaller kernel-defined struct onto a +/// larger libc-defined struct). +#[repr(C)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct Termios { + /// input mode flags + pub c_iflag: libc::tcflag_t, + /// output mode flags + pub c_oflag: libc::tcflag_t, + /// control mode flags + pub c_cflag: libc::tcflag_t, + /// local mode flags + pub c_lflag: libc::tcflag_t, + /// line discipline + pub c_line: libc::cc_t, + /// control characters + pub c_cc: [libc::cc_t; 19], +} + +#[allow(missing_docs)] +#[repr(C)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct Winsize { + pub ws_row: libc::c_ushort, + pub ws_col: libc::c_ushort, + pub ws_xpixel: libc::c_ushort, + pub ws_ypixel: libc::c_ushort, +} diff --git a/reverie-syscalls/src/args/mod.rs b/reverie-syscalls/src/args/mod.rs new file mode 100644 index 0000000..56bdc05 --- /dev/null +++ b/reverie-syscalls/src/args/mod.rs @@ -0,0 +1,525 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Collection of type-safe syscall arguments. These are shared among +//! potentially many syscalls. + +use core::fmt; + +use std::{ + ffi::{CString, OsString}, + os::unix::ffi::OsStringExt, + path::PathBuf, +}; + +mod fcntl; +pub mod ioctl; +mod poll; +mod stat; +mod time; + +use nix::{ + sys::stat::{Mode, SFlag}, + unistd::Pid, +}; + +use serde::{Deserialize, Serialize}; + +pub use fcntl::FcntlCmd; +pub use poll::*; +pub use stat::*; +pub use time::*; + +use crate::{Addr, AddrMut, Displayable, Errno, FromToRaw, MemoryAccess}; + +/// Helper trait for reading a specific value from an address. +pub trait ReadAddr { + /// The type of value returned by `read`. + type Target: Sized; + + /// The error type returned by `read`. + type Error; + + /// Reads the contents of the address and returns it. + fn read(&self, memory: &M) -> Result; +} + +impl<'a, T> ReadAddr for Addr<'a, T> +where + T: Copy + Sized, +{ + type Target = T; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + memory.read_value(*self) + } +} + +impl<'a, T> ReadAddr for AddrMut<'a, T> +where + T: Copy + Sized, +{ + type Target = T; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + memory.read_value(*self) + } +} + +/// An array of pointers. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub struct CArrayPtr<'a, T>(Addr<'a, Option>); + +impl<'a, T> ReadAddr for CArrayPtr<'a, T> +where + T: Copy, +{ + type Target = Vec; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + let mut v = Vec::new(); + + let mut r = memory.reader(self.0); + + while let Some(addr) = r.read_value()? { + v.push(addr); + } + + Ok(v) + } +} + +impl<'a, T> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Option::>>::from_raw(raw).map(CArrayPtr) + } + + fn into_raw(self) -> u64 { + self.map(|p| p.0).into_raw() + } +} + +impl<'a, T> Displayable for Option> +where + T: Copy + Displayable, +{ + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => f.write_str("NULL"), + Some(array) => match array.read(memory) { + Ok(v) => { + write!(f, "{} -> [", array.0)?; + + let mut count = 0; + + let mut iter = v.into_iter(); + + if let Some(item) = iter.next() { + item.fmt(memory, outputs, f)?; + count += 1; + } + + for item in iter { + f.write_str(", ")?; + + // Only print the first 32 arguments like strace does. + if count > 32 { + f.write_str("...")?; + break; + } + + item.fmt(memory, outputs, f)?; + count += 1; + } + + f.write_str("]") + } + Err(e) => write!(f, "{} -> <{}>", array.0, e), + }, + } + } +} + +/// A pointer to a `CString` that resides in the target address space. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub struct CStrPtr<'a>(Addr<'a, u8>); + +impl<'a> CStrPtr<'a> { + /// Creates the `CStrPtr` from a raw pointer. Returns `None` if the given + /// pointer is NULL. + pub fn from_ptr(r: *const libc::c_char) -> Option { + Addr::from_ptr(r as *const u8).map(CStrPtr) + } +} + +impl<'a> ReadAddr for CStrPtr<'a> { + type Target = CString; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + memory.read_cstring(self.0) + } +} + +impl<'a> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Option::>::from_raw(raw).map(CStrPtr) + } + + fn into_raw(self) -> u64 { + self.map(|p| p.0).into_raw() + } +} + +impl<'a> Displayable for CStrPtr<'a> { + fn fmt( + &self, + memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self.read(memory) { + Ok(s) => { + // Only display the first 64 bytes. + if s.as_bytes().len() > 64 { + let mut bytes = s.into_bytes(); + bytes.truncate(64); + let s = unsafe { CString::from_vec_unchecked(bytes) }; + write!(f, "{} -> {:?}...", self.0, s) + } else { + write!(f, "{} -> {:?}", self.0, s) + } + } + Err(e) => write!(f, "{} -> <{}>", self.0, e), + } + } +} + +impl<'a> Displayable for Option> { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => f.write_str("NULL"), + Some(addr) => Displayable::fmt(addr, memory, outputs, f), + } + } +} + +/// A pointer to a `Path` that resides in the target address space. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub struct PathPtr<'a>(CStrPtr<'a>); + +impl<'a> PathPtr<'a> { + /// Creates the `PathPtr` from a raw pointer. Returns `None` if the given + /// pointer is NULL. + pub fn from_ptr(r: *const libc::c_char) -> Option { + CStrPtr::from_ptr(r).map(PathPtr) + } +} + +impl<'a> ReadAddr for PathPtr<'a> { + type Target = PathBuf; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + let path = PathBuf::from(OsString::from_vec(self.0.read(memory)?.into_bytes())); + + Ok(path) + } +} + +impl<'a> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Option::>::from_raw(raw).map(PathPtr) + } + + fn into_raw(self) -> u64 { + self.map(|p| p.0).into_raw() + } +} + +impl<'a> Displayable for Option> { + fn fmt( + &self, + memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => f.write_str("NULL"), + Some(addr) => match addr.read(memory) { + Ok(s) => write!(f, "{} -> {:?}", addr.0.0, s), + Err(e) => write!(f, "{} -> <{}>", addr.0.0, e), + }, + } + } +} + +/// A pointer to a `stat` buffer. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct StatPtr<'a>(pub AddrMut<'a, libc::stat>); + +impl<'a> StatPtr<'a> { + /// Creates the `StatPtr` from a raw pointer. Returns `None` if the given + /// pointer is NULL. + pub fn from_ptr(r: *const libc::stat) -> Option { + AddrMut::from_ptr(r as *const libc::stat).map(StatPtr) + } +} + +impl<'a> ReadAddr for StatPtr<'a> { + type Target = libc::stat; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + memory.read_value(self.0) + } +} + +impl<'a> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Option::>::from_raw(raw).map(StatPtr) + } + + fn into_raw(self) -> u64 { + self.map(|p| p.0).into_raw() + } +} + +impl<'a> Displayable for Option> { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => f.write_str("NULL"), + Some(addr) => { + if outputs { + match addr.read(memory) { + Ok(stat) => { + // Print st_mode the same way strace does. + let sflag = SFlag::from_bits_truncate(stat.st_mode); + let mode = Mode::from_bits_truncate(stat.st_mode); + write!( + f, + "{} -> {{st_mode={:?} | 0{:o}, st_size={}, ...}}", + addr.0, sflag, mode, stat.st_size + ) + } + Err(e) => write!(f, "{} -> <{}>", addr.0, e), + } + } else { + // Just print the address when not displaying outputs. + fmt::Display::fmt(&addr.0, f) + } + } + } + } +} + +/// A pointer to a `statx` buffer. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct StatxPtr<'a>(pub AddrMut<'a, libc::statx>); + +impl<'a> StatxPtr<'a> { + /// Creates the `StatxPtr` from a raw pointer. Returns `None` if the given + /// pointer is NULL. + pub fn from_ptr(r: *const libc::statx) -> Option { + AddrMut::from_ptr(r as *const libc::statx).map(StatxPtr) + } +} + +impl<'a> ReadAddr for StatxPtr<'a> { + type Target = libc::statx; + type Error = Errno; + + fn read(&self, memory: &M) -> Result { + memory.read_value(self.0) + } +} + +impl<'a> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Option::>::from_raw(raw).map(StatxPtr) + } + + fn into_raw(self) -> u64 { + self.map(|p| p.0).into_raw() + } +} + +impl<'a> Displayable for Option> { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => f.write_str("NULL"), + Some(addr) => { + if outputs { + match addr.read(memory) { + Ok(stat) => { + // Print mode the same way strace does. + let sflag = SFlag::from_bits_truncate(stat.stx_mode.into()); + let mode = Mode::from_bits_truncate(stat.stx_mode.into()); + write!( + f, + "{} -> {{st_mode={:?} | 0{:o}, st_size={}, ...}}", + addr.0, sflag, mode, stat.stx_size + ) + } + Err(e) => write!(f, "{} -> <{}>", addr.0, e), + } + } else { + // Just print the address when not displaying outputs. + fmt::Display::fmt(&addr.0, f) + } + } + } + } +} + +bitflags! { + /// stx_mask from statx, see linux/stat.h + #[derive(Serialize, Deserialize)] + pub struct StatxMask: u32 { + /// has stx_type + const STATX_TYPE = 0x1; + /// has stx_mode + const STATX_MODE = 0x2; + /// has stx_nlink + const STATX_NLINK = 0x4; + /// has stx_uid + const STATX_UID = 0x8; + /// has stx_gid + const STATX_GID = 0x10; + /// has stx_atime + const STATX_ATIME = 0x20; + /// has stx_mtime + const STATX_MTIME = 0x40; + /// has stx_ctime + const STATX_CTIME = 0x80; + /// has stx_ino + const STATX_INO = 0x100; + /// has stx_size + const STATX_SIZE = 0x200; + /// has stx_blocks + const STATX_BLOCKS = 0x400; + /// compatible with `stat'. + const STATX_BASIC_STATS = 0x7ff; + /// has stx_btime + const STATX_BTIME = 0x800; + /// has stx_mnt_id + const STATX_MNT_ID = 0x1000; + /// reserved + const STATX_RESERVED = 0x80000000; + } +} + +impl Default for StatxMask { + fn default() -> Self { + StatxMask::STATX_BASIC_STATS + } +} + +impl FromToRaw for StatxMask { + fn from_raw(raw: u64) -> Self { + StatxMask::from_bits_truncate(raw as u32) + } + + fn into_raw(self) -> u64 { + self.bits() as u64 + } +} + +impl Displayable for StatxMask { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + fmt::Display::fmt(&self.bits(), f) + } +} + +command_enum! { + /// The argument pairs of `arch_prctl(2)`. + #[allow(missing_docs)] + pub enum ArchPrctlCmd<'a>: libc::c_int { + ARCH_SET_GS(u64) = 0x1001, + ARCH_SET_FS(u64) = 0x1002, + ARCH_GET_FS(Option>) = 0x1003, + ARCH_GET_GS(Option>) = 0x1004, + + ARCH_GET_CPUID(Option>) = 0x1011, + ARCH_SET_CPUID(u64) = 0x1012, + } +} + +const_enum! { + /// Directives that tell `lseek` and `lseek64` what the offset is relative + /// to. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum Whence: i32 { + /// Specifies an offset relative to the start of the file. + SEEK_SET, + + /// Specifies an offset relative to the current file location. + SEEK_CUR, + + /// Specifies an offset relative to the end of the file. + SEEK_END, + + /// Specifies an offset relative to the next location in the file + /// greater than or equal to offset that contains some data. If offset + /// points to some data, then the file offset is set to offset. + SEEK_DATA, + + /// Specify an offset relative to the next hole in the file greater than + /// or equal to offset. If offset points into the middle of a hole, then + /// the file offset should be set to offset. If there is no hole past + /// offset, then the file offset should be adjusted to the end of the + /// file (i.e., there is an implicit hole at the end of any file). + SEEK_HOLE, + } +} + +const_enum! { + /// A clock ID. See the definitions in `kernel/include/uapi/linux/time.h`. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum ClockId: i32 { + CLOCK_REALTIME, + CLOCK_MONOTONIC, + CLOCK_PROCESS_CPUTIME_ID, + CLOCK_THREAD_CPUTIME_ID, + CLOCK_MONOTONIC_RAW, + CLOCK_REALTIME_COARSE, + CLOCK_MONOTONIC_COARSE, + CLOCK_BOOTTIME, + CLOCK_REALTIME_ALARM, + CLOCK_BOOTTIME_ALARM, + } +} diff --git a/reverie-syscalls/src/args/poll.rs b/reverie-syscalls/src/args/poll.rs new file mode 100644 index 0000000..fa55905 --- /dev/null +++ b/reverie-syscalls/src/args/poll.rs @@ -0,0 +1,104 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Serialization support for poll-related enums and structs. + +use crate::{Displayable, FromToRaw, MemoryAccess}; + +use core::fmt; +use serde::{Deserialize, Serialize}; + +/// A serializable version of `libc::pollfd`. +#[derive(Serialize, Deserialize, Copy, Clone, Eq, PartialEq, Debug, Default)] +#[repr(C)] +#[allow(missing_docs)] +pub struct PollFd { + pub fd: libc::c_int, + pub events: PollFlags, + pub revents: PollFlags, +} + +impl From for libc::pollfd { + fn from(pollfd: PollFd) -> libc::pollfd { + libc::pollfd { + fd: pollfd.fd, + events: pollfd.events.bits(), + revents: pollfd.revents.bits(), + } + } +} + +impl From for PollFd { + fn from(pollfd: libc::pollfd) -> Self { + Self { + fd: pollfd.fd, + events: unsafe { PollFlags::from_bits_unchecked(pollfd.events) }, + revents: unsafe { PollFlags::from_bits_unchecked(pollfd.revents) }, + } + } +} + +impl Displayable for PollFd { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +bitflags! { + /// Flags for [`PollFd`]. + #[derive(Default, Serialize, Deserialize)] + pub struct PollFlags: libc::c_short { + /// There is data to read. + const POLLIN = libc::POLLIN; + /// There is some exceptional condition on the file descriptor. + const POLLPRI = libc::POLLPRI; + /// Writing is now possible. + const POLLOUT = libc::POLLOUT; + /// Equivalent to [`POLLIN`]. + const POLLRDNORM = libc::POLLRDNORM; + /// Equivalent to [`POLLOUT`]. + const POLLWRNORM = libc::POLLWRNORM; + /// Priority band can be read (generally unused on Linux). + const POLLRDBAND = libc::POLLRDBAND; + /// Priority data may be written. + const POLLWRBAND = libc::POLLWRBAND; + /// Error condition. + const POLLERR = libc::POLLERR; + /// Hang up. + const POLLHUP = libc::POLLHUP; + /// Invalid request. + const POLLNVAL = libc::POLLNVAL; + } +} + +impl FromToRaw for PollFlags { + fn from_raw(raw: u64) -> Self { + Self::from_bits_truncate(raw as libc::c_short) + } + + fn into_raw(self) -> u64 { + self.bits() as u64 + } +} + +impl Displayable for PollFlags { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + fmt::Display::fmt(&self.bits(), f) + } +} diff --git a/reverie-syscalls/src/args/stat.rs b/reverie-syscalls/src/args/stat.rs new file mode 100644 index 0000000..e41fc65 --- /dev/null +++ b/reverie-syscalls/src/args/stat.rs @@ -0,0 +1,155 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Serialization support for stat structs. + +use serde::{Deserialize, Serialize}; + +/// A serializable version of `libc::stat`. +#[derive(Serialize, Deserialize, Copy, Clone, Eq, PartialEq, Debug)] +#[serde(remote = "libc::stat")] +#[repr(C)] +#[allow(missing_docs)] +pub struct StatBuf { + pub st_dev: libc::dev_t, + pub st_ino: libc::ino64_t, + pub st_nlink: libc::nlink_t, + pub st_mode: libc::mode_t, + pub st_uid: libc::uid_t, + pub st_gid: libc::gid_t, + #[serde(getter = "unused")] + __pad0: libc::c_int, + pub st_rdev: libc::dev_t, + pub st_size: libc::off_t, + pub st_blksize: libc::blksize_t, + pub st_blocks: libc::blkcnt64_t, + pub st_atime: libc::time_t, + pub st_atime_nsec: i64, + pub st_mtime: libc::time_t, + pub st_mtime_nsec: i64, + pub st_ctime: libc::time_t, + pub st_ctime_nsec: i64, + #[serde(getter = "unused")] + __unused: [i64; 3], +} + +fn unused(_stat: &libc::stat) -> T { + T::default() +} + +impl From for libc::stat { + fn from(buf: StatBuf) -> libc::stat { + // The layout and size is exactly the same, so this transmute is safe to + // do. + unsafe { core::mem::transmute(buf) } + } +} + +/// A serializable version of `libc::statx`. +#[derive(Serialize, Deserialize, Copy, Clone, Eq, PartialEq, Debug)] +#[repr(C)] +#[allow(missing_docs)] +pub struct StatxTimestamp { + pub tv_sec: i64, + pub tv_nsec: u32, + #[serde(skip)] + __statx_timestamp_pad1: [i32; 1], +} + +impl From for libc::statx_timestamp { + fn from(buf: StatxTimestamp) -> libc::statx_timestamp { + // The layout and size is exactly the same, so this transmute is safe to + // do. + unsafe { core::mem::transmute(buf) } + } +} + +impl From for StatxTimestamp { + fn from(buf: libc::statx_timestamp) -> StatxTimestamp { + // The layout and size is exactly the same, so this transmute is safe to + // do. + unsafe { core::mem::transmute(buf) } + } +} + +/// A serializable version of `libc::statx`. +#[derive(Serialize, Deserialize, Copy, Clone, Eq, PartialEq, Debug)] +#[repr(C)] +#[allow(missing_docs)] +pub struct StatxBuf { + pub stx_mask: u32, + pub stx_blksize: u32, + pub stx_attributes: u64, + pub stx_nlink: u32, + pub stx_uid: u32, + pub stx_gid: u32, + pub stx_mode: u16, + #[serde(skip)] + __statx_pad1: [u16; 1], + pub stx_ino: u64, + pub stx_size: u64, + pub stx_blocks: u64, + pub stx_attributes_mask: u64, + pub stx_atime: StatxTimestamp, + pub stx_btime: StatxTimestamp, + pub stx_ctime: StatxTimestamp, + pub stx_mtime: StatxTimestamp, + pub stx_rdev_major: u32, + pub stx_rdev_minor: u32, + pub stx_dev_major: u32, + pub stx_dev_minor: u32, + pub stx_mnt_id: u64, + #[serde(skip)] + __statx_pad2: u64, + #[serde(skip)] + __statx_pad3: [u64; 12], +} + +impl From for libc::statx { + fn from(buf: StatxBuf) -> libc::statx { + // The layout and size is exactly the same, so this transmute is safe to + // do. + unsafe { core::mem::transmute(buf) } + } +} + +impl From for StatxBuf { + fn from(buf: libc::statx) -> StatxBuf { + // The layout and size is exactly the same, so this transmute is safe to + // do. + unsafe { core::mem::transmute(buf) } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use core::mem::align_of; + use core::mem::size_of; + + #[test] + fn sizes() { + assert_eq!(size_of::(), size_of::()); + assert_eq!(size_of::(), size_of::()); + assert_eq!( + size_of::(), + size_of::() + ); + } + + #[test] + fn alignment() { + assert_eq!(align_of::(), align_of::()); + assert_eq!(align_of::(), align_of::()); + assert_eq!( + align_of::(), + align_of::() + ); + } +} diff --git a/reverie-syscalls/src/args/time.rs b/reverie-syscalls/src/args/time.rs new file mode 100644 index 0000000..ed79936 --- /dev/null +++ b/reverie-syscalls/src/args/time.rs @@ -0,0 +1,97 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Serialization support for timespec structs. + +use serde::{Deserialize, Serialize}; + +/// A serializable version of `libc::timespec`. +#[derive(Serialize, Deserialize, Copy, Clone, Eq, PartialEq, Debug, Hash)] +#[repr(C)] +pub struct Timespec { + /// Seconds + pub tv_sec: libc::time_t, + /// Nanoseconds + pub tv_nsec: libc::c_long, +} + +impl From for libc::timespec { + fn from(ts: Timespec) -> libc::timespec { + libc::timespec { + tv_sec: ts.tv_sec, + tv_nsec: ts.tv_nsec, + } + } +} + +impl From for Timespec { + fn from(ts: libc::timespec) -> Self { + Self { + tv_sec: ts.tv_sec, + tv_nsec: ts.tv_nsec, + } + } +} + +impl From for Timespec { + fn from(tp: libc::statx_timestamp) -> Self { + Timespec { + tv_sec: tp.tv_sec as _, + tv_nsec: tp.tv_nsec as _, + } + } +} + +impl From for libc::statx_timestamp { + fn from(tp: Timespec) -> Self { + libc::statx_timestamp { + tv_sec: tp.tv_sec as _, + tv_nsec: tp.tv_nsec as _, + __statx_timestamp_pad1: [0], + } + } +} + +impl From for Timespec { + fn from(tv: libc::timeval) -> Self { + Timespec { + tv_sec: tv.tv_sec as _, + tv_nsec: (1000 * tv.tv_usec) as _, + } + } +} + +impl From for libc::timeval { + fn from(ts: Timespec) -> Self { + libc::timeval { + tv_sec: ts.tv_sec as _, + tv_usec: (ts.tv_nsec / 1000) as _, + } + } +} + +/// A serializable version of `libc::timeval`. +#[derive(Serialize, Deserialize)] +#[derive(Default, Copy, Clone, Eq, PartialEq, Debug, Hash)] +#[repr(C)] +#[allow(missing_docs)] +pub struct Timeval { + pub tv_sec: libc::time_t, + pub tv_usec: libc::suseconds_t, +} + +/// A serializable version of `libc::timezone`. +#[derive(Serialize, Deserialize)] +#[derive(Default, Copy, Clone, Eq, PartialEq, Debug, Hash)] +#[repr(C)] +#[allow(missing_docs)] +pub struct Timezone { + tz_minuteswest: libc::c_int, + tz_dsttime: libc::c_int, +} diff --git a/reverie-syscalls/src/display.rs b/reverie-syscalls/src/display.rs new file mode 100644 index 0000000..d1154ae --- /dev/null +++ b/reverie-syscalls/src/display.rs @@ -0,0 +1,200 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::fmt; + +use crate::memory::{Addr, AddrMut, MemoryAccess}; +use crate::Errno; + +use nix::{ + fcntl::{AtFlags, OFlag}, + sched::CloneFlags, + sys::{ + epoll::EpollCreateFlags, + eventfd::EfdFlags, + inotify::InitFlags, + mman::{MapFlags, ProtFlags}, + signalfd::SfdFlags, + socket::{AddressFamily, SockFlag, SockProtocol}, + stat::{Mode, SFlag}, + timerfd::TimerFlags, + wait::WaitPidFlag, + }, + unistd::Pid, +}; + +/// A wrapper that combines an address space and a syscall. This is useful for +/// displaying the contents of syscall pointer inputs. +pub struct Display<'a, M, T> { + /// How we access memory. + memory: &'a M, + + /// The syscall arguments we need to display. + syscall: &'a T, + + /// Whether or not to display output arguments. + outputs: bool, +} + +impl<'a, M, T> Display<'a, M, T> { + /// Allocate a new display struct from a memory and a syscall whose + /// arguments read from that memory. + pub fn new(memory: &'a M, syscall: &'a T, outputs: bool) -> Self { + Display { + memory, + syscall, + outputs, + } + } +} + +impl<'a, M, T> fmt::Display for Display<'a, M, T> +where + M: MemoryAccess, + T: Displayable, +{ + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.syscall.fmt(self.memory, self.outputs, f) + } +} + +/// Trait that all syscalls and their arguments need to implement in order to be +/// printed out. +pub trait Displayable { + /// Displays a syscall with all of its arguments. + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result; + + /// Returns an object that implements `std::fmt::Display` and displays only + /// syscall inputs. + fn display<'a, M>(&'a self, memory: &'a M) -> Display<'a, M, Self> + where + M: MemoryAccess, + Self: Sized, + { + Display::new(memory, self, false) + } + + /// Returns an object that implements `std::fmt::Display` and displays + /// syscall inputs as well as outputs. Useful for displaying pointer + /// arguments that are only valid after a syscall has been executed. + fn display_with_outputs<'a, M>(&'a self, memory: &'a M) -> Display<'a, M, Self> + where + M: MemoryAccess, + Self: Sized, + { + Display::new(memory, self, true) + } +} + +impl<'a, T> Displayable for Option> { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => write!(f, "NULL"), + Some(addr) => write!(f, "{:?}", addr), + } + } +} + +impl<'a, T> Displayable for Option> { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + None => write!(f, "NULL"), + Some(addr) => write!(f, "{:?}", addr), + } + } +} + +impl Displayable for OFlag { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + if self.is_empty() { + // Without this special case, the default Debug implementation will + // print "O_LARGEFILE | O_RDONLY" because both of those flags are + // zeros. + f.write_str("0") + } else { + fmt::Debug::fmt(self, f) + } + } +} + +impl Displayable for Result +where + T: fmt::Display, +{ + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut fmt::Formatter, + ) -> fmt::Result { + match self { + Ok(x) => fmt::Display::fmt(x, f), + Err(err) => fmt::Display::fmt(err, f), + } + } +} + +macro_rules! impl_displayable { + ($fmt:ident $t:ty) => { + impl $crate::Displayable for $t { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + ::core::fmt::$fmt::fmt(self, f) + } + } + }; +} + +impl_displayable!(Debug AtFlags); +impl_displayable!(Debug CloneFlags); +impl_displayable!(Debug Mode); +impl_displayable!(Debug SFlag); +impl_displayable!(Debug WaitPidFlag); +impl_displayable!(Debug MapFlags); +impl_displayable!(Debug ProtFlags); +impl_displayable!(Debug EpollCreateFlags); +impl_displayable!(Debug EfdFlags); +impl_displayable!(Debug SfdFlags); +impl_displayable!(Debug InitFlags); +impl_displayable!(Debug SockFlag); +impl_displayable!(Debug AddressFamily); +impl_displayable!(Debug SockProtocol); +impl_displayable!(Debug Option); +impl_displayable!(Debug TimerFlags); + +impl_displayable!(Display Pid); +impl_displayable!(Display i32); +impl_displayable!(Display u32); +impl_displayable!(Display i64); +impl_displayable!(Display u64); +impl_displayable!(Display isize); +impl_displayable!(Display usize); diff --git a/reverie-syscalls/src/lib.rs b/reverie-syscalls/src/lib.rs new file mode 100644 index 0000000..688c48c --- /dev/null +++ b/reverie-syscalls/src/lib.rs @@ -0,0 +1,40 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! This crate wraps raw `u64` syscall arguments in stronger Rust types. This +//! has a number of useful side effects: +//! 1. Syscalls and their arguments can be easily displayed for debugging +//! purposes. +//! 2. When intercepting syscalls, the Rust type can be accessed more safely. +//! 3. When injecting syscalls, it is easier and clearer to set the arguments +//! using the `with_*` builder methods. + +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] + +#[macro_use] +mod macros; + +#[macro_use] +extern crate bitflags; + +mod args; +mod display; +mod memory; +mod raw; +mod syscalls; + +pub use crate::args::*; +pub use crate::display::*; +pub use crate::memory::*; +pub use crate::raw::*; +pub use crate::syscalls::*; + +// Re-export the only things that might be needed from the syscalls crate +pub use ::syscalls::{Errno, SyscallArgs, Sysno}; diff --git a/reverie-syscalls/src/macros.rs b/reverie-syscalls/src/macros.rs new file mode 100644 index 0000000..3c4441e --- /dev/null +++ b/reverie-syscalls/src/macros.rs @@ -0,0 +1,821 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/// Creates a type-safe syscall by generating much of the boilerplate needed to +/// do conversion from the raw syscall registers into Rust types. +macro_rules! typed_syscall { + // This macro is pretty big, but most of the rules are for matching and + // collecting potentially different types of syscall arguments. The easiest + // way to read this macro is bottom-up. + // + // There are four "types" of syscall arguments that we need to match on: + // 1. Non-optional, uncustomized entries: `my_arg: MyType,` + // 2. Non-optional, customized entries: `my_arg: fn(&self) -> MyType {...},` + // 3. Optional, uncustomized entries: `my_arg?: MyType,` + // 4. Optional, customized entries: `my_arg?: fn(&self) -> MyType {...},` + // + // Once a rule matches one of these, we separate the optional from the + // non-optional arguments. We do this so that the optional arguments are + // always at the end of the argument list and so that during display, we can + // avoid printing it out if it is `None`. + + // Exit rule + (@make_syscall + { + vis: $vis:vis, + name: $Name:ident, + attrs: [$(#[$attrs:meta])*], + ret: $ret:ty, + doc: $doc:expr, + // Required arguments + required: [$({ + $req:ident, + + // The 'getter' function. + $(#[$req_get_meta:meta])* + ($($req_get_args:tt)*) -> $req_type:ty { + $($req_get_impl:tt)* + } + + // The 'setter' function. + $(#[$req_set_meta:meta])* + ($($req_set_args:tt)*) -> $req_set_type:ty { + $($req_set_impl:tt)* + } + },)*], + // Optional arguments + optional: [$({ + $opt:ident, + + // The 'getter' function. + $(#[$opt_get_meta:meta])* + ($($opt_get_args:tt)*) -> $opt_type:ty { + $($opt_get_impl:tt)* + } + + // The 'setter' function. + $(#[$opt_set_meta:meta])* + ($($opt_set_args:tt)*) -> $opt_set_type:ty { + $($opt_set_impl:tt)* + } + },)*], + } + ) => { + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + $(#[$attrs])* + #[doc = $doc] + $vis struct $Name { + raw: ::syscalls::SyscallArgs, + } + + impl Default for $Name { + fn default() -> Self { + Self { + raw: ::syscalls::SyscallArgs::new(0, 0, 0, 0, 0, 0), + } + } + } + + impl $crate::SyscallInfo for $Name { + type Return = Result<$ret, $crate::Errno>; + + #[inline] + fn name(&self) -> &'static str { + Self::NAME + } + + #[inline] + fn number(&self) -> ::syscalls::Sysno { + Self::NUMBER + } + + fn into_parts(self) -> (::syscalls::Sysno, ::syscalls::SyscallArgs) { + (Self::NUMBER, self.raw) + } + } + + impl $Name { + ::paste::paste! { + /// The name of the syscall. + pub const NAME: &'static str = stringify!([<$Name:snake>]); + + /// The syscall number. + pub const NUMBER: ::syscalls::Sysno = ::syscalls::[]; + } + + /// Creates the syscall. Use the `with_*` functions to build up the + /// arguments to this syscall. + pub fn new() -> Self { + Self::default() + } + + // Generate getter functions. + $( + /// Gets this argument's value. + $(#[$req_get_meta])* + #[allow(clippy::len_without_is_empty)] + pub fn $req($($req_get_args)*) -> $req_type { + $($req_get_impl)* + } + )* + + $( + /// Gets this optional argument's value. Returns `None` if it is not set. + $(#[$opt_get_meta])* + pub fn $opt($($opt_get_args)*) -> $opt_type { + $($opt_get_impl)* + } + )* + + // Generate setter functions + ::paste::paste! { + $( + /// Sets this argument to the given value. + $(#[$req_set_meta])* + pub fn []($($req_set_args)*) -> $req_set_type { + $($req_set_impl)* + } + )* + + $( + /// Sets this optional argument to the given value. + $(#[$opt_set_meta])* + pub fn []($($opt_set_args)*) -> $opt_set_type { + $($opt_set_impl)* + } + )* + } + } + + impl From<::syscalls::SyscallArgs> for $Name { + fn from(raw: ::syscalls::SyscallArgs) -> Self { + $Name { raw } + } + } + + impl From<$Name> for ::syscalls::SyscallArgs { + fn from(syscall: $Name) -> Self { + syscall.raw + } + } + + typed_syscall! { + @impl_display + $Name, + [$($req,)*], + [$($opt,)*], + } + }; + + // Display zero args + (@impl_display + $Name:ident, + [], + [], + ) => { + impl $crate::Displayable for $Name { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + write!(f, "{}()", Self::NAME) + } + } + }; + + // Display zero required args, but some optional args + (@impl_display + $Name:ident, + [], + [$optional:ident, $($optional_tail:ident,)*], + ) => { + impl $crate::Displayable for $Name { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + write!(f, "{}(", Self::NAME)?; + + self.$optional().fmt(memory, outputs, f)?; + + $( + if let Some(arg) = self.$optional_tail() { + f.write_str(", ")?; + arg.fmt(memory, outputs, f)?; + } + )* + + f.write_str(")") + } + } + }; + + // Display one or more required arguments. + (@impl_display + $Name:ident, + [$req:ident, $($req_tail:ident,)*], + [$($optional_tail:ident,)*], + ) => { + impl $crate::Displayable for $Name { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + write!(f, "{}(", Self::NAME)?; + + self.$req().fmt(memory, outputs, f)?; + + $( + f.write_str(", ")?; + $crate::Displayable::fmt(&self.$req_tail(), memory, outputs, f)?; + )* + + $( + // Display all optional arguments at the end. + if let Some(arg) = self.$optional_tail() { + f.write_str(", ")?; + $crate::Displayable::fmt(&arg, memory, outputs, f)?; + } + )* + + f.write_str(")") + } + } + }; + + // Done accumulating entries + (@accumulate_entries + { + vis: $vis:vis, + name: $Name:ident, + attrs: [$(#[$attrs:meta])*], + ret: $ret:ty, + }, + [$($req_entries:tt)*], + [$($optional_entries:tt)*], + [$($raw:ident,)*], + ) => { + ::paste::paste! { + typed_syscall! { + @make_syscall + { + vis: $vis, + name: $Name, + attrs: [$(#[$attrs])*], + ret: $ret, + // Generate a handy link to the syscall in the doc comment. + doc: concat!( + "See [", stringify!([<$Name:snake>]), "(2)]", + "(http://man7.org/linux/man-pages/man2/", stringify!([<$Name:snake>]), + ".2.html) for info on this syscall." + ), + required: [$($req_entries)*], + optional: [$($optional_entries)*], + } + } + } + }; + + // Munch a required entry + (@accumulate_entries + $prefix:tt, + [$($req_entries:tt)*], + [$($optional_entries:tt)*], + [$raw:ident, $($rawtail:ident,)*], + $(#[$meta:meta])* + $entry:ident: $t:ty, + $($tail:tt)* + ) => { + typed_syscall! { + @accumulate_entries + $prefix, + [ + $($req_entries)* + // Append the munched entry + { + $entry, + + $(#[$meta])* + (&self) -> $t { + $crate::FromToRaw::from_raw((self.raw).$raw) + } + + $(#[$meta])* + (mut self, v: $t) -> Self { + (self.raw).$raw = $crate::FromToRaw::into_raw(v); + self + } + }, + ], + [$($optional_entries)*], + [$($rawtail,)*], + $($tail)* + } + }; + + // Munch a required function entry + (@accumulate_entries + $prefix:tt, + [$($req_entries:tt)*], + [$($optional_entries:tt)*], + [$raw:ident, $($rawtail:ident,)*], + $(#[$meta:meta])* + $entry:ident: { + $(#[$get_meta:meta])* + fn get($($get_args:tt)*) -> $get_type:ty { $($get_impl:tt)* } + + $(#[$set_meta:meta])* + fn set($($set_args:tt)*) -> $set_type:ty { $($set_impl:tt)* } + }, + $($tail:tt)* + ) => { + typed_syscall! { + @accumulate_entries + $prefix, + [ + $($req_entries)* + // Append the munched entry + { + $entry, + + $(#[$meta])* + $(#[$get_meta])* + ($($get_args)*) -> $get_type { $($get_impl)* } + + $(#[$meta])* + $(#[$set_meta])* + ($($set_args)*) -> $set_type { $($set_impl)* } + }, + ], + [$($optional_entries)*], + [$($rawtail,)*], + $($tail)* + } + }; + + // Munch an optional entry + (@accumulate_entries + $prefix:tt, + [$($req_entries:tt)*], + [$($optional_entries:tt)*], + [$raw:ident, $($rawtail:ident,)*], + $(#[$meta:meta])* + $entry:ident?: $t:ty, + $($tail:tt)* + ) => { + typed_syscall! { + @accumulate_entries + $prefix, + [$($req_entries)*], + [ + $($optional_entries)* + // Append the munched entry + { + $entry, + + $(#[$meta])* + (&self) -> $t { + $crate::FromToRaw::from_raw((self.raw).$raw) + } + + $(#[$meta])* + (mut self, v: $t) -> Self { + (self.raw).$raw = $crate::FromToRaw::into_raw(v); + self + } + }, + ], + [$($rawtail,)*], + $($tail)* + } + }; + + // Munch an optional function entry + (@accumulate_entries + $prefix:tt, + [$($req_entries:tt)*], + [$($optional_entries:tt)*], + [$raw:ident, $($rawtail:ident,)*], + $(#[$meta:meta])* + $entry:ident?: { + $(#[$get_meta:meta])* + fn get($($get_args:tt)*) -> $get_type:ty { $($get_impl:tt)* } + + $(#[$set_meta:meta])* + fn set($($set_args:tt)*) -> $set_type:ty { $($set_impl:tt)* } + }, + $($tail:tt)* + ) => { + typed_syscall! { + @accumulate_entries + $prefix, + [$($req_entries)*], + [ + $($optional_entries)* + // Append the munched entry + { + $entry, + + $(#[$meta])* + $(#[$get_meta])* + ($($get_args)*) -> $get_type { $($get_impl)* } + + $(#[$meta])* + $(#[$set_meta])* + ($($set_args)*) -> $set_type { $($set_impl)* } + }, + ], + [$($rawtail,)*], + $($tail)* + } + }; + + // Entry rule. + ( + $(#[$attrs:meta])* + $vis:vis struct $Name:ident -> $ret:ty { + $($vals:tt)* + } + ) => { + typed_syscall! { + @accumulate_entries + // Meta data that is passed through the munching pipeline until we + // are ready to generate all of the code. + { + vis: $vis, + name: $Name, + attrs: [$(#[$attrs])*], + ret: $ret, + }, + // List of required entries accumulated thus far. + [], + // List of optional entries accumulated thus far. + [], + // Queue of raw args. This are popped off in sequence and used for + // the default implementation of getting and setting raw registers. + [arg0, arg1, arg2, arg3, arg4, arg5,], + // The unprocessed entries, including their metadata. + $($vals)* + } + }; + + // Entry rule (with a default return type). + ( + $(#[$attrs:meta])* + $vis:vis struct $Name:ident { + $($vals:tt)* + } + ) => { + typed_syscall! { + $(#[$attrs])* + $vis struct $Name -> u64 { + $($vals)* + } + } + }; +} + +macro_rules! syscall_list { + ( + $(#[$outer:meta])* + $vis:vis enum $name:ident { + $( + $(#[$inner:meta])* + $num:ident => $item:ident, + )* + } + ) => { + $(#[$outer])* + $vis enum $name { + $( + $(#[$inner])* + $item($item), + )* + + /// Catch-all for syscalls that are not yet type-safe. + Other(::syscalls::Sysno, ::syscalls::SyscallArgs), + } + + impl $name { + /// Creates a `Syscall` from raw arguments. If the specified syscall + /// is not supported, a `Syscall::Other` will be created. + pub fn from_raw(syscall: ::syscalls::Sysno, args: ::syscalls::SyscallArgs) -> Self { + match syscall { + $( + ::syscalls::$num => $name::$item(args.into()), + )* + num => Syscall::Other(num, args), + } + } + } + + impl $crate::SyscallInfo for $name { + type Return = Result; + + fn name(&self) -> &'static str { + match self { + $( + $name::$item(_) => $item::NAME, + )* + $name::Other(syscall, _) => syscall.name(), + } + } + + fn number(&self) -> ::syscalls::Sysno { + match self { + $( + $name::$item(_) => ::syscalls::$num, + )* + $name::Other(num, _) => *num, + } + } + + fn into_parts(self) -> (::syscalls::Sysno, ::syscalls::SyscallArgs) { + match self { + $( + $name::$item(x) => (::syscalls::$num, x.into()), + )* + $name::Other(num, args) => (num, args), + } + } + } + + $( + impl From<$item> for $name { + fn from(x: $item) -> Self { + $name::$item(x) + } + } + )* + + impl $crate::Displayable for $name { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + match self { + $( + $name::$item(x) => $crate::Displayable::fmt(x, memory, outputs, f), + )* + $name::Other(num, args) => { + // Write out the raw arguments. + write!( + f, + "{:?}({}, {}, {}, {}, {}, {})", + num, + args.arg0, + args.arg1, + args.arg2, + args.arg3, + args.arg4, + args.arg5 + ) + } + } + } + } + }; +} + +/// Generate code for fcntl-like enums where there is a code specifying the +/// command and a value associated with the command. +macro_rules! command_enum { + // Exit rule + (@emit_enum + { + vis: $vis:vis, + name: $name:ident, + lifetimes: [$($lt:lifetime,)*], + attrs: [$(#[$attrs:meta])*], + type: $type:ty, + entries: [$({ + meta: [$(#[$meta:meta])*], + id: $id:expr, + flag: $flag:ident, + into: $(($arg:ident: $t:ty))? => $raw:expr, + },)*], + } + ) => { + $(#[$attrs])* + #[allow(non_camel_case_types, clippy::upper_case_acronyms)] + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + $vis enum $name<$($lt,)*> { + $( + #[allow(missing_docs)] + $(#[$meta])* + $flag$(($t))?, + )* + + /// Catch-all case when we don't know the command and its argument. + Other($type, u64), + } + + impl<$($lt,)*> ::core::fmt::Display for $name<$($lt,)*> { + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + // Same as derived debug + ::fmt(self, f) + } + } + + impl<$($lt,)*> $crate::Displayable for $name<$($lt,)*> { + fn fmt( + &self, + memory: &M, + outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + match self { + $( + $(#[$meta])* + $name::$flag$(($arg))? => { + f.write_str(stringify!($flag))?; + $( + f.write_str(", ")?; + $crate::Displayable::fmt($arg, memory, outputs, f)?; + )? + Ok(()) + } + )* + $name::Other(cmd, arg) => write!(f, "{}, {:#x}", cmd, arg), + } + } + } + + impl<$($lt,)*> $name<$($lt,)*> { + /// Creates the enum from raw arguments. + pub fn from_raw(cmd: $type, arg: u64) -> Self { + match cmd { + $( + $(#[$meta])* + $id => $name::$flag$((<$t as $crate::FromToRaw>::from_raw(arg)))?, + )* + _ => $name::Other(cmd, arg), + } + } + + /// Converts the enum into raw arguments. + pub fn into_raw(self) -> ($type, u64) { + match self { + $( + $(#[$meta])* + $name::$flag$(($arg))? => ($id, $raw), + )* + $name::Other(cmd, arg) => (cmd, arg), + } + } + } + }; + + // Done collecting entries + (@collect_entries + { + $($prefix:tt)* + }, + [$($entries:tt)*], + ) => { + command_enum! { + @emit_enum + { + $($prefix)* + entries: [$($entries)*], + } + } + }; + + // Collect an entry with a single argument. + (@collect_entries + $prefix:tt, + [$($entries:tt)*], + $(#[$meta:meta])* + $flag:ident($t:ty) = $id:expr, + $($tail:tt)* + ) => { + command_enum! { + @collect_entries + $prefix, + [ + $($entries)* + { + meta: [$(#[$meta])*], + id: $id, + flag: $flag, + into: (arg: $t) => arg.into_raw(), + }, + ], + $($tail)* + } + }; + + // Collect an entry with zero arguments. + (@collect_entries + $prefix:tt, + [$($entries:tt)*], + $(#[$meta:meta])* + $flag:ident = $id:expr, + $($tail:tt)* + ) => { + command_enum! { + @collect_entries + $prefix, + [ + $($entries)* + { + meta: [$(#[$meta])*], + id: $id, + flag: $flag, + into: => 0, + }, + ], + $($tail)* + } + }; + + // Entry rule + ( + $(#[$attrs:meta])* + $vis:vis enum $name:ident$(<$($lt:lifetime),*>)?: $t:ty { + $($entries:tt)* + } + ) => { + command_enum! { + @collect_entries + { + vis: $vis, + name: $name, + lifetimes: [$($($lt,)*)?], + attrs: [$(#[$attrs])*], + type: $t, + }, + [], + $($entries)* + } + }; +} + +// Helper for generating an enum-like struct. This helps to avoid casting raw +// integers to an `enum`, which may result in undefined behavior if the integer +// does not match any of the enum variants. +// +// Note that the item must have a matching definition in `libc`. +macro_rules! const_enum { + ( + $(#[$meta:meta])* + $vis:vis enum $Name:ident : $inner:ident { + $( + $(#[$attrs:meta])* + $item:ident, + )* + } + ) => { + $(#[$meta])* + $vis struct $Name($inner); + + impl $Name { + $( + $(#[$attrs])* + #[allow(missing_docs)] + pub const $item: $Name = Self(libc::$item); + )* + } + + impl $crate::FromToRaw for $Name { + fn from_raw(raw: u64) -> Self { + Self(raw as $inner) + } + + fn into_raw(self) -> u64 { + self.0 as u64 + } + } + + impl $crate::Displayable for $Name { + fn fmt( + &self, + _memory: &M, + _outputs: bool, + f: &mut ::core::fmt::Formatter, + ) -> ::core::fmt::Result { + match *self { + $( + Self::$item => f.write_str(stringify!($item)), + )* + Self(x) => write!(f, "{:#x}", x), + } + } + } + }; +} diff --git a/reverie-syscalls/src/memory/addr.rs b/reverie-syscalls/src/memory/addr.rs new file mode 100644 index 0000000..f810abe --- /dev/null +++ b/reverie-syscalls/src/memory/addr.rs @@ -0,0 +1,495 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::fmt; +use core::marker::PhantomData; +use core::ptr::NonNull; + +// Only used for `IoSlice`. To be fully no_std, this should get replaced with a +// custom `IoSlice` type. +use std::io; + +/// An address to some immutable memory. We don't know where the memory lives; +/// it can be either in the current process or a another process. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[repr(transparent)] +pub struct Addr<'a, T> { + // This is our non-null pointer. Since this may not point to memory in the + // same process, we need to be very careful to never dereference this. + inner: NonNull, + _p: PhantomData<&'a T>, +} + +impl<'a, T> From for Addr<'a, T> { + fn from(raw: usize) -> Self { + Self::from_raw(raw).unwrap() + } +} + +impl<'a, T> Addr<'a, T> { + /// Construct an address pointing to mutable data from a raw u64. Useful for + /// converting a syscall register to a pointer. + pub fn from_raw(raw: usize) -> Option { + if raw == 0 { + None + } else { + Some(unsafe { Self::from_raw_unchecked(raw) }) + } + } + + /// Creates an address pointing to mutable data from a raw pointer. If the + /// pointer is null, then `None` will be returned. + pub fn from_ptr(r: *const T) -> Option { + NonNull::new(r as *mut T).map(|p| Self { + inner: p, + _p: PhantomData, + }) + } + + /// Construct an address from a raw u64 without checking if it is null. + /// + /// # Safety + /// + /// `raw` must be non-zero. + pub unsafe fn from_raw_unchecked(raw: usize) -> Self { + Self { + inner: NonNull::new_unchecked(raw as *mut T), + _p: PhantomData, + } + } + + /// Casts this pointer to a mutable pointer. + /// + /// # Safety + /// + /// This method is unsafe for numerous reasons. + pub unsafe fn into_mut(self) -> AddrMut<'a, T> { + AddrMut { + inner: self.inner, + _p: PhantomData, + } + } + + /// Returns a raw pointer. + /// + /// # Safety + /// + /// This method is unsafe because the pointer returned by this function + /// should never be dereferenced as it could point to memory outside of the + /// current address space. + #[allow(clippy::wrong_self_convention)] + pub unsafe fn as_ptr(self) -> *const T { + self.inner.as_ptr() + } + + /// Returns the raw integer value of the address. + #[allow(clippy::wrong_self_convention)] + pub fn as_raw(self) -> usize { + self.inner.as_ptr() as usize + } + + /// Casts the address into an address of another type. + pub fn cast(self) -> Addr<'a, U> { + Addr { + inner: self.inner.cast(), + _p: PhantomData, + } + } + + /// Returns a new address relative to the current address + `count * + /// size_of::()`. + /// + /// # Safety + /// + /// This method is unsafe because the new address may not point to valid + /// memory. + pub unsafe fn offset(self, count: isize) -> Self { + Self { + inner: NonNull::new_unchecked(self.inner.as_ptr().offset(count)), + _p: PhantomData, + } + } + + /// Returns a new address plus `count * size_of::()`. + /// + /// # Safety + /// + /// This method is unsafe because the new address may not point to valid + /// memory. + #[allow(clippy::should_implement_trait)] + pub unsafe fn add(self, count: usize) -> Self { + self.offset(count as isize) + } +} + +impl<'a, T> From<&'a T> for Addr<'a, T> { + fn from(inner: &'a T) -> Self { + Self { + inner: NonNull::from(inner), + _p: PhantomData, + } + } +} + +impl<'a, T> AsRef for Addr<'a, T> { + fn as_ref(&self) -> &T { + unsafe { self.inner.as_ref() } + } +} + +impl<'a, T> fmt::Debug for Addr<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +impl<'a, T> fmt::Display for Addr<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +impl<'a, T> fmt::Pointer for Addr<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +/// An address to some mutable memory. We don't know where the memory lives; it +/// can be either in the current process or a another process. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[repr(transparent)] +pub struct AddrMut<'a, T> { + // This is our non-null pointer. Since this may not point to memory in the + // same process, we need to be very careful to never dereference this. + inner: NonNull, + _p: PhantomData<&'a mut T>, +} + +impl<'a, T> AddrMut<'a, T> { + /// Construct an address from a raw `usize`. Useful for converting a syscall + /// register to a pointer. If the raw value is 0, then `None` is returned. + pub fn from_raw(raw: usize) -> Option { + if raw == 0 { + None + } else { + Some(unsafe { Self::from_raw_unchecked(raw) }) + } + } + + /// Creates an address from a raw pointer. If the pointer is null, then + /// `None` will be returned. + pub fn from_ptr(r: *const T) -> Option { + NonNull::new(r as *mut T).map(|p| Self { + inner: p, + _p: PhantomData, + }) + } + + /// Construct an address from a raw u64 without checking if it is null. + /// + /// # Safety + /// + /// `raw` must be non-zero. + pub unsafe fn from_raw_unchecked(raw: usize) -> Self { + Self { + inner: NonNull::new_unchecked(raw as *mut T), + _p: PhantomData, + } + } + + /// Returns a raw mutable pointer. + /// + /// # Safety + /// + /// This method is unsafe because the pointer returned by this function + /// should never be dereferenced as it could point to memory outside of the + /// current address space. + #[allow(clippy::wrong_self_convention)] + pub unsafe fn as_mut_ptr(self) -> *mut T { + self.inner.as_ptr() + } + + /// Returns the raw integer value of the address. + #[allow(clippy::wrong_self_convention)] + pub fn as_raw(self) -> usize { + self.inner.as_ptr() as usize + } + + /// Casts the address into an address of another type. + pub fn cast(self) -> AddrMut<'a, U> { + AddrMut { + inner: self.inner.cast(), + _p: PhantomData, + } + } + + /// Returns a new address relative to the current address + `count * + /// size_of::()`. + /// + /// # Safety + /// + /// This method is unsafe because the new address may not point to valid + /// memory. + pub unsafe fn offset(self, count: isize) -> Self { + Self { + inner: NonNull::new_unchecked(self.inner.as_ptr().offset(count)), + _p: PhantomData, + } + } + + /// Returns a new address plus `count * size_of::()`. + /// + /// # Safety + /// + /// This method is unsafe because the new address may not point to valid + /// memory. + #[allow(clippy::should_implement_trait)] + pub unsafe fn add(self, count: usize) -> Self { + self.offset(count as isize) + } +} + +impl<'a, T> From> for Addr<'a, T> { + fn from(addr: AddrMut<'a, T>) -> Self { + Self { + inner: addr.inner, + _p: PhantomData, + } + } +} + +impl<'a, T> From<&'a T> for AddrMut<'a, T> { + fn from(inner: &'a T) -> Self { + Self { + inner: NonNull::from(inner), + _p: PhantomData, + } + } +} + +impl<'a, T> AsRef for AddrMut<'a, T> { + fn as_ref(&self) -> &T { + unsafe { self.inner.as_ref() } + } +} + +impl<'a, T> fmt::Debug for AddrMut<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +impl<'a, T> fmt::Display for AddrMut<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +impl<'a, T> fmt::Pointer for AddrMut<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Pointer::fmt(&self.inner, f) + } +} + +unsafe impl<'a, T> Send for Addr<'a, T> where T: Send {} +unsafe impl<'a, T> Send for AddrMut<'a, T> where T: Send {} + +/// A slice of some read-only memory. The memory can be in this process or in +/// another process. +#[derive(Copy, Clone)] +pub struct AddrSlice<'a, T> { + inner: &'a [T], +} + +impl<'a, T> AddrSlice<'a, T> { + /// Creates the slice from its raw parts. + /// + /// # Safety + /// + /// This method is unsafe for the same reasons that + /// [`std::slice::from_raw_parts`] is unsafe. + pub unsafe fn from_raw_parts(addr: Addr<'a, T>, len: usize) -> Self { + Self { + inner: ::core::slice::from_raw_parts(addr.as_ptr(), len), + } + } + + /// Divides one slice into two at an index. Panics if `mid > len`. + pub fn split_at(&self, mid: usize) -> (Self, Self) { + let (a, b) = self.inner.split_at(mid); + (Self { inner: a }, Self { inner: b }) + } + + /// Returns the number of elements in the slice. + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Returns true if the slice is empty. + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Splits the slice at the next page boundary if the slice spans two pages. + /// Returns `None` if the slice does not span two pages. Thus, both slices + /// are guaranteed to be non-empty. + pub fn split_at_page_boundary(&self) -> Option<(Self, Self)> { + let addr = self.inner.as_ptr() as usize; + + // Get the offset to the next page. If this is larger than (or equal to) + // the length of the slice, then it's not possible to split the slice. + let offset = next_page(addr) - addr; + + if offset < self.len() { + Some(self.split_at(offset)) + } else { + None + } + } +} + +impl<'a> AddrSlice<'a, u8> { + /// Returns an `IoSlice` representing this `AddrSlice`. + /// + /// # Safety + /// This function is unsafe because it gives access to raw pointers, which + /// may not be valid for the current address space. + pub unsafe fn as_ioslice(&self) -> io::IoSlice { + io::IoSlice::new(self.inner) + } +} + +/// A slice of some writable memory. The memory can be in this process or in +/// another process. +pub struct AddrSliceMut<'a, T> { + inner: &'a mut [T], +} + +impl<'a, T> AddrSliceMut<'a, T> { + /// Creates the slice from its raw parts. + /// + /// # Safety + /// + /// This method is unsafe for the same reasons that + /// [`std::slice::from_raw_parts`] is unsafe. + pub unsafe fn from_raw_parts(addr: AddrMut<'a, T>, len: usize) -> Self { + Self { + inner: ::core::slice::from_raw_parts_mut(addr.as_mut_ptr(), len), + } + } + + /// Divides one slice into two at an index. + pub fn split_at(&'a mut self, mid: usize) -> (Self, Self) { + let (a, b) = self.inner.split_at_mut(mid); + (Self { inner: a }, Self { inner: b }) + } + + /// Returns the number of elements in the slice. + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Returns true if the slice is empty. + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Splits the slice at the next page boundary if the slice spans two pages. + /// Returns `None` if the slice does not span two pages. Thus, both slices + /// are guaranteed to be non-empty. + pub fn split_at_page_boundary(&'a mut self) -> Option<(Self, Self)> { + let addr = self.inner.as_ptr() as usize; + + // Get the offset to the next page. If this is larger than (or equal to) + // the length of the slice, then it's not possible to split the slice. + let offset = next_page(addr) - addr; + + if offset < self.len() { + Some(self.split_at(offset)) + } else { + None + } + } +} + +impl<'a> AddrSliceMut<'a, u8> { + /// Returns an `IoSliceMut` representing this `AddrSliceMut`. + /// + /// # Safety + /// This function is unsafe because it gives access to raw pointers, which + /// may not be valid for the current address space. + pub unsafe fn as_ioslice_mut(&mut self) -> io::IoSliceMut { + io::IoSliceMut::new(self.inner) + } +} + +/// Finds the boundary for the next page. Note that this is different than simply +/// aligning an address on a page boundary. +fn next_page(addr: usize) -> usize { + const PAGE_SIZE: usize = 0x1000; + (addr + PAGE_SIZE) & (!PAGE_SIZE + 1) +} + +#[cfg(test)] +mod test { + use super::*; + use core::mem::{align_of, size_of}; + + #[test] + fn test_next_page() { + assert_eq!(next_page(0x1000), 0x2000); + assert_eq!(next_page(0x1), 0x1000); + assert_eq!(next_page(0x0), 0x1000); + assert_eq!(next_page(0x1234), 0x2000); + } + + #[test] + fn test_addr() { + // Ensure that we haven't perturbed the size or alignment of the + // address. We rely on the fact that it is the same size as a regular + // pointer. + assert_eq!(size_of::>(), size_of::<*const u8>()); + assert_eq!(size_of::>(), size_of::<&u8>()); + assert_eq!(size_of::>>(), size_of::<*const u8>()); + assert_eq!(size_of::>>(), size_of::<&u8>()); + assert_eq!(align_of::>>(), align_of::<*const u8>()); + assert_eq!(align_of::>>(), align_of::<&u8>()); + + assert_eq!(Addr::::from_raw(0), None); + + // Test comparison operators. + assert_eq!( + Addr::::from_raw(0xdeadbeef), + Addr::::from_raw(0xdeadbeef) + ); + assert_ne!( + Addr::::from_raw(0xdeadbeef), + Addr::::from_raw(0xbaadf00d) + ); + assert!(Addr::::from_raw(0x1000).unwrap() < Addr::::from_raw(0x1001).unwrap()); + + assert_eq!( + format!("{:p}", Addr::::from_raw(0x1000).unwrap()), + "0x1000" + ); + } + + #[test] + fn test_addr_slice_size() { + // Ensure that we haven't purturbed the size. We rely on the fact that + // it is the same size as a regular slice. + assert_eq!(size_of::>(), size_of::<&[u8]>()); + assert_eq!(size_of::>>(), size_of::<&[u8]>()); + assert_eq!(size_of::>(), size_of::<&mut [u8]>()); + assert_eq!( + size_of::>>(), + size_of::<&mut [u8]>() + ); + } +} diff --git a/reverie-syscalls/src/memory/local.rs b/reverie-syscalls/src/memory/local.rs new file mode 100644 index 0000000..03fce44 --- /dev/null +++ b/reverie-syscalls/src/memory/local.rs @@ -0,0 +1,88 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use std::ffi::CString; +use std::io; + +use super::{Addr, AddrMut, Errno, MemoryAccess}; + +/// A local address space. +#[derive(Default, Debug)] +pub struct LocalMemory {} + +impl LocalMemory { + /// Creates a new representation of memory in the current address space. + /// Accessing memory this way is highly unsafe. This interface is subject to + /// change in the future to reduce the unsafeness of it. + /// + /// # Example + /// ``` + /// # use reverie_syscalls::LocalMemory; + /// let memory = LocalMemory::new(); + /// ``` + pub fn new() -> Self { + // TODO: Make LocalMemory just act as a `&mut [u8]`. Then, the "address + // space" will simply be pointers within that range. This would enable + // restriction of the accessible address space on a per-syscall basis. + Self::default() + } +} + +impl MemoryAccess for LocalMemory { + fn read_vectored( + &self, + _read_from: &[io::IoSlice], + _write_to: &mut [io::IoSliceMut], + ) -> Result { + // TODO: Just write to the first non-empty buffer + todo!("Implement local memory access") + } + + fn write_vectored( + &mut self, + _read_from: &[io::IoSlice], + _write_to: &mut [io::IoSliceMut], + ) -> Result { + todo!("Implement local memory access") + } + + fn read<'a, A>(&self, addr: A, buf: &mut [u8]) -> Result + where + A: Into>, + { + let addr = addr.into(); + // Simply copy the memory starting at the address into the buffer. This + // is very unsafe. We need a better way to do this. + unsafe { + ::core::intrinsics::copy_nonoverlapping(addr.as_ptr(), buf.as_mut_ptr(), buf.len()) + }; + + Ok(buf.len()) + } + + fn write(&mut self, addr: AddrMut, buf: &[u8]) -> Result { + // Simply copy the memory starting at the address into the buffer. This + // is very unsafe. We need a better way to do this. + unsafe { + ::core::intrinsics::copy_nonoverlapping(buf.as_ptr(), addr.as_mut_ptr(), buf.len()) + }; + + Ok(buf.len()) + } + + fn read_cstring(&self, addr: Addr) -> Result { + let ptr = unsafe { addr.as_ptr() }; + let len = unsafe { libc::strlen(ptr as *const libc::c_char) }; + let slice = unsafe { ::core::slice::from_raw_parts(ptr, len) }; + + let buf = Vec::from(slice); + + Ok(unsafe { CString::from_vec_unchecked(buf) }) + } +} diff --git a/reverie-syscalls/src/memory/mod.rs b/reverie-syscalls/src/memory/mod.rs new file mode 100644 index 0000000..1ac2d44 --- /dev/null +++ b/reverie-syscalls/src/memory/mod.rs @@ -0,0 +1,342 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +mod addr; +mod local; + +use core::mem; + +use std::ffi::CString; +use std::io; + +pub use addr::{Addr, AddrMut, AddrSlice, AddrSliceMut}; +pub use local::LocalMemory; + +use crate::Errno; + +/// Trait for accessing potentially remote memory. +pub trait MemoryAccess { + /// Reads bytes from the address space. Returns the number of bytes read. + /// + /// Note that there is no guarantee that all of the requested buffers will be + /// filled. + fn read_vectored( + &self, + read_from: &[io::IoSlice], + write_to: &mut [io::IoSliceMut], + ) -> Result; + + /// Writes bytes to the address space. Returns the number of bytes written. + /// + /// Note that there is no guarantee that all of the requested buffers will + /// be written. + fn write_vectored( + &mut self, + read_from: &[io::IoSlice], + write_to: &mut [io::IoSliceMut], + ) -> Result; + + /// Performs a read starting at the given address. The number of bytes read + /// is returned. The buffer is not guaranteed to be completely filled. + fn read<'a, A>(&self, addr: A, buf: &mut [u8]) -> Result + where + A: Into>, + { + let slice = unsafe { AddrSlice::from_raw_parts(addr.into(), buf.len()) }; + let from = [unsafe { slice.as_ioslice() }]; + let mut to = [io::IoSliceMut::new(buf)]; + self.read_vectored(&from, &mut to) + } + + /// Performs a write starting at the given address. The number of bytes + /// written is returned. There is no guarantee that the given buffer will be + /// fully written. + fn write(&mut self, addr: AddrMut, buf: &[u8]) -> Result { + let mut slice = unsafe { AddrSliceMut::from_raw_parts(addr, buf.len()) }; + let from = [io::IoSlice::new(buf)]; + let mut to = [unsafe { slice.as_ioslice_mut() }]; + self.write_vectored(&from, &mut to) + } + + /// Reads exactly the number of bytes wanted by `buf`. + fn read_exact<'a, A>(&self, addr: A, mut buf: &mut [u8]) -> Result<(), Errno> + where + A: Into>, + { + let mut addr = addr.into(); + + while !buf.is_empty() { + match self.read(addr, buf)? { + 0 => break, + n => { + addr = unsafe { addr.add(n) }; + buf = &mut buf[n..]; + } + } + } + + if !buf.is_empty() { + // Failed to fill the whole buffer. + Err(Errno::EFAULT) + } else { + Ok(()) + } + } + + /// Reads exactly the number of bytes wanted by `buf`. + fn write_exact(&mut self, mut addr: AddrMut, mut buf: &[u8]) -> Result<(), Errno> { + while !buf.is_empty() { + match self.write(addr, buf)? { + 0 => break, + n => { + addr = unsafe { addr.add(n) }; + buf = &buf[n..]; + } + } + } + + if !buf.is_empty() { + // Failed to fill the whole buffer. + Err(Errno::EFAULT) + } else { + Ok(()) + } + } + + /// Reads a value at the given address. + fn read_value<'a, A, T>(&self, addr: A) -> Result + where + A: Into>, + T: Sized + 'a, + { + let addr = addr.into(); + let mut value = mem::MaybeUninit::uninit(); + + let value_buf = unsafe { + ::core::slice::from_raw_parts_mut(value.as_mut_ptr() as *mut u8, mem::size_of::()) + }; + + self.read_exact(addr.cast::(), value_buf)?; + + Ok(unsafe { value.assume_init() }) + } + + /// Writes a value to the given address. + fn write_value<'a, A, T>(&mut self, addr: A, value: &T) -> Result<(), Errno> + where + A: Into>, + T: Sized + 'a, + { + let addr = addr.into(); + + let value_buf = unsafe { + ::core::slice::from_raw_parts(value as *const _ as *const u8, mem::size_of::()) + }; + + self.write_exact(addr.cast::(), value_buf)?; + + Ok(()) + } + + /// Reads a slice of values. Returns an error if the buffer fails to get + /// fully filled. + fn read_values(&self, addr: Addr, buf: &mut [T]) -> Result<(), Errno> + where + T: Sized, + { + let buf = unsafe { + ::core::slice::from_raw_parts_mut( + buf.as_mut_ptr() as *mut u8, + buf.len() * mem::size_of::(), + ) + }; + + self.read_exact(addr.cast::(), buf) + } + + /// Writes a slice of values. Returns an error if the buffer fails to get + /// fully written. + fn write_values(&mut self, addr: AddrMut, buf: &[T]) -> Result<(), Errno> + where + T: Sized, + { + let buf = unsafe { + ::core::slice::from_raw_parts( + buf.as_ptr() as *const u8, + buf.len() * mem::size_of::(), + ) + }; + + self.write_exact(addr.cast::(), buf) + } + + /// Reads memory at the given starting address while the boolean returned by + /// the predicate `pred` is true. + fn read_while(&self, mut addr: Addr, buf: &mut [u8], mut pred: F) -> Result + where + F: FnMut(&[u8]) -> Option, + { + let mut count = 0; + + loop { + let read = self.read(addr, buf)?; + if read == 0 { + // We hit an "EOF" (an EFAULT) and the predicate never matched. + // The predicate should *eventually* return true, so this is + // always an error. + return Err(Errno::EFAULT); + } + + addr = unsafe { addr.add(read) }; + + if let Some(used) = pred(&buf[..read]) { + return Ok(count + used); + } + + count += read; + } + } + + /// Reads a NUL terminated string using the provided buffer to read it in + /// chunks. Change the size of the buffer to adjust how many bytes are read + /// at one time. Increasing the buffer size can be more efficient when + /// reading a remote C string because it reduces the number of syscalls that + /// are made. + fn read_cstring_with_buf(&self, addr: Addr, buf: &mut [u8]) -> Result { + let mut accumulator = Vec::new(); + + self.read_while(addr, buf, |slice| { + if let Some(nul) = slice.iter().position(|&b| b == 0) { + // Stop once we find a NUL terminator. + accumulator.extend(&slice[..nul]); + Some(nul) + } else { + accumulator.extend(slice); + None + } + })?; + + // unsafe is okay here; the vector is guaranteed to not contain a nul + // byte. + Ok(unsafe { CString::from_vec_unchecked(accumulator) }) + } + + /// Reads a null-terminated string starting at the given address. + fn read_cstring(&self, addr: Addr) -> Result { + // Assume most strings are smallish. We need to balance the overhead of + // copying data vs the average length of C-strings. + let mut buf: [u8; 512] = [0; 512]; + + self.read_cstring_with_buf(addr, &mut buf) + } + + /// Returns a struct that implements `std::io::Read`. This is useful when + /// reading memory sequentially. + fn reader<'a, T>(&'a self, addr: Addr<'a, T>) -> MemoryReader<'a, Self, T> + where + Self: Sized, + { + MemoryReader::new(self, addr) + } + + /// Returns a struct that implements `std::io::Write`. This is useful when + /// writing memory sequentially. + fn writer<'a, T>(&'a mut self, addr: AddrMut<'a, T>) -> MemoryWriter<'a, Self, T> + where + Self: Sized, + { + MemoryWriter::new(self, addr) + } +} + +/// A wrapper around both an address space and a pointer for sequential reads. +pub struct MemoryReader<'a, M, T> { + memory: &'a M, + + addr: Addr<'a, T>, +} + +impl<'a, M, T> MemoryReader<'a, M, T> { + /// Creates a new `MemoryReader`. All reads will start at `addr`. It is the + /// callers job to avoid buffer overruns. + pub fn new(memory: &'a M, addr: Addr<'a, T>) -> Self { + MemoryReader { memory, addr } + } +} + +impl<'a, M, T> MemoryReader<'a, M, T> +where + M: MemoryAccess, + T: Sized + Copy, +{ + /// Reads a single typed value from the buffer. + pub fn read_value(&mut self) -> Result { + let value = self.memory.read_value(self.addr)?; + self.addr = unsafe { self.addr.add(1) }; + Ok(value) + } +} + +impl<'a, M> io::Read for MemoryReader<'a, M, u8> +where + M: MemoryAccess, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let count = self.memory.read(self.addr, buf)?; + + self.addr = unsafe { self.addr.add(count) }; + + Ok(count) + } +} + +/// A wrapper around both an address space and a pointer for sequential writes. +pub struct MemoryWriter<'a, M, T> { + memory: &'a mut M, + + addr: AddrMut<'a, T>, +} + +impl<'a, M, T> MemoryWriter<'a, M, T> { + /// Creates a new `MemoryWriter`. All writes will start at `addr`. It is the + /// callers job to avoid buffer overruns. + pub fn new(memory: &'a mut M, addr: AddrMut<'a, T>) -> Self { + MemoryWriter { memory, addr } + } +} + +impl<'a, M, T> MemoryWriter<'a, M, T> +where + M: MemoryAccess, + T: Sized + Copy, +{ + /// Reads a single typed value from the buffer. + pub fn write_value(&mut self, value: &T) -> Result<(), Errno> { + self.memory.write_value(self.addr, value)?; + self.addr = unsafe { self.addr.add(1) }; + Ok(()) + } +} + +impl<'a, M> io::Write for MemoryWriter<'a, M, u8> +where + M: MemoryAccess, +{ + fn write(&mut self, buf: &[u8]) -> io::Result { + let count = self.memory.write(self.addr, buf)?; + + self.addr = unsafe { self.addr.add(count) }; + + Ok(count) + } + + fn flush(&mut self) -> io::Result<()> { + // Flush doesn't make any sense when writing to memory. + Ok(()) + } +} diff --git a/reverie-syscalls/src/raw.rs b/reverie-syscalls/src/raw.rs new file mode 100644 index 0000000..7d707cc --- /dev/null +++ b/reverie-syscalls/src/raw.rs @@ -0,0 +1,205 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use nix::{ + fcntl::{AtFlags, OFlag}, + sched::CloneFlags, + sys::{ + epoll::EpollCreateFlags, + eventfd::EfdFlags, + inotify::InitFlags, + mman::{MapFlags, ProtFlags}, + signalfd::SfdFlags, + socket::SockFlag, + stat::Mode, + timerfd::TimerFlags, + wait::WaitPidFlag, + }, + unistd::Pid, +}; + +use crate::{Addr, AddrMut, Errno}; + +/// Trait representing a raw value. Note that the assertion +/// `assert_eq!(T::from_raw(x).into_raw(), x)` should hold true for every +/// possible value of `T` and `x`. In other words, there should be no loss of +/// information and conversions should never fail. This ensures that adding type +/// information to a value will always be forward compatible. +/// +/// This trait is very similar to `From` and `Into`. Instead of reusing +/// those existing traits, this separate trait is necessary such that it can be +/// implemented for foreign types. +pub trait FromToRaw: Sized { + /// Converts a raw value into this type. + fn from_raw(value: u64) -> Self; + + /// Converts this type into a raw value. + fn into_raw(self) -> u64; +} + +impl FromToRaw for u32 { + fn from_raw(raw: u64) -> Self { + raw as Self + } + + fn into_raw(self) -> u64 { + self as u64 + } +} + +impl FromToRaw for i32 { + fn from_raw(raw: u64) -> Self { + raw as Self + } + + fn into_raw(self) -> u64 { + self as u64 + } +} + +impl FromToRaw for u64 { + fn from_raw(raw: u64) -> Self { + raw + } + + fn into_raw(self) -> u64 { + self + } +} + +impl FromToRaw for usize { + fn from_raw(raw: u64) -> Self { + raw as Self + } + + fn into_raw(self) -> u64 { + self as u64 + } +} + +impl FromToRaw for i64 { + fn from_raw(raw: u64) -> Self { + raw as Self + } + + fn into_raw(self) -> u64 { + self as u64 + } +} + +impl<'a, T> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + Addr::from_raw(raw as usize) + } + + fn into_raw(self) -> u64 { + self.map_or(0, |addr| addr.as_raw() as u64) + } +} + +impl<'a, T> FromToRaw for Option> { + fn from_raw(raw: u64) -> Self { + AddrMut::from_raw(raw as usize) + } + + fn into_raw(self) -> u64 { + self.map_or(0, |addr| addr.as_raw() as u64) + } +} + +macro_rules! impl_raw_bits { + ($t:ty : $inner:ty) => { + impl $crate::FromToRaw for $t { + fn from_raw(raw: u64) -> Self { + unsafe { Self::from_bits_unchecked(raw as $inner) } + } + + fn into_raw(self) -> u64 { + self.bits() as u64 + } + } + }; + + ($t:ty) => { + impl_raw_bits!($t: i32); + }; +} + +impl_raw_bits!(AtFlags); +impl_raw_bits!(OFlag); +impl_raw_bits!(CloneFlags); +impl_raw_bits!(Mode: libc::mode_t); +impl_raw_bits!(WaitPidFlag); +impl_raw_bits!(MapFlags); +impl_raw_bits!(ProtFlags); +impl_raw_bits!(EpollCreateFlags); +impl_raw_bits!(EfdFlags); +impl_raw_bits!(InitFlags); +impl_raw_bits!(SockFlag); +impl_raw_bits!(SfdFlags); +impl_raw_bits!(TimerFlags); + +impl FromToRaw for Option { + fn from_raw(raw: u64) -> Self { + if raw == 0 { + None + } else { + Some(Mode::from_raw(raw)) + } + } + + fn into_raw(self) -> u64 { + match self { + None => 0, + Some(mode) => mode.into_raw(), + } + } +} + +impl FromToRaw for Pid { + fn from_raw(raw: u64) -> Self { + Pid::from_raw(raw as i32) + } + + fn into_raw(self) -> u64 { + self.as_raw() as u64 + } +} + +impl FromToRaw for Result +where + T: FromToRaw, +{ + fn from_raw(raw: u64) -> Self { + Errno::from_ret(raw as i64).map(|x| T::from_raw(x as u64)) + } + + fn into_raw(self) -> u64 { + match self { + Ok(x) => x.into_raw(), + Err(err) => -err.into_raw() as u64, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use nix::unistd::Pid; + + #[test] + fn test_results() { + assert_eq!( + Result::::from_raw(-2i64 as u64), + Err(Errno::ENOENT) + ); + + assert_eq!(Result::::from_raw(42), Ok(Pid::from_raw(42))); + } +} diff --git a/reverie-syscalls/src/syscalls/family.rs b/reverie-syscalls/src/syscalls/family.rs new file mode 100644 index 0000000..d13cf9c --- /dev/null +++ b/reverie-syscalls/src/syscalls/family.rs @@ -0,0 +1,133 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! This module provides groupings of closely related syscalls (i.e., +//! "families"). These are useful when needing to handle families in very similar +//! ways. + +use super::Syscall; + +use crate::args::StatPtr; +use crate::memory::AddrMut; + +use derive_more::From; + +/// Represents the `[p]write{64,v,v2}` family of syscalls. All of these syscalls +/// have an associated file descriptor. +#[derive(From, Debug, Copy, Clone, Eq, PartialEq)] +#[allow(missing_docs)] +pub enum WriteFamily { + Write(super::Write), + Pwrite64(super::Pwrite64), + Writev(super::Writev), + Pwritev(super::Pwritev), + Pwritev2(super::Pwritev2), +} + +impl WriteFamily { + /// Get the file descriptor associated with the write. + pub fn fd(&self) -> i32 { + match self { + Self::Write(s) => s.fd(), + Self::Pwrite64(s) => s.fd(), + Self::Writev(s) => s.fd(), + Self::Pwritev(s) => s.fd(), + Self::Pwritev2(s) => s.fd(), + } + } +} + +impl From for Syscall { + fn from(family: WriteFamily) -> Syscall { + match family { + WriteFamily::Write(syscall) => Syscall::Write(syscall), + WriteFamily::Pwrite64(syscall) => Syscall::Pwrite64(syscall), + WriteFamily::Writev(syscall) => Syscall::Writev(syscall), + WriteFamily::Pwritev(syscall) => Syscall::Pwritev(syscall), + WriteFamily::Pwritev2(syscall) => Syscall::Pwritev2(syscall), + } + } +} + +/// Represents the stat family of syscalls. All of these have an associated stat +/// buffer. +#[derive(From, Debug, Copy, Clone, Eq, PartialEq)] +#[allow(missing_docs)] +pub enum StatFamily { + Stat(super::Stat), + Fstat(super::Fstat), + Lstat(super::Lstat), + Newfstatat(super::Newfstatat), +} + +impl StatFamily { + /// Get address of the stat buffer. Returns `None` if a NULL pointer was + /// specified. + pub fn stat(&self) -> Option { + match self { + Self::Stat(s) => s.stat(), + Self::Fstat(s) => s.stat(), + Self::Lstat(s) => s.stat(), + Self::Newfstatat(s) => s.stat(), + } + } +} + +impl From for Syscall { + fn from(family: StatFamily) -> Syscall { + match family { + StatFamily::Stat(syscall) => Syscall::Stat(syscall), + StatFamily::Fstat(syscall) => Syscall::Fstat(syscall), + StatFamily::Lstat(syscall) => Syscall::Lstat(syscall), + StatFamily::Newfstatat(syscall) => Syscall::Newfstatat(syscall), + } + } +} + +/// Represents the family of syscalls that get information about a socket. All of +/// these have some buffer and a length pointer. +#[derive(From, Debug, Copy, Clone, Eq, PartialEq)] +#[allow(missing_docs)] +pub enum SockOptFamily { + Getsockopt(super::Getsockopt), + Getpeername(super::Getpeername), + Getsockname(super::Getsockname), +} + +impl SockOptFamily { + /// Get address of the value. Returns `None` if a NULL pointer was + /// specified. + pub fn value(&self) -> Option> { + match self { + Self::Getsockopt(s) => s.optval().map(AddrMut::cast), + Self::Getpeername(s) => s.usockaddr().map(AddrMut::cast), + Self::Getsockname(s) => s.usockaddr().map(AddrMut::cast), + } + } + + /// Get address of the buffer length. Returns `None` if a NULL pointer was + /// specified. + pub fn value_len(&self) -> Option> { + match self { + Self::Getsockopt(s) => s.optlen(), + Self::Getpeername(s) => s.usockaddr_len(), + Self::Getsockname(s) => s.usockaddr_len(), + } + } +} + +impl From for Syscall { + fn from(family: SockOptFamily) -> Syscall { + match family { + SockOptFamily::Getsockopt(syscall) => Syscall::Getsockopt(syscall), + SockOptFamily::Getpeername(syscall) => Syscall::Getpeername(syscall), + SockOptFamily::Getsockname(syscall) => Syscall::Getsockname(syscall), + } + } +} diff --git a/reverie-syscalls/src/syscalls/mod.rs b/reverie-syscalls/src/syscalls/mod.rs new file mode 100644 index 0000000..1094846 --- /dev/null +++ b/reverie-syscalls/src/syscalls/mod.rs @@ -0,0 +1,3406 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +pub mod family; + +use crate::args::{ + ioctl, ArchPrctlCmd, CArrayPtr, CStrPtr, ClockId, FcntlCmd, PathPtr, PollFd, StatPtr, + StatxMask, StatxPtr, Timespec, Timeval, Timezone, Whence, +}; +use crate::display::Displayable; +use crate::memory::{Addr, AddrMut}; +use crate::raw::FromToRaw; +use ::syscalls::{SyscallArgs, Sysno}; + +// Re-export flags that used by syscalls from the `nix` crate so downstream +// projects don't need to add another dependency on it. +pub use nix::{ + fcntl::{AtFlags, OFlag}, + sched::CloneFlags, + sys::{ + epoll::EpollCreateFlags, + eventfd::EfdFlags, + inotify::InitFlags, + mman::{MapFlags, ProtFlags}, + signalfd::SfdFlags, + socket::SockFlag, + stat::Mode, + timerfd::TimerFlags, + wait::WaitPidFlag, + }, +}; + +/// A trait that all syscalls implement. +pub trait SyscallInfo: Displayable + Copy + Send { + /// The return type of the syscall. + type Return: Displayable + FromToRaw; + + /// Returns the syscall name. + fn name(&self) -> &'static str; + + /// Returns the syscall number. + fn number(&self) -> Sysno; + + /// Converts the syscall into its constituent parts. + fn into_parts(self) -> (Sysno, SyscallArgs); +} + +// After adding a new type-safe syscall, uncomment the corresponding line below. +// The syscalls lower ranks and higher probabilities should be implemented +// first. +syscall_list! { + /// Full list of type-safe syscalls. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + #[allow(missing_docs)] + #[non_exhaustive] + pub enum Syscall { + SYS_read => Read, + SYS_write => Write, + SYS_open => Open, + SYS_close => Close, + SYS_stat => Stat, + SYS_fstat => Fstat, + SYS_lstat => Lstat, + SYS_poll => Poll, + SYS_lseek => Lseek, + SYS_mmap => Mmap, + SYS_mprotect => Mprotect, + SYS_munmap => Munmap, + SYS_brk => Brk, + SYS_rt_sigaction => RtSigaction, + SYS_rt_sigprocmask => RtSigprocmask, + SYS_rt_sigreturn => RtSigreturn, + SYS_ioctl => Ioctl, + SYS_pread64 => Pread64, + SYS_pwrite64 => Pwrite64, + SYS_readv => Readv, + SYS_writev => Writev, + SYS_access => Access, + SYS_pipe => Pipe, + SYS_select => Select, + SYS_sched_yield => SchedYield, + SYS_mremap => Mremap, + SYS_msync => Msync, + SYS_mincore => Mincore, + SYS_madvise => Madvise, + SYS_shmget => Shmget, + SYS_shmat => Shmat, + SYS_shmctl => Shmctl, + SYS_dup => Dup, + SYS_dup2 => Dup2, + SYS_pause => Pause, + SYS_nanosleep => Nanosleep, + SYS_getitimer => Getitimer, + SYS_alarm => Alarm, + SYS_setitimer => Setitimer, + SYS_getpid => Getpid, + SYS_sendfile => Sendfile, + SYS_socket => Socket, + SYS_connect => Connect, + SYS_accept => Accept, + SYS_sendto => Sendto, + SYS_recvfrom => Recvfrom, + SYS_sendmsg => Sendmsg, + SYS_recvmsg => Recvmsg, + SYS_shutdown => Shutdown, + SYS_bind => Bind, + SYS_listen => Listen, + SYS_getsockname => Getsockname, + SYS_getpeername => Getpeername, + SYS_socketpair => Socketpair, + SYS_setsockopt => Setsockopt, + SYS_getsockopt => Getsockopt, + SYS_clone => Clone, + SYS_fork => Fork, + SYS_vfork => Vfork, + SYS_execve => Execve, + SYS_exit => Exit, + SYS_wait4 => Wait4, + SYS_kill => Kill, + SYS_uname => Uname, + SYS_semget => Semget, + SYS_semop => Semop, + SYS_semctl => Semctl, + SYS_shmdt => Shmdt, + SYS_msgget => Msgget, + SYS_msgsnd => Msgsnd, + SYS_msgrcv => Msgrcv, + SYS_msgctl => Msgctl, + SYS_fcntl => Fcntl, + SYS_flock => Flock, + SYS_fsync => Fsync, + SYS_fdatasync => Fdatasync, + SYS_truncate => Truncate, + SYS_ftruncate => Ftruncate, + SYS_getdents => Getdents, + SYS_getcwd => Getcwd, + SYS_chdir => Chdir, + SYS_fchdir => Fchdir, + SYS_rename => Rename, + SYS_mkdir => Mkdir, + SYS_rmdir => Rmdir, + SYS_creat => Creat, + SYS_link => Link, + SYS_unlink => Unlink, + SYS_symlink => Symlink, + SYS_readlink => Readlink, + SYS_chmod => Chmod, + SYS_fchmod => Fchmod, + SYS_chown => Chown, + SYS_fchown => Fchown, + SYS_lchown => Lchown, + SYS_umask => Umask, + SYS_gettimeofday => Gettimeofday, + SYS_getrlimit => Getrlimit, + SYS_getrusage => Getrusage, + SYS_sysinfo => Sysinfo, + SYS_times => Times, + SYS_ptrace => Ptrace, + SYS_getuid => Getuid, + SYS_syslog => Syslog, + SYS_getgid => Getgid, + SYS_setuid => Setuid, + SYS_setgid => Setgid, + SYS_geteuid => Geteuid, + SYS_getegid => Getegid, + SYS_setpgid => Setpgid, + SYS_getppid => Getppid, + SYS_getpgrp => Getpgrp, + SYS_setsid => Setsid, + SYS_setreuid => Setreuid, + SYS_setregid => Setregid, + SYS_getgroups => Getgroups, + SYS_setgroups => Setgroups, + SYS_setresuid => Setresuid, + SYS_getresuid => Getresuid, + SYS_setresgid => Setresgid, + SYS_getresgid => Getresgid, + SYS_getpgid => Getpgid, + SYS_setfsuid => Setfsuid, + SYS_setfsgid => Setfsgid, + SYS_getsid => Getsid, + SYS_capget => Capget, + SYS_capset => Capset, + SYS_rt_sigpending => RtSigpending, + SYS_rt_sigtimedwait => RtSigtimedwait, + SYS_rt_sigqueueinfo => RtSigqueueinfo, + SYS_rt_sigsuspend => RtSigsuspend, + SYS_sigaltstack => Sigaltstack, + SYS_utime => Utime, + SYS_mknod => Mknod, + SYS_uselib => Uselib, + SYS_personality => Personality, + SYS_ustat => Ustat, + SYS_statfs => Statfs, + SYS_fstatfs => Fstatfs, + SYS_sysfs => Sysfs, + SYS_getpriority => Getpriority, + SYS_setpriority => Setpriority, + SYS_sched_setparam => SchedSetparam, + SYS_sched_getparam => SchedGetparam, + SYS_sched_setscheduler => SchedSetscheduler, + SYS_sched_getscheduler => SchedGetscheduler, + SYS_sched_get_priority_max => SchedGetPriorityMax, + SYS_sched_get_priority_min => SchedGetPriorityMin, + SYS_sched_rr_get_interval => SchedRrGetInterval, + SYS_mlock => Mlock, + SYS_munlock => Munlock, + SYS_mlockall => Mlockall, + SYS_munlockall => Munlockall, + SYS_vhangup => Vhangup, + SYS_modify_ldt => ModifyLdt, + SYS_pivot_root => PivotRoot, + #[allow(non_camel_case_types)] + SYS__sysctl => _sysctl, + SYS_prctl => Prctl, + SYS_arch_prctl => ArchPrctl, + SYS_adjtimex => Adjtimex, + SYS_setrlimit => Setrlimit, + SYS_chroot => Chroot, + SYS_sync => Sync, + SYS_acct => Acct, + SYS_settimeofday => Settimeofday, + SYS_mount => Mount, + SYS_umount2 => Umount2, + SYS_swapon => Swapon, + SYS_swapoff => Swapoff, + SYS_reboot => Reboot, + SYS_sethostname => Sethostname, + SYS_setdomainname => Setdomainname, + SYS_iopl => Iopl, + SYS_ioperm => Ioperm, + SYS_create_module => CreateModule, + SYS_init_module => InitModule, + SYS_delete_module => DeleteModule, + SYS_get_kernel_syms => GetKernelSyms, + SYS_query_module => QueryModule, + SYS_quotactl => Quotactl, + SYS_nfsservctl => Nfsservctl, + SYS_getpmsg => Getpmsg, + SYS_putpmsg => Putpmsg, + SYS_afs_syscall => AfsSyscall, + SYS_tuxcall => Tuxcall, + SYS_security => Security, + SYS_gettid => Gettid, + SYS_readahead => Readahead, + SYS_setxattr => Setxattr, + SYS_lsetxattr => Lsetxattr, + SYS_fsetxattr => Fsetxattr, + SYS_getxattr => Getxattr, + SYS_lgetxattr => Lgetxattr, + SYS_fgetxattr => Fgetxattr, + SYS_listxattr => Listxattr, + SYS_llistxattr => Llistxattr, + SYS_flistxattr => Flistxattr, + SYS_removexattr => Removexattr, + SYS_lremovexattr => Lremovexattr, + SYS_fremovexattr => Fremovexattr, + SYS_tkill => Tkill, + SYS_time => Time, + SYS_futex => Futex, + SYS_sched_setaffinity => SchedSetaffinity, + SYS_sched_getaffinity => SchedGetaffinity, + SYS_set_thread_area => SetThreadArea, + SYS_io_setup => IoSetup, + SYS_io_destroy => IoDestroy, + SYS_io_getevents => IoGetevents, + SYS_io_submit => IoSubmit, + SYS_io_cancel => IoCancel, + SYS_get_thread_area => GetThreadArea, + SYS_lookup_dcookie => LookupDcookie, + SYS_epoll_create => EpollCreate, + SYS_epoll_ctl_old => EpollCtlOld, + SYS_epoll_wait_old => EpollWaitOld, + SYS_remap_file_pages => RemapFilePages, + SYS_getdents64 => Getdents64, + SYS_set_tid_address => SetTidAddress, + SYS_restart_syscall => RestartSyscall, + SYS_semtimedop => Semtimedop, + SYS_fadvise64 => Fadvise64, + SYS_timer_create => TimerCreate, + SYS_timer_settime => TimerSettime, + SYS_timer_gettime => TimerGettime, + SYS_timer_getoverrun => TimerGetoverrun, + SYS_timer_delete => TimerDelete, + SYS_clock_settime => ClockSettime, + SYS_clock_gettime => ClockGettime, + SYS_clock_getres => ClockGetres, + SYS_clock_nanosleep => ClockNanosleep, + SYS_exit_group => ExitGroup, + SYS_epoll_wait => EpollWait, + SYS_epoll_ctl => EpollCtl, + SYS_tgkill => Tgkill, + SYS_utimes => Utimes, + SYS_vserver => Vserver, + SYS_mbind => Mbind, + SYS_set_mempolicy => SetMempolicy, + SYS_get_mempolicy => GetMempolicy, + SYS_mq_open => MqOpen, + SYS_mq_unlink => MqUnlink, + SYS_mq_timedsend => MqTimedsend, + SYS_mq_timedreceive => MqTimedreceive, + SYS_mq_notify => MqNotify, + SYS_mq_getsetattr => MqGetsetattr, + SYS_kexec_load => KexecLoad, + SYS_waitid => Waitid, + SYS_add_key => AddKey, + SYS_request_key => RequestKey, + SYS_keyctl => Keyctl, + SYS_ioprio_set => IoprioSet, + SYS_ioprio_get => IoprioGet, + SYS_inotify_init => InotifyInit, + SYS_inotify_add_watch => InotifyAddWatch, + SYS_inotify_rm_watch => InotifyRmWatch, + SYS_migrate_pages => MigratePages, + SYS_openat => Openat, + SYS_mkdirat => Mkdirat, + SYS_mknodat => Mknodat, + SYS_fchownat => Fchownat, + SYS_futimesat => Futimesat, + SYS_newfstatat => Newfstatat, + SYS_unlinkat => Unlinkat, + SYS_renameat => Renameat, + SYS_linkat => Linkat, + SYS_symlinkat => Symlinkat, + SYS_readlinkat => Readlinkat, + SYS_fchmodat => Fchmodat, + SYS_faccessat => Faccessat, + SYS_pselect6 => Pselect6, + SYS_ppoll => Ppoll, + SYS_unshare => Unshare, + SYS_set_robust_list => SetRobustList, + SYS_get_robust_list => GetRobustList, + SYS_splice => Splice, + SYS_tee => Tee, + SYS_sync_file_range => SyncFileRange, + SYS_vmsplice => Vmsplice, + SYS_move_pages => MovePages, + SYS_utimensat => Utimensat, + SYS_epoll_pwait => EpollPwait, + SYS_signalfd => Signalfd, + SYS_timerfd_create => TimerfdCreate, + SYS_eventfd => Eventfd, + SYS_fallocate => Fallocate, + SYS_timerfd_settime => TimerfdSettime, + SYS_timerfd_gettime => TimerfdGettime, + SYS_accept4 => Accept4, + SYS_signalfd4 => Signalfd4, + SYS_eventfd2 => Eventfd2, + SYS_epoll_create1 => EpollCreate1, + SYS_dup3 => Dup3, + SYS_pipe2 => Pipe2, + SYS_inotify_init1 => InotifyInit1, + SYS_preadv => Preadv, + SYS_pwritev => Pwritev, + SYS_rt_tgsigqueueinfo => RtTgsigqueueinfo, + SYS_perf_event_open => PerfEventOpen, + SYS_recvmmsg => Recvmmsg, + SYS_fanotify_init => FanotifyInit, + SYS_fanotify_mark => FanotifyMark, + SYS_prlimit64 => Prlimit64, + SYS_name_to_handle_at => NameToHandleAt, + SYS_open_by_handle_at => OpenByHandleAt, + SYS_clock_adjtime => ClockAdjtime, + SYS_syncfs => Syncfs, + SYS_sendmmsg => Sendmmsg, + SYS_setns => Setns, + SYS_getcpu => Getcpu, + SYS_process_vm_readv => ProcessVmReadv, + SYS_process_vm_writev => ProcessVmWritev, + SYS_kcmp => Kcmp, + SYS_finit_module => FinitModule, + SYS_sched_setattr => SchedSetattr, + SYS_sched_getattr => SchedGetattr, + SYS_renameat2 => Renameat2, + SYS_seccomp => Seccomp, + SYS_getrandom => Getrandom, + SYS_memfd_create => MemfdCreate, + SYS_kexec_file_load => KexecFileLoad, + SYS_bpf => Bpf, + SYS_execveat => Execveat, + SYS_userfaultfd => Userfaultfd, + SYS_membarrier => Membarrier, + SYS_mlock2 => Mlock2, + SYS_copy_file_range => CopyFileRange, + SYS_preadv2 => Preadv2, + SYS_pwritev2 => Pwritev2, + SYS_pkey_mprotect => PkeyMprotect, + SYS_pkey_alloc => PkeyAlloc, + SYS_pkey_free => PkeyFree, + SYS_statx => Statx, + } +} + +typed_syscall! { + pub struct Read { + fd: i32, + // TODO: Change this to a slice and print out part of the contents of + // the read. + buf: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Write { + fd: i32, + // TODO: Change this to a slice and print out part of the contents of + // the write (after the syscall has been executed). + buf: Option>, + len: usize, + } +} + +fn get_mode(flags: OFlag, mode: u64) -> Option { + if flags.intersects(OFlag::O_CREAT | OFlag::O_TMPFILE) { + Some(FromToRaw::from_raw(mode)) + } else { + None + } +} + +typed_syscall! { + pub struct Open -> i32 { + path: Option, + flags: OFlag, + + /// The mode is only present when `O_CREAT` or `O_TMPFILE` is specified + /// in the flags. It is ignored otherwise. + mode?: { + fn get(&self) -> Option { + get_mode(self.flags(), self.raw.arg2) + } + + fn set(mut self, v: Option) -> Self { + self.raw.arg2 = v.into_raw(); + self + } + }, + } +} + +impl From for Open { + /// A call to creat() is equivalent to calling open() with flags equal to + /// O_CREAT|O_WRONLY|O_TRUNC + fn from(creat: Creat) -> Self { + let Creat { mut raw } = creat; + raw.arg2 = raw.arg1; + raw.arg1 = (libc::O_CREAT | libc::O_WRONLY | libc::O_TRUNC) as u64; + Open { raw } + } +} + +typed_syscall! { + pub struct Close { + fd: i32, + } +} + +typed_syscall! { + pub struct Stat { + path: Option, + stat: Option, + } +} + +typed_syscall! { + pub struct Fstat { + fd: i32, + stat: Option, + } +} + +typed_syscall! { + pub struct Lstat { + path: Option, + stat: Option, + } +} + +typed_syscall! { + pub struct Poll { + fds: Option>, + nfds: libc::nfds_t, + timeout: libc::c_int, + } +} + +typed_syscall! { + pub struct Mmap { + addr: Option>, + len: usize, + prot: ProtFlags, + flags: MapFlags, + fd: i32, + offset: libc::off_t, + } +} + +typed_syscall! { + pub struct Lseek { + fd: i32, + offset: libc::off_t, + whence: Whence, + } +} + +typed_syscall! { + pub struct Mprotect { + addr: Option>, + len: usize, + protection: ProtFlags, + } +} + +typed_syscall! { + pub struct Munmap { + addr: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Brk { + addr: Option>, + } +} + +typed_syscall! { + pub struct RtSigaction { + signum: i32, + action: Option>, + old_action: Option>, + /// Should always be 8 (`size_of::()`). + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct RtSigprocmask { + how: i32, + set: Option>, + oldset: Option>, + /// Should always be 8 (`size_of::()`). + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct RtSigreturn { + } +} + +typed_syscall! { + pub struct Ioctl { + fd: i32, + request: { + fn get(&self) -> ioctl::Request { + ioctl::Request::from_raw(self.raw.arg1, self.raw.arg2) + } + + fn set(mut self, v: ioctl::Request) -> Self { + let (request, arg) = v.into_raw(); + self.raw.arg1 = request; + self.raw.arg2 = arg; + self + } + }, + } +} + +typed_syscall! { + pub struct Pread64 { + fd: i32, + // TODO: Change this to a slice and print out part of the contents of + // the read. + buf: Option>, + len: usize, + offset: libc::off_t, + } +} + +typed_syscall! { + pub struct Pwrite64 { + fd: i32, + // TODO: Change this to a slice and print out part of the contents of + // the write. + buf: Option>, + len: usize, + offset: libc::off_t, + } +} + +typed_syscall! { + pub struct Readv { + fd: i32, + iov: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Writev { + fd: i32, + iov: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Access { + path: Option, + mode: Mode, + } +} + +typed_syscall! { + pub struct Pipe { + pipefd: Option>, + } +} + +typed_syscall! { + pub struct Select { + nfds: i32, + readfds: Option>, + writefds: Option>, + exceptfds: Option>, + timeout: Option>, + } +} + +typed_syscall! { + pub struct SchedYield {} +} + +typed_syscall! { + pub struct Mremap { + addr: Option>, + old_len: usize, + new_len: usize, + flags: usize, + new_addr: Option>, + } +} + +typed_syscall! { + pub struct Msync { + addr: Option>, + len: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct Mincore { + addr: Option>, + len: usize, + vec: Option>, + } +} + +typed_syscall! { + pub struct Madvise { + addr: Option>, + len: usize, + advice: i32, + } +} + +typed_syscall! { + pub struct Shmget { + key: libc::key_t, + size: usize, + shmflg: i32, + } +} + +typed_syscall! { + pub struct Shmat { + shmid: i32, + shmaddr: Option>, + shmflg: i32, + } +} + +typed_syscall! { + pub struct Shmctl { + shmid: i32, + cmd: i32, + buf: Option>, + } +} + +typed_syscall! { + pub struct Dup { + oldfd: i32, + } +} + +typed_syscall! { + pub struct Dup2 { + oldfd: i32, + newfd: i32, + } +} + +typed_syscall! { pub struct Pause {} } + +typed_syscall! { + pub struct Nanosleep { + req: Option>, + rem: Option>, + } +} + +typed_syscall! { + pub struct Getitimer { + which: i32, + value: Option>, + } +} + +typed_syscall! { + pub struct Alarm { + seconds: u32, + } +} + +typed_syscall! { + pub struct Setitimer { + which: i32, + value: Option>, + ovalue: Option>, + } +} + +typed_syscall! { + pub struct Getpid {} +} + +typed_syscall! { + pub struct Sendfile { + out_fd: i32, + in_fd: i32, + offset: Option>, + count: usize, + } +} + +typed_syscall! { + // TODO: Give more meaningful types to these arguments. + pub struct Socket { + family: i32, + r#type: i32, + protocol: i32, + } +} + +typed_syscall! { + pub struct Connect { + fd: i32, + uservaddr: Option>, + addrlen: i32, + } +} + +typed_syscall! { + pub struct Accept { + sockfd: i32, + sockaddr: Option>, + addrlen: Option>, + } +} + +typed_syscall! { + pub struct Sendto { + fd: i32, + buf: Option>, + size: usize, + flags: u32, + addr: Option>, + addr_len: i32, + } +} + +typed_syscall! { + pub struct Recvfrom { + fd: i32, + buf: Option>, + len: usize, + flags: i32, + addr: Option>, + addr_len: Option>, + } +} + +typed_syscall! { + pub struct Sendmsg { + fd: i32, + msg: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct Recvmsg { + sockfd: i32, + msg: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct Shutdown { + fd: i32, + how: i32, + } +} + +typed_syscall! { + pub struct Bind { + fd: i32, + umyaddr: Option>, + addrlen: i32, + } +} + +typed_syscall! { + pub struct Listen { + fd: i32, + backlog: i32, + } +} + +typed_syscall! { + pub struct Getsockname { + fd: i32, + usockaddr: Option>, + usockaddr_len: Option>, + } +} + +typed_syscall! { + pub struct Getpeername { + fd: i32, + usockaddr: Option>, + usockaddr_len: Option>, + } +} + +typed_syscall! { + // TODO: Give more meaningful types to these arguments. + pub struct Socketpair { + family: i32, + r#type: i32, + protocol: i32, + usockvec: Option>, + } +} + +typed_syscall! { + pub struct Setsockopt { + fd: i32, + level: i32, + optname: i32, + optval: Option>, + optlen: libc::socklen_t, + } +} + +typed_syscall! { + pub struct Getsockopt { + fd: i32, + level: i32, + optname: i32, + optval: Option>, + optlen: Option>, + } +} + +#[cfg(any(target_arch = "x86_64"))] +typed_syscall! { + pub struct Clone { + flags: CloneFlags, + child_stack: Option>, + ptid: Option>, + ctid: Option>, + newtls: u64, + } +} + +#[cfg(any(target_arch = "arm", target_arch = "aarch64", target_arch = "x86"))] +typed_syscall! { + pub struct Clone { + flags: CloneFlags, + child_stack: Option>, + ptid: Option>, + newtls: u64, + ctid: Option>, + } +} + +impl From for Clone { + /// Since `clone` offers a superset of functionality over `vfork`, a `vfork` + /// syscall can be transformed into a `clone` syscall by passing in the + /// right flags. In fact, this is how the Linux kernel implements `vfork`. + /// See kernel/fork.c for more details. + fn from(_: Vfork) -> Self { + let raw = SyscallArgs { + arg0: (libc::CLONE_VFORK | libc::CLONE_VM | libc::SIGCHLD) as u64, + arg1: 0, + arg2: 0, + arg3: 0, + arg4: 0, + arg5: 0, + }; + Self { raw } + } +} + +impl From for Clone { + /// Since `clone` offers a superset of functionality over `fork`, a `fork` + /// syscall can be transformed into a `clone` syscall by passing in the + /// right flags. In fact, this is how the Linux kernel implements `fork`. + /// See kernel/fork.c for more details. + fn from(_: Fork) -> Self { + let raw = SyscallArgs { + arg0: libc::SIGCHLD as u64, + arg1: 0, + arg2: 0, + arg3: 0, + arg4: 0, + arg5: 0, + }; + Self { raw } + } +} + +typed_syscall! { + pub struct Fork {} +} + +typed_syscall! { + pub struct Vfork {} +} + +typed_syscall! { + pub struct Execve { + path: Option, + argv: Option>, + envp: Option>, + } +} + +typed_syscall! { + pub struct Exit { + status: libc::c_int, + } +} + +typed_syscall! { + pub struct Wait4 { + pid: libc::pid_t, + wstatus: Option>, + options: WaitPidFlag, + rusage: Option>, + } +} + +typed_syscall! { + pub struct Kill { + pid: libc::pid_t, + // TODO: Change the signal to a type that prints out the signal passed + // to it. + sig: libc::c_int, + } +} + +typed_syscall! { + pub struct Uname { + buf: Option>, + } +} + +typed_syscall! { + pub struct Semget { + key: libc::key_t, + nsems: i32, + semflg: i32, + } +} + +typed_syscall! { + pub struct Semop { + semid: i32, + tsops: Option>, + nsops: usize, + } +} + +typed_syscall! { + pub struct Semctl { + semid: i32, + semnum: i32, + cmd: i32, + arg: u64, + } +} + +typed_syscall! { + pub struct Shmdt { + shmaddr: Option>, + } +} + +typed_syscall! { + pub struct Msgget { + key: libc::key_t, + msgflg: i32, + } +} + +typed_syscall! { + pub struct Msgsnd { + msqid: i32, + msgp: Option>, + msgsz: usize, + msgflg: i32, + } +} + +typed_syscall! { + pub struct Msgrcv { + msqid: i32, + msgp: Option>, + msgsz: usize, + msgtyp: libc::c_long, + msgflg: i32, + } +} + +typed_syscall! { + pub struct Msgctl { + msqid: i32, + cmd: i32, + buf: Option>, + } +} + +typed_syscall! { + pub struct Fcntl { + /// The file descriptor to perform the operation on. + fd: i32, + + cmd: { + fn get(&self) -> FcntlCmd { + FcntlCmd::from_raw(self.raw.arg1 as libc::c_int, self.raw.arg2) + } + + fn set(mut self, v: FcntlCmd) -> Self { + let (cmd, arg) = v.into_raw(); + self.raw.arg1 = cmd as u64; + self.raw.arg2 = arg; + self + } + }, + } +} + +typed_syscall! { + pub struct Flock { + fd: i32, + // TODO: Give this a more restricted type. + operation: i32, + } +} + +typed_syscall! { + pub struct Fsync { + fd: i32, + } +} + +typed_syscall! { + pub struct Fdatasync { + fd: i32, + } +} + +typed_syscall! { + pub struct Truncate { + path: Option, + length: libc::off_t, + } +} + +typed_syscall! { + pub struct Ftruncate { + fd: i32, + length: libc::off_t, + } +} + +typed_syscall! { + pub struct Getdents { + fd: u32, + dirent: Option>, + count: u32, + } +} + +typed_syscall! { + pub struct Getcwd { + // TODO: Replace this with a PathPtrMut. + buf: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Chdir { + path: Option, + } +} + +typed_syscall! { + pub struct Fchdir { + fd: i32, + } +} + +typed_syscall! { + pub struct Rename { + oldpath: Option, + newpath: Option, + } +} + +typed_syscall! { + pub struct Mkdir { + path: Option, + mode: Mode, + } +} + +typed_syscall! { + pub struct Rmdir { + path: Option, + } +} + +typed_syscall! { + pub struct Creat { + path: Option, + mode: Mode, + } +} + +typed_syscall! { + pub struct Link { + oldpath: Option, + newpath: Option, + } +} + +typed_syscall! { + pub struct Unlink { + path: Option, + } +} + +typed_syscall! { + pub struct Symlink { + target: Option, + linkpath: Option, + } +} + +typed_syscall! { + pub struct Readlink { + path: Option, + // TODO: Replace this with a PathPtrMut + buf: Option>, + bufsize: usize, + } +} + +typed_syscall! { + pub struct Chmod { + path: Option, + mode: Mode, + } +} + +typed_syscall! { + pub struct Fchmod { + fd: i32, + mode: Mode, + } +} + +typed_syscall! { + pub struct Chown { + path: Option, + owner: libc::uid_t, + group: libc::gid_t, + } +} + +typed_syscall! { + pub struct Fchown { + fd: i32, + owner: libc::uid_t, + group: libc::gid_t, + } +} + +typed_syscall! { + pub struct Lchown { + path: Option, + owner: libc::uid_t, + group: libc::gid_t, + } +} + +typed_syscall! { + pub struct Umask { + mask: Mode, + } +} + +typed_syscall! { + pub struct Gettimeofday { + tv: Option>, + tz: Option>, + } +} + +typed_syscall! { + pub struct Getrlimit { + resource: i32, + rlim: Option>, + } +} + +typed_syscall! { + pub struct Getrusage { + who: i32, + usage: Option>, + } +} + +typed_syscall! { + pub struct Sysinfo { + info: Option>, + } +} + +typed_syscall! { + pub struct Times -> libc::clock_t { + buf: Option>, + } +} + +typed_syscall! { + pub struct Ptrace { + request: u32, + pid: libc::pid_t, + addr: Option>, + data: Option>, + } +} + +typed_syscall! { pub struct Getuid {} } + +typed_syscall! { + pub struct Syslog { + priority: i32, + buf: Option>, + len: usize, + } +} + +typed_syscall! { pub struct Getgid {} } +typed_syscall! { pub struct Setuid { uid: libc::uid_t, } } +typed_syscall! { pub struct Setgid { uid: libc::gid_t, } } +typed_syscall! { pub struct Geteuid {} } +typed_syscall! { pub struct Getegid {} } +typed_syscall! { pub struct Setpgid { pid: libc::pid_t, pgid: libc::pid_t, } } +typed_syscall! { pub struct Getppid {} } +typed_syscall! { pub struct Getpgrp {} } + +typed_syscall! { pub struct Setsid {} } +typed_syscall! { pub struct Setreuid { ruid: libc::uid_t, euid: libc::uid_t, } } +typed_syscall! { pub struct Setregid { rgid: libc::gid_t, egid: libc::gid_t, } } + +typed_syscall! { + pub struct Getgroups { + // TODO: Make this a slice. + size: i32, + list: Option>, + } +} + +typed_syscall! { + pub struct Setgroups { + // TODO: Make this a slice. + size: usize, + list: Option>, + } +} + +typed_syscall! { + pub struct Setresuid { + ruid: libc::uid_t, + euid: libc::uid_t, + suid: libc::uid_t, + } +} +typed_syscall! { + pub struct Getresuid { + ruid: Option>, + euid: Option>, + suid: Option>, + } +} +typed_syscall! { + pub struct Setresgid { + rgid: libc::gid_t, + egid: libc::gid_t, + sgid: libc::gid_t, + } +} +typed_syscall! { + pub struct Getresgid { + rgid: Option>, + egid: Option>, + sgid: Option>, + } +} +typed_syscall! { pub struct Getpgid {} } +typed_syscall! { pub struct Setfsuid {} } +typed_syscall! { pub struct Setfsgid {} } +typed_syscall! { pub struct Getsid {} } + +typed_syscall! { + pub struct Capget { + header: Option>, + data: Option>, + } +} + +typed_syscall! { + pub struct Capset { + header: Option>, + data: Option>, + } +} + +typed_syscall! { + pub struct RtSigpending { + set: Option>, + /// Should always be 8 (`size_of::()`). + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct RtSigtimedwait { + set: Option>, + info: Option>, + timeout: Option>, + /// Should always be 8 (`size_of::()`). + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct RtSigqueueinfo { + tgid: libc::pid_t, + sig: i32, + siginfo: Option>, + } +} + +typed_syscall! { + pub struct RtSigsuspend { + mask: Option>, + /// Should always be 8 (`size_of::()`). + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct Sigaltstack { + ss: Option>, + old_ss: Option>, + } +} + +typed_syscall! { + pub struct Utime { + path: Option, + times: Option>, + } +} + +typed_syscall! { + pub struct Mknod { + path: Option, + mode: Mode, + dev: libc::dev_t, + } +} + +typed_syscall! { + pub struct Uselib { + library: Option, + } +} + +typed_syscall! { + pub struct Personality { + persona: u64, + } +} + +typed_syscall! { + pub struct Ustat { + dev: libc::dev_t, + // TODO: Change this to libc::ustat if/when it exists. + ubuf: Option>, + } +} + +typed_syscall! { + pub struct Statfs { + path: Option, + buf: Option>, + } +} + +typed_syscall! { + pub struct Fstatfs { + fd: i32, + buf: Option>, + } +} + +typed_syscall! { + pub struct Sysfs { + option: i32, + arg1: u64, + arg2: u64, + } +} + +typed_syscall! { + pub struct Getpriority { + which: i32, + who: libc::id_t, + } +} + +typed_syscall! { + pub struct Setpriority { + which: i32, + who: libc::id_t, + prio: i32, + } +} + +typed_syscall! { + pub struct SchedSetparam { + pid: libc::pid_t, + param: Option>, + } +} + +typed_syscall! { + pub struct SchedGetparam { + pid: libc::pid_t, + param: Option>, + } +} + +typed_syscall! { + pub struct SchedSetscheduler { + pid: libc::pid_t, + policy: i32, + param: Option>, + } +} + +typed_syscall! { + pub struct SchedGetscheduler { + pid: libc::pid_t, + } +} + +typed_syscall! { + pub struct SchedGetPriorityMax { + policy: i32, + } +} + +typed_syscall! { + pub struct SchedGetPriorityMin { + policy: i32, + } +} + +typed_syscall! { + pub struct SchedRrGetInterval { + pid: libc::pid_t, + tp: Option>, + } +} + +typed_syscall! { + pub struct Mlock { + addr: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Munlock { + addr: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Mlockall { + flags: i32, + } +} + +typed_syscall! { pub struct Munlockall {} } + +typed_syscall! { pub struct Vhangup {} } + +typed_syscall! { + pub struct ModifyLdt { + func: i32, + ptr: Option>, + bytecount: u64, + } +} + +typed_syscall! { + pub struct PivotRoot { + new_root: Option, + put_old: Option, + } +} + +typed_syscall! { + #[allow(non_camel_case_types)] + pub struct _sysctl { + // TODO: Use _sysctl_args struct. + args: Option>, + } +} + +typed_syscall! { + pub struct Prctl { + option: i32, + arg2: u64, + arg3: u64, + arg4: u64, + arg5: u64, + } +} + +typed_syscall! { + pub struct ArchPrctl { + cmd: { + fn get(&self) -> ArchPrctlCmd { + ArchPrctlCmd::from_raw(self.raw.arg0 as i32, self.raw.arg1) + } + + fn set(mut self, v: ArchPrctlCmd) -> Self { + let (cmd, arg) = v.into_raw(); + self.raw.arg0 = cmd as u64; + self.raw.arg1 = arg; + self + } + }, + } +} + +typed_syscall! { + pub struct Adjtimex { + buf: Option>, + } +} + +typed_syscall! { + pub struct Setrlimit { + resource: i32, + rlim: Option>, + } +} + +typed_syscall! { + pub struct Chroot { + path: Option, + } +} + +typed_syscall! { pub struct Sync {} } + +typed_syscall! { + pub struct Acct { + filename: Option, + } +} + +typed_syscall! { + pub struct Settimeofday { + tv: Option>, + tz: Option>, + } +} + +typed_syscall! { + pub struct Mount { + source: Option, + target: Option, + filesystemtype: Option, + flags: u64, + data: Option>, + } +} + +typed_syscall! { + pub struct Umount2 { + target: Option, + flags: i32, + } +} + +typed_syscall! { + pub struct Swapon { + path: Option, + swapflags: i32, + } +} + +typed_syscall! { + pub struct Swapoff { + path: Option, + } +} + +typed_syscall! { + pub struct Reboot { + magic1: i32, + magic2: i32, + cmd: u32, + arg: Option>, + } +} + +typed_syscall! { + pub struct Sethostname { + name: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Setdomainname { + name: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct Iopl { + level: u32, + } +} + +typed_syscall! { + pub struct Ioperm { + from: u64, + num: u64, + turn_on: i32, + } +} + +typed_syscall! { + /// Note: This system call is present only in kernels before Linux 2.6. + pub struct CreateModule { + name: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct InitModule { + module_image: Option>, + len: usize, + param_values: Option, + } +} + +typed_syscall! { + pub struct DeleteModule { + name: Option, + flags: i32, + } +} + +typed_syscall! { + /// Note: This system call is present only in kernels before Linux 2.6. + pub struct GetKernelSyms { + table: Option>, + } +} + +typed_syscall! { + pub struct QueryModule { + name: Option, + which: i32, + buf: Option>, + bufsize: usize, + ret: Option>, + } +} + +typed_syscall! { + pub struct Quotactl { + cmd: i32, + special: Option, + id: i32, + addr: Option>, + } +} + +typed_syscall! { + /// Note: Since Linux 3.1, this system call no longer exists. It has been + /// replaced by a set of files in the nfsd filesystem; see `nfsd(7)`. + pub struct Nfsservctl { + cmd: i32, + argp: Option>, + resp: Option>, + } +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct Getpmsg {} +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct Putpmsg {} +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct AfsSyscall {} +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct Tuxcall {} +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct Security {} +} + +typed_syscall! { pub struct Gettid {} } + +typed_syscall! { + pub struct Readahead { + fd: i32, + offset: libc::loff_t, + count: usize, + } +} + +typed_syscall! { + pub struct Setxattr { + path: Option, + name: Option, + value: Option>, + size: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct Lsetxattr { + path: Option, + name: Option, + value: Option>, + size: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct Fsetxattr { + fd: i32, + name: Option, + value: Option>, + size: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct Getxattr { + path: Option, + name: Option, + value: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Lgetxattr { + path: Option, + name: Option, + value: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Fgetxattr { + fd: i32, + name: Option, + value: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Listxattr { + path: Option, + list: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Llistxattr { + path: Option, + list: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Flistxattr { + fd: i32, + list: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct Removexattr { + path: Option, + name: Option, + } +} + +typed_syscall! { + pub struct Lremovexattr { + path: Option, + name: Option, + } +} + +typed_syscall! { + pub struct Fremovexattr { + fd: i32, + name: Option, + } +} + +typed_syscall! { + pub struct Tkill { + tid: libc::pid_t, + sig: libc::c_int, + } +} + +typed_syscall! { + pub struct Time { + tloc: Option>, + } +} + +typed_syscall! { + // TODO: Wrap each futex operation in a type, similar to fcntl and ioctl. + pub struct Futex { + uaddr: Option>, + futex_op: libc::c_int, + val: libc::c_int, + timeout: Option>, + uaddr2: Option>, + val3: libc::c_int, + } +} + +typed_syscall! { + pub struct SchedSetaffinity { + pid: libc::pid_t, + len: u32, + mask: Option>, + } +} + +typed_syscall! { + pub struct SchedGetaffinity { + pid: libc::pid_t, + len: u32, + mask: Option>, + } +} + +typed_syscall! { + pub struct SetThreadArea { + addr: Option>, + } +} + +typed_syscall! { + pub struct IoSetup { + nr_events: u32, + context: Option>, + } +} + +typed_syscall! { + pub struct IoDestroy { + context: libc::c_ulong, + } +} + +typed_syscall! { + pub struct IoGetevents { + context: libc::c_ulong, + min_nr: libc::c_long, + nr: libc::c_long, + // FIXME: This should be a pointer to an `io_event`. + events: Option>, + timeout: Option>, + } +} + +typed_syscall! { + pub struct IoSubmit { + context: libc::c_ulong, + nr: libc::c_long, + // FIXME: This should be a pointer to a pointer of `iocb`. + iocb: Option>>, + } +} + +typed_syscall! { + pub struct IoCancel { + context: libc::c_ulong, + iocb: Option>, + // FIXME: This should be a pointer to an `io_event`. + result: Option>, + } +} + +typed_syscall! { + pub struct GetThreadArea { + addr: Option>, + } +} + +typed_syscall! { + pub struct LookupDcookie { + cookie: u64, + buf: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct EpollCreate { + /// The kernel doesn't actually use this parameter, but it must be + /// greater than 0. (It was used as a size hint at one point in time.) + size: i32, + } +} + +typed_syscall! { + /// Undocumented. + pub struct EpollCtlOld {} +} + +typed_syscall! { + /// Undocumented. + pub struct EpollWaitOld {} +} + +typed_syscall! { + pub struct RemapFilePages { + addr: Option>, + size: u64, + prot: i32, + pgoff: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct Getdents64 { + fd: u32, + dirent: Option>, + count: u32, + } +} + +typed_syscall! { + pub struct SetTidAddress { + tidptr: Option>, + } +} + +typed_syscall! { pub struct RestartSyscall { } } + +typed_syscall! { + pub struct Semtimedop { + semid: i32, + tsops: Option>, + nsops: u32, + timeout: Option>, + } +} + +typed_syscall! { + pub struct Fadvise64 { + fd: i32, + offset: libc::loff_t, + len: usize, + advice: i32, + } +} + +typed_syscall! { + pub struct TimerCreate { + clockid: ClockId, + sevp: Option>, + timerid: Option>, + } +} + +typed_syscall! { + pub struct TimerSettime { + timerid: libc::c_int, + flags: i32, + new_value: Option>, + old_value: Option>, + } +} + +typed_syscall! { + pub struct TimerGettime { + timerid: libc::c_int, + value: Option>, + } +} + +typed_syscall! { + pub struct TimerGetoverrun { + timerid: libc::c_int, + } +} + +typed_syscall! { + pub struct TimerDelete { + timerid: libc::c_int, + } +} + +typed_syscall! { + pub struct ClockSettime { + clockid: ClockId, + tp: Option>, + } +} + +typed_syscall! { + pub struct ClockGettime { + clockid: ClockId, + tp: Option>, + } +} + +typed_syscall! { + pub struct ClockGetres { + clockid: ClockId, + res: Option>, + } +} + +typed_syscall! { + pub struct ClockNanosleep { + clockid: ClockId, + flags: i32, + req: Option>, + rem: Option>, + } +} + +typed_syscall! { + pub struct ExitGroup { + status: libc::c_int, + } +} + +typed_syscall! { + pub struct EpollWait { + epfd: i32, + events: Option>, + maxevents: i32, + timeout: i32, // Milliseconds. + } +} + +typed_syscall! { + pub struct EpollCtl { + epfd: i32, + op: i32, + fd: i32, + event: Option>, + } +} + +typed_syscall! { + pub struct Tgkill { + tgid: libc::pid_t, + tid: libc::pid_t, + sig: libc::c_int, + } +} + +typed_syscall! { + pub struct Utimes { + filename: Option, + times: Option>, + } +} + +typed_syscall! { + /// Unimplemented in the kernel. + pub struct Vserver { } +} + +typed_syscall! { + pub struct Mbind { + addr: Option>, + len: u64, + mode: i32, + nodemask: Option>, + maxnode: u64, + flags: u32, + } +} + +typed_syscall! { + pub struct SetMempolicy { + mode: i32, + nodemask: Option>, + maxnode: u64, + } +} + +typed_syscall! { + pub struct GetMempolicy { + policy: Option>, + nodemask: Option>, + maxnode: u64, + addr: Option>, + flags: u32, + } +} + +typed_syscall! { + pub struct MqOpen { + name: Option, + oflag: i32, + mode: libc::mode_t, + attr: Option>, + } +} + +typed_syscall! { + pub struct MqUnlink { + name: Option, + } +} + +typed_syscall! { + pub struct MqTimedsend { + mqdes: libc::mqd_t, + msg: Option>, + msg_len: usize, + priority: u32, + timeout: Option>, + } +} + +typed_syscall! { + pub struct MqTimedreceive { + mqdes: libc::mqd_t, + msg: Option>, + msg_len: usize, + priority: u32, + timeout: Option>, + } +} + +typed_syscall! { + pub struct MqNotify { + mqdes: libc::mqd_t, + sevp: Option>, + } +} + +typed_syscall! { + pub struct MqGetsetattr { + mqdes: libc::mqd_t, + newattr: Option>, + oldattr: Option>, + } +} + +typed_syscall! { + pub struct KexecLoad { + entry: u64, + nr_segments: u64, + // FIXME: This should be a pointer to `kexec_segment`. + segments: Option>, + flags: u64, + } +} + +typed_syscall! { + pub struct Waitid { + which: i32, + pid: libc::pid_t, + info: Option>, + options: i32, + rusage: Option>, + } +} + +typed_syscall! { + pub struct AddKey { + key_type: Option, + description: Option, + payload: Option>, + payload_len: usize, + keyring: libc::c_int, + } +} + +typed_syscall! { + pub struct RequestKey { + key_type: Option, + description: Option, + callout_info: Option, + dest_keyring: libc::c_int, + } +} + +typed_syscall! { + pub struct Keyctl { + option: i32, + arg2: u64, + arg3: u64, + arg4: u64, + arg6: u64, + } +} + +typed_syscall! { + pub struct IoprioSet { + which: i32, + who: i32, + priority: i32, + } +} + +typed_syscall! { + pub struct IoprioGet { + which: i32, + who: i32, + } +} + +typed_syscall! { pub struct InotifyInit {} } + +typed_syscall! { + pub struct InotifyAddWatch { + fd: i32, + path: Option, + mask: u32, + } +} + +typed_syscall! { + pub struct InotifyRmWatch { + fd: i32, + wd: i32, + } +} + +typed_syscall! { + pub struct MigratePages { + pid: libc::pid_t, + maxnode: u64, + old_nodes: Option>, + new_nodes: Option>, + } +} + +typed_syscall! { + pub struct Openat { + dirfd: i32, + path: Option, + flags: OFlag, + + /// The mode is only present when `O_CREAT` or `O_TMPFILE` is specified + /// in the flags. It is ignored otherwise. + mode?: { + fn get(&self) -> Option { + get_mode(self.flags(), self.raw.arg3) + } + + fn set(mut self, v: Mode) -> Self { + self.raw.arg3 = v.into_raw(); + self + } + }, + } +} + +impl From for Openat { + /// An `open` syscall can be trivially transformed into an `openat` + /// syscall by shifting all the arguments to the right and setting the first + /// argument to `AT_FDCWD` (the current working directory). + fn from(open: Open) -> Self { + let Open { mut raw } = open; + raw.arg3 = raw.arg2; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Openat { raw } + } +} + +impl From for Openat { + /// A call to creat() is equivalent to calling open() with flags equal to + /// O_CREAT|O_WRONLY|O_TRUNC + fn from(creat: Creat) -> Self { + let Creat { mut raw } = creat; + raw.arg3 = raw.arg1; + raw.arg2 = (libc::O_CREAT | libc::O_WRONLY | libc::O_TRUNC) as u64; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Openat { raw } + } +} + +typed_syscall! { + pub struct Mkdirat { + dirfd: i32, + path: Option, + mode: Mode, + } +} + +impl From for Mkdirat { + /// An `mkdir` syscall can be trivially transformed into a `mkdirat` syscall + /// by shifting all the arguments to the right and setting the first argument + /// to `AT_FDCWD` (the current working directory). + fn from(syscall: Mkdir) -> Self { + let Mkdir { mut raw } = syscall; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Mkdirat { raw } + } +} + +typed_syscall! { + pub struct Mknodat { + dirfd: i32, + path: Option, + mode: Mode, + dev: libc::dev_t, + } +} + +impl From for Mknodat { + /// An `mknod` syscall can be trivially transformed into an `mknodat` syscall + /// by shifting all the arguments to the right and setting the first argument + /// to `AT_FDCWD` (the current working directory). + fn from(syscall: Mknod) -> Self { + let Mknod { mut raw } = syscall; + raw.arg3 = raw.arg2; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Mknodat { raw } + } +} + +typed_syscall! { + pub struct Fchownat { + dirfd: i32, + path: Option, + owner: libc::uid_t, + group: libc::gid_t, + flags: AtFlags, + } +} + +typed_syscall! { + pub struct Futimesat { + dirfd: i32, + path: Option, + utimes: Option>, + } +} + +typed_syscall! { + pub struct Newfstatat { + dirfd: i32, + path: Option, + stat: Option, + flags: AtFlags, + } +} + +impl From for Newfstatat { + fn from(stat: Stat) -> Self { + let Stat { mut raw } = stat; + raw.arg3 = 0; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Newfstatat { raw } + } +} + +impl From for Newfstatat { + fn from(lstat: Lstat) -> Self { + let Lstat { mut raw } = lstat; + raw.arg3 = AtFlags::AT_SYMLINK_NOFOLLOW.bits() as u64; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Newfstatat { raw } + } +} + +typed_syscall! { + pub struct Unlinkat { + dirfd: i32, + path: Option, + flags: AtFlags, + } +} + +impl From for Unlinkat { + fn from(unlink: Unlink) -> Self { + let Unlink { mut raw } = unlink; + raw.arg2 = 0; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Unlinkat { raw } + } +} + +impl From for Unlinkat { + fn from(rmdir: Rmdir) -> Self { + let Rmdir { mut raw } = rmdir; + raw.arg2 = libc::AT_REMOVEDIR as u64; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Unlinkat { raw } + } +} + +typed_syscall! { + pub struct Renameat { + olddirfd: i32, + oldpath: Option, + newdirfd: i32, + newpath: Option, + } +} + +typed_syscall! { + pub struct Linkat { + olddirfd: i32, + oldpath: Option, + newdirfd: i32, + newpath: Option, + flags: AtFlags, + } +} + +impl From for Linkat { + /// A `link` syscall can be trivially transformed into a `linkat` syscall + /// by rearranging the `oldpath` and `newpath` arguments, + /// setting both old and new directory file descriptors to the special value + /// `AT_FDCWD` (indicating the current working directory), + /// and clearing the flags. + fn from(link: Link) -> Self { + let Link { mut raw } = link; + raw.arg3 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + raw.arg2 = libc::AT_FDCWD as u64; + raw.arg4 = 0; + Linkat { raw } + } +} + +typed_syscall! { + pub struct Symlinkat { + target: Option, + newdirfd: i32, + linkpath: Option, + } +} + +typed_syscall! { + pub struct Readlinkat { + dirfd: i32, + path: Option, + buf: Option>, + buf_len: usize, + } +} + +typed_syscall! { + pub struct Fchmodat { + dirfd: i32, + path: Option, + mode: Mode, + flags: AtFlags, + } +} + +typed_syscall! { + pub struct Faccessat { + dirfd: i32, + path: Option, + mode: Mode, + flags: AtFlags, + } +} + +typed_syscall! { + pub struct Pselect6 { + nfds: i32, + readfds: Option>, + writefds: Option>, + exceptfds: Option>, + timeout: Option>, + sigmask: Option>, + } +} + +typed_syscall! { + pub struct Ppoll { + fds: Option>, + nfds: libc::nfds_t, + timeout: Option>, + sigmask: Option>, + sigsetsize: usize, + } +} + +typed_syscall! { + pub struct Unshare { + flags: CloneFlags, + } +} + +typed_syscall! { + pub struct SetRobustList { + // FIXME: This should be pointer to `robust_list_head`. + head: Option>, + len: usize, + } +} + +typed_syscall! { + pub struct GetRobustList { + pid: libc::pid_t, + // FIXME: This should be pointer to `robust_list_head`. + head_ptr: Option>>, + len_ptr: Option>, + } +} + +typed_syscall! { + pub struct Splice { + fd_in: i32, + off_in: Option>, + fd_out: i32, + off_out: Option>, + len: usize, + flags: u32, + } +} + +typed_syscall! { + pub struct Tee { + fd_in: i32, + fd_out: i32, + len: usize, + flags: u32, + } +} + +typed_syscall! { + pub struct SyncFileRange { + fd: i32, + offset: libc::loff_t, + nbytes: libc::loff_t, + flags: u32, + } +} + +typed_syscall! { + pub struct Vmsplice { + fd: i32, + iov: Option>, + nr_segs: u64, + flags: u32, + } +} + +typed_syscall! { + pub struct MovePages { + pid: libc::pid_t, + nr_pages: u64, + pages: Option>>, + nodes: Option>>, + status: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct Utimensat { + dirfd: i32, + path: Option, + times: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct EpollPwait { + epfd: i32, + events: Option>, + maxevents: i32, + timeout: i32, + sigmask: Option>, + sigsetsize: usize, + } +} + +typed_syscall! { + /// Naked signalfd(2) is not the same as glibc wrapper. + /// see kernel fs/signalfd.c for more details. + /// NB: kernel_sigset_t is 8 bytes (sizeof usize), we still use + /// libc::sigset_t here because kernel access only the 1st 8 bytes. + /// NB2: glibc wrapper will call signalfd4(2) instead, this this + /// syscall is only possible when user calls libc::syscall directly. + pub struct Signalfd { + fd: i32, + mask: Option>, + size: usize, + } +} + +typed_syscall! { + pub struct TimerfdCreate { + clockid: ClockId, + flags: TimerFlags, + } +} + +typed_syscall! { + pub struct Eventfd { + count: u32, + } +} + +typed_syscall! { + pub struct Fallocate { + fd: i32, + mode: i32, + offset: libc::loff_t, + len: libc::loff_t, + } +} + +typed_syscall! { + pub struct TimerfdSettime { + fd: i32, + flags: i32, + new_value: Option>, + old_value: Option>, + } +} + +typed_syscall! { + pub struct TimerfdGettime { + fd: i32, + value: Option>, + } +} + +typed_syscall! { + pub struct Accept4 { + sockfd: i32, + sockaddr: Option>, + addrlen: Option>, + flags: SockFlag, + } +} + +impl From for Accept4 { + /// If flags is 0, then accept4() is the same as accept(). + fn from(accept: Accept) -> Self { + let Accept { mut raw } = accept; + raw.arg3 = 0; + Accept4 { raw } + } +} + +typed_syscall! { + /// Naked signalfd4(2) is not the same as glibc wrapper. + /// see kernel fs/signalfd.c for more details. + /// NB: kernel_sigset_t is 8 bytes (sizeof usize), we still use + /// libc::sigset_t here because kernel access only the 1st 8 bytes. + pub struct Signalfd4 { + fd: i32, + mask: Option>, + size: usize, + flags: SfdFlags, + } +} + +impl From for Signalfd4 { + fn from(signalfd: Signalfd) -> Self { + let Signalfd { mut raw } = signalfd; + raw.arg3 = 0; + Signalfd4 { raw } + } +} + +typed_syscall! { + pub struct Eventfd2 { + count: u32, + flags: EfdFlags, + } +} + +impl From for Eventfd2 { + /// eventfd2 provide an extra `flags' argument, it's safe + /// to convert eventfd(2) to eventfd2(2), as a result. + /// glibc should have wrapped all eventfd syscall into eventfd2. + fn from(eventfd: Eventfd) -> Self { + let Eventfd { mut raw } = eventfd; + raw.arg1 = 0; + Eventfd2 { raw } + } +} + +typed_syscall! { + pub struct EpollCreate1 { + flags: EpollCreateFlags, + } +} + +impl From for EpollCreate1 { + /// `size' in epoll_create(2) is ignored but must be >= 0 since 2.6.9 + /// We still allows convert `epoll_create` to `epoll_create1` by forcing + /// `flags` to 0. This could have changed behavior when calling + /// `epoll_create(-1)` but shouldn't be a real concern in practice. + fn from(epoll_create: EpollCreate) -> Self { + let EpollCreate { mut raw } = epoll_create; + raw.arg0 = 0; + EpollCreate1 { raw } + } +} + +typed_syscall! { + pub struct Dup3 { + oldfd: i32, + newfd: i32, + flags: OFlag, + } +} + +typed_syscall! { + pub struct Pipe2 { + pipefd: Option>, + flags: OFlag, + } +} + +impl From for Pipe2 { + /// If flags is 0, then pipe2() is the same as pipe(). + fn from(pipe: Pipe) -> Self { + let Pipe { mut raw } = pipe; + raw.arg1 = 0; + Pipe2 { raw } + } +} + +typed_syscall! { + pub struct InotifyInit1 { + flags: InitFlags, + } +} + +impl From for InotifyInit1 { + /// If flags is 0, then inotify_init1 is the same as inotify_init. + /// Note that inotify_init was introduced in 2.6.13 and inotify_init1 + /// was added in 2.6.27. + fn from(inotify_init: InotifyInit) -> Self { + let InotifyInit { mut raw } = inotify_init; + raw.arg0 = 0; + InotifyInit1 { raw } + } +} + +typed_syscall! { + pub struct Preadv { + fd: i32, + iov: Option>, + iov_len: usize, + pos_l: u64, + pos_h: u64, + } +} + +typed_syscall! { + pub struct Pwritev { + fd: i32, + iov: Option>, + iov_len: usize, + pos_l: u64, + pos_h: u64, + } +} + +typed_syscall! { + pub struct RtTgsigqueueinfo { + tgid: libc::pid_t, + tid: libc::pid_t, + sig: i32, + siginfo: Option>, + } +} + +typed_syscall! { + pub struct PerfEventOpen { + // FIXME: This should be a pointer to `perf_event_attr`. + attr: Option>, + pid: libc::pid_t, + cpu: i32, + group_fd: i32, + flags: u64, + } +} + +typed_syscall! { + pub struct Recvmmsg { + fd: i32, + mmsg: Option>, + vlen: u32, + flags: u32, + timeout: Option>, + } +} + +typed_syscall! { + pub struct FanotifyInit { + flags: u32, + event_f_flags: u32, + } +} + +typed_syscall! { + pub struct FanotifyMark { + fanotify_fd: i32, + flags: u32, + mask: u64, + dirfd: i32, + pathname: Option, + } +} + +typed_syscall! { + pub struct Prlimit64 { + pid: libc::pid_t, + resource: u32, + new_rlim: Option>, + old_rlim: Option>, + } +} + +typed_syscall! { + pub struct NameToHandleAt { + dirfd: i32, + pathname: Option, + // FIXME: This should be a pointer to `file_handle`. + handle: Option>, + mount_id: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct OpenByHandleAt { + mount_fd: i32, + // FIXME: This should be a pointer to `file_handle`. + handle: Option>, + flags: i32, + } +} + +typed_syscall! { + pub struct ClockAdjtime { + clockid: ClockId, + buf: Option>, + } +} + +typed_syscall! { + pub struct Syncfs { + fd: i32, + } +} + +typed_syscall! { + pub struct Sendmmsg { + sockfd: i32, + msgvec: Option>, + vlen: u32, + flags: i32, + } +} + +typed_syscall! { + pub struct Setns { + fd: i32, + nstype: CloneFlags, + } +} + +// NB: getcpu_cache (third argument) is unused in kernel >= 2.6.23 should be +// always NULL. +typed_syscall! { + pub struct Getcpu { + cpu: Option>, + node: Option>, + } +} + +typed_syscall! { + pub struct ProcessVmReadv { + pid: libc::pid_t, + local_iov: Option>, + local_iov_count: u64, + remote_iov: Option>, + remote_iov_count: u64, + flags: u64, + } +} + +typed_syscall! { + pub struct ProcessVmWritev { + pid: libc::pid_t, + local_iov: Option>, + local_iov_count: u64, + remote_iov: Option>, + remote_iov_count: u64, + flags: u64, + } +} + +typed_syscall! { + pub struct Kcmp { + pid1: libc::pid_t, + pid2: libc::pid_t, + typ: i32, + idx1: u64, + idx2: u64, + } +} + +typed_syscall! { + pub struct FinitModule { + fd: i32, + param_values: Option, + flags: i32, + } +} + +typed_syscall! { + pub struct SchedSetattr { + pid: libc::pid_t, + attr: Option>, + flags: u32, + } +} + +typed_syscall! { + pub struct SchedGetattr { + pid: libc::pid_t, + // FIXME: This should be a pointer to a `sched_attr`. + attr: Option>, + size: u32, + flags: u32, + } +} + +typed_syscall! { + pub struct Renameat2 { + olddirfd: i32, + oldpath: Option, + newdirfd: i32, + newpath: Option, + // TODO: Make some `RENAME_*` bitflags to cover this. + flags: libc::c_uint, + } +} + +impl From for Renameat2 { + fn from(rename: Rename) -> Self { + let Rename { mut raw } = rename; + raw.arg4 = 0; + raw.arg3 = raw.arg1; + raw.arg2 = libc::AT_FDCWD as u64; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Renameat2 { raw } + } +} + +impl From for Renameat2 { + fn from(renameat: Renameat) -> Self { + let Renameat { mut raw } = renameat; + raw.arg4 = 0; + Renameat2 { raw } + } +} + +typed_syscall! { + pub struct Seccomp { + op: u32, + flags: u32, + args: Option>, + } +} + +typed_syscall! { + pub struct Getrandom { + /// The buffer should never be NULL (None), or this represents an invalid call when passed + /// to the kernel. Nevertheless, we retain the ability here to represent that invalid call. + buf: Option>, + buflen: usize, + flags: usize, + } +} + +typed_syscall! { + pub struct MemfdCreate { + name: Option, + flags: u32, + } +} + +typed_syscall! { + pub struct KexecFileLoad { + kernel_fd: i32, + initrd_fd: i32, + cmdline_len: u64, + cmdline: Option>, + flags: u64, + } +} + +typed_syscall! { + pub struct Bpf { + cmd: i32, + attr: Option>, + size: u32, + } +} + +typed_syscall! { + pub struct Execveat { + dirfd: i32, + path: Option, + argv: Option>, + envp: Option>, + flags: i32, + } +} + +impl From for Execveat { + /// An `execve` syscall can be trivially transformed into an `execveat` + /// syscall by shifting all the arguments to the right and setting the first + /// argument to `AT_FDCWD` (the current working directory). + fn from(execve: Execve) -> Self { + let Execve { mut raw } = execve; + raw.arg4 = 0; // flags + raw.arg3 = raw.arg2; + raw.arg2 = raw.arg1; + raw.arg1 = raw.arg0; + raw.arg0 = libc::AT_FDCWD as u64; + Execveat { raw } + } +} + +typed_syscall! { + pub struct Userfaultfd { + flags: i32, + } +} + +typed_syscall! { + pub struct Membarrier { + cmd: i32, + flags: i32, + } +} + +typed_syscall! { + pub struct Mlock2 { + addr: Option>, + len: usize, + flags: i32, + } +} + +typed_syscall! { + pub struct CopyFileRange { + fd_in: i32, + off_in: Option>, + fd_out: i32, + off_out: Option>, + len: usize, + flags: u32, + } +} + +typed_syscall! { + pub struct Preadv2 { + fd: i32, + iov: Option>, + iov_len: u64, + pos_l: u64, + pos_h: u64, + flags: i32, + } +} + +typed_syscall! { + pub struct Pwritev2 { + fd: i32, + iov: Option>, + iov_len: u64, + pos_l: u64, + pos_h: u64, + flags: i32, + } +} + +typed_syscall! { + pub struct PkeyMprotect { + addr: Option>, + len: usize, + prot: i32, + pkey: i32, + } +} + +typed_syscall! { + pub struct PkeyAlloc { + flags: u64, + access_rights: u64, + } +} + +typed_syscall! { + pub struct PkeyFree { + pkey: i32, + } +} + +typed_syscall! { + pub struct Statx { + dirfd: i32, + path: Option, + flags: AtFlags, + mask: StatxMask, + statx: Option, + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{Displayable, LocalMemory, ReadAddr}; + + use std::{ffi::CString, path::Path}; + + use syscalls::{SyscallArgs, Sysno}; + + #[test] + fn test_syscall_open() { + assert_eq!(Open::NAME, "open"); + assert_eq!(Open::NUMBER, Sysno::open); + + let name = CString::new("/some/file/path").unwrap(); + + let syscall = Open::new() + .with_path(PathPtr::from_ptr(name.as_ptr())) + .with_flags(OFlag::O_RDONLY | OFlag::O_APPEND) + .with_mode(Some(Mode::from_bits_truncate(0o644))); + + assert_eq!(Open::from(SyscallArgs::from(syscall)), syscall); + + let memory = LocalMemory::new(); + + assert_eq!( + syscall.path().unwrap().read(&memory).unwrap(), + Path::new("/some/file/path") + ); + + assert_eq!( + format!("{}", syscall.display(&memory)), + format!("open({:p} -> \"/some/file/path\", O_APPEND)", name.as_ptr()) + ); + } + + #[test] + fn test_syscall_openat() { + assert_eq!(Openat::NAME, "openat"); + assert_eq!(Openat::NUMBER, Sysno::openat); + + let memory = LocalMemory::new(); + + assert_eq!( + format!( + "{}", + Openat::new() + .with_dirfd(-100) + .with_path(None) + .with_flags(OFlag::O_APPEND) + .display(&memory) + ), + "openat(-100, NULL, O_APPEND)" + ); + + assert_eq!( + format!( + "{}", + Openat::new() + .with_dirfd(-100) + .with_path(None) + .with_flags(OFlag::O_CREAT) + .with_mode(Mode::from_bits_truncate(0o644)) + .display(&memory) + ), + "openat(-100, NULL, O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)" + ); + + assert_eq!( + format!( + "{}", + Openat::new() + .with_dirfd(-100) + .with_path(None) + .with_flags(OFlag::O_TMPFILE) + .with_mode(Mode::from_bits_truncate(0o600)) + .display(&memory) + ), + "openat(-100, NULL, O_DIRECTORY | O_TMPFILE, S_IRUSR | S_IWUSR)" + ); + + assert_eq!( + Openat::new() + .with_dirfd(libc::AT_FDCWD) + .with_path(None) + .with_flags(OFlag::O_CREAT | OFlag::O_WRONLY | OFlag::O_TRUNC) + .with_mode(Mode::from_bits_truncate(0o600)), + Creat::new() + .with_path(None) + .with_mode(Mode::from_bits_truncate(0o600)) + .into() + ); + } + + #[test] + fn test_syscall_stat() { + let name = CString::new("/dev/null").unwrap(); + + let stat = nix::sys::stat::stat("/dev/null").unwrap(); + + let syscall = Stat::new() + .with_path(PathPtr::from_ptr(name.as_ptr())) + .with_stat(StatPtr::from_ptr(&stat as *const libc::stat)); + + let memory = LocalMemory::new(); + + assert_eq!( + format!("{}", syscall.display_with_outputs(&memory)), + format!( + "stat({:p} -> \"/dev/null\", {:p} -> {{st_mode=S_IFCHR | 0666, st_size=0, ...}})", + name.as_ptr(), + &stat as *const _ + ) + ); + } + + #[test] + fn test_syscall_fcntl() { + let memory = LocalMemory::new(); + + assert_eq!( + format!( + "{}", + Fcntl::new() + .with_fd(1) + .with_cmd(FcntlCmd::F_DUPFD(2)) + .display(&memory) + ), + "fcntl(1, F_DUPFD, 2)" + ); + } + + #[test] + fn test_syscall_pipe2() { + let memory: Option> = AddrMut::from_raw(0x1245); + + assert_eq!( + Pipe2::new().with_pipefd(memory), + Pipe::new().with_pipefd(memory).into() + ); + + assert_ne!( + Pipe2::new() + .with_pipefd(memory) + .with_flags(OFlag::O_CLOEXEC), + Pipe::new().with_pipefd(memory).into() + ); + } + + #[test] + fn test_syscall_linkat() { + let foo = CString::new("foo").unwrap(); + let bar = CString::new("bar").unwrap(); + + assert_eq!( + Linkat::new() + .with_olddirfd(libc::AT_FDCWD) + .with_oldpath(PathPtr::from_ptr(foo.as_ptr())) + .with_newdirfd(libc::AT_FDCWD) + .with_newpath(PathPtr::from_ptr(bar.as_ptr())) + .with_flags(AtFlags::empty()), + Link::new() + .with_oldpath(PathPtr::from_ptr(foo.as_ptr())) + .with_newpath(PathPtr::from_ptr(bar.as_ptr())) + .into(), + ); + } +} diff --git a/reverie-util/Cargo.toml b/reverie-util/Cargo.toml new file mode 100644 index 0000000..e6ff66c --- /dev/null +++ b/reverie-util/Cargo.toml @@ -0,0 +1,20 @@ +# @generated by autocargo + +[package] +name = "reverie-util" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" + +[dependencies] +bitvec = { version = "0.17", features = ["serde"] } +chrono = { version = "0.4", features = ["clock", "serde", "std"], default-features = false } +libc = "0.2.98" +nix = "0.22" +reverie = { version = "0.1.0", path = "../reverie" } +serde = { version = "1.0.126", features = ["derive", "rc"] } +structopt = "0.3.23" +tracing = "0.1.29" +tracing-appender = "0.2.0" +tracing-subscriber = { version = "0.3.3", features = ["ansi", "env-filter", "fmt", "json", "parking_lot", "registry"] } diff --git a/reverie-util/src/commandline.rs b/reverie-util/src/commandline.rs new file mode 100644 index 0000000..4b27bba --- /dev/null +++ b/reverie-util/src/commandline.rs @@ -0,0 +1,164 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Common support for building the CLI interface to each Reverie tool. +//! Each tool with this backend is a standalone executable, and thus +//! needs its own CLI. + +use chrono::Local; +use std::path::Path; +use std::{error::Error, ffi::OsStr, fmt::Display, io, path::PathBuf, str::FromStr}; +use structopt::StructOpt; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::fmt::MakeWriter; +use tracing_subscriber::EnvFilter; + +use reverie::process::Command; + +/// Parses an environment variable command-line argument. +pub fn parse_env(s: &str) -> Result<(T, U), Box> +where + T: FromStr, + T::Err: Error + 'static, + U: FromStr, + U::Err: Error + 'static, +{ + let mut iter = s.splitn(2, '='); + + let key = iter.next().ok_or("Invalid KEY=VALUE: string is empty")?; + + let value = match iter.next() { + Some(value) => value.parse()?, + None => std::env::var(key)?.parse()?, + }; + + Ok((key.parse()?, value)) +} + +// Arguments that are shared by most Reverie tools, including which program to +// run and how to run it. Using StructOpt, this is designed to be set from CLI +// args, or to be extended by the tool to form CLI args. +// +// NOTE: Do not change this to a doc comment due to this bug: +// https://github.com/TeXitoi/structopt/issues/333 +#[allow(missing_docs)] +#[derive(Debug, Clone, StructOpt)] +pub struct CommonToolArguments { + /// Direct logging to a file. This can also be set with the RUST_LOG_FILE environment + /// variable, but the CLI flag takes precedence. + #[structopt(long = "log-file", value_name = "PATH", env = "RUST_LOG_FILE")] + pub log_file: Option, + + /// Do not pass-through host's environment variables, instead providing a + /// minimal PATH only (/bin:/usr/bin). The default is to pass through the + /// host environment. + #[structopt(long = "no-host-envs")] + pub no_host_envs: bool, + + /// Sets an environment variable. Can be used multiple times. + #[structopt( + long = "env", + short = "e", + value_name = "ENV[=VALUE]", + parse(try_from_str = parse_env), + number_of_values = 1 + )] + pub envs: Vec<(String, String)>, + + /// Path of the program to trace. + #[structopt(value_name = "PROGRAM")] + pub program: String, + + /// Arguments to the program to trace. + #[structopt(value_name = "ARGS")] + pub program_args: Vec, +} + +impl CommonToolArguments { + /// Create a new configuration to run the given program. + pub fn new + Clone>(prog: S) -> CommonToolArguments { + // Dirty, dirty hack. The first argument is ignored in this process: + CommonToolArguments::from_iter_safe(&[prog.clone(), prog]) + .expect("CommonToolArguments::new has an internal error that prevented it from constructing an instance.") + } + + /// Add an argument, similar to Command::arg. (Consuming builder.) + pub fn arg + Display>(&mut self, s: S) -> &mut CommonToolArguments { + self.program_args + .push(s.as_ref().to_str().expect("CommonToolArguments::arg internal error. This OsStr to str conversion should have worked.").to_string()); + self + } + + pub fn init_tracing(&self) -> Option { + fn set_subscriber_with_writer< + T: for<'writer> MakeWriter<'writer> + Send + Sync + 'static, + >( + writer: T, + ) { + // TODO: There is currently no support for async tracing. + let subscriber = tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env()) + .with_writer(writer) + .finish(); + tracing::subscriber::set_global_default(subscriber) + .expect("Unable to set global default subscriber"); + } + + self.log_file + .as_ref() + .and_then(|lf| { + let parent = lf.parent()?; + let orig_filename = lf.file_name()?.to_os_string(); + let mut filename = orig_filename.clone(); + + for _ in 0..100 { + if Path::new(parent).join(&filename).exists() { + filename = orig_filename.clone(); + filename.push(format!("{}", Local::now().format(".%Y%m%d.%H%M%S.%f"))); + } else { + break; + } + } + + if Path::new(parent).join(&filename).exists() { + eprintln!( + " [reverie] WARNING: could not open log file, falling back to stderr" + ); + None + } else { + let file_writer = tracing_appender::rolling::never(parent, &filename); + // TODO: Is this async logging? + let (file_writer, guard) = tracing_appender::non_blocking(file_writer); + + eprintln!(" [reverie] Logging to file at {:?}", parent.join(&filename)); + set_subscriber_with_writer(file_writer); + Some(guard) + } + }) + .or_else(|| { + set_subscriber_with_writer(io::stderr); + None + }) + } +} + +impl From for Command { + fn from(args: CommonToolArguments) -> Self { + let mut cmd = Command::new(args.program); + cmd.args(args.program_args); + + if args.no_host_envs { + cmd.env_clear(); + cmd.env("PATH", "/bin/:/usr/bin"); + } + + cmd.envs(args.envs); + cmd + } +} diff --git a/reverie-util/src/lib.rs b/reverie-util/src/lib.rs new file mode 100644 index 0000000..e9aea52 --- /dev/null +++ b/reverie-util/src/lib.rs @@ -0,0 +1,13 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod commandline; +pub mod pedigree; + +pub use commandline::CommonToolArguments; diff --git a/reverie-util/src/pedigree.rs b/reverie-util/src/pedigree.rs new file mode 100644 index 0000000..4d9f1a0 --- /dev/null +++ b/reverie-util/src/pedigree.rs @@ -0,0 +1,248 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! An example that tracks thread pedigree using local state. + +use bitvec::{bitvec, order::Msb0, vec::BitVec}; +use libc::pid_t; +use nix::unistd::Pid; +use serde::{Deserialize, Serialize}; +use std::{io, mem}; + +/// Helper function that finds the longest run of repeating bits in a bitvec +fn longest_run(sequence: &BitVec) -> (usize, usize) { + let mut prev_bit = false; + let mut prev_count = 1; + + let mut max_count = 0; + let mut max_start = 0; + + for (index, bit) in sequence.iter().enumerate() { + let count = if index > 0 && prev_bit == *bit { + prev_count + 1 + } else { + 1 + }; + if count > max_count { + max_count = count; + max_start = index + 1 - count; + } + + prev_count = count; + prev_bit = *bit; + } + + (max_start, max_count) +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +/// Unbounded bitstring representation of process pedigree (i.e. tree path or tree index) +/// which can be forked and converted to a deterministic virtual PID. +/// +/// As a binary tree-index, a Pedigree can be viewed as a series of "left"/"right" +/// directions for how to navigate the tree. Therefore a zero-length Pedigree refers to +/// the root. For convenience, we refer to "parent/child" rather than "left/right", +/// following the normal conventions of process or thread forking. Note that this +/// pedigree datatype does not represent "joins" within a set of running processes, nor +/// does it otherwise represent dependencies or "happens before" edges. +/// +/// TODO: Add serialization / deserialization +pub struct Pedigree { + pedigree: BitVec, +} + +impl Pedigree { + /// Create a new root pedigree representing the top of a tree of processes or threads. + pub fn new() -> Self { + Pedigree { + pedigree: bitvec![0], + } + } + + /// Split a pedigree into a pedigree for the two execution points + /// after the fork: `(parent,child)`. I.e. both tree-paths + /// returned are one level deeper from the root than the input was. + pub fn fork(&self) -> (Self, Self) { + let mut parent = self.clone(); + let child = parent.fork_mut(); + (parent, child) + } + + /// Fork a pedigree, destructively. + /// Mutates parent pedigree, returns new child pedigree. + /// + /// Since parent pedigree is being copied to the child, this function will + /// have O(n) complexity with respect to pedigree length. + pub fn fork_mut(&mut self) -> Self { + let mut child_pedigree = self.pedigree.clone(); + child_pedigree.push(true); + self.pedigree.push(false); + Pedigree { + pedigree: child_pedigree, + } + } + + /// Get pedigree's inner BitVec representation + pub fn raw(&self) -> BitVec { + self.pedigree.clone() + } +} + +/// Attempts to convert the pedigree bitstring into a deterministic virtual PID +impl TryFrom<&Pedigree> for Pid { + type Error = io::Error; + fn try_from(pedigree: &Pedigree) -> Result { + // Define mpping of pedigree bits -> PID bits + const MSB_ZERO_BITS: usize = 1; + const TREE_BITS: usize = 16; + const RUN_INDEX_BITS: usize = 4; + const RUN_TYPE_BITS: usize = 1; + const RUN_LENGTH_BITS: usize = 10; + debug_assert!( + MSB_ZERO_BITS + TREE_BITS + RUN_INDEX_BITS + RUN_TYPE_BITS + RUN_LENGTH_BITS + == mem::size_of::() * 8 + ); + + // Trim off any trailing P's from pedigree, i.e. viewing it as + // a sequence of 'P' (parent) and 'C' (child) directions. + let mut sequence = pedigree.raw(); + while sequence.len() > 1 && sequence.last() == Some(&false) { + sequence.pop(); + } + + // Find longest run in pedigree sequence + let (index, len) = longest_run(&sequence); + + // Make sure pedigree will fit into the bit encoding + if index >= 2_usize.pow(RUN_INDEX_BITS as u32) + || len >= 2_usize.pow(RUN_LENGTH_BITS as u32) + || sequence.len() - len > TREE_BITS + { + Err(Self::Error::new( + io::ErrorKind::Other, + "Pedigree is too large or complex to be deterministically converted into virtual PID.", + )) + } else { + // Extract the longest run of bits from pedigree + let mut lower_tree = sequence.split_off(index + len); + let run = sequence.split_off(index); + let mut tree = sequence; + tree.append(&mut lower_tree); + + // Construct a BitVec which will be interpreted as a pid_t + let mut vpid_bits: BitVec = + BitVec::with_capacity(mem::size_of::() * 8); + + // pid_t is signed, so MSB must always be zero or it will be interpreted as error + // when returned from fork, clone, etc. + vpid_bits.push(false); + + // Pack the rest of the bits, using asserts to make sure the bitfield sizing + // is correct. Any errors here are fatal bugs, so assert seems acceptable. + + let mut tree_bits: BitVec = BitVec::repeat(false, TREE_BITS - tree.len()); + tree_bits.append(&mut tree); + debug_assert!(tree_bits.len() == TREE_BITS); + vpid_bits.append(&mut tree_bits); + + let mut run_index_bits = BitVec::::from_element(index as u32); + run_index_bits = run_index_bits.split_off(run_index_bits.len() - RUN_INDEX_BITS); + debug_assert!(run_index_bits.len() == RUN_INDEX_BITS); + vpid_bits.append(&mut run_index_bits); + + let mut run_type_bits: BitVec = BitVec::new(); + run_type_bits.push(run[0]); + debug_assert!(run_type_bits.len() == RUN_TYPE_BITS); + vpid_bits.append(&mut run_type_bits); + + let mut run_length_bits = BitVec::::from_element(len as u32); + run_length_bits = run_length_bits.split_off(run_length_bits.len() - RUN_LENGTH_BITS); + debug_assert!(run_length_bits.len() == RUN_LENGTH_BITS); + vpid_bits.append(&mut run_length_bits); + + debug_assert!(vpid_bits.len() == mem::size_of::() * 8); + + Ok(Pid::from_raw(vpid_bits.into_vec()[0] as i32)) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_longest_run() { + let sequence = bitvec![1, 1, 0, 0, 0]; + let (index, len) = longest_run(&sequence); + assert_eq!((index, len), (2, 3)); + + let sequence = bitvec![1, 1, 1, 0, 0]; + let (index, len) = longest_run(&sequence); + assert_eq!((index, len), (0, 3)); + + let sequence = bitvec![1, 1, 0, 0, 1, 1]; + let (index, len) = longest_run(&sequence); + assert_eq!((index, len), (0, 2)); + + let sequence = bitvec![1, 0, 0, 0, 0, 1]; + let (index, len) = longest_run(&sequence); + assert_eq!((index, len), (1, 4)); + + let sequence = bitvec![1, 0, 1, 0, 1, 0]; + let (index, len) = longest_run(&sequence); + assert_eq!((index, len), (0, 1)); + } + + #[test] + fn test_pedigree_basic() { + // FIXME: These tests are dependent on the bit widths used to convert + // pedigree into PID, but the tests below assume that these values + // do not change. + + let mut parent = Pedigree::new(); + // Root pedigree = P + assert_eq!(Pid::try_from(&parent).unwrap(), Pid::from_raw(0x1)); + + let child = parent.fork_mut(); + // Parent pedigree = PP + assert_eq!(Pid::try_from(&parent).unwrap(), Pid::from_raw(0x1)); + // Child pedigree == PC + assert_eq!(Pid::try_from(&child).unwrap(), Pid::from_raw(0x00008001)); + + let child2 = parent.fork_mut(); + // Parent pedigree == PPP + assert_eq!(Pid::try_from(&parent).unwrap(), Pid::from_raw(0x1)); + // Child pedigree == PPC + assert_eq!(Pid::try_from(&child2).unwrap(), Pid::from_raw(0x00008002)); + } + + #[test] + fn test_pedigree_many_forks() { + let mut many_forks_bitstring = BitVec::repeat(false, 1023); + many_forks_bitstring.push(true); + let many_forks_pedigree = Pedigree { + pedigree: many_forks_bitstring, + }; + assert_eq!( + Pid::try_from(&many_forks_pedigree).unwrap(), + Pid::from_raw(0x000083FF) + ); + } + + #[test] + fn test_pedigree_overflow() { + let mut many_forks_bitstring = BitVec::repeat(false, 1024); + many_forks_bitstring.push(true); + let many_forks_pedigree = Pedigree { + pedigree: many_forks_bitstring, + }; + assert!(Pid::try_from(&many_forks_pedigree).is_err()); + } +} diff --git a/reverie/Cargo.toml b/reverie/Cargo.toml new file mode 100644 index 0000000..fa558ef --- /dev/null +++ b/reverie/Cargo.toml @@ -0,0 +1,23 @@ +# @generated by autocargo + +[package] +name = "reverie" +version = "0.1.0" +authors = ["Facebook"] +edition = "2021" +license = "BSD-2-Clause" + +[dependencies] +addr2line = "0.14" +anyhow = "1.0.51" +async-trait = "0.1.51" +bitflags = "1.3" +bitvec = { version = "0.17", features = ["serde"] } +byteorder = "1.3" +libc = "0.2.98" +nix = "0.22" +raw-cpuid = "9.0" +reverie-process = { version = "0.1.0", path = "../reverie-process" } +reverie-syscalls = { version = "0.1.0", path = "../reverie-syscalls" } +serde = { version = "1.0.126", features = ["derive", "rc"] } +thiserror = "1.0.29" diff --git a/reverie/src/auxv.rs b/reverie/src/auxv.rs new file mode 100644 index 0000000..7872438 --- /dev/null +++ b/reverie/src/auxv.rs @@ -0,0 +1,119 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use crate::syscalls::Addr; +use crate::Pid; + +use byteorder::{NativeEndian, ReadBytesExt}; + +use std::collections::BTreeMap; +use std::fs; +use std::io; + +/// Represents the auxv table of a process. +/// +/// NOTE: This is not necessarily the same table as the one used by +/// [`libc::getauxval`]. For dynamically linked programs, glibc will copy this +/// table early on in the start up of the program and may modify it. Thus, it is +/// really only safe to modify this immediately after `execve` runs. +pub struct Auxv { + map: BTreeMap, +} + +impl Auxv { + /// Reads the auxiliary values from `/proc/{pid}/auxv`. + pub(crate) fn new(pid: Pid) -> io::Result { + let mut map = BTreeMap::new(); + let buf = fs::read(format!("/proc/{}/auxv", pid))?; + + // The file size should be a multiple of `size_of::() * 2`. + debug_assert_eq!( + buf.len() % 16, + 0, + "got invalid size of auxv file: {} bytes", + buf.len() + ); + + let mut file = io::Cursor::new(buf); + + loop { + let key = file.read_u64::()?; + let value = file.read_u64::()?; + + if key == 0 && value == 0 { + break; + } + + map.insert(key, value); + } + + Ok(Self { map }) + } + + /// The number of entries in the auxv table. + pub fn len(&self) -> usize { + self.map.len() + } + + /// Returns true if the table is empty. + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } + + /// The address of sixteen bytes containing a random value. + /// + /// Returns `None` if the address is NULL or if `AT_RANDOM` does not exist in + /// the auxv table. + pub fn at_random(&self) -> Option> { + self.map + .get(&libc::AT_RANDOM) + .and_then(|val| Addr::from_raw(*val as usize)) + } + + /// The user ID of the thread. + /// + /// Returns `None` if the `AT_UID` does not exist in the auxv table. + pub fn at_uid(&self) -> Option { + self.map.get(&libc::AT_UID).map(|val| *val as libc::uid_t) + } + + /// The effective user ID of the thread. + /// + /// Returns `None` if the `AT_EUID` does not exist in the auxv table. + pub fn at_euid(&self) -> Option { + self.map.get(&libc::AT_EUID).map(|val| *val as libc::uid_t) + } + + /// The group ID of the process. + /// + /// Returns `None` if the `AT_GID` does not exist in the auxv table. + pub fn at_gid(&self) -> Option { + self.map.get(&libc::AT_GID).map(|val| *val as libc::gid_t) + } + + /// The effective group ID of the process. + /// + /// Returns `None` if the `AT_EGID` does not exist in the auxv table. + pub fn at_egid(&self) -> Option { + self.map.get(&libc::AT_EGID).map(|val| *val as libc::gid_t) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke() { + let map = Auxv::new(Pid::this()).unwrap(); + assert_eq!(map.is_empty(), false); + assert_eq!(map.at_uid(), Some(unsafe { libc::getuid() })); + assert_eq!(map.at_gid(), Some(unsafe { libc::getgid() })); + assert!(map.at_random().is_some()); + } +} diff --git a/reverie/src/backtrace.rs b/reverie/src/backtrace.rs new file mode 100644 index 0000000..89a2c61 --- /dev/null +++ b/reverie/src/backtrace.rs @@ -0,0 +1,153 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use core::fmt; +use serde::{Deserialize, Serialize}; +use std::borrow::Cow; +use std::fs::File; +use std::io; +use std::io::Read; + +use super::Pid; + +/// A backtrace is a list of stack frames. These stack frames may have originated +/// from a remote process. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] +pub struct Backtrace { + /// Thread ID where the backtrace originated. This can be used to get the + /// name of the thread and the process it came from. + thread_id: Pid, + + /// The stack frames in the backtrace. + frames: Vec, +} + +/// A stack frame. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] +pub struct Frame { + /// The value of the instruction pointer. + pub ip: u64, + /// True if this frame is inside of a signal handler. + pub is_signal: bool, + /// The symbol associated with this frame (if known). + pub symbol: Option, +} + +/// A symbol from a frame. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] +pub struct Symbol { + /// Name of the (mangled) symbol. + pub name: String, + /// Offset of the symbol. + pub offset: u64, + /// Address of the symbol. + pub address: u64, + /// Size of the symbol. + pub size: u64, +} + +impl Symbol { + /// Returns the demangled name of the symbol. This makes a best-effort guess + /// about demangling. If the symbol could not be demangled, returns the raw, + /// original name of the symbol. + pub fn demangled(&self) -> Cow { + addr2line::demangle_auto(Cow::from(&self.name), None) + } +} + +impl fmt::Display for Frame { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match &self.symbol { + Some(symbol) => write!(f, "{:#016x}: {:#}", self.ip, symbol)?, + None => write!(f, "{:#016x}: ???", self.ip)?, + } + + if self.is_signal { + write!(f, " (in signal handler)")?; + } + + Ok(()) + } +} + +impl fmt::Display for Symbol { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if f.alternate() { + write!(f, "{} + {:#x}", self.demangled(), self.offset) + } else { + write!(f, "{} + {:#x}", self.name, self.offset) + } + } +} + +impl Backtrace { + /// Creates a backtrace from a thread ID and frames. + pub fn new(thread_id: Pid, frames: Vec) -> Self { + Self { thread_id, frames } + } + + /// Returns an iterator over the frames in the backtrace. + pub fn iter(&self) -> impl Iterator { + self.frames.iter() + } + + /// Returns the thread ID where the backtrace originated. + pub fn thread_id(&self) -> Pid { + self.thread_id + } + + /// Retreives the name of the thread for this backtrace. This will fail if + /// the thread has already exited since the thread ID is used to look up the + /// thread name. + pub fn thread_name(&self) -> io::Result { + let mut name = String::new(); + + let mut f = File::open(format!("/proc/{}/comm", self.thread_id))?; + f.read_to_string(&mut name)?; + + // Remove trailing newline character + assert_eq!(name.pop(), Some('\n')); + + Ok(name) + } +} + +impl IntoIterator for Backtrace { + type Item = Frame; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.frames.into_iter() + } +} + +impl From for Vec { + fn from(bt: Backtrace) -> Self { + bt.frames + } +} + +impl fmt::Display for Backtrace { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let thread_name = self.thread_name(); + let thread_name = thread_name.as_ref().map(String::as_str); + let thread_name = thread_name.unwrap_or(""); + writeln!( + f, + "Stack trace for thread {} ({:?}):", + self.thread_id, thread_name + )?; + + // Ugly formatting with no symbol resolution. + for frame in &self.frames { + writeln!(f, "{}", frame)?; + } + + Ok(()) + } +} diff --git a/reverie/src/error.rs b/reverie/src/error.rs new file mode 100644 index 0000000..f00b317 --- /dev/null +++ b/reverie/src/error.rs @@ -0,0 +1,49 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Error handling. + +use thiserror::Error; + +pub use reverie_syscalls::Errno; + +/// A general error. +#[derive(Error, Debug)] +pub enum Error { + /// A low-level errno. + #[error(transparent)] + Errno(#[from] Errno), + + /// A generic error that may be produced by the tool. + #[error(transparent)] + Tool(#[from] anyhow::Error), + + /// An I/O error. + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl Error { + /// Extracts the errno from the error. If this is not an `Error::Errno`, then + /// returns `Err(Error)`. This is useful for capturing syscall errors and + /// propagating all other types of errors. + pub fn into_errno(self) -> Result { + if let Self::Errno(err) = self { + Ok(err) + } else { + Err(self) + } + } +} + +impl From for Error { + fn from(err: nix::errno::Errno) -> Self { + Self::Errno(Errno::new(err as i32)) + } +} diff --git a/reverie/src/guest.rs b/reverie/src/guest.rs new file mode 100644 index 0000000..8fc5e32 --- /dev/null +++ b/reverie/src/guest.rs @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Guest (i.e. thread) structure and traits + +use async_trait::async_trait; +use reverie_syscalls::{Errno, MemoryAccess, SyscallInfo}; + +use crate::auxv::Auxv; +use crate::backtrace::Backtrace; +use crate::error::Error; +use crate::stack::Stack; +use crate::timer::TimerSchedule; +use crate::tool::{GlobalRPC, GlobalTool, Tool}; +use crate::Pid; + +/// A representation of a guest task (thread). +#[async_trait] +pub trait Guest: Send + GlobalRPC { + /// Access to guest memory + type Memory: MemoryAccess + Send; + + /// Access to guest stack + type Stack: Send + Stack; + + /// Thread ID of the guest task. + fn tid(&self) -> Pid; + + /// Process ID of the process containing the guest task. + fn pid(&self) -> Pid; + + /// Process ID of the parent process. Returns `None` if this is the root of + /// the traced process tree. A return value of `None` does not necessarily + /// mean it is the root process in the system. + fn ppid(&self) -> Option; + + /// Returns true if this thread is the thread group leader (i.e., the main + /// thread). + fn is_main_thread(&self) -> bool { + self.tid() == self.pid() + } + + /// Returns true if this is considered the root process of the traced task + /// tree (i.e., if `getppid()` returns `None`). + fn is_root_process(&self) -> bool { + self.ppid().is_none() + } + + /// Returns true if this is considered the root thread of the traced task + /// tree (i.e., if `getppid()` returns `None` and `is_main_thread` returns + /// true). + fn is_root_thread(&self) -> bool { + self.is_root_process() && self.is_main_thread() + } + + /// Reads and returns the auxv table for this process. + fn auxv(&self) -> Auxv { + Auxv::new(self.pid()).expect("failed to read auxv table") + } + + /// Returns a representation of the address space associated with this guest + /// thread. + fn memory(&self) -> Self::Memory; + + /// Returns a mutable reference to thread state. + fn thread_state_mut(&mut self) -> &mut T::ThreadState; + + /// Returns an immutable reference to thread state. + fn thread_state(&self) -> &T::ThreadState; + + /// Returns the current stack pointer with this guest thread. + async fn stack(&mut self) -> Self::Stack; + + /// Task is trying to become a daemon. The tracer may choose to kill all + /// remaining tasks when daemons are the only ones left. + async fn daemonize(&mut self); + + /// Inject a system call into the guest and wait for the return value. This + /// function dirties the register file while its executing, but restores at + /// the end. + /// + /// Preconditions: the guest is in a stopped state and Reverie is currently + /// running a handler on that guest thread's behalf. + /// + /// Postconditions: the register file is the same as before the call to this + /// function. However, any side effects, including to guest memory, persist + /// after the injected call. + /// + /// # Caveats + /// + /// A few syscalls are special and behave differently from the rest: + /// - `exit` or `exit_group` will never return when injected. Since these + /// syscalls will cause the current thread or process to exit, no code that + /// comes after can be executed. + /// - `execve` will never return when *successfully* injected. If you wish to + /// handle successful calls to `execve`, use [`Tool::handle_post_exec`]. + /// Failed calls to `execve` will still return, however. Thus, it is safe to + /// use [`Result::unwrap_err`] on the result of the `inject`. + async fn inject(&mut self, syscall: S) -> Result; + + /// Similar to [`Guest::inject`], except that it never returns. Since it does + /// not return to the caller, the syscall return value cannot be altered or + /// inspected. This method exists as an optimization for the `ptrace` + /// backend, so that we can avoid interrupting the guest if we don't care + /// about the syscall return value. + /// + /// # Caveats + /// + /// This method comes with a major footgun. Any code written after + /// `tail_inject` will never be executed: + /// + /// ```no_run + /// use reverie::*; + /// use reverie::syscalls::*; + /// use serde::{Deserialize, Serialize}; + /// + /// #[derive(Debug, Serialize, Deserialize, Default, Clone)] + /// struct MyTool; + /// + /// #[reverie::tool] + /// impl Tool for MyTool { + /// /// Count of successful syscalls. + /// type ThreadState = u64; + /// + /// async fn handle_syscall_event>( + /// &self, + /// guest: &mut T, + /// syscall: Syscall, + /// ) -> Result { + /// let ret = match syscall { + /// Syscall::Open(syscall) => guest.tail_inject(syscall).await, + /// _ => guest.inject(syscall).await?, + /// }; + /// + /// // This is never called if we got the `open` syscall above!! + /// *guest.thread_state_mut() += 1; + /// + /// Ok(ret) + /// } + /// } + /// ``` + async fn tail_inject(&mut self, syscall: S) -> !; + + /// Like [`Guest::inject`], but will retry the syscall if `EINTR` or + /// `ERESTARTSYS` are returned. + /// + /// This is useful if we need to inject a syscall other than the one + /// currently being handled in `handle_syscall_event`. If we don't retry + /// interrupted syscalls, we could end up running the real syscall more than + /// once. + async fn inject_with_retry(&mut self, syscall: S) -> Result { + loop { + match self.inject(syscall).await { + Ok(x) => return Ok(x), + Err(Errno::EINTR) | Err(Errno::ERESTARTSYS) => continue, + Err(other) => return Err(other), + } + } + } + + /// Converts this `Guest` such that it implements `Guest`. This is + /// useful when forwarding callbacks to a "child" tool. + #[allow(clippy::wrong_self_convention)] + fn into_guest(&mut self) -> IntoGuest { + IntoGuest::new(self) + } + + /// Request that a single timer event occur in the future according to + /// `sched`. + /// + /// There is only a single timer, so repeatedly setting a timer event delays + /// the delivery of the single timer event that will eventually fire. + /// + /// Timer events are cancelled by the delivery of other event types. If + /// receiving timer events is critical, your tool must override all event + /// listeners and reschedule your timer within them. + /// + /// This requests a non-deterministic timer event, which will occur after _at + /// least_ `sched` has elapsed, but no guarantees are made for delivery. As a + /// result, the event will likely have much less overhead than one set with + /// [`Guest::set_timer_precise`]. + fn set_timer(&mut self, sched: TimerSchedule) -> Result<(), Error>; + + /// Request that a single timer event occur in the future according to + /// `sched`. + /// + /// Functions identically to [`Guest::set_timer`], except that the resulting + /// event will be delivered _exactly_ when `sched` has elapsed. This results + /// in a far higher overhead to deliver an event. + fn set_timer_precise(&mut self, sched: TimerSchedule) -> Result<(), Error>; + + /// Read a thread-local monotonic clock which is never reset. The starting + /// value, resolution, and semantics of the ticks are + /// implementation-specific. + fn read_clock(&mut self) -> Result; + + /// Returns a stack trace starting at the current location of the guest + /// thread. If a backtrace is not available, returns `None`. + /// + /// # Example + /// + /// ``` + /// use reverie::*; + /// use reverie::syscalls::*; + /// use serde::{Deserialize, Serialize}; + /// + /// #[derive(Debug, Serialize, Deserialize, Default, Clone)] + /// struct MyTool; + /// + /// #[reverie::tool] + /// impl Tool for MyTool { + /// async fn handle_syscall_event>( + /// &self, + /// guest: &mut T, + /// syscall: Syscall, + /// ) -> Result { + /// // Generate a backtrace whenever we receive a call to getpid(). + /// if let Syscall::Getpid(_) = &syscall { + /// if let Some(frames) = guest.backtrace() { + /// println!("Backtrace for getpid():"); + /// for frame in frames { + /// println!(" {}", frame); + /// } + /// } + /// } + /// + /// Ok(guest.inject(syscall).await?) + /// } + /// } + /// ``` + fn backtrace(&mut self) -> Option { + None + } +} + +/// Wraps a `Guest` such that it implements `Guest`. +/// +/// # Limitations +/// +/// `T` and `U` must have the same global state. This limitation may be removed +/// in the future. +pub struct IntoGuest<'a, G: ?Sized, U> { + inner: &'a mut G, + _phantom: core::marker::PhantomData, +} + +impl<'a, G: ?Sized, U> IntoGuest<'a, G, U> { + /// Creates a new `IntoGuest`. + pub fn new(guest: &'a mut G) -> Self { + Self { + inner: guest, + _phantom: core::marker::PhantomData, + } + } +} + +#[async_trait] +impl<'a, G, U> GlobalRPC for IntoGuest<'a, G, U> +where + G: Guest + ?Sized, + U: Tool, +{ + async fn send_rpc( + &self, + message: ::Request, + ) -> Result<::Response, Error> { + self.inner.send_rpc(message).await + } + + fn config(&self) -> &::Config { + self.inner.config() + } +} + +#[async_trait] +impl<'a, G, U, L> Guest for IntoGuest<'a, G, U> +where + G: Guest + ?Sized, + L: Tool, + U: Tool + AsMut, + U::ThreadState: AsRef + AsMut, +{ + type Memory = G::Memory; + type Stack = G::Stack; + + fn tid(&self) -> Pid { + self.inner.tid() + } + + fn pid(&self) -> Pid { + self.inner.pid() + } + + fn ppid(&self) -> Option { + self.inner.ppid() + } + + fn is_main_thread(&self) -> bool { + self.inner.is_main_thread() + } + + fn is_root_process(&self) -> bool { + self.inner.is_root_process() + } + + fn is_root_thread(&self) -> bool { + self.inner.is_root_thread() + } + + fn memory(&self) -> Self::Memory { + self.inner.memory() + } + + fn thread_state_mut(&mut self) -> &mut L::ThreadState { + self.inner.thread_state_mut().as_mut() + } + + fn thread_state(&self) -> &L::ThreadState { + self.inner.thread_state().as_ref() + } + + async fn stack(&mut self) -> Self::Stack { + self.inner.stack().await + } + + async fn daemonize(&mut self) { + self.inner.daemonize().await + } + + async fn inject(&mut self, syscall: S) -> Result { + self.inner.inject(syscall).await + } + + async fn tail_inject(&mut self, syscall: S) -> ! { + #![allow(unreachable_code)] + self.inner.tail_inject(syscall).await + } + + fn set_timer(&mut self, sched: TimerSchedule) -> Result<(), Error> { + self.inner.set_timer(sched) + } + + fn set_timer_precise(&mut self, sched: TimerSchedule) -> Result<(), Error> { + self.inner.set_timer_precise(sched) + } + + fn read_clock(&mut self) -> Result { + self.inner.read_clock() + } + + fn backtrace(&mut self) -> Option { + self.inner.backtrace() + } +} diff --git a/reverie/src/lib.rs b/reverie/src/lib.rs new file mode 100644 index 0000000..fcea314 --- /dev/null +++ b/reverie/src/lib.rs @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018-2019, Trustees of Indiana University + * ("University Works" via Baojun Wang) + * Copyright (c) 2018-2019, Ryan Newton + * ("Traditional Works of Scholarship") + * Copyright (c) 2020-, Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#![doc = include_str!("../../README.md")] +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] +#![feature(associated_type_defaults)] +#![feature(never_type)] + +mod auxv; +mod backtrace; +mod error; +mod guest; +mod rdtsc; +mod stack; +mod subscription; +mod timer; +mod tool; + +pub use auxv::*; +pub use backtrace::*; +pub use error::*; +pub use guest::*; +pub use rdtsc::*; +pub use stack::*; +pub use subscription::*; +pub use timer::*; +pub use tool::*; + +pub use reverie_process as process; + +pub use process::ExitStatus; +pub use process::Pid; + +/// The identifier for a specific thread, corresponding to the output of gettid. +/// In many cases, Linux blurs the Pid/Tid distinction, but Reverie should +/// consistently use TIDs when referring to threads, and Pids when referring to +/// shared address spaces that (typically) correspond to processes. +/// +/// This type is currently equivalent to [`Pid`], but relying on that equivalence +/// is deprecated. `Tid` may be a distinct newtype in the future. +pub type Tid = Pid; + +/// typed syscalls. +pub use reverie_syscalls as syscalls; + +/// CPUID result. +pub use raw_cpuid::CpuIdResult; + +// Reexport nix Signal type. +pub use nix::sys::signal::Signal; + +/// Required for `impl Tool for MyTool` blocks. +/// +/// NOTE: This is just an alias for `async_trait` for now, but may be extended in +/// the future to do more things (like derive syscall subscriptions). +pub use async_trait::async_trait as tool; + +/// Required for `impl GlobalTool for MyGlobalTool` blocks. +/// +/// NOTE: This is just an alias for `async_trait` for now, but may be extended in +/// the future to do more things (like deriving Request/Response types from +/// method names). +pub use async_trait::async_trait as global_tool; diff --git a/reverie/src/rdtsc.rs b/reverie/src/rdtsc.rs new file mode 100644 index 0000000..1705232 --- /dev/null +++ b/reverie/src/rdtsc.rs @@ -0,0 +1,56 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! rdtsc/rdtscp helpers + +use core::arch::x86_64::__rdtscp; +use core::arch::x86_64::_rdtsc; +use core::mem::MaybeUninit; +use serde::{Deserialize, Serialize}; + +/// Rdtsc/Rdtscp request. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum Rdtsc { + /// Rdtsc + Tsc, + /// Rdtscp + Tscp, +} + +/// Result returned by [`Tool::handle_rdtsc_event`]. +/// +/// [`Tool::handle_rdtsc_event`]: crate::Tool::handle_rdtsc_event +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct RdtscResult { + /// tsc counter returned from rdtsc/rdtscp + pub tsc: u64, + /// aux (TSC_AUX) returned from rdtscp + /// for rdtsc this should be None. + pub aux: Option, +} + +impl RdtscResult { + /// read current tsc/tscp value + pub fn new(request: Rdtsc) -> RdtscResult { + match request { + Rdtsc::Tsc => RdtscResult { + tsc: unsafe { _rdtsc() }, + aux: None, + }, + Rdtsc::Tscp => { + let mut aux_val = MaybeUninit::uninit(); + let tsc = unsafe { __rdtscp(aux_val.as_mut_ptr()) }; + RdtscResult { + tsc, + aux: Some(unsafe { aux_val.assume_init() }), + } + } + } + } +} diff --git a/reverie/src/stack.rs b/reverie/src/stack.rs new file mode 100644 index 0000000..4dbfc3e --- /dev/null +++ b/reverie/src/stack.rs @@ -0,0 +1,45 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! A simple arena allocator using tracee's stack +//! +//! Allocation is done on stack, all allocated memory will (1) become usable after +//! `commit`, and (2) be released when `StackGuard' is subsequently dropped. + +use reverie_syscalls::{Addr, AddrMut, Errno}; + +/// A low-level stack which stores untyped (but Sized) values +pub trait Stack { + /// A guard which should be kept alive while accessing the memory allocated with this interface. + type StackGuard: Drop + Send; + + /// Get the current stack size allocated by `push` and `reserve` operations (initially zero). + fn size(&self) -> usize; + + /// Get stack capacity, i.e. the maximum that can be allocated. + fn capacity(&self) -> usize; + + /// Allocate from stack with given `size', return a `Addr` + /// that points to the stack, panics if no space is available. + /// Copies the raw bits of the provided value into the allocated space. + /// + /// This returns results as pointers into guest mememory. *However*, the data is not + /// guaranteed to be written through to the guest until flushed by a `commit`. Thus + /// the returned `Addr` is implicitly invalid until the `commit`. + fn push<'stack, T>(&mut self, value: T) -> Addr<'stack, T>; + + /// Allocates like `push` but fills the allocated area with zeroes instead. + /// Like `push`, the results are not available until the next `commit`. + fn reserve<'stack, T>(&mut self) -> AddrMut<'stack, T>; + + /// Commit all allocations, writing data through to the guest. This allows certain + /// optimizations, and forces the `Stack' value to released, returning a `StackGuard` + /// instead. + fn commit(self) -> Result; +} diff --git a/reverie/src/subscription.rs b/reverie/src/subscription.rs new file mode 100644 index 0000000..6c2c549 --- /dev/null +++ b/reverie/src/subscription.rs @@ -0,0 +1,257 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie_syscalls::Sysno; + +use bitflags::bitflags; +use bitvec::bitvec; +use bitvec::vec::BitVec; + +// The maximum number of syscalls to hold in our bitvec. +// +// FIXME: This should come from the `syscall` crate instead. +const MAX_SYSCALLS: usize = 512; + +bitflags! { + #[derive(Default)] + struct Instructions: u32 { + const CPUID = 1; + const RDTSC = 2; + } +} + +/// A set of events to subscribe to. +#[derive(Default, Clone, Eq, PartialEq)] +pub struct Subscription { + instructions: Instructions, + // TODO: Use a BitArray with bitvec >=0.18 + syscalls: BitVec, +} + +impl Subscription { + /// Don't receive any events. + pub fn none() -> Self { + Subscription { + instructions: Instructions::empty(), + syscalls: bitvec![0; MAX_SYSCALLS], + } + } + + /// Subscribe to all events. + pub fn all() -> Self { + Subscription { + instructions: Instructions::CPUID | Instructions::RDTSC, + syscalls: bitvec![1; MAX_SYSCALLS], + } + } + + /// Subscribe to all sycall events (but not instruction events). + pub fn all_syscalls() -> Self { + Subscription { + instructions: Instructions::empty(), + syscalls: bitvec![1; MAX_SYSCALLS], + } + } + + /// Enable interception of the `rdtsc` instruction. + #[inline] + pub fn rdtsc(&mut self) -> &mut Self { + self.instructions.insert(Instructions::RDTSC); + self + } + + /// Enable interception of the `cpuid` instruction. + #[inline] + pub fn cpuid(&mut self) -> &mut Self { + self.instructions.insert(Instructions::CPUID); + self + } + + /// Returns true if we're subscribed to RDTSC events. + #[inline] + pub fn has_rdtsc(&self) -> bool { + self.instructions.contains(Instructions::RDTSC) + } + + /// Returns true if we're subscribed to CPUID events. + #[inline] + pub fn has_cpuid(&self) -> bool { + self.instructions.contains(Instructions::CPUID) + } + + /// Enables or disables a single syscall. + #[inline] + pub fn set(&mut self, syscall: Sysno, enabled: bool) -> &mut Self { + self.syscalls.set(syscall as i32 as usize, enabled); + self + } + + /// Enables a single syscall. + #[inline] + pub fn syscall(&mut self, syscall: Sysno) -> &mut Self { + self.set(syscall, true) + } + + /// Enables multiple syscalls. + pub fn syscalls(&mut self, syscalls: I) -> &mut Self + where + I: IntoIterator, + { + for syscall in syscalls { + self.syscall(syscall); + } + + self + } + + /// Disables a single syscall. + #[inline] + pub fn disable_syscall(&mut self, syscall: Sysno) -> &mut Self { + self.set(syscall, false) + } + + /// Disables multiple syscalls. + pub fn disable_syscalls(&mut self, syscalls: I) -> &mut Self + where + I: IntoIterator, + { + for syscall in syscalls { + self.disable_syscall(syscall); + } + + self + } + + /// Iterates over the set of syscalls that are enabled. + pub fn iter_syscalls(&self) -> impl Iterator + '_ { + // With bitvec >=0.20, this becomes a lot simpler: + //self.syscalls.iter_ones().filter_map(Sysno::new) + + self.syscalls.iter().enumerate().filter_map( + |(i, is_set)| { + if *is_set { Sysno::new(i) } else { None } + }, + ) + } + + /// Iterates over the set of syscalls that are disabled. + pub fn iter_disabled_syscalls(&self) -> impl Iterator + '_ { + // With bitvec >=0.20, this becomes a lot simpler: + //self.syscalls.iter_zeros().filter_map(Sysno::new) + + self.syscalls.iter().enumerate().filter_map( + |(i, is_set)| { + if !*is_set { Sysno::new(i) } else { None } + }, + ) + } +} + +impl core::ops::BitOr for Subscription { + type Output = Self; + + fn bitor(mut self, rhs: Self) -> Self::Output { + self |= rhs; + self + } +} + +impl core::ops::BitOrAssign for Subscription { + fn bitor_assign(&mut self, rhs: Self) { + self.instructions |= rhs.instructions; + self.syscalls |= rhs.syscalls; + } +} + +impl core::ops::BitOrAssign for Subscription { + fn bitor_assign(&mut self, syscall: Sysno) { + self.syscalls.set(syscall as i32 as usize, true); + } +} + +impl Extend for Subscription { + fn extend>(&mut self, iter: I) { + for syscall in iter { + *self |= syscall; + } + } +} + +impl FromIterator for Subscription { + fn from_iter>(iter: I) -> Self { + let mut s = Self::none(); + s.extend(iter); + s + } +} + +impl core::fmt::Debug for Subscription { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let syscalls: Vec<_> = self.iter_syscalls().collect(); + + f.debug_struct("Subscription") + .field("instructions", &self.instructions) + .field("syscalls", &syscalls) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smoke() { + let mut s1: Subscription = [Sysno::open, Sysno::read, Sysno::write] + .iter() + .copied() + .collect(); + s1 |= Sysno::open; + + let mut s2 = Subscription::none(); + s2 |= s1.clone(); + s2 |= Sysno::open; + + assert_eq!( + s1.iter_syscalls().collect::>(), + [Sysno::read, Sysno::write, Sysno::open,] + ); + + assert_eq!( + s2.iter_syscalls().collect::>(), + [Sysno::read, Sysno::write, Sysno::open,] + ); + } + + #[test] + fn disabled_syscalls() { + let mut sub = Subscription::all(); + + assert!(sub.iter_disabled_syscalls().next().is_none()); + + sub.set(Sysno::open, false); + sub.set(Sysno::read, false); + + assert_eq!( + sub.iter_disabled_syscalls().collect::>(), + [Sysno::read, Sysno::open] + ); + } + + #[test] + fn compose() { + let a = Subscription::from_iter([Sysno::open, Sysno::read]); + let b = Subscription::from_iter([Sysno::read, Sysno::close]); + let c = a | b; + + assert_eq!( + c.iter_syscalls().collect::>(), + [Sysno::read, Sysno::open, Sysno::close] + ); + } +} diff --git a/reverie/src/timer.rs b/reverie/src/timer.rs new file mode 100644 index 0000000..4c0c2be --- /dev/null +++ b/reverie/src/timer.rs @@ -0,0 +1,23 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use nix::sys::signal::Signal; + +/// Options for scheduling a timer event. +pub enum TimerSchedule { + /// Request that a timer event occur after approximataly this duration. + /// Conversion to real time is best-effort only. + Time(core::time::Duration), + /// Request that a timer event occur after exactly this many retired + /// conditional branches (RCBs). + Rcbs(u64), +} + +/// signal used by reverie perf counter timer. +pub const PERF_EVENT_SIGNAL: Signal = Signal::SIGSTKFLT; diff --git a/reverie/src/tool.rs b/reverie/src/tool.rs new file mode 100644 index 0000000..654cdf0 --- /dev/null +++ b/reverie/src/tool.rs @@ -0,0 +1,324 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! The API that a Reverie tool (client) should implement. +//! +//! Reverie tools consist of two portions: the global and local (per-guest +//! thread) instrumentation, though in some backends these will execute in the +//! same process. + +use crate::{ + error::{Errno, Error}, + guest::Guest, + rdtsc::{Rdtsc, RdtscResult}, + ExitStatus, Pid, Signal, Subscription, Tid, +}; +use async_trait::async_trait; +use raw_cpuid::{cpuid, CpuIdResult}; +use reverie_syscalls::Syscall; +use serde::{de::DeserializeOwned, Serialize}; + +/// The global half of a complete Reverie tool. +/// +/// One global instance of this type will exist at runtime (singleton). This +/// global state is shared by the tool across the whole process tree being +/// instrumented. +#[async_trait] +pub trait GlobalTool: Send + Sync + Default { + /// The message to send to the global tool. + type Request: Serialize + DeserializeOwned + Send = (); + + /// The result of sending the message. + type Response: Serialize + DeserializeOwned + Send = (); + + /// Static, read-only configuration data that is available everywhere the + /// tool runs code. + type Config: Serialize + DeserializeOwned + Send + Sync + Clone + Default = (); + + /// Initialize the tool, allocating the global state. + async fn init_global_state(_cfg: &Self::Config) -> Self { + Default::default() + } + + /// Receive a (potentially) inter-process upcall on the global state object. + /// This intended to be IPC, inter-process communication, in some backends, + /// and a local method call in others, but never truly a communication + /// between different machines. + /// + /// It receives a shared reference to the global state object, which must + /// manage its own synchronization. + async fn receive_rpc(&self, _from: Tid, _message: Self::Request) -> Self::Response; +} + +#[async_trait] +impl GlobalTool for () { + type Request = (); + type Response = (); + + async fn receive_rpc(&self, _from: Tid, _message: ()) {} +} + +/// A trait that every Reverie *tool* must implement. The primary function of the +/// tool specifies how syscalls and signals are handled. +/// +/// The type that a `Tool` is implemented for represents the process-level state. +/// That is, one runtime instance of this type will be created for each guest +/// process. This type is in turn a factory for *thread level states*, which are +/// allocated dynamically upon guest thread creation. Instances of the thread +/// state are also managed by Reverie. +/// +/// # Example +/// +/// Here is an example of a tool that simply counts the number of syscalls +/// intercepted for each thread: +/// ``` +/// use reverie::syscalls::*; +/// use reverie::*; +/// use serde::{Deserialize, Serialize}; +/// +/// /// Our process-level state. +/// #[derive(Debug, Serialize, Deserialize, Default, Clone)] +/// struct MyTool; +/// +/// #[reverie::tool] +/// impl Tool for MyTool { +/// /// Count of syscalls. +/// type ThreadState = u64; +/// +/// async fn handle_syscall_event>( +/// &self, +/// guest: &mut T, +/// syscall: Syscall, +/// ) -> Result { +/// *guest.thread_state_mut() += 1; +/// +/// // Inject the syscall. If we don't do this, the syscall will be +/// // supressed. +/// let ret = guest.inject(syscall).await?; +/// +/// Ok(ret) +/// } +/// } +/// ``` +#[async_trait] +pub trait Tool: Serialize + DeserializeOwned + Send + Sync + Default { + /// The type of the global half that goes along with this Local tool. By + /// including this type, the Tool is actually a complete specification for an + /// instrumentation tool. + type GlobalState: GlobalTool = (); + + /// Tool-state specific to each guest thread. If unset, this defaults to the + /// unit type `()`, indicating that the tool does not have thread-level + /// state. + /// + /// Both thread-local and process-local state may have to be migrated between + /// address spaces by a Reverie backend. Hence the `ThreadState` type must + /// implement [`Serialize`] and [`DeserializeOwned`]. + /// + /// The thread-local storage must be in a good, consistent state when each + /// handler returns, and also when handlers yield. + /// + /// [`Serialize`]: serde::Serialize + /// [`DeserializeOwned`]: serde::de::DeserializeOwned + type ThreadState: Serialize + DeserializeOwned + Default + Send + Sync = (); + + /// A common constructor that initializes state when a process is created, + /// including the guest's initial, root process. Of course, every process + /// includes at least one thread, but the process level state is allocated + /// before thread level-state for the process's main thread is allocated. + /// + /// For now this method assumes access to the global state, but that may + /// change. + fn new(_pid: Pid, _cfg: &::Config) -> Self { + Default::default() + } + + /// Events the tool subscribes to. This is only called *once* for the entire + /// tree. By default, all syscalls are traced (but CPUID/RDTSC instructions + /// are not). + fn subscriptions(_cfg: &::Config) -> Subscription { + Subscription::all_syscalls() + } + + /// A guest process creates additional threads, which need their tool state + /// initialized. This method returns a newly-allocated thread state. This + /// method necessarily runs before the first instruction of a newly created + /// guest thread. + /// + /// If the parent thread is running a handler which injects a fork, this + /// callback executes on behalf of the child and may observe the parent's + /// thread-local state just this one time. It is important to know WHEN that + /// view into the parent's thread-state occurs. We currently guarantee that + /// this is *immediately* upon the `.inject()` call that creates the child + /// thread. Any later point of execution for `init_thread_state` could delay + /// the creation of the child arbitrarily long, waiting for the parent to + /// relinquish its hold on its own thread-local state. + /// + /// The parent Tid always refers to the thread-ID that called + /// fork/clone/vfork in order to create the new guest thread. Access to the + /// parent's state allows the child state to be defined in terms of modifying + /// the parent's, such as tracking the depth in a tree of threads. + /// + /// # Arguments + /// + /// * `&self`: a handle on the process-level state. + /// * `child`: the new child thread's ID. + /// * `parent`: A tuple of the parent thread ID and a snapshot of the + /// parent's thread-local state. This is `None` if the current thread is the + /// root of the guest process tree. + fn init_thread_state( + &self, + _child: Tid, + _parent: Option<(Tid, &Self::ThreadState)>, + ) -> Self::ThreadState { + Default::default() + } + + /// Similar to `handle_syscall_event`, except this traps the first + /// instruction executed by a new thread. Typical uses of this method include + /// delaying thread execution or running initialization actions (injections + /// or rpcs). + /// + /// Both this callback and `init_thread_state` run once for every newly + /// created thread. The important difference is that this callback is + /// guaranteed to run independently from the parent. It does not view the + /// parents state, and this handler runs in its own asynchronous task. + /// Blocking this task on an `.await` will not interfere with the progress of + /// the parent thread. + /// + /// # Arguments + /// + /// * `&self`: The process-level state for this thread. + /// * `guest`: A handle to the guest thread. + async fn handle_thread_start>(&self, _guest: &mut T) -> Result<(), Error> { + Ok(()) + } + + /// Called upon a *successful* execve. In `handle_syscall_event`, after + /// injecting `execve`, it is not possible to run code after a successful + /// `execve` because it never returns. + /// + /// NOTE: Thread and process state are unchanged across this execve boundary. + /// Thus, this can be useful for doing something like counting the number of + /// times a process successfully calls `execve`. + async fn handle_post_exec>(&self, _guest: &mut T) -> Result<(), Errno> { + Ok(()) + } + + /// The tool receives an event from the guest, via the Reverie program + /// instrumentation. A Reverie syscall handler fires in the moment *before* a + /// guest syscall executes (like a "prehook"). + /// + /// After the event is trapped, control transfers to `handle_syscall_event` + /// which is put in temporary control of the guest thread. Via `guest`, we + /// can directly access the thread/process local state, and we can also + /// remotely access (1) the global state and (2) the memory/registers of the + /// guest thread it controls. + /// + /// NOTE: Only syscalls we have subscribed to [`Tool::subscriptions`] will + /// have this handler invoked. + async fn handle_syscall_event>( + &self, + guest: &mut T, + c: Syscall, + ) -> Result { + guest.tail_inject(c).await + } + + /// CPUID is trapped, the tool should implement this function to return [eax, + /// ebx, ecx, edx] + /// + /// NOTE: This is never called by default unless cpuid events are subscribed + /// to. + async fn handle_cpuid_event>( + &self, + _guest: &mut T, + eax: u32, + ecx: u32, + ) -> Result { + Ok(cpuid!(eax, ecx)) + } + + /// rdtsc/rdtscp is trapped, the tool should implement this function to + /// return the counter. + /// + /// NOTE: This is never called by default unless rdtsc events are subscribed + /// to. + async fn handle_rdtsc_event>( + &self, + _guest: &mut T, + request: Rdtsc, + ) -> Result { + Ok(RdtscResult::new(request)) + } + + /// Handles a guest's signal before it is delivered to guest. + /// + /// # Return value + /// - `Some(sig)`: The signal `sig` will be delivered to guest. + /// - `None`: The signal is supressed and never delivered to the guest. + async fn handle_signal_event>( + &self, + _guest: &mut T, + signal: Signal, + ) -> Result, Errno> { + Ok(Some(signal)) + } + + /// Handles a timer event generated by a call to `Guest::set_timer` + async fn handle_timer_event>(&self, _guest: &mut T) {} + + /// Called when a thread will exit shortly or has exited. That means there + /// will be no more intercepted events on this thread. + /// + /// Serves as a "destructor" for the thread state, and thus takes it by move. + async fn on_exit_thread>( + &self, + _tid: Tid, + _global_state: &G, + _thread_state: Self::ThreadState, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + Ok(()) + } + + /// Called when a process will exit shortly or has exited. That means there + /// will be no more intercepted events on from any thread within this + /// process. + /// + /// Serves as a "destructor" for the process state (`self`), and thus takes + /// it by move. + async fn on_exit_process>( + self, + _pid: Pid, + _global_state: &G, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + Ok(()) + } +} + +/// A "noop" tool that doesn't do anything. +impl Tool for () { + fn subscriptions(_cfg: &()) -> Subscription { + Subscription::none() + } +} + +/// A handle to send messages to the global state (potentially a remote, +/// inter-process communication). +#[async_trait] +pub trait GlobalRPC: Sync { + /// Send an RPC message to wherever the global state is stored, synchronously + /// blocks the current thread until a response is received. + async fn send_rpc(&self, message: G::Request) -> Result; + + /// Return the read-only tool configuration + fn config(&self) -> &G::Config; +} diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..b7c9100 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +# @fb-only: path = "../../third-party-buck/platform009/build/rust" +channel = "nightly" # @oss-only diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..4ecef32 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,5 @@ +# Get help on options with `rustfmt --help=config` +# Please keep these in alphabetical order. +edition = "2018" +merge_derives = false +use_field_init_shorthand = true diff --git a/tests/backtrace.rs b/tests/backtrace.rs new file mode 100644 index 0000000..8bdebb0 --- /dev/null +++ b/tests/backtrace.rs @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests for getting backtraces from the guest. + +#![cfg(not(sanitized))] + +use reverie::syscalls::Syscall; +use reverie::Error; +use reverie::ExitStatus; +use reverie::Guest; +use reverie::Tool; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct TestTool; + +#[reverie::tool] +impl Tool for TestTool { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + if let Syscall::Getpid(_) = &syscall { + let backtrace = guest + .backtrace() + .expect("failed to get backtrace from guest"); + + // There's no guarantee our function is at the top of the stack, so + // we simply assert that it is *somewhere* in the stack. + assert!( + backtrace.iter().any(|frame| { + if let Some(symbol) = &frame.symbol { + // Due to name mangling, there won't be an exact match. + symbol.name.contains("funky_function") + } else { + false + } + }), + "guest backtrace did not contain our expected function:\n{}", + backtrace + ); + } + + Ok(guest.inject(syscall).await?) + } +} + +#[inline(never)] +fn funky_function() { + let _ = unsafe { libc::getpid() }; +} + +#[test] +fn smoke() { + use reverie_ptrace::testing::test_fn; + + let (output, _) = test_fn::(funky_function).unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); +} diff --git a/tests/basics.rs b/tests/basics.rs new file mode 100644 index 0000000..c3fc41f --- /dev/null +++ b/tests/basics.rs @@ -0,0 +1,264 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#![feature(llvm_asm)] + +//! Basic tests that don't fall into some other category. + +#[allow(unused_imports)] +use nix::{ + sys::wait::{self, WaitStatus}, + unistd::{self, ForkResult}, +}; +use reverie::{ + syscalls::{Syscall, SyscallInfo, Sysno}, + Error, ExitStatus, GlobalTool, Guest, Pid, Tool, +}; +#[allow(unused_imports)] +use reverie_ptrace::testing::{check_fn, test_cmd, test_fn}; +use serde::{Deserialize, Serialize}; +#[allow(unused_imports)] +use std::ffi::CString; +#[allow(unused_imports)] +use std::io::Write; +use std::sync::atomic::{AtomicU64, Ordering}; + +#[derive(Debug, serde::Serialize, serde::Deserialize, Default)] +struct NoopTool; +impl Tool for NoopTool {} + +#[test] +fn noop_tool_test() { + let (output, _) = test_cmd::("/bin/pwd", &[]).unwrap(); + // pwd should succeed & print some characters: + assert_eq!(output.status, ExitStatus::Exited(0)); + assert!(!output.stdout.is_empty()); + assert!(output.stderr.is_empty()); +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct CounterGlobal { + num_syscalls: AtomicU64, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct CounterLocal {} + +/// The message sent to the global state method. +/// This contains the syscall number. +#[derive(PartialEq, Debug, Eq, Clone, Copy, Serialize, Deserialize)] +pub struct IncrMsg(Sysno); + +#[reverie::global_tool] +impl GlobalTool for CounterGlobal { + type Request = IncrMsg; + type Response = (); + async fn receive_rpc(&self, _from: Pid, _: IncrMsg) -> Self::Response { + AtomicU64::fetch_add(&self.num_syscalls, 1, Ordering::SeqCst); + } +} + +#[reverie::tool] +impl Tool for CounterLocal { + type GlobalState = CounterGlobal; + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let sysno = syscall.number(); + let _ = guest.send_rpc(IncrMsg(sysno)).await?; + guest.tail_inject(syscall).await + } +} + +#[test] +fn counter_tool_test() { + let (output, state) = test_cmd::("ls", &[]).unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + // ls should print some characters and perform some syscalls: + assert!(!output.stdout.is_empty()); + assert!(AtomicU64::load(&state.num_syscalls, Ordering::SeqCst) > 30); +} + +#[test] +fn error_exit_test() { + let (output, state) = test_cmd::("/bin/bash", &["-c", "exit 42"]).unwrap(); + assert_eq!(output.status, ExitStatus::Exited(42)); + assert_eq!(output.stdout.len(), 0); + assert!(AtomicU64::load(&state.num_syscalls, Ordering::SeqCst) > 0); +} + +#[allow(dead_code)] +fn fn_test() { + let (output, state) = test_fn::(|| { + let pid = nix::unistd::getpid(); + println!("Hello world1! Pid = {:?}", pid); + unsafe { + libc::syscall(libc::SYS_write, 1, "Hello world2!\n", 14); + } + let _gid = nix::unistd::getgid(); + }) + .unwrap(); + + println!( + " >>> Command complete, stdout len {}, stderr len {}", + output.stdout.len(), + output.stderr.len(), + ); + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stderr.len(), 0); + assert!(AtomicU64::load(&state.num_syscalls, Ordering::SeqCst) > 1); +} + +#[cfg(not(sanitized))] +#[test] +fn run_fn_test() { + fn_test(); +} + +#[cfg(not(sanitized))] +#[test] +fn run_guest_command_test() { + let (output, _state) = test_cmd::("/bin/echo", &["-n", "abcd"]).unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout.as_slice(), b"abcd"); +} + +#[cfg(not(sanitized))] +#[test] +fn run_guest_command_test_closure() { + let msg = "abcd"; + let (output, _state) = test_cmd::("/bin/echo", &["-n", msg]).unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout.as_slice(), msg.as_bytes()); +} + +#[cfg(not(sanitized))] +#[test] +fn run_guest_func_write_test() { + let msg = "abcd"; + let (output, _state) = test_fn::(move || { + std::io::stdout().write_all(msg.as_bytes()).unwrap(); + std::io::stdout().flush().unwrap(); + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout.as_slice(), msg.as_bytes()); +} + +#[cfg(not(sanitized))] +#[test] +fn run_guest_func_print_test() { + let msg = "abcd"; + let (output, _state) = test_fn::(move || { + println!("{}", msg); + }) + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout.as_slice(), b"abcd\n"); +} + +#[cfg(not(sanitized))] +#[test] +fn orphans() { + use nix::unistd::{fork, ForkResult}; + use std::{thread, time::Duration}; + + let (output, _state) = test_fn::(|| { + // Spawn a child process and make sure the parent exits before the child + // process. + match unsafe { fork() }.unwrap() { + ForkResult::Parent { child: _child } => { + // Don't wait on the child. Just exit. + } + ForkResult::Child => { + // Sleep for a little while so the parent has time to exit. + thread::sleep(Duration::from_secs(1)); + } + } + }) + .unwrap(); + + assert_eq!(output.status, ExitStatus::Exited(0)); +} + +#[cfg(not(sanitized))] +#[test] +fn rust_execve_noexist_test() { + use reverie_ptrace::testing::check_fn; + check_fn::(|| { + let program = CString::new("I do not exist").unwrap(); + let env = CString::new("foo=bar").unwrap(); + let res = nix::unistd::execve(&program, &[&program], &[&env]); + assert!(res.is_err()); + }); +} + +#[cfg(not(sanitized))] +#[test] +fn i_should_segfault() { + use nix::sys::signal::Signal::SIGSEGV; + use reverie_ptrace::testing::test_fn; + let (output, _) = test_fn::(|| { + unsafe { + let invalid_pointer = 0x5u64 as *mut u64; + std::ptr::write(invalid_pointer, 0xdeadbeefu64); + }; + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Signaled(SIGSEGV, true),); +} + +#[cfg(not(sanitized))] +#[test] +fn i_should_segfault_2() { + use nix::sys::signal::Signal::SIGSEGV; + use reverie_ptrace::testing::test_fn; + let (output, _) = test_fn::(|| unsafe { + llvm_asm!(r#"mov $$0, %rax + jmpq *%rax + "#:::"rax") + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Signaled(SIGSEGV, true),); +} + +#[cfg(not(sanitized))] +#[test] +fn child_should_inherit_fds() { + check_fn::(move || { + let (fdread, fdwrite) = unistd::pipe().unwrap(); + let msg: [u8; 8] = [0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8]; + match unsafe { unistd::fork() } { + Ok(ForkResult::Parent { child, .. }) => { + assert!(unistd::close(fdwrite).is_ok()); + let mut buf: [u8; 8] = [0; 8]; + assert_eq!(unistd::read(fdread, &mut buf), Ok(8)); + assert_eq!(buf, msg); + assert_eq!(wait::waitpid(child, None), Ok(WaitStatus::Exited(child, 0))); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + unreachable!(); + } + Ok(ForkResult::Child) => { + assert!(unistd::close(fdread).is_ok()); + assert_eq!(unistd::write(fdwrite, &msg), Ok(8)); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + unreachable!(); + } + Err(err) => { + panic!("fork failed: {:?}", err); + } + } + }); +} diff --git a/tests/busywait.rs b/tests/busywait.rs new file mode 100644 index 0000000..234b9c8 --- /dev/null +++ b/tests/busywait.rs @@ -0,0 +1,224 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Counts and verifies the number of reverie events received while the guest is busywaiting +//! or otherwise CPU spinning. The beginning and end of the busywait are marked by `clock_getres` +//! syscalls to avoid errantly counting end-of-process syscalls/events. +//! +//! This verifies that timer events, if requested, are delivered during busywaits and are not delivered +//! if not requested. + +use libc; +use raw_cpuid::cpuid; +use reverie::{ + syscalls::Syscall, CpuIdResult, Errno, Error, ExitStatus, GlobalRPC, GlobalTool, Guest, Pid, + Rdtsc, RdtscResult, Signal, Tid, TimerSchedule, Tool, +}; +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::{Duration, Instant}; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState { + num_evts: AtomicU64, + num_timer_evts: AtomicU64, + collect: AtomicBool, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct Config { + set_timer: bool, +} + +#[derive(PartialEq, Debug, Eq, Clone, Copy, Serialize, Deserialize)] +pub enum IncrMsg { + Increment, + ToggleCollection, + TimerEvent, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = IncrMsg; + type Response = (); + type Config = Config; + + async fn init_global_state(_: &Self::Config) -> Self { + GlobalState { + num_evts: AtomicU64::new(0), + num_timer_evts: AtomicU64::new(0), + collect: AtomicBool::new(false), + } + } + + async fn receive_rpc(&self, _from: Pid, msg: IncrMsg) -> Self::Response { + match msg { + IncrMsg::ToggleCollection => { + self.collect.fetch_xor(true, Ordering::SeqCst); + } + IncrMsg::Increment if self.collect.load(Ordering::SeqCst) => { + self.num_evts.fetch_add(1, Ordering::SeqCst); + } + IncrMsg::Increment => {} + IncrMsg::TimerEvent => { + self.num_timer_evts.fetch_add(1, Ordering::SeqCst); + } + } + } +} + +// Use RCBs directly to ensure determinism tests are robust to changes in +// conversion from realtime to RCBs. +const TIMEOUT: TimerSchedule = TimerSchedule::Rcbs(120_000_000); + +/// Should implement _all_ reverie callbacks. +#[reverie::tool] +impl Tool for LocalState { + type GlobalState = GlobalState; + + async fn handle_thread_start>(&self, guest: &mut T) -> Result<(), Error> { + guest.send_rpc(IncrMsg::Increment).await.unwrap(); + Ok(()) + } + + async fn handle_post_exec>(&self, guest: &mut T) -> Result<(), Errno> { + guest.send_rpc(IncrMsg::Increment).await.unwrap(); + Ok(()) + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + if let Syscall::ClockGetres(_) = syscall { + // clock_getres denotes the start/end of the busywait + guest.send_rpc(IncrMsg::ToggleCollection).await?; + if guest.config().set_timer { + guest.set_timer_precise(TIMEOUT).unwrap(); + } + } else { + guest.send_rpc(IncrMsg::Increment).await?; + } + guest.tail_inject(syscall).await + } + + async fn handle_cpuid_event>( + &self, + guest: &mut T, + eax: u32, + ecx: u32, + ) -> Result { + guest.send_rpc(IncrMsg::Increment).await.unwrap(); + Ok(cpuid!(eax, ecx)) + } + + async fn handle_rdtsc_event>( + &self, + guest: &mut T, + request: Rdtsc, + ) -> Result { + guest.send_rpc(IncrMsg::Increment).await.unwrap(); + Ok(RdtscResult::new(request)) + } + + async fn handle_signal_event>( + &self, + guest: &mut T, + signal: Signal, + ) -> Result, Errno> { + guest.send_rpc(IncrMsg::Increment).await.unwrap(); + Ok(Some(signal)) + } + + async fn handle_timer_event>(&self, guest: &mut T) { + guest.send_rpc(IncrMsg::TimerEvent).await.unwrap(); + guest.set_timer_precise(TIMEOUT).unwrap(); + } + + async fn on_exit_thread>( + &self, + _tid: Tid, + global_state: &G, + _thread_state: Self::ThreadState, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + global_state.send_rpc(IncrMsg::Increment).await?; + Ok(()) + } + + async fn on_exit_process>( + self, + _pid: Pid, + global_state: &G, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + global_state.send_rpc(IncrMsg::Increment).await?; + Ok(()) + } +} + +/// Inform the Tool to begin counting events via a specific syscall +fn do_marker_syscall() { + unsafe { + libc::clock_getres(libc::CLOCK_MONOTONIC, std::ptr::null_mut()); + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::{check_fn_with_config, do_branches}; + + #[test] + fn guest_busywait_no_timer() { + let start = Instant::now(); + let gs = check_fn_with_config::( + move || { + // Signal start/end of busywait via marker syscall + do_marker_syscall(); + do_branches(10_000_000_000); + do_marker_syscall(); + }, + Config { set_timer: false }, + true, + ); + // Spin outlasts any reasonable scheduling interval + assert!(start.elapsed() > Duration::from_millis(2700)); + // No events received during busywait + assert_eq!(gs.num_evts.into_inner(), 0); + assert_eq!(gs.num_timer_evts.into_inner(), 0); + } + + #[test] + fn guest_busywait_timer() { + use reverie_ptrace::ret_without_perf; + ret_without_perf!(); + let start = Instant::now(); + let gs = check_fn_with_config::( + move || { + // Signal start/end of busywait via marker syscall + do_marker_syscall(); + do_branches(10_000_000_000); + do_marker_syscall(); + }, + Config { set_timer: true }, + true, + ); + // Spin outlasts any reasonable scheduling interval + assert!(start.elapsed() > Duration::from_millis(2700)); + // Events received only from timer + assert_eq!(gs.num_evts.into_inner(), 0); + // Soft test of determinism: assert exact number of timer events + assert_eq!(gs.num_timer_evts.into_inner(), 83); + } +} diff --git a/tests/c_tests/cc_no_shlib.sh b/tests/c_tests/cc_no_shlib.sh new file mode 100755 index 0000000..377c689 --- /dev/null +++ b/tests/c_tests/cc_no_shlib.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# fbmake will pass --install-dir and --fbcode--dir while +# buck won't pass it. So drop all arguments except for the last two +while [[ $# -gt 1 ]]; do + shift; +done + +src=$1 + +shift + +path=$INSTALL_DIR +[[ ! -d $path ]] && mkdir -p "$path"; + +output_file=$(basename "$src") +output=$INSTALL_DIR/${output_file%.*} + +echo "compiling $output from $src" + +cc=clang.par + +${cc} -nostdlib -o "$output" "$src" "$*" diff --git a/tests/c_tests/clock-nanosleep.c b/tests/c_tests/clock-nanosleep.c new file mode 100644 index 0000000..f95fc80 --- /dev/null +++ b/tests/c_tests/clock-nanosleep.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = 100000000, + }; + struct timespec rem; + int ret; + + do { + ret = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); + + return 0; +} diff --git a/tests/c_tests/forkExec.c b/tests/c_tests/forkExec.c new file mode 100644 index 0000000..43fcc5f --- /dev/null +++ b/tests/c_tests/forkExec.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[], char* envp[]) { + if (argc == 2 && strcmp(argv[1], "child") == 0) { + printf("exec pid: %u\n", getpid()); + _exit(0); + } + + pid_t pid = fork(); + + if (pid < 0) { + perror("fork failed: "); + exit(1); + } else if (pid == 0) { + char* prog = argv[0]; + char* const newArgv[] = {prog, "child", NULL}; + printf("child pid: %u\n", getpid()); + execve(prog, newArgv, envp); + printf("exec failed: %s\n", strerror(errno)); + } else { + int status; + printf("parent pid: %u\n", getpid()); + waitpid(pid, &status, 0); + if (WIFSIGNALED(status)) { + printf("%u terminated by signal: %u\n", pid, WTERMSIG(status)); + } + } +} diff --git a/tests/c_tests/forkMany-blockSigchld.c b/tests/c_tests/forkMany-blockSigchld.c new file mode 100644 index 0000000..49134ef --- /dev/null +++ b/tests/c_tests/forkMany-blockSigchld.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define TESTS_NLOOPS 100 + +static _Atomic unsigned long* counter; + +int main(int argc, char* argv[]) { + sigset_t oldset, set; + pid_t pid; + unsigned long c; + int status; + + counter = mmap( + 0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + assert((unsigned long)counter != -1UL); + + sigprocmask(SIG_BLOCK, NULL, &set); + sigaddset(&set, SIGCHLD); + sigprocmask(SIG_BLOCK, &set, &oldset); + + if (argc == 2 && strcmp(argv[1], "--block-sigchld") == 0) { + sigprocmask(SIG_BLOCK, NULL, &set); + sigaddset(&set, SIGCHLD); + sigprocmask(SIG_BLOCK, &set, &oldset); + } + + for (int i = 0; i < TESTS_NLOOPS; i++) { + kill(getpid(), SIGCHLD); + pid = fork(); + // Child + if (pid == 0) { + c = atomic_fetch_add(counter, 1); + exit(0); + } else if (pid > 0) { + c = atomic_fetch_add(counter, 1); + } else { + perror("fork: "); + exit(1); + } + } + + while ((pid = wait(&status)) > 0) + ; + + unsigned long expected = 2 * TESTS_NLOOPS; + unsigned long got = atomic_load(counter); + + printf("counter: expected: %lu got: %lu\n", expected, got); + + return 0; +} diff --git a/tests/c_tests/forkMany.c b/tests/c_tests/forkMany.c new file mode 100644 index 0000000..bcd1519 --- /dev/null +++ b/tests/c_tests/forkMany.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define TESTS_NLOOPS 100 + +static _Atomic unsigned long* counter; + +int main(int argc, char* argv[]) { + sigset_t oldset, set; + pid_t pid; + unsigned long c; + int status; + + counter = mmap( + 0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + assert((unsigned long)counter != -1UL); + + if (argc == 2 && strcmp(argv[1], "--block-sigchld") == 0) { + sigprocmask(SIG_BLOCK, NULL, &set); + sigaddset(&set, SIGCHLD); + sigprocmask(SIG_BLOCK, &set, &oldset); + } + + for (int i = 0; i < TESTS_NLOOPS; i++) { + kill(getpid(), SIGCHLD); + pid = fork(); + // Child + if (pid == 0) { + c = atomic_fetch_add(counter, 1); + exit(0); + } else if (pid > 0) { + c = atomic_fetch_add(counter, 1); + } else { + perror("fork: "); + exit(1); + } + } + + while ((pid = wait(&status)) > 0) + ; + + unsigned long expected = 2 * TESTS_NLOOPS; + unsigned long got = atomic_load(counter); + + printf("counter: expected: %lu got: %lu\n", expected, got); + + return 0; +} diff --git a/tests/c_tests/forkNoWait.c b/tests/c_tests/forkNoWait.c new file mode 100644 index 0000000..5daa363 --- /dev/null +++ b/tests/c_tests/forkNoWait.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void sigchld_handler(int sig, siginfo_t* _info, void* _context) { + if (sig != SIGCHLD) { + fprintf(stderr, "unexpected signal %d != SIGCHLD\n", sig); + abort(); + } + char buf[256]; + int n = snprintf(buf, 256, "%d caught SIGCHLD\n", getpid()); + write(1, buf, n); + _exit(0); +} + +int main(int argc, char* argv[], char* envp[]) { + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigchld_handler; + sa.sa_flags = SA_RESTART | SA_SIGINFO; + + if (sigaction(SIGCHLD, &sa, NULL) != 0) { + fprintf(stderr, "sigaction failed: %s\n", strerror(errno)); + exit(1); + } + + if (argc == 2 && strcmp(argv[1], "child") == 0) { + printf("exec pid: %u\n", getpid()); + _exit(0); + } + + pid_t pid = fork(); + + if (pid < 0) { + perror("fork failed: "); + exit(1); + } else if (pid == 0) { + char* prog = argv[0]; + char* const newArgv[] = {prog, "child", NULL}; + printf("child pid: %u\n", getpid()); + execve(prog, newArgv, envp); + printf("exec failed: %s\n", strerror(errno)); + } else { + printf("parent pid: %u\n", getpid()); + struct timespec tp = {900, 0}; + nanosleep(&tp, NULL); + } +} diff --git a/tests/c_tests/getpid-pie.c b/tests/c_tests/getpid-pie.c new file mode 100644 index 0000000..a8992d7 --- /dev/null +++ b/tests/c_tests/getpid-pie.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* must build with flags -pie -fPIE -O */ + +#include +#include +#include +#include + +/** + * 0000000000000740 : + * 740: b8 27 00 00 00 mov $0x27,%eax + * 745: 0f 05 syscall + * 747: c3 retq + * 748: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1) + * 74f: 00 + */ +__attribute__((noinline)) static int sys_getpid(void) { + int ret; + asm volatile( + "mov $0x27, %%eax\n\t" + "syscall\n\t" + : "=r"(ret)); + return ret; +} + +int main(int argc, char* argv[]) { + int pid0 = getpid(); + int pid = sys_getpid(); + printf("pid = %d\n", pid); + if (pid0 != pid) + abort(); + + return 0; +} diff --git a/tests/c_tests/getpid.c b/tests/c_tests/getpid.c new file mode 100644 index 0000000..b811afa --- /dev/null +++ b/tests/c_tests/getpid.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + pid_t pid = getpid(); + printf(" my pid = %d\n", pid); + printf(" my ppid = %d\n", getppid()); + printf(" my uid = %d\n", getuid()); + printf(" my gid = %d\n", getgid()); + printf(" my sid = %d\n", getsid(pid)); + + exit(0); // Since glibc 2.3 this calls SYS_exit_group +} diff --git a/tests/c_tests/nanosleep.c b/tests/c_tests/nanosleep.c new file mode 100644 index 0000000..f2cfda7 --- /dev/null +++ b/tests/c_tests/nanosleep.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include + +#define NITERATIONS 1000 + +static long long diff_time( + const struct timespec* begin, + const struct timespec* end) { + long long r = 0; + r = (end->tv_sec - begin->tv_sec) * 1000000000 + + (end->tv_nsec - begin->tv_nsec); + return r / 1000; +} + +int main(int argc, char* argv[]) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = 1000000, + }; + struct timespec begin, end; + int ntests = NITERATIONS; + + // ignore first nanosleep + nanosleep(&req, NULL); + + // ignore first clock_gettime + clock_gettime(CLOCK_REALTIME, &end); + + clock_gettime(CLOCK_REALTIME, &begin); + + for (int i = 0; i < 1000; i++) { + printf("nanosleep, iteration: %u\n", i); + nanosleep(&req, NULL); + } + clock_gettime(CLOCK_REALTIME, &end); + + long long elapsed = diff_time(&begin, &end); + + printf( + "time elapsed %lluus for %u iterations, mean: %.3lfus\n", + elapsed, + ntests, + (double)elapsed / ntests); + + return 0; +} diff --git a/tests/c_tests/open-many.c b/tests/c_tests/open-many.c new file mode 100644 index 0000000..5b39627 --- /dev/null +++ b/tests/c_tests/open-many.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +int main(int argc, char* argv[]) { + const char* file = "/etc/passwd"; + int fd; + + for (int i = 0; i < 1000; i++) { + fd = open(file, O_RDONLY); + assert(access(file, O_RDONLY) == 0); + assert(fd >= 0); + close(fd); + } + + return 0; +} diff --git a/tests/c_tests/openat1.c b/tests/c_tests/openat1.c new file mode 100644 index 0000000..ec3a13f --- /dev/null +++ b/tests/c_tests/openat1.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int segv(int sig, siginfo_t* info, void* u) { + unsigned char* ip = info->si_addr; + printf("received signal: %d, si_addr: %p\n", sig, ip); + + for (int i = 0; i < 8; i++) { + printf("%02x ", (int)ip[i] & 0xff); + } + printf("\n"); + + return 0; +} + +int main(int argc, char* argv[]) { + struct sigaction sa, old_sa; + const char* file = "/dev/urandom"; + int fd; + + memset(&sa, 0, sizeof(sa)); + sa.sa_flags = SA_RESETHAND | SA_SIGINFO; + + sigaction(SIGSEGV, &sa, &old_sa); + + fd = open(file, 0); + fprintf(stderr, "openat1: %d\n", fd); + if (fd < 0) { + fprintf(stderr, "open %s, error: %s\n", file, strerror(errno)); + } + + fd = open(file, 0); + fprintf(stderr, "openat1: %d\n", fd); + if (fd < 0) { + fprintf(stderr, "open %s, error: %s\n", file, strerror(errno)); + } + + return 0; +} diff --git a/tests/c_tests/signal1.c b/tests/c_tests/signal1.c new file mode 100644 index 0000000..09da822 --- /dev/null +++ b/tests/c_tests/signal1.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#ifndef SA_RESTORER +#define SA_RESTORER 0x04000000 +#endif + +struct kernel_sigaction { + unsigned long sa__; + unsigned long flags; + unsigned long restorer; + unsigned long masks; +}; + +static int rt_sigreturn(void* regs) { + return (int)syscall(SYS_rt_sigreturn, regs); +} + +static int rt_sigaction( + int signum, + const struct kernel_sigaction* new, + struct kernel_sigaction* old) { + unsigned long r = + (unsigned long)syscall(SYS_rt_sigaction, signum, new, old, 8); + if (r >= ~0xfffUL) { + return (int)r; + } else { + return (int)r; + } +} + +static volatile int quit = 0; + +static void handler(int sig, siginfo_t* info, void* ucontext) { + static char msg[64]; + quit = 1; + size_t n = snprintf(msg, 64, "[OK] received signal %u\n", info->si_signo); + write(STDOUT_FILENO, msg, n); +} + +extern int __restore_rt(void); + +int main(int argc, char* argv[]) { + int ret; + struct kernel_sigaction old, new; + + memset(&old, 0, sizeof(old)); + memset(&new, 0, sizeof(new)); + + new.sa__ = (unsigned long)handler; + new.flags = SA_RESTART | SA_RESTORER | SA_SIGINFO; + new.restorer = (unsigned long)rt_sigreturn; + + ret = rt_sigaction(SIGALRM, &new, NULL); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + ret = rt_sigaction(SIGALRM, NULL, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + assert(old.sa__ == (unsigned long)handler); + + alarm(1); + + while (!quit) + ; + + return 0; +} diff --git a/tests/c_tests/signal2.c b/tests/c_tests/signal2.c new file mode 100644 index 0000000..e325a60 --- /dev/null +++ b/tests/c_tests/signal2.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +static volatile int quit = 0; + +static void handler(int sig, siginfo_t* info, void* ucontext) { + static char msg[64]; + quit = 1; + size_t n = snprintf(msg, 64, "[OK] received signal %u\n", info->si_signo); + write(STDOUT_FILENO, msg, n); +} + +int main(int argc, char* argv[]) { + int ret; + sigset_t sigset; + struct sigaction old, new; + + memset(&old, 0, sizeof(old)); + memset(&new, 0, sizeof(new)); + + sigemptyset(&sigset); + new.sa_sigaction = handler; + new.sa_mask = sigset; + new.sa_flags = SA_RESTART | SA_SIGINFO; + + ret = sigaction(SIGALRM, &new, NULL); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + ret = sigaction(SIGALRM, NULL, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + assert((unsigned long)old.sa_sigaction == (unsigned long)handler); + + alarm(1); + + while (!quit) + ; + + return 0; +} diff --git a/tests/c_tests/signal3.c b/tests/c_tests/signal3.c new file mode 100644 index 0000000..1be0a98 --- /dev/null +++ b/tests/c_tests/signal3.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +static volatile int quit = 0; + +static void handler(int sig, siginfo_t* info, void* ucontext) { + static char msg[64]; + quit = 1; + size_t n = snprintf(msg, 64, "[OK] received signal %u\n", info->si_signo); + write(STDOUT_FILENO, msg, n); +} + +static void dump_sa(const struct sigaction* sa) { + printf(" struct sigaction @%p\n ", sa); + printf( + "handler = %p, sigaction = %p, flags = %x, restorer = %p, sigset: \n ", + sa->sa_handler, + sa->sa_sigaction, + sa->sa_flags, + sa->sa_restorer); + for (int i = 0; i < sizeof(sigset_t) / sizeof(long); i++) { + printf(" %016lx", sa->sa_mask.__val[i]); + } + printf("\n"); +} + +int main(int argc, char* argv[]) { + int ret; + sigset_t sigset; + struct sigaction old, new; + + memset(&old, 0, sizeof(old)); + memset(&new, 0, sizeof(new)); + + sigemptyset(&sigset); + new.sa_sigaction = handler; + new.sa_mask = sigset; + new.sa_flags = SA_RESTART | SA_SIGINFO; + + ret = sigaction(SIGALRM, &new, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + dump_sa(&new); + + ret = sigaction(SIGALRM, NULL, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + dump_sa(&old); + assert((unsigned long)old.sa_sigaction == (unsigned long)handler); + + alarm(1); + + while (!quit) + ; + + return 0; +} diff --git a/tests/c_tests/sigprocmask1.c b/tests/c_tests/sigprocmask1.c new file mode 100644 index 0000000..536ed37 --- /dev/null +++ b/tests/c_tests/sigprocmask1.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +static volatile int quit = 0; + +static void handler(int sig, siginfo_t* info, void* ucontext) { + static char msg[64]; + quit = 1; + size_t n = snprintf(msg, 64, "[OK] received signal %u\n", info->si_signo); + write(STDOUT_FILENO, msg, n); +} + +int main(int argc, char* argv[]) { + int ret; + sigset_t sigset, sigset_old; + struct sigaction old, new; + + memset(&old, 0, sizeof(old)); + memset(&new, 0, sizeof(new)); + + sigemptyset(&sigset); + sigemptyset(&sigset_old); + sigaddset(&sigset, SIGALRM); + sigaddset(&sigset, SIGVTALRM); + + new.sa_sigaction = handler; + new.sa_mask = sigset; + new.sa_flags = SA_RESTART | SA_SIGINFO; + + ret = sigaction(SIGALRM, &new, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + ret = sigprocmask(SIG_UNBLOCK, &sigset, &sigset_old); + assert(ret >= 0); + + ret = sigaction(SIGALRM, NULL, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + assert((unsigned long)old.sa_sigaction == (unsigned long)handler); + + alarm(1); + + while (!quit) + ; + + return 0; +} diff --git a/tests/c_tests/thread8-cond-wait.c b/tests/c_tests/thread8-cond-wait.c new file mode 100644 index 0000000..3bc818f --- /dev/null +++ b/tests/c_tests/thread8-cond-wait.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) ((sizeof(x)) / sizeof((x)[0])) +#endif + +#define NR_THREADS 5 + +static _Atomic unsigned int threads_started; + +static pthread_cond_t conds[NR_THREADS] = { + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, +}; + +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + +void* thread_entry(void* param) { + long id = (long)param; + + printf("this is thread #%lu\n", id); + + pthread_mutex_lock(&mutex); + atomic_fetch_add(&threads_started, 1); + pthread_cond_wait(&conds[id], &mutex); + pthread_cond_signal(&conds[(1 + id) % NR_THREADS]); + + pthread_mutex_unlock(&mutex); + + printf("%lu exited.\n", id); + + return 0; +} + +int main(int argc, char* argv[]) { + pthread_t ids[NR_THREADS]; + struct timespec tp = {0, 100000000}; + + for (long i = 0; i < NR_THREADS; i++) { + pthread_create(&ids[i], NULL, thread_entry, (void*)i); + } + + while (atomic_load(&threads_started) != NR_THREADS) + ; + + nanosleep(&tp, NULL); + + int k = 3; + printf("signaling thread #%u\n", k); + pthread_cond_signal(&conds[k]); + + for (int i = 0; i < NR_THREADS; i++) { + pthread_join(ids[i], NULL); + } + + for (int i = 0; i < NR_THREADS; i++) { + pthread_cond_destroy(&conds[i]); + } + + return 0; +} diff --git a/tests/c_tests/thread9-cond-bcast.c b/tests/c_tests/thread9-cond-bcast.c new file mode 100644 index 0000000..d5b5e31 --- /dev/null +++ b/tests/c_tests/thread9-cond-bcast.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) ((sizeof(x)) / sizeof((x)[0])) +#endif + +#define NR_THREADS 5 + +static _Atomic unsigned int threads_started; + +static pthread_cond_t conds[NR_THREADS] = { + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, + PTHREAD_COND_INITIALIZER, +}; + +static pthread_mutex_t mutexes[NR_THREADS] = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_INITIALIZER, +}; + +void* thread_entry(void* param) { + long id = (long)param; + + printf("this is thread #%lu\n", id); + + pthread_mutex_lock(&mutexes[0]); + + atomic_fetch_add(&threads_started, 1); + pthread_cond_wait(&conds[0], &mutexes[0]); + + pthread_mutex_unlock(&mutexes[0]); + + printf("%lu exited.\n", id); + + return 0; +} + +int main(int argc, char* argv[]) { + pthread_t ids[NR_THREADS]; + struct timespec tp = {0, 100000000}; + + for (long i = 0; i < NR_THREADS; i++) { + pthread_create(&ids[i], NULL, thread_entry, (void*)i); + } + + while (atomic_load(&threads_started) != NR_THREADS) + ; + + nanosleep(&tp, NULL); + + pthread_cond_broadcast(&conds[0]); + + for (int i = 0; i < NR_THREADS; i++) { + pthread_join(ids[i], NULL); + } + + for (int i = 0; i < NR_THREADS; i++) { + pthread_cond_destroy(&conds[i]); + } + + return 0; +} diff --git a/tests/c_tests/threads1.c b/tests/c_tests/threads1.c new file mode 100644 index 0000000..11a132d --- /dev/null +++ b/tests/c_tests/threads1.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 2L +#define TIME_100MS 100000000UL + +static void test_clock_nanosleep(unsigned long ns) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = ns, + }; + struct timespec rem; + int ret; + + do { + ret = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void* threaded(void* param) { + long k = (long)param; + + printf("thread %ld enter.\n", k); + + test_clock_nanosleep(TIME_100MS); + + printf("thread %ld exit.\n", k); + + return 0; +} + +int main(int argc, char* argv[]) { + // sleep in a non-threpaded context + test_clock_nanosleep(TIME_100MS); + + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/c_tests/threads2.c b/tests/c_tests/threads2.c new file mode 100644 index 0000000..e07a0c8 --- /dev/null +++ b/tests/c_tests/threads2.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 8L +#define TIME_100MS 100000000UL + +#define THREAD_SHARED_HEAP 0x67000000L + +static void test_clock_nanosleep(unsigned long ns) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = ns, + }; + struct timespec rem; + int ret; + + do { + ret = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void* threaded_0(void* param) { + long k = (long)param; + char buf[128]; + int n; + + n = snprintf(buf, 128, "thread %ld enter.\n", k); + write(STDOUT_FILENO, buf, n); + + long* p = mmap( + (void*)THREAD_SHARED_HEAP, + 0x2000, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + + assert((void*)p == (void*)THREAD_SHARED_HEAP); + + p[k] = pthread_self(); + + test_clock_nanosleep(TIME_100MS); + + n = snprintf(buf, 128, "thread %ld exit.\n", k); + write(STDOUT_FILENO, buf, n); + + return 0; +} + +static void* threaded(void* param) { + long k = (long)param; + long* ptr = (long*)THREAD_SHARED_HEAP; + char buf[128]; + int n; + + n = snprintf(buf, 128, "thread %ld enter.\n", k); + write(STDOUT_FILENO, buf, n); + + ptr[k] = pthread_self(); + + test_clock_nanosleep(TIME_100MS); + + n = snprintf(buf, 128, "thread %ld exit.\n", k); + write(STDOUT_FILENO, buf, n); + + return 0; +} + +int main(int argc, char* argv[]) { + char buf[128]; + int n; + // sleep in a non-threpaded context + test_clock_nanosleep(TIME_100MS); + + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + long* ptr = (long*)THREAD_SHARED_HEAP; + + assert(pthread_attr_init(&attr) == 0); + + long i = 0; + assert(pthread_create(&threadid[i], &attr, threaded_0, (void*)i) == 0); + pthread_join(threadid[i], NULL); + + for (i = 1; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (i = 1; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + for (i = 0; i < NR_THREADS; i++) { + n = snprintf(buf, 128, "threads data: %lx\n", ptr[i]); + write(STDOUT_FILENO, buf, n); + } + + return 0; +} diff --git a/tests/c_tests/threads3.c b/tests/c_tests/threads3.c new file mode 100644 index 0000000..bfedf4a --- /dev/null +++ b/tests/c_tests/threads3.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 5L +#define TIME_100MS 100000000UL + +#define THREAD_SHARED_HEAP 0x67000000L + +static void test_clock_nanosleep(unsigned long ns) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = ns, + }; + struct timespec rem; + int ret; + + do { + ret = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void* threaded_0(void* param) { + long k = (long)param; + + printf("thread %ld enter. allocating with mmap\n", k); + + long* p = mmap( + (void*)THREAD_SHARED_HEAP, + 0x2000, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + + assert((void*)p == (void*)THREAD_SHARED_HEAP); + + p[k] = pthread_self(); + + test_clock_nanosleep(TIME_100MS); + + printf("thread %ld exit.\n", k); + + return 0; +} + +static void* threaded(void* param) { + long k = (long)param; + long* ptr = (long*)THREAD_SHARED_HEAP; + + pid_t tid = syscall(SYS_gettid); + pid_t pid = getpid(); + pid_t ppid = getppid(); + pid_t pgid = getpgid(0); + + printf( + "self: %lx thread %05ld (tid=%u, pid=%u, ppid=%u, pgid=%u) enter.\n", + pthread_self(), + k, + tid, + pid, + ppid, + pgid); + + ptr[k] = pthread_self(); + + test_clock_nanosleep(TIME_100MS); + + printf( + "self: %lx thread %05ld (tid=%u, pid=%u, ppid=%u, pgid=%u) exit.\n", + pthread_self(), + k, + tid, + pid, + ppid, + pgid); + + return 0; +} + +static void thread_test_0(void) { + // sleep in a non-threpaded context + test_clock_nanosleep(TIME_100MS); + + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + long* ptr = (long*)THREAD_SHARED_HEAP; + + assert(pthread_attr_init(&attr) == 0); + + long i = 0; + assert(pthread_create(&threadid[i], &attr, threaded_0, (void*)i) == 0); + pthread_join(threadid[i], NULL); + + for (i = 1; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (i = 1; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + for (i = 0; i < NR_THREADS; i++) { + printf("%lu threads data: %lx\n", i, ptr[i]); + } + assert(ptr[100] == 0); +} + +static void thread_test_1(void) { + // sleep in a non-threpaded context + test_clock_nanosleep(TIME_100MS); + + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + long* ptr = (long*)THREAD_SHARED_HEAP; + + assert(pthread_attr_init(&attr) == 0); + + long i = 0; + assert( + pthread_create(&threadid[i], &attr, threaded_0, (void*)(100 + i)) == 0); + pthread_join(threadid[i], NULL); + + for (i = 1; i < NR_THREADS; i++) { + assert( + pthread_create(&threadid[i], &attr, threaded, (void*)(100 + i)) == 0); + } + + for (i = 1; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + for (i = 0; i < NR_THREADS; i++) { + printf("%lu threads data: %lx\n", 100 + i, ptr[100 + i]); + } + assert(ptr[0] == 0); +} + +int main(int argc, char* argv[]) { + pid_t pid; + + pid = fork(); + + if (pid < 0) { + perror("fork"); + exit(1); + } else if (pid == 0) { /* child */ + printf("child pid: %u, parent: %u\n", getpid(), getppid()); + thread_test_0(); + } else { + int status; + printf("parent pid: %u, parent: %u\n", getpid(), getppid()); + thread_test_1(); + + // wait for SIGCHLD + waitpid(pid, &status, 0); + } + + return 0; +} diff --git a/tests/c_tests/threads4.c b/tests/c_tests/threads4.c new file mode 100644 index 0000000..1006696 --- /dev/null +++ b/tests/c_tests/threads4.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define THREAD_LOOP_COUNT 1000 +#define NR_THREADS 4L +#define TIME_100MS 100000000UL + +static void* threaded(void* param) { + long k = (long)param; + char buf[32]; + int n; + + n = snprintf(buf, 32, "%lu", k); + + for (int i = 0; i < THREAD_LOOP_COUNT; i++) { + write(STDERR_FILENO, buf, n); + } + + return 0; +} + +int main(int argc, char* argv[]) { + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/c_tests/threads5.c b/tests/c_tests/threads5.c new file mode 100644 index 0000000..470e7d1 --- /dev/null +++ b/tests/c_tests/threads5.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 8L + +#define NSECS_PER_SEC 1000000000L +#define FIVE_SECONDS (5UL * NSECS_PER_SEC) + +static int ts_printf(const char* fmt, ...) { + va_list ap; + struct timespec ts; + char buf[8192]; + int n; + + clock_gettime(CLOCK_REALTIME, &ts); + + va_start(ap, fmt); + n = snprintf(buf, 8192, "%lu.%06lu|", ts.tv_sec, ts.tv_nsec / 1000); + if (n < 8192) { + n += vsnprintf(buf + n, 8192 - n, fmt, ap); + } + va_end(ap); + + fputs(buf, stdout); + + return n; +} + +__attribute__((weak)) pid_t gettid(void) { + return syscall(SYS_gettid); +} + +static void thread_delay(unsigned long ns) { + struct timespec req = { + .tv_sec = ns / NSECS_PER_SEC, + .tv_nsec = ns % NSECS_PER_SEC, + }; + struct timespec rem; + int ret; + + do { + ret = nanosleep(&req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void* threaded(void* param) { + long k = (long)param; + unsigned long delay = FIVE_SECONDS; + + if (k == 5) { + delay = NSECS_PER_SEC; + } + + ts_printf("thread %lu enter. pid=%u, tid=%u\n", k, getpid(), gettid()); + if (k == 5) { + thread_delay(2 * NSECS_PER_SEC); + ts_printf("thread %lu call fork.\n", k); + pid_t pid = fork(); + + assert(pid >= 0); + + if (pid > 0) { + int status; + ts_printf( + "after fork, I'm parent pid = %u, child pid = %u, tid = %u\n", + getpid(), + pid, + gettid()); + waitpid(pid, &status, 0); + ts_printf("parent pid = %u exit\n", getpid()); + } else { + ts_printf( + "after fork, I'm child pid = %u, parent = %u, tid = %u\n", + getpid(), + getppid(), + gettid()); + thread_delay(NSECS_PER_SEC / 2); + ts_printf("child pid = %u exit\n", getpid()); + } + ts_printf("thread %lu exit. pid=%u, tid=%u\n", k, getpid(), gettid()); + } else { + unsigned long loops = 10; + unsigned long delay_per_loop = delay / loops; + for (unsigned long i = 0; i < loops; i++) { + ts_printf( + "thread# %lu reporting pid: %u, tid: %u\n", k, getpid(), gettid()); + thread_delay(delay_per_loop); + } + ts_printf("thread %lu exit. pid=%u, tid=%u\n", k, getpid(), gettid()); + } + + return 0; +} + +static void atfork_prepare(void) { + ts_printf("pthread_atfork prepare.\n"); +} + +static void atfork_parent(void) { + ts_printf( + "pthread_atfork parent pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +static void atfork_child(void) { + ts_printf( + "pthread_atfork child pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +int main(int argc, char* argv[]) { + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + pthread_atfork(atfork_prepare, atfork_parent, atfork_child); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/c_tests/threads6.c b/tests/c_tests/threads6.c new file mode 100644 index 0000000..262c067 --- /dev/null +++ b/tests/c_tests/threads6.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 8L + +#define NSECS_PER_SEC 1000000000L +#define FIVE_SECONDS (5UL * NSECS_PER_SEC) + +static int ts_printf(const char* fmt, ...) { + va_list ap; + struct timespec ts; + char buf[8192]; + int n; + + clock_gettime(CLOCK_REALTIME, &ts); + + va_start(ap, fmt); + n = snprintf(buf, 8192, "%lu.%06lu|", ts.tv_sec, ts.tv_nsec / 1000); + if (n < 8192) { + n += vsnprintf(buf + n, 8192 - n, fmt, ap); + } + va_end(ap); + + fputs(buf, stdout); + + return n; +} + +__attribute__((weak)) pid_t gettid(void) { + return syscall(SYS_gettid); +} + +static void thread_delay(unsigned long ns) { + struct timespec req = { + .tv_sec = ns / NSECS_PER_SEC, + .tv_nsec = ns % NSECS_PER_SEC, + }; + struct timespec rem; + int ret; + + do { + ret = nanosleep(&req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void run_exec(long k) { + ts_printf( + "thread %lu pid %lu tid %lu ready to run exec.\n", k, getpid(), gettid()); + char* const args[] = { + (char* const)"cat", + (char* const)"/proc/self/stat", + (char* const)NULL, + }; + char* const envp[] = { + (char* const)"PATH=/bin;/usr/bin", + (char* const)"SHELL=/bin/bash", + (char* const)NULL, + }; + execvpe(args[0], args, envp); + perror("exec"); + exit(1); +} + +static void* threaded(void* param) { + long k = (long)param; + unsigned long delay = FIVE_SECONDS; + + if (k == 5) { + delay = NSECS_PER_SEC; + } + + ts_printf("thread %lu enter. pid=%u, tid=%u\n", k, getpid(), gettid()); + thread_delay(delay); + if (k == 5) { + ts_printf("thread %lu call fork.\n", k); + + pid_t pid = fork(); + + assert(pid >= 0); + + if (pid > 0) { + int status; + ts_printf( + "after fork, I'm parent pid = %u, child pid = %u, tid = %u\n", + getpid(), + pid, + gettid()); + waitpid(pid, &status, 0); + ts_printf("parent pid = %u exit\n", getpid()); + } else { + ts_printf( + "after fork, I'm child pid = %u, parent = %u, tid = %u\n", + getpid(), + getppid(), + gettid()); + thread_delay(NSECS_PER_SEC); + run_exec(k); + ts_printf("child pid = %u exit\n", getpid()); + } + } + + ts_printf("thread %lu exit. pid=%u, tid=%u\n", k, getpid(), gettid()); + + return 0; +} + +static void atfork_prepare(void) { + ts_printf("pthread_atfork prepare.\n"); +} + +static void atfork_parent(void) { + ts_printf( + "pthread_atfork parent pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +static void atfork_child(void) { + ts_printf( + "pthread_atfork child pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +int main(int argc, char* argv[]) { + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + pthread_atfork(atfork_prepare, atfork_parent, atfork_child); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/c_tests/threads_dual_exit.c b/tests/c_tests/threads_dual_exit.c new file mode 100644 index 0000000..09f8fe6 --- /dev/null +++ b/tests/c_tests/threads_dual_exit.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Two threads in the same process both call `SYS_exit` (not group exit) with +// the main thread calling it first. + +#include +// #include +#include +// #include +#include +#include +#include +// #include +#include +#include +// #include +#include + +#define NTHREADS 8 + +// do `exit` syscall directly, avoid libc doing sth smart by replacing +// `_exit` with `exit_group`. +static void sys_exit(int code) { + (void)syscall(SYS_exit, code); +} + +// do futex wait with timeout 600s. 600s is to make sure it can timeout +// on sandcastle default configuration. +void* thread_fn(void* _param) { + printf("Child thread, sleeping...\n"); + struct timespec tp = {0, 500000000}; + clock_nanosleep(CLOCK_MONOTONIC, 0, &tp, NULL); + printf("Child thread, exiting...\n"); + sys_exit(0); + return NULL; +} + +int main(int argc, char* argv[], char* envp[]) { + pthread_t child; + if (pthread_create(&child, NULL, thread_fn, NULL) != 0) { + fprintf(stderr, "pthread_create failed: %s\n", strerror(errno)); + abort(); + } + printf("Parent thread, exiting...\n"); + sys_exit(0); +} diff --git a/tests/c_tests/threads_exit_group.c b/tests/c_tests/threads_exit_group.c new file mode 100644 index 0000000..e2d2d97 --- /dev/null +++ b/tests/c_tests/threads_exit_group.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Force blocked background threads to exit via exit_group(). + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTHREADS 8 + +struct thread_param { + int* sem; + long thread_id; +}; + +static int futex( + int* uaddr, + int futex_op, + int val, + const struct timespec* timeout, + int* uaddr2, + int val3) { + return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr, val3); +} + +// do futex wait with timeout 600s. 600s is to make sure it can timeout +// on sandcastle default configuration. +void* thread_pfn(void* param) { + struct thread_param* tp = (struct thread_param*)param; + struct timespec ts = {600, 0}; + + futex(tp->sem, FUTEX_PRIVATE_FLAG | FUTEX_WAIT, 0, &ts, NULL, 0); + _exit(0); +} + +int main(int argc, char* argv[]) { + void* page = mmap( + NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (page == (void*)-1) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + exit(1); + } + struct thread_param params[NTHREADS]; + pthread_t threads[NTHREADS]; + for (int i = 0; i < NTHREADS; i++) { + params[i].sem = (int*)page; + params[i].thread_id = (long)i; + + if (pthread_create(&threads[i], NULL, thread_pfn, (void*)¶ms[i]) != 0) { + fprintf( + stderr, + "pthread_create to create thread #%d failed: %s\n", + i, + strerror(errno)); + abort(); + } + } + + struct timespec tp = {1, 0}; + clock_nanosleep(CLOCK_MONOTONIC, 0, &tp, NULL); + + // do SYS_exit_group. All threads should be still blocked by mutex. + // SYS_exit_group should force all threads begin to exit. + syscall(SYS_exit_group, 0); +} diff --git a/tests/c_tests/threads_exit_mixed.c b/tests/c_tests/threads_exit_mixed.c new file mode 100644 index 0000000..1d59c16 --- /dev/null +++ b/tests/c_tests/threads_exit_mixed.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// create `NTHREADS`, half doing blocking futex, half doing `SYS_exit`, while +// the thread group leader doing `SYS_exit_group`. +// This is to test `SYS_exit` and `SYS_exit_group` have below behavior: +// +// - `SYS_exit` should exit the call thread *only* +// - `SYS_exit_group` should exit all the threads in the same thread group +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTHREADS 8 + +struct thread_param { + int* sem; + long thread_id; +}; + +static int futex( + int* uaddr, + int futex_op, + int val, + const struct timespec* timeout, + int* uaddr2, + int val3) { + return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr, val3); +} + +// do `exit` syscall directly, avoid libc doing sth smart by replacing +// `_exit` with `exit_group`. +static void sys_exit(int code) { + (void)syscall(SYS_exit, code); +} + +static _Atomic unsigned long counter; + +// do futex wait with timeout 600s. 600s is to make sure it can timeout +// on sandcastle default configuration. +void* thread_pfn(void* param) { + struct thread_param* tp = (struct thread_param*)param; + struct timespec ts = {600, 0}; + + atomic_fetch_add(&counter, 1); + if (tp->thread_id % 2 == 0) { + futex(tp->sem, FUTEX_PRIVATE_FLAG | FUTEX_WAIT, 0, &ts, NULL, 0); + } else { + sys_exit(0); + } + return NULL; +} + +#define SECRET_PARAM "__my_secret_param" +int main(int argc, char* argv[], char* envp[]) { + if (argc == 2 && strcmp(argv[1], SECRET_PARAM) == 0) { + // Guest mode: do the test. + void* page = mmap( + NULL, + 0x1000, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + if (page == (void*)-1) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + exit(1); + } + struct thread_param params[NTHREADS]; + pthread_t threads[NTHREADS]; + for (int i = 0; i < NTHREADS; i++) { + params[i].sem = (int*)page; + params[i].thread_id = (long)i; + + if (pthread_create(&threads[i], NULL, thread_pfn, (void*)¶ms[i]) != + 0) { + fprintf( + stderr, + "pthread_create to create thread #%d failed: %s\n", + i, + strerror(errno)); + abort(); + } + } + + struct timespec tp = {1, 0}; + clock_nanosleep(CLOCK_MONOTONIC, 0, &tp, NULL); + + long nb = atomic_load(&counter); + + fprintf(stderr, "Heard from %ld threads before killing them.\n", nb); + fwrite(&nb, sizeof(nb), 1, stdout); + fflush(stdout); + // do SYS_exit_group. All threads should be still blocked by mutex. + // SYS_exit_group should force all threads begin to exit. + syscall(SYS_exit_group, 0); + } else { + // Host mode: as test runner, run guest and check output. + char command[PATH_MAX] = + { + 0, + }, + program_path[PATH_MAX] = { + 0, + }; + + char* prog = realpath(argv[0], program_path); + snprintf(command, PATH_MAX, "%s %s", prog, SECRET_PARAM); + FILE* output = popen(command, "r"); + if (!output) { + fprintf( + stderr, "failed to run `%s`, error: %s\n", command, strerror(errno)); + exit(1); + } + + long val = 0; + size_t nb = fread(&val, sizeof(val), 1, output); + if (nb != 1 || val != NTHREADS) { + fprintf( + stderr, + "expecting %s output to be value %ld, got %ld\n", + command, + (long)NTHREADS, + val); + exit(1); + } + fprintf(stderr, "Success.\n"); + pclose(output); + } + return 0; +} diff --git a/tests/c_tests/threads_group_exit_blocking.c b/tests/c_tests/threads_group_exit_blocking.c new file mode 100644 index 0000000..cd9def0 --- /dev/null +++ b/tests/c_tests/threads_group_exit_blocking.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// create `NTHREADS`, all doing (indefinite) blocking futexes, while the thread +// group leader calling `SYS_exit_group`. +// This is to test all blocking futex syscall can be interrupted, and all +// threads can exit gracefully under `SYS_exit_group`. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTHREADS 8 + +static _Atomic unsigned long counter; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + +void* thread_pfn(void* param) { + atomic_fetch_add(&counter, 1); + + // Wait for enough time such that the main thread can kill this thread via + // `exit_group`. + pthread_mutex_lock(&mutex); + + return NULL; +} + +int main(int argc, char* argv[], char* envp[]) { + // Lock, but never unlock the mutex to force all threads to wait. All threads + // will get killed while waiting for for this mutex. + pthread_mutex_lock(&mutex); + + pthread_t threads[NTHREADS]; + + for (int i = 0; i < NTHREADS; i++) { + if (pthread_create(&threads[i], NULL, thread_pfn, NULL) != 0) { + fprintf( + stderr, + "pthread_create to create thread #%d failed: %s\n", + i, + strerror(errno)); + abort(); + } + } + + // Spin while we wait for the threads to finish initializing. + while (atomic_load(&counter) != NTHREADS) { + // Yield so that other threads have a chance to run. + sched_yield(); + } + + // do SYS_exit_group. All threads should be still blocked by mutex. + // SYS_exit_group should force all threads begin to exit. + syscall(SYS_exit_group, 0); + + return 0; +} diff --git a/tests/c_tests/threads_group_exit_stress.c b/tests/c_tests/threads_group_exit_stress.c new file mode 100644 index 0000000..e1b628a --- /dev/null +++ b/tests/c_tests/threads_group_exit_stress.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// create `NTHREADS`, all doing blocking or non blocking syscall in a tight +// loop, while the thread group leader calling `SYS_exit_group`. This is to test +// while `SYS_exit_group` is called, the remaining threads can exit gracefully. +// +// NB: When running this program under a ptracer, due to doing syscalls in a +// tight loop, the syscall (`sched_yield`) might return +// +// - interrupted, by ptrace event exit +// - interrupted, by the real exit (WEXITED) +// - unavailable, waitpid returned ECHILD (_yes_, strace has this state) +// - returns normally even `exit_group` started in another thread, but +// subsequent waitpid +// should still indicate the thread (doing sched_yield) is exiting. +// while blocking syscalls (`futex`) most likely would get interrupted (by exit +// or event exit) +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTESTS 100 +#define NTHREADS 16 + +struct thread_param { + pthread_mutex_t* mutex; + _Atomic unsigned long* counter; + int thread_id; +}; + +// Call `sched_yield` repeatly, which should always return 0 on Linux. +// we use `sched_yield` to simulate various outcomes when our main thread +// calls `exit_group`. +static inline void forever_yield(struct thread_param* param) { + while (1) { + sched_yield(); + } +} + +// call pthread_mutex_lock(), the lock should have been held by the thread +// group leader, hence this should translate to a futex syscall which would +// never return. +static inline void forever_block(struct thread_param* param) { + pthread_mutex_lock(param->mutex); +} + +static void* thread_pfn(void* param) { + struct thread_param* p = (struct thread_param*)param; + + atomic_fetch_add(p->counter, 1); + + if (p->thread_id % 2 == 0) { + forever_yield(p); + } else { + forever_block(p); + } + + return NULL; +} + +static __attribute__((noreturn)) void test_exit_group() { + struct thread_param param; + _Atomic unsigned long counter = 0; + pthread_t threads[NTHREADS]; + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + + param.counter = &counter; + param.mutex = &mutex; + + for (int i = 0; i < NTHREADS; i++) { + param.thread_id = i; + if (pthread_create(&threads[i], NULL, thread_pfn, (void*)¶m) != 0) { + fprintf( + stderr, + "pthread_create to create thread #%d failed: %s\n", + i, + strerror(errno)); + abort(); + } + } + + // Spin while we wait for the threads to finish initializing. + while (atomic_load(&counter) != NTHREADS) { + // Yield so that other threads have a chance to run. + sched_yield(); + } + + // do SYS_exit_group. All threads should be still blocked by mutex. + // SYS_exit_group should force all threads begin to exit. + syscall(SYS_exit_group, 0); + + // should not reach here! + abort(); +} + +static int test_exit_group_helper(void) { + pid_t pid = fork(); + + if (pid < 0) { + perror("fork"); + return -1; + } else if (pid > 0) { + int status; + pid_t child_pid; + + if ((child_pid = waitpid(-1, &status, 0)) < 0) { + perror("waitpid"); + return -1; + } else { + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + pid_t child; + status = 0; + // The second waitpid with WNOHANG should return ECHILD only. meaning + // all children has exited in previous waitpid without WNOHANG. + if ((child = waitpid(-1, &status, WNOHANG)) != -1 && errno != ECHILD) { + fprintf( + stderr, + "Second waitpid should return ECHILD, but returned %d with status 0x%x, errno: %d\n", + child, + status, + errno); + return -2; + } else { + return 0; + } + } else { + fprintf(stderr, "waitpid returned unknown status: 0x%x\n", status); + return -1; + } + } + } else { + test_exit_group(); + } +} + +unsigned long time_getus(void) { + struct timespec tp = { + 0, + }; + + clock_gettime(CLOCK_MONOTONIC, &tp); + + return tp.tv_sec * 1000000 + tp.tv_nsec / 1000; +} + +int main(int argc, char* argv[], char* envp[]) { + long i, ntests = NTESTS; + unsigned long begin, elapsed; + + if (argc == 2) { + ntests = strtol(argv[1], NULL, 0); + } + + if (ntests <= 0) { + ntests = 1; + } + + long increment = (99 + ntests) / 100, curr = increment; + + begin = time_getus(); + + for (i = 0; i < ntests; i++) { + if (test_exit_group_helper() < 0) { + fprintf(stdout, "stress test failed at %ld/%ld\n", 1 + i, ntests); + exit(1); + } else { + if (i >= curr) { + fputs(".", stdout); + fflush(stdout); + curr += increment; + } + } + } + elapsed = time_getus() - begin; + + printf(" passed %ld tests\n", ntests); + printf( + "time elapsed: %.3lf secs, time/test: %ld milli secs.\n", + elapsed * 1.0 / 1000000, + elapsed / 1000 / ntests); + return 0; +} diff --git a/tests/c_tests/vforkExec.c b/tests/c_tests/vforkExec.c new file mode 100644 index 0000000..db45f52 --- /dev/null +++ b/tests/c_tests/vforkExec.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[], char* envp[]) { + if (argc == 2 && strcmp(argv[1], "child") == 0) { + printf("exec pid: %u\n", getpid()); + _exit(0); + } + + pid_t pid = vfork(); + + if (pid < 0) { + perror("vfork failed: "); + exit(1); + } else if (pid == 0) { + char* prog = argv[0]; + char* const newArgv[] = {prog, "child", NULL}; + printf("child pid: %u\n", getpid()); + execve(prog, newArgv, envp); + printf("exec failed: %s\n", strerror(errno)); + } else { + int status; + printf("parent pid: %u\n", getpid()); + waitpid(pid, &status, 0); + if (WIFSIGNALED(status)) { + printf("%u terminated by signal: %u\n", pid, WTERMSIG(status)); + } + } +} diff --git a/tests/c_tests/write-many.c b/tests/c_tests/write-many.c new file mode 100644 index 0000000..b583c21 --- /dev/null +++ b/tests/c_tests/write-many.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include + +int main(int argc, char* argv[]) { + write(STDOUT_FILENO, "0", 1); + write(STDOUT_FILENO, "1", 1); + write(STDOUT_FILENO, "2", 1); + write(STDOUT_FILENO, "3", 1); + write(STDOUT_FILENO, "4", 1); + write(STDOUT_FILENO, "5", 1); + write(STDOUT_FILENO, "6", 1); + write(STDOUT_FILENO, "7", 1); + write(STDOUT_FILENO, "8", 1); + write(STDOUT_FILENO, "9", 1); + write(STDOUT_FILENO, "\n", 1); +} diff --git a/tests/convert.rs b/tests/convert.rs new file mode 100644 index 0000000..0090861 --- /dev/null +++ b/tests/convert.rs @@ -0,0 +1,113 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#![feature(llvm_asm)] + +// when we convert syscall, such as open -> openat, the old syscall +// args should not be clobbered, even with the conversion. + +use reverie::{syscalls::Syscall, Error, Guest, Tool}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalStateTailInject; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalStateInject; + +#[reverie::tool] +impl Tool for LocalStateTailInject { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Open(open_syscall) => { + guest + .tail_inject(reverie::syscalls::Openat::from(open_syscall)) + .await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[reverie::tool] +impl Tool for LocalStateInject { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Open(open_syscall) => Ok(guest + .inject(reverie::syscalls::Openat::from(open_syscall)) + .await?), + _ => guest.tail_inject(syscall).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use nix::unistd; + use reverie_ptrace::testing::check_fn; + + #[cfg(target_arch = "x86_64")] + #[allow(unused_mut)] + unsafe fn open_syscall_sanity_check() -> i32 { + let mut ret; + let path = b"/dev/null\0"; + llvm_asm!(r#"movq %rdi, %r8 + movq $$0x8000, %rsi # O_LARGEFILE + movq $$0x1a4, %rdx # 0644 + mov $$2, %eax + syscall + cmp $$0xfffffffffffff001,%rax + jae 1f + cmp %rdi, %r8 + jne 1f + cmp $$0x8000, %rsi + jne 1f + cmp $$0x1a4, %rdx + jne 1f + jmp 2f + 1:mov $$1, %rdi + mov $$231, %rax # call exit_group(1) + syscall + 2: + "# + :"={rax}"(ret) + :"{rdi}"(path.as_ptr() as u64) + :"rcx", "r11", "memory"); + ret + } + + #[cfg(not(target_arch = "x86_64"))] + unsafe fn open_syscall_sanity_check() -> i32 { + unimplemented!() + } + + #[test] + fn open_into_openat_tail_inject_test() { + check_fn::(move || { + let fd = unsafe { open_syscall_sanity_check() }; + assert!(unistd::close(fd).is_ok()); + }) + } + + #[test] + fn open_into_openat_inject_test() { + check_fn::(move || { + let fd = unsafe { open_syscall_sanity_check() }; + assert!(unistd::close(fd).is_ok()); + }) + } +} diff --git a/tests/cpuid.rs b/tests/cpuid.rs new file mode 100644 index 0000000..a9b0994 --- /dev/null +++ b/tests/cpuid.rs @@ -0,0 +1,148 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests cpuid interception + +use raw_cpuid::CpuIdResult; + +use serde::{Deserialize, Serialize}; + +use reverie::{Errno, GlobalTool, Guest, Pid, Subscription, Tool}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct GlobalState { + clock: u64, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = (); + type Response = u64; + + // Just get the current time. + async fn receive_rpc(&self, _from: Pid, _request: ()) -> u64 { + // This could be turned into a logical clock by incrementing this. + self.clock + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState {} + +#[reverie::tool] +impl Tool for LocalState { + type GlobalState = GlobalState; + + fn subscriptions(_cfg: &()) -> Subscription { + let mut s = Subscription::none(); + s.cpuid(); + s + } + + async fn handle_cpuid_event>( + &self, + _guest: &mut T, + eax: u32, + _ecx: u32, + ) -> Result { + let intercepted = InterceptedCpuid::new(); + Ok(intercepted.cpuid(eax).unwrap()) + } +} + +trait Cpuid { + fn cpuid(&self, index: u32) -> Option; +} + +#[derive(Debug, Clone, Copy)] +struct InterceptedCpuid(); + +impl InterceptedCpuid { + pub fn new() -> Self { + InterceptedCpuid() + } +} + +impl Cpuid for InterceptedCpuid { + fn cpuid(&self, index: u32) -> Option { + let request = index as usize; + if request >= 0x80000000 && request < 0x80000000 + EXTENDED_CPUIDS.len() { + Some(EXTENDED_CPUIDS[request - 0x80000000]) + } else if request < CPUIDS.len() { + Some(CPUIDS[request]) + } else { + None + } + } +} + +const fn cpuid_result(eax: u32, ebx: u32, ecx: u32, edx: u32) -> CpuIdResult { + CpuIdResult { eax, ebx, ecx, edx } +} + +// CPUID output from older CPU (broadwell?), with some features like RDRAND +// masked off to prevent non-determinism. +const CPUIDS: &[CpuIdResult] = &[ + cpuid_result(0x0000000D, 0x756E6547, 0x6C65746E, 0x49656E69), + cpuid_result(0x00000663, 0x00000800, 0x80202001, 0x078BFBFD), + cpuid_result(0x00000001, 0x00000000, 0x0000004D, 0x002C307D), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000120, 0x01C0003F, 0x0000003F, 0x00000001), + cpuid_result(0x00000000, 0x00000000, 0x00000003, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000001, 0x00000100, 0x00000001), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), +]; + +const EXTENDED_CPUIDS: &[CpuIdResult] = &[ + cpuid_result(0x8000000A, 0x756E6547, 0x6C65746E, 0x49656E69), + cpuid_result(0x00000663, 0x00000000, 0x00000001, 0x20100800), + cpuid_result(0x554D4551, 0x72695620, 0x6C617574, 0x55504320), + cpuid_result(0x72657620, 0x6E6F6973, 0x352E3220, 0x0000002B), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x01FF01FF, 0x01FF01FF, 0x40020140, 0x40020140), + cpuid_result(0x00000000, 0x42004200, 0x02008140, 0x00808140), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00003028, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), + cpuid_result(0x00000000, 0x00000000, 0x00000000, 0x00000000), +]; + +#[test] +fn cpuid_leaf_count() { + assert_eq!(1 + CPUIDS[0].eax as usize, CPUIDS.len()); + assert_eq!( + 1 + (EXTENDED_CPUIDS[0].eax as usize & !0x80000000usize), + EXTENDED_CPUIDS.len() + ); +} + +#[cfg(not(sanitized))] +#[cfg(test)] +mod tests { + use super::*; + + use reverie_ptrace::testing::check_fn; + + #[test] + fn run_guest_func_cpuid_intercepted_test() { + check_fn::(|| { + let cpuid = raw_cpuid::CpuId::new(); + let feature = cpuid.get_feature_info(); + assert!(feature.is_some()); + let feature = feature.unwrap(); + assert!(!feature.has_rdrand()); + }); + } +} diff --git a/tests/delay_signal.rs b/tests/delay_signal.rs new file mode 100644 index 0000000..e45c644 --- /dev/null +++ b/tests/delay_signal.rs @@ -0,0 +1,551 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! tests for delaying signal delivery +//! SIGALRM: suppressed +//! SIGVTALRM: delayed about 500ms, then delivered +//! SIGSYS: delayed till next syscall. +//! +//! NB: restarted syscalls should not count, as syscall +//! returning ERESTARTSYS could be automatically restarted + +use nix::sys::signal::{self, Signal}; +use reverie::{ + syscalls::{Errno, Syscall, SyscallInfo, Sysno, Tgkill}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; +use tokio::time::{sleep, Duration}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +const GAP_MS: u64 = 500; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct ThreadState { + sigpending: Option, + injected_signal: Option, +} + +// syscall is interrupted and may restart +fn is_syscall_restarted(errno: Errno) -> bool { + [ + Errno::ERESTARTSYS, + Errno::ERESTARTNOINTR, + Errno::ERESTARTNOHAND, + Errno::ERESTART_RESTARTBLOCK, + ] + .contains(&errno) +} + +#[reverie::tool] +impl Tool for LocalState { + type ThreadState = ThreadState; + async fn handle_signal_event>( + &self, + guest: &mut T, + signal: signal::Signal, + ) -> Result, Errno> { + Ok(if signal == Signal::SIGVTALRM { + sleep(Duration::from_millis(GAP_MS)).await; + Some(signal) + } else if signal == Signal::SIGALRM { + None // Suppress the signal. + } else if signal == Signal::SIGSYS { + eprintln!( + "[pid = {}] delay delivery of signal {:?}, thread_state {:?}", + guest.tid(), + signal, + guest.thread_state(), + ); + match guest.thread_state_mut().injected_signal.take() { + None => { + guest.thread_state_mut().sigpending = Some(signal as i32); + None + } + Some(sig) => { + guest.thread_state_mut().sigpending = None; + Some(Signal::try_from(sig).unwrap()) + } + } + } else { + println!("[pid = {}] deliverying signal {:?}", guest.tid(), signal); + Some(signal) + }) + } + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let pending = guest.thread_state().sigpending; + if pending.is_some() { + eprintln!( + "[pid = {}] syscall {:?} pending signal {:?}", + guest.tid(), + syscall, + pending, + ); + } + + if [ + Sysno::exit_group, + Sysno::exit, + Sysno::execve, + Sysno::execveat, + ] + .contains(&syscall.number()) + { + eprintln!("[pid = {}] tail injecting {:?}", guest.tid(), syscall); + guest.tail_inject(syscall).await + } else { + eprintln!("[pid = {}] injecting {:?}", guest.tid(), syscall); + let res = guest.inject(syscall).await; + if let Some(sig) = pending { + // NB: don't do signal delivery if syscall is interrupted + // and restarted. + if res.is_ok() || res.is_err() && is_syscall_restarted(res.unwrap_err()) { + eprintln!( + "[pid = {}] injecting tgkill to deliver signal {:?}", + guest.tid(), + sig + ); + let send_signal = Tgkill::new() + .with_tgid(guest.pid().as_raw()) + .with_tid(guest.tid().as_raw()) + .with_sig(sig); + guest.thread_state_mut().injected_signal = Some(sig); + let _ = guest.inject(send_signal).await; + } + } + Ok(res?) + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie::ExitStatus; + use reverie_ptrace::testing::{check_fn, test_fn}; + use std::{io, mem::MaybeUninit, sync::mpsc, thread, time}; + + // kernel_sigset_t used by naked syscall + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + struct KernelSigset(u64); + + impl From<&[Signal]> for KernelSigset { + fn from(signals: &[Signal]) -> Self { + let mut set: u64 = 0; + for &sig in signals { + set |= 1u64 << (sig as usize - 1); + } + KernelSigset(set) + } + } + + #[allow(dead_code)] + unsafe fn block_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: MaybeUninit = MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_BLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + // unblock signal(s) and set its handler to SIG_DFL + unsafe fn unblock_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: MaybeUninit = MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_UNBLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + unsafe fn restore_sig_handlers(signals: &[Signal]) -> io::Result<()> { + for &sig in signals { + libc::signal(sig as i32, libc::SIG_DFL); + } + Ok(()) + } + + #[no_mangle] + extern "C" fn sigprof_handler( + _sig: i32, + _siginfo: *mut libc::siginfo_t, + _ucontext: *const libc::c_void, + ) { + nix::unistd::write(2, b"caught SIGPROF!").unwrap(); + unsafe { + libc::syscall(libc::SYS_exit_group, 0); + } + } + + unsafe fn install_sigprof_handler() -> i32 { + let mut sa: libc::sigaction = MaybeUninit::zeroed().assume_init(); + sa.sa_flags = libc::SA_RESTART | libc::SA_SIGINFO | libc::SA_NODEFER; + sa.sa_sigaction = sigprof_handler as _; + + libc::sigaction(libc::SIGPROF, &sa as *const _, std::ptr::null_mut()) + } + + unsafe fn sigtimedwait(signals: &[Signal], timeout_ns: u64) -> io::Result { + let mut siginfo: MaybeUninit = MaybeUninit::zeroed(); + let sigset = KernelSigset::from(signals); + let timeout = libc::timespec { + tv_sec: timeout_ns as i64 / 1000000000, + tv_nsec: (timeout_ns % 1000000000) as i64, + }; + + match Signal::try_from(libc::syscall( + libc::SYS_rt_sigtimedwait, + &sigset as *const _, + siginfo.as_mut_ptr(), + &timeout as *const _, + 8, + ) as i32) + { + Ok(sig) => { + let siginfo = siginfo.assume_init(); + assert_eq!(siginfo.si_signo, sig as i32); + Ok(sig) + } + Err(_) => Err(io::Error::last_os_error()), + } + } + + unsafe fn sigsuspend(signals: &[Signal]) -> io::Result<()> { + let mut set: u64 = 0; + for &sig in signals { + set |= 1u64 << (sig as usize - 1); + } + + libc::syscall(libc::SYS_rt_sigsuspend, &set as *const _, 8); + // always return Err. + Err(io::Error::last_os_error()) + } + + // set timer with SIGPROF as signal + unsafe fn settimer(time_us: u64) -> io::Result<()> { + let zero = libc::timeval { + tv_sec: 0, + tv_usec: 0, + }; + + let mut next = libc::timeval { + tv_sec: time_us as i64 / 1000000, + tv_usec: time_us as i64 % 1000000, + }; + + if next.tv_usec > 1000000 { + next.tv_sec += 1; + next.tv_usec -= 1000000; + } + + let timer_val = libc::itimerval { + it_interval: zero, + it_value: next, + }; + + if libc::syscall( + libc::SYS_setitimer, + libc::ITIMER_PROF, + &timer_val as *const _, + 0, + ) != 0 + { + eprintln!("setitimer returned error: {:?}", io::Error::last_os_error()); + Err(io::Error::last_os_error()) + } else { + Ok(()) + } + } + + #[test] + fn signal_delay_500ms() { + check_fn::(|| { + assert!(unsafe { restore_sig_handlers(&[Signal::SIGVTALRM]) }.is_ok()); + assert!(unsafe { unblock_signals(&[Signal::SIGVTALRM]) }.is_ok()); + unsafe { + libc::signal(libc::SIGVTALRM, libc::SIG_IGN) + }; + let now = time::Instant::now(); + thread::sleep(time::Duration::from_millis(10)); + assert!(signal::raise(Signal::SIGVTALRM).is_ok()); + assert!(now.elapsed().as_millis() >= GAP_MS as u128 + 10); + }); + } + + #[test] + // signal is suppressed by handle_signal_event + fn signal_suppress() { + check_fn::(|| { + assert!(unsafe { restore_sig_handlers(&[Signal::SIGALRM]) }.is_ok()); + assert!(unsafe { unblock_signals(&[Signal::SIGALRM]) }.is_ok()); + let now = time::Instant::now(); + thread::sleep(time::Duration::from_millis(10)); + assert!(signal::raise(Signal::SIGALRM).is_ok()); + assert!(now.elapsed().as_millis() < GAP_MS as u128); + }); + } + + #[test] + fn sigtimedwait_sanity() { + check_fn::(|| { + let (sender, receiver) = mpsc::channel(); + let handle = thread::spawn(move || { + assert!(sender.send(nix::unistd::gettid()).is_ok()); + unsafe { + libc::signal(libc::SIGBUS, libc::SIG_DFL) + }; + assert_eq!( + unsafe { sigtimedwait(&[Signal::SIGBUS], 1000000000000u64) }.unwrap(), + Signal::SIGBUS, + ); + eprintln!("[thread] sigtimedwait returned SIGBUS"); + }); + + let thread_id = receiver.recv().unwrap(); + // wait until thread is blocked by rt_sigtimedwait.. + thread::sleep(Duration::from_millis(500)); + let signal_sent = unsafe { + libc::syscall(libc::SYS_tkill, thread_id.as_raw(), Signal::SIGBUS as i32) + }; + assert_eq!(signal_sent, 0); + assert!(handle.join().is_ok()); + }); + } + + #[test] + fn sigsuspend_sanity() { + let (output, _) = test_fn::(|| { + let (sender, receiver) = mpsc::channel(); + let handle = thread::spawn(move || { + assert!(sender.send(nix::unistd::gettid()).is_ok()); + unsafe { + libc::signal(libc::SIGBUS, libc::SIG_DFL) + }; + assert_eq!( + unsafe { sigsuspend(&[]) } + .err() + .and_then(|e| e.raw_os_error()), + Some(libc::EINTR) + ); + }); + + let thread_id = receiver.recv().unwrap(); + // wait until thread is blocked by rt_sigtimedwait.. + thread::sleep(Duration::from_millis(500)); + let signal_sent = unsafe { + libc::syscall(libc::SYS_tkill, thread_id.as_raw(), Signal::SIGBUS as i32) + }; + assert_eq!(signal_sent, 0); + assert!(handle.join().is_ok()); + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Signaled(Signal::SIGBUS, true)); + } + + #[test] + // A sanity check ITIMER_PROF can indeed cause program to exit with SIGPROF + // NB: rust runtime masks most signals, hence SIGPROF has to be explicitly + // unmasked. + fn sigprof_sanity() { + check_fn::(|| { + assert_eq!(unsafe { install_sigprof_handler() }, 0); + // timer should expire + assert!(unsafe { unblock_signals(&[Signal::SIGPROF]) }.is_ok()); + assert!(unsafe { settimer(100000) }.is_ok()); + loop {} + }); + } + + #[test] + // SIGSYS is delayed till next syscall is trapped. However, since we send + // SIGSYS when rt_sigsuspend is called, rt_sigsuspend won't return because + // the signal is delayed till next syscall. Which causes this test to timeout + // pease note this is expected behavior. Showing we cannot assume signal + // delivery can be always delayed. + fn sigsuspend_delay_till_next_syscall_should_timeout_1() { + check_fn::(|| { + let (sender, receiver) = mpsc::channel(); + let _handle = thread::spawn(move || { + assert!(sender.send(nix::unistd::gettid()).is_ok()); + unsafe { + libc::signal(libc::SIGSYS, libc::SIG_DFL) + }; + + assert_eq!( + unsafe { sigsuspend(&[Signal::SIGPROF, Signal::SIGVTALRM]) } + .err() + .and_then(|e| e.raw_os_error()), + Some(libc::EINTR) + ); + }); + + let thread_id = receiver.recv().unwrap(); + // wait until thread is blocked by rt_sigtimedwait.. + thread::sleep(Duration::from_millis(500)); + let signal_sent = unsafe { + libc::syscall(libc::SYS_tkill, thread_id.as_raw(), Signal::SIGSYS as i32) + }; + assert_eq!(signal_sent, 0); + + assert!(unsafe { restore_sig_handlers(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + + assert_eq!(unsafe { install_sigprof_handler() }, 0); + + // timer should expire + assert!(unsafe { unblock_signals(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + assert!(unsafe { settimer(500000) }.is_ok()); + loop { /* sigprof handler calls exit_group */ } + }); + } + + #[test] + // similar to sigsuspend_delay_till_next_syscall_should_timeout_1, but this test + // only has one line difference compare to sigsuspend_delay_till_next_syscall_should_pass + // to emphasis SIGSYS is indeeded not delivered without the extra syscall after tgkill. + // because we delay SIGSYS delivery to next syscall. + fn sigsuspend_delay_till_next_syscall_should_timeout_2() { + check_fn::(|| { + let (sender, receiver) = mpsc::channel(); + let _handle = thread::spawn(move || { + let tid = nix::unistd::gettid(); + let pid = nix::unistd::getpid(); + assert!(sender.send(tid).is_ok()); + unsafe { + libc::signal(libc::SIGSYS, libc::SIG_DFL); + }; + + // block SIGPROF as the parent task is setting up a timer + // with ITIMER_PROF. Linux does not guarantee which thread + // receive the signal. As a result, we simply mask SIGPROF + // in this thread, so that only parent task can receive it. + assert!(unsafe { block_signals(&[Signal::SIGPROF]) }.is_ok()); + assert!(unsafe { unblock_signals(&[Signal::SIGSYS]) }.is_ok()); + assert_eq!( + unsafe { sigtimedwait(&[Signal::SIGSYS], 1000_000_000) }.ok(), + Some(Signal::SIGSYS) + ); + + unsafe { + libc::syscall( + libc::SYS_tgkill, + pid.as_raw(), + tid.as_raw(), + Signal::SIGSYS as i32, + ); + // expected to timeout, because we delay signal delivery to next + // syscall which returns success + loop {} + } + }); + + let thread_id = receiver.recv().unwrap(); + // wait until thread is blocked by rt_sigtimedwait.. + thread::sleep(Duration::from_millis(100)); + let signal_sent = unsafe { + libc::syscall(libc::SYS_tkill, thread_id.as_raw(), Signal::SIGSYS as i32) + }; + assert_eq!(signal_sent, 0); + + assert!(unsafe { restore_sig_handlers(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + + assert_eq!(unsafe { install_sigprof_handler() }, 0); + + // timer should expire + assert!(unsafe { unblock_signals(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + assert!(unsafe { settimer(500000) }.is_ok()); + + loop {} + }); + } + + #[test] + // since we delay SIGSYS till next syscall, adding a syscall like getsid should + // cause SIGSYS to be delivered, hence the program should be killed by SIGSYS. + fn sigsuspend_delay_till_next_syscall_should_pass() { + let (output, _) = test_fn::(|| { + let (sender, receiver) = mpsc::channel(); + let _handle = thread::spawn(move || { + let tid = nix::unistd::gettid(); + let pid = nix::unistd::getpid(); + assert!(sender.send(tid).is_ok()); + unsafe { + libc::signal(libc::SIGSYS, libc::SIG_DFL); + }; + + assert!(unsafe { unblock_signals(&[Signal::SIGSYS]) }.is_ok()); + assert_eq!( + unsafe { sigtimedwait(&[Signal::SIGSYS], 1000_000_000) }.ok(), + Some(Signal::SIGSYS) + ); + + unsafe { + libc::syscall( + libc::SYS_tgkill, + pid.as_raw(), + tid.as_raw(), + Signal::SIGSYS as i32, + ); + // signal should delivered after SYS_getsid returned + // hence the program should be killed by SIGSYS + libc::syscall(libc::SYS_getsid); + + // will run into SIGSYS handler (SIG_DFL) hence below + // statement is not reachable. + unreachable!() + } + }); + + let thread_id = receiver.recv().unwrap(); + // wait until thread is blocked by rt_sigtimedwait.. + thread::sleep(Duration::from_millis(100)); + let signal_sent = unsafe { + libc::syscall(libc::SYS_tkill, thread_id.as_raw(), Signal::SIGSYS as i32) + }; + assert_eq!(signal_sent, 0); + + assert!(unsafe { restore_sig_handlers(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + + assert_eq!(unsafe { install_sigprof_handler() }, 0); + + + assert!(unsafe { unblock_signals(&[Signal::SIGPROF, Signal::SIGVTALRM]) }.is_ok()); + assert!(unsafe { settimer(500000) }.is_ok()); + + // SIGSYS handler (SIG_DFL) should be called before timer expire + unreachable!() + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Signaled(Signal::SIGSYS, true)); + } +} diff --git a/tests/disabled/clobbered.S b/tests/disabled/clobbered.S new file mode 100644 index 0000000..b55b259 --- /dev/null +++ b/tests/disabled/clobbered.S @@ -0,0 +1,108 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * clobbered.S: used by reverie *only* + * to test inject/tail_inject wouldn't clobber any + * syscall registers. + * NB: %rcx is clobbered by `syscall`. + */ + .text + .global _start + .type _start, @function +_start: + sub $0x18, %rsp + + mov $9, %eax // mmap + movq $0, %rdi + movq $4096, %rsi + movq $1, %rdx + movq $0x22, %r10 + movq $-1, %r8 + movq $0, %r9 + syscall + cmp $0xfffffffffffff000,%rax + ja panic + + movq %rax, 0x8(%rsp) + + cmpq $0, %rdi + jne panic + + cmpq $4096, %rsi + jne panic + + cmpq $1, %rdx + jne panic + + cmpq $0x22, %r10 + jne panic + + cmpq $-1, %r8 + jne panic + + cmpq $0, %r9 + jne panic + + movq $0x12345678, %rdi + movq $0x17654321, %rsi + movq $0x42421234, %rdx + movq $0x1234abab, %r10 + movq $0x12123434, %r8 + movq $0x78781212, %r9 + + mov $39, %eax // getpid + syscall + + cmpq $0x12345678, %rdi + jne panic + + cmpq $0x17654321, %rsi + jne panic + + cmpq $0x42421234, %rdx + jne panic + + cmpq $0x1234abab, %r10 + jne panic + + cmpq $0x12123434, %r8 + jne panic + + cmpq $0x78781212, %r9 + jne panic + + mov $11, %eax // munmap + movq 8(%rsp), %rdi + movq $4096, %rsi + syscall + cmp $0xfffffffffffff000,%rax + ja panic + + cmpq $0x42421234, %rdx + jne panic + + cmpq $0x1234abab, %r10 + jne panic + + cmpq $0x12123434, %r8 + jne panic + + cmpq $0x78781212, %r9 + jne panic + + add $18, %rsp + mov $0xe7, %eax // exit_group + mov $0, %rdi + syscall + +panic: add $18, %rsp + mov $1, %rdi + mov $0xe7, %eax + syscall diff --git a/tests/disabled/openat2.S b/tests/disabled/openat2.S new file mode 100644 index 0000000..2fad9d3 --- /dev/null +++ b/tests/disabled/openat2.S @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * openat2.S: create syscalls sequence that cannot be patchable + */ + .text + .global _start + .type _start, @function +_start: + sub $0x8, %rsp + movq $0x6d6f646e, %rax + push %rax + movabs $0x6172752f7665642f, %rax + push %rax + mov $0x101, %eax + mov $0xffffff9c, %rdi + mov %rsp, %rsi + mov $0x0, %rdx + syscall + mov $0xe7, %eax + mov $0, %rdi + syscall diff --git a/tests/disabled/segfault.c b/tests/disabled/segfault.c new file mode 100644 index 0000000..4d06b21 --- /dev/null +++ b/tests/disabled/segfault.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +int main(int argc, char* argv[]) { + long* invalid_ptr = (long*)0x123; + + *invalid_ptr = 0x12345678l; + + return 0; +} diff --git a/tests/disabled/signal4.c b/tests/disabled/signal4.c new file mode 100644 index 0000000..e47ccbb --- /dev/null +++ b/tests/disabled/signal4.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static volatile int quit = 0; + +static void handler(int sig, siginfo_t* info, void* ucontext) { + quit = 1; +} + +void* thread_main(void* param) { + struct timespec ts = {0, 100000000}; + nanosleep(&ts, NULL); + _exit(1); +} + +int main(int argc, char* argv[]) { + int ret; + sigset_t sigset; + struct sigaction old, new; + + pthread_t tid; + + pthread_create(&tid, NULL, thread_main, (void*)1UL); + + memset(&old, 0, sizeof(old)); + memset(&new, 0, sizeof(new)); + + sigemptyset(&sigset); + new.sa_sigaction = handler; + new.sa_mask = sigset; + new.sa_flags = SA_RESTART | SA_SIGINFO; + + ret = sigaction(SIGALRM, &new, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + ret = sigaction(SIGALRM, NULL, &old); + if (ret < 0) { + perror("rt_sigaction"); + exit(1); + } + + assert((unsigned long)old.sa_sigaction == (unsigned long)handler); + + alarm(1); + + while (!quit) + ; + + return 0; +} diff --git a/tests/disabled/threads7.c b/tests/disabled/threads7.c new file mode 100644 index 0000000..0cb2f1d --- /dev/null +++ b/tests/disabled/threads7.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define assert(b) \ + if (!(b)) \ + abort(); + +#define NR_THREADS 8L + +#define NSECS_PER_SEC 1000000000L +#define FIVE_SECONDS (5UL * NSECS_PER_SEC) + +static int ts_printf(const char* fmt, ...) { + va_list ap; + struct timespec ts; + char buf[8192]; + int n; + + clock_gettime(CLOCK_REALTIME, &ts); + + va_start(ap, fmt); + n = snprintf(buf, 8192, "%lu.%06lu|", ts.tv_sec, ts.tv_nsec / 1000); + if (n < 8192) { + n += vsnprintf(buf + n, 8192 - n, fmt, ap); + } + va_end(ap); + + fputs(buf, stdout); + + return n; +} + +static pid_t gettid(void) { + return syscall(SYS_gettid, 0, 0, 0, 0, 0, 0); +} + +static void thread_delay(unsigned long ns) { + struct timespec req = { + .tv_sec = ns / NSECS_PER_SEC, + .tv_nsec = ns % NSECS_PER_SEC, + }; + struct timespec rem; + int ret; + + do { + ret = nanosleep(&req, &rem); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void run_exec(long k) { + ts_printf( + "thread %lu pid %lu tid %lu ready to run exec.\n", k, getpid(), gettid()); + char* const args[] = { + (char* const)"cat", + (char* const)"/proc/self/stat", + (char* const)NULL, + }; + char* const envp[] = { + (char* const)"PATH=/bin;/usr/bin", + (char* const)"SHELL=/bin/bash", + (char* const)NULL, + }; + execvpe(args[0], args, envp); + perror("exec"); + exit(1); +} + +static void* threaded(void* param) { + long k = (long)param; + unsigned long delay = FIVE_SECONDS; + + if (k == 5) { + delay = NSECS_PER_SEC; + } + + ts_printf("thread %lu enter. pid=%u, tid=%u\n", k, getpid(), gettid()); + thread_delay(delay); + if (k == 5) { + ts_printf("thread %lu call fork.\n", k); + + pid_t pid = fork(); + + assert(pid >= 0); + + if (pid > 0) { + int status; + ts_printf( + "after fork, I'm parent pid = %u, child pid = %u, tid = %u\n", + getpid(), + pid, + gettid()); + run_exec(k); + waitpid(pid, &status, 0); + ts_printf("parent pid = %u exit\n", getpid()); + } else { + ts_printf( + "after fork, I'm child pid = %u, parent = %u, tid = %u\n", + getpid(), + getppid(), + gettid()); + thread_delay(NSECS_PER_SEC); + ts_printf("child pid = %u exit\n", getpid()); + } + } + + ts_printf("thread %lu exit. pid=%u, tid=%u\n", k, getpid(), gettid()); + + return 0; +} + +static void atfork_prepare(void) { + ts_printf("pthread_atfork prepare.\n"); +} + +static void atfork_parent(void) { + ts_printf( + "pthread_atfork parent pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +static void atfork_child(void) { + ts_printf( + "pthread_atfork child pid = %u, ppid = %u, tid = %u.\n", + getpid(), + getppid(), + gettid()); +} + +int main(int argc, char* argv[]) { + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + pthread_atfork(atfork_prepare, atfork_parent, atfork_child); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/disabled/x64-save-return-address.c b/tests/disabled/x64-save-return-address.c new file mode 100644 index 0000000..581405f --- /dev/null +++ b/tests/disabled/x64-save-return-address.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* must be compiled with -O */ +/* demo how we can save return address from previous `callq xxx` + * the byte code could be useful for us to generate temp trampoline + */ +#include +#include +#include +#include + +__attribute__((noinline)) static void test1(void) { + __asm__( + "callq test2\n\t" + "nop\n\t"); +} + +__attribute__((noinline, used)) static void test2(void) { + __asm__( + "push %rax\n\t" + "movq 0x8(%rsp), %rax\n\t" + "movq %rax, 0x65001010\n\t" + "addq $0x8, %rsp\n\t" + "nop"); +} + +__attribute__((noinline)) static void test3(void) { + test1(); +} + +static void prepare_mmap(void) { + void* addr = mmap( + (void*)0x65000000UL, + 0x2000, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + if (addr != (void*)0x65000000UL) + abort(); +} + +int main(int argc, char* argv[]) { + unsigned long* ret = (unsigned long*)0x65001010UL; + prepare_mmap(); + test3(); + printf("*ret = %lx, expected = %lx\n", *ret, (unsigned long)test1 + 5); + if (*ret != (unsigned long)test1 + 5) + abort(); + return 0; +} diff --git a/tests/exit.rs b/tests/exit.rs new file mode 100644 index 0000000..79c7c52 --- /dev/null +++ b/tests/exit.rs @@ -0,0 +1,85 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests surrounding exit logic. + +use reverie::{ + syscalls::{self, Syscall, SyscallInfo, Sysno}, + Error, ExitStatus, GlobalRPC, GlobalTool, Guest, Pid, Tool, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Mutex; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState { + // FIXME: Can't use (Pid, ExitStatus) types here since they don't implement + // Serialize/Deserialize. + exited: Mutex>, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = ExitStatus; + type Response = (); + + async fn receive_rpc(&self, from: Pid, exit_status: ExitStatus) -> Self::Response { + self.exited + .lock() + .unwrap() + .push((from.as_raw(), exit_status)); + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct InjectExitTool {} + +#[reverie::tool] +impl Tool for InjectExitTool { + type GlobalState = GlobalState; + + async fn on_exit_process>( + self, + _pid: Pid, + global_state: &G, + exit_status: ExitStatus, + ) -> Result<(), Error> { + global_state.send_rpc(exit_status).await?; + Ok(()) + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + if syscall.number() == Sysno::getpid { + guest + .tail_inject(syscalls::ExitGroup::new().with_status(42)) + .await + } else { + guest.tail_inject(syscall).await + } + } +} + +#[cfg(not(sanitized))] +#[test] +fn smoke() { + use reverie_ptrace::testing::test_fn; + + let (output, state) = test_fn::(|| unsafe { + let _ = libc::getpid(); + libc::syscall(libc::SYS_exit_group, 0); + }) + .unwrap(); + assert_eq!(output.status, ExitStatus::Exited(42)); + let mut mg = state.exited.lock().unwrap(); + assert_eq!(mg.pop().map(|x| x.1), Some(ExitStatus::Exited(42))); + assert!(mg.is_empty()); +} diff --git a/tests/gdbserver-integration/gdbserver-helper/src/client.rs b/tests/gdbserver-integration/gdbserver-helper/src/client.rs new file mode 100644 index 0000000..d20b2e3 --- /dev/null +++ b/tests/gdbserver-integration/gdbserver-helper/src/client.rs @@ -0,0 +1,84 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use std::io; +use std::path::PathBuf; +use std::process::Command; +use std::process::ExitStatus; + +pub struct GdbClientCommand { + gdb: PathBuf, + program_to_run: PathBuf, + init_command_to_queue: Vec, + command_to_queue: Vec, +} + +impl GdbClientCommand { + pub fn new>(gdb_client: P, program_to_run: P) -> Self { + GdbClientCommand { + gdb: gdb_client.into(), + program_to_run: program_to_run.into(), + command_to_queue: Vec::new(), + init_command_to_queue: Vec::new(), + } + } + pub fn init_command>(&mut self, command: P) -> &mut Self { + self.init_command_to_queue.push(command.into()); + self + } + pub fn init_commands(&mut self, commands: P) -> &mut Self + where + P: IntoIterator, + S: Into, + { + commands.into_iter().for_each(|ex| { + self.init_command_to_queue.push(ex.into()); + }); + self + } + pub fn command>(&mut self, command: P) -> &mut Self { + self.command_to_queue.push(command.into()); + self + } + pub fn commands(&mut self, commands: P) -> &mut Self + where + P: IntoIterator, + S: Into, + { + commands.into_iter().for_each(|ex| { + self.command_to_queue.push(ex.into()); + }); + self + } + + pub fn status(&mut self) -> io::Result { + let mut command = Command::new(&self.gdb); + command.arg(&self.program_to_run); + command.arg("-nh"); + command.arg("--batch"); + command.arg("-q"); + command.arg("-l"); + command.arg("2"); + command.arg("-iex"); + command.arg("set debug remote 1"); + command.arg("-iex"); + // NB: host io generates tons of packets which are not interesting, + // try not to get our remote (debug) packets too cluttered. + command.arg("set remote hostio-open-packet 0"); + self.init_command_to_queue.iter().for_each(|iex| { + command.arg("-iex"); + command.arg(format!("{}", iex)); + }); + self.command_to_queue.iter().for_each(|ex| { + command.arg("-ex"); + command.arg(format!("{}", ex)); + }); + command.status() + } +} diff --git a/tests/gdbserver-integration/gdbserver-helper/src/main.rs b/tests/gdbserver-integration/gdbserver-helper/src/main.rs new file mode 100644 index 0000000..cb24124 --- /dev/null +++ b/tests/gdbserver-integration/gdbserver-helper/src/main.rs @@ -0,0 +1,272 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use anyhow::{bail, Error}; +use futures::future; +use structopt::StructOpt; +use tempfile::TempDir; + +use reverie::process::Output; + +use std::fs; +use std::io; +use std::io::BufRead; +use std::os::unix::process::ExitStatusExt; +use std::path::Path; +use std::path::PathBuf; +use std::process::ExitStatus; + +mod client; +mod server; + +pub use client::*; +pub use server::*; + +/// A remote gdb session with a hermit gdbserver and gdb. +struct RemoteGdbSession { + program_to_debug: PathBuf, + program_args: Vec, + gdb_client: PathBuf, + + // Temporary directory where the socket file lives. + tempdir: TempDir, +} + +impl RemoteGdbSession { + pub fn new(path_to_gdb: P1, program_to_debug: P2, program_args: A) -> Self + where + P1: AsRef, + P2: AsRef, + A: IntoIterator + Send, + S: AsRef, + { + // Need a tempdir for the socket file to get created in. We can't create + // a tempfile and use that as a socket since it cannot exist when we + // bind to it. + let tempdir = tempfile::Builder::new() + .prefix("reverie-gdb-") + .tempdir() + .unwrap(); + + RemoteGdbSession { + program_to_debug: PathBuf::from(program_to_debug.as_ref()), + program_args: program_args + .into_iter() + .map(|s| String::from(s.as_ref())) + .collect(), + gdb_client: path_to_gdb.as_ref().into(), + tempdir, + } + } + + pub async fn run_server(&self) -> Result { + let path = self.tempdir.path().join("sock"); + + let server = GdbServerCommand::new( + &self.program_to_debug, + self.program_args.clone(), + path.into(), + ); + let output = + tokio::time::timeout(tokio::time::Duration::from_secs(60), server.output()).await??; + Ok(output) + } + + pub async fn run_client(&self, iex: P, ex: P) -> Result + where + P: IntoIterator, + S: Into, + { + let mut client = GdbClientCommand::new(&self.gdb_client, &self.program_to_debug); + + // Connect to remote gdbserver via a Unix domain socket. + client.command(format!( + "target remote {}", + self.tempdir.path().join("sock").display() + )); + client.commands(iex); + client.commands(ex); + // Final disconnect. this is not really necessary with `--batch` + // but we still keep it as-is. + client.command("q"); + + let client_status = tokio::time::timeout( + tokio::time::Duration::from_secs(60), + tokio::task::spawn_blocking(move || client.status()), + ) + .await???; // Nani??? + + Ok(client_status) + } + + pub async fn run(self, gdb_iex: P1, gdb_ex: P2) -> Result + where + P1: IntoIterator + Send, + P2: IntoIterator + Send, + S: AsRef, + { + let gdb_iex: Vec = gdb_iex + .into_iter() + .map(|s| String::from(s.as_ref())) + .collect(); + let gdb_ex: Vec = gdb_ex + .into_iter() + .map(|s| String::from(s.as_ref())) + .collect(); + let (server_output, client_exit_status) = + future::try_join(self.run_server(), self.run_client(gdb_iex, gdb_ex)).await?; + if !client_exit_status.success() { + bail!("gdb client exited with {}", client_exit_status); + } + Ok(server_output) + } +} + +#[derive(StructOpt, Debug, Clone)] +struct GdbServerHelperArgs { + /// The binary to run. + #[structopt()] + test_binary: PathBuf, + + /// The arguments to pass to the binary. + #[structopt()] + test_binary_args: Vec, + + /// Path to the GDB binary. + #[structopt(long, default_value = "gdb")] + gdb: PathBuf, + + /// Path to a file containing GDB commands to execute before loading the + /// inferior. + #[structopt(long)] + iex: Option, + + /// Path to a file containing GDB commands to execute. + #[structopt(long)] + ex: Option, + + /// The expected exit code. + #[structopt(long)] + exit_code: i32, + + /// Path to the expected stderr. + #[structopt(long)] + stderr: Option, + + /// Path to the expected stdout. + #[structopt(long)] + stdout: Option, +} + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<(), Error> { + let args = GdbServerHelperArgs::from_args(); + + let session = RemoteGdbSession::new(args.gdb, args.test_binary, args.test_binary_args); + + // TODO: Provide a way to pass file paths to gdb instead. + let iex = if let Some(iex) = args.iex { + let iex_file = io::BufReader::new(fs::File::open(iex)?); + iex_file.lines().collect::>>()? + } else { + Vec::new() + }; + + let ex = if let Some(ex) = args.ex { + let ex_file = io::BufReader::new(fs::File::open(ex)?); + ex_file.lines().collect::>>()? + } else { + Vec::new() + }; + + let output = session.run(iex, ex).await?; + + if let Some(stderr) = args.stderr { + // TODO: Display a diff if these don't match. + let stderr = fs::read(stderr)?; + assert_eq!( + String::from_utf8(stderr)?, + String::from_utf8(output.stderr)? + ); + } + + if let Some(stdout) = args.stdout { + // TODO: Display a diff if these don't match. + let stdout = fs::read(stdout)?; + assert_eq!( + String::from_utf8(stdout)?, + String::from_utf8(output.stdout)? + ); + } + + assert_eq!(ExitStatus::from_raw(args.exit_code), output.status.into()); + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + + const NO_ARGS: &[&str] = &[]; + + #[tokio::test(flavor = "current_thread")] + async fn debug_ls_b_main_detach() { + let session = RemoteGdbSession::new("gdb", "/bin/ls", NO_ARGS); + assert!( + session + .run([], &["b main", "c", "detach"]) + .await + .unwrap() + .status + .success() + ); + } + + #[tokio::test(flavor = "current_thread")] + async fn debug_ls_with_b_main_kill() { + let session = RemoteGdbSession::new("gdb", "/bin/ls", NO_ARGS); + assert_eq!( + session + .run([], &["b main", "c", "kill inferiors 1"]) + .await + .unwrap() + .status + .signal(), + Some(9), + ); + } + + #[tokio::test(flavor = "current_thread")] + async fn debug_ls_with_b_main_continue() { + let session = RemoteGdbSession::new("gdb", "/bin/ls", NO_ARGS); + assert!( + session + .run([], &["b main", "c", "c"]) + .await + .unwrap() + .status + .success() + ); + } + + #[tokio::test(flavor = "current_thread")] + async fn debug_uname_with_b_main_continue() { + let session = RemoteGdbSession::new("gdb", "/bin/uname", vec!["-s"]); + assert_eq!( + session.run([], &["b main", "c", "c"]).await.unwrap().stdout, + b"Linux\n", + ); + } + + #[tokio::test(flavor = "current_thread")] + async fn debug_file_does_not_exist_with_b_main_continue() { + let session = RemoteGdbSession::new("gdb", "/this_file/does/not/exist!", NO_ARGS); + assert!(session.run(None, &["b main", "c", "c"]).await.is_err()); + } +} diff --git a/tests/gdbserver-integration/gdbserver-helper/src/server.rs b/tests/gdbserver-integration/gdbserver-helper/src/server.rs new file mode 100644 index 0000000..acb4bf3 --- /dev/null +++ b/tests/gdbserver-integration/gdbserver-helper/src/server.rs @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use anyhow::Error; +use reverie::process::{Command, Mount, Namespace, Output, Stdio}; +use reverie::Subscription; +use reverie::Tool; +use reverie_ptrace::GdbConnection; +use serde::Deserialize; +use serde::Serialize; +use std::path::PathBuf; + +pub struct GdbServerCommand { + // NB: ideally we could also attach to a existing pid, but this is not + // supported by reverie yet.. + program_to_run: PathBuf, + program_args: Vec, + connection: GdbConnection, +} + +#[derive(Serialize, Deserialize, Default)] +struct TestTool; + +impl Tool for TestTool { + fn subscriptions(_cfg: &()) -> Subscription { + Subscription::all() + } +} + +async fn run(command: Command, connection: GdbConnection) -> Result { + let (output, _global_state) = reverie_ptrace::TracerBuilder::::new(command) + .gdbserver(connection) + .spawn() + .await? + .wait_with_output() + .await?; + Ok(output) +} + +impl GdbServerCommand { + pub fn new(program_to_run: P, program_args: A, connection: GdbConnection) -> Self + where + P: Into, + A: IntoIterator, + S: AsRef, + { + GdbServerCommand { + program_to_run: program_to_run.into(), + program_args: program_args + .into_iter() + .map(|s| String::from(s.as_ref())) + .collect(), + connection, + } + } + + /// run gdbserver under namespace + pub async fn output(self) -> Result { + let mut command = Command::new(&self.program_to_run); + command.args(&self.program_args); + command + .unshare(Namespace::PID) + .map_root() + .hostname("hermetic-container.local") + .domainname("local") + .mount(Mount::proc()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + run(command, self.connection).await + } +} diff --git a/tests/gdbserver-integration/test-src/forkExec.c b/tests/gdbserver-integration/test-src/forkExec.c new file mode 100644 index 0000000..43fcc5f --- /dev/null +++ b/tests/gdbserver-integration/test-src/forkExec.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[], char* envp[]) { + if (argc == 2 && strcmp(argv[1], "child") == 0) { + printf("exec pid: %u\n", getpid()); + _exit(0); + } + + pid_t pid = fork(); + + if (pid < 0) { + perror("fork failed: "); + exit(1); + } else if (pid == 0) { + char* prog = argv[0]; + char* const newArgv[] = {prog, "child", NULL}; + printf("child pid: %u\n", getpid()); + execve(prog, newArgv, envp); + printf("exec failed: %s\n", strerror(errno)); + } else { + int status; + printf("parent pid: %u\n", getpid()); + waitpid(pid, &status, 0); + if (WIFSIGNALED(status)) { + printf("%u terminated by signal: %u\n", pid, WTERMSIG(status)); + } + } +} diff --git a/tests/gdbserver-integration/test-src/manyThreads.c b/tests/gdbserver-integration/test-src/manyThreads.c new file mode 100644 index 0000000..2c8561a --- /dev/null +++ b/tests/gdbserver-integration/test-src/manyThreads.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NR_THREADS 8L +#define MAX_THREADS 2048 + +// NB: this counter is supposed to be set by gdb cli for testing only +static volatile unsigned int bkpt_resumed_count; + +static _Atomic unsigned int thread_count; + +extern pid_t gettid(void); + +__attribute__((noinline)) void bkpt(void) {} + +__attribute__((noinline)) void foo(void) { + atomic_fetch_add(&thread_count, 1); + bkpt(); +} + +__attribute__((noinline)) void* threaded(void* param) { + foo(); + + return 0; +} + +int main(int argc, char* argv[]) { + foo(); + + unsigned nthreads = NR_THREADS; + if (argc == 2) { + nthreads = atoi(argv[1]); + } + + if (nthreads > MAX_THREADS) { + nthreads = MAX_THREADS; + } + + pthread_t* threadid = calloc(nthreads, sizeof(pthread_t)); + + for (int i = 0; i < nthreads; i++) { + assert(pthread_create(&threadid[i], NULL, threaded, NULL) == 0); + } + for (int i = 0; i < nthreads; i++) { + pthread_join(threadid[i], NULL); + } + + struct timespec tp = { + .tv_sec = 0, + .tv_nsec = 100000000, + }; + clock_nanosleep(CLOCK_MONOTONIC, 0, &tp, NULL); + + printf("%d %d\n", thread_count, bkpt_resumed_count); + + return 0; +} diff --git a/tests/gdbserver-integration/test-src/nested.c b/tests/gdbserver-integration/test-src/nested.c new file mode 100644 index 0000000..a453b49 --- /dev/null +++ b/tests/gdbserver-integration/test-src/nested.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* compile with clang.par nested.c -o nested -O0 -g -Wall */ +#include + +int bar(int); +int baz(int, int); + +int foo(int a, int b) { + int x = a * a + b * b; + return bar(x); +} + +int bar(int x) { + int y = x * (1 + x); + return baz(x, y); +} + +int baz(int a, int b) { + return (a + b) * (a - b); +} + +int main(int argc, char* argv[]) { + printf("%d\n", foo(3, 4)); + return 0; +} diff --git a/tests/gdbserver-integration/test-src/openat1.c b/tests/gdbserver-integration/test-src/openat1.c new file mode 100644 index 0000000..80b4f13 --- /dev/null +++ b/tests/gdbserver-integration/test-src/openat1.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int segv(int sig, siginfo_t* info, void* u) { + unsigned char* ip = info->si_addr; + printf("received signal: %d, si_addr: %p\n", sig, ip); + + for (int i = 0; i < 8; i++) { + printf("%02x ", (int)ip[i] & 0xff); + } + printf("\n"); + + return 0; +} + +int main(int argc, char* argv[]) { + struct sigaction sa, old_sa; + const char* file = "/dev/urandom"; + int fd; + + memset(&sa, 0, sizeof(sa)); + sa.sa_flags = SA_RESETHAND | SA_SIGINFO; + + sigaction(SIGSEGV, &sa, &old_sa); + + fd = open(file, 0); + printf("openat1: %d\n", fd); + if (fd < 0) { + fprintf(stderr, "open %s, error: %s\n", file, strerror(errno)); + } + + fd = open(file, 0); + printf("openat1: %d\n", fd); + if (fd < 0) { + fprintf(stderr, "open %s, error: %s\n", file, strerror(errno)); + } + + return 0; +} diff --git a/tests/gdbserver-integration/test-src/threads1.c b/tests/gdbserver-integration/test-src/threads1.c new file mode 100644 index 0000000..c5baa5e --- /dev/null +++ b/tests/gdbserver-integration/test-src/threads1.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define NR_THREADS 2L +#define TIME_100MS 100000000UL + +__attribute__((noinline)) void bkpt(void) {} + +static void test_clock_nanosleep(unsigned long ns) { + struct timespec req = { + .tv_sec = 0, + .tv_nsec = ns, + }; + struct timespec rem; + int ret; + + do { + ret = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem); + bkpt(); + memcpy(&req, &rem, sizeof(req)); + } while (ret != 0 && errno == EINTR); +} + +static void* threaded(void* param) { + long k = (long)param; + + printf("thread %ld enter.\n", k); + + test_clock_nanosleep(TIME_100MS); + + printf("thread %ld exit.\n", k); + + return 0; +} + +int main(int argc, char* argv[]) { + // sleep in a non-threpaded context + test_clock_nanosleep(TIME_100MS); + + pthread_attr_t attr; + pthread_t threadid[NR_THREADS]; + + assert(pthread_attr_init(&attr) == 0); + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], &attr, threaded, (void*)i) == 0); + } + + for (long i = 0; i < NR_THREADS; i++) { + assert(pthread_join(threadid[i], NULL) == 0); + } + + assert(pthread_attr_destroy(&attr) == 0); + + return 0; +} diff --git a/tests/gdbserver-integration/test-src/threads2.c b/tests/gdbserver-integration/test-src/threads2.c new file mode 100644 index 0000000..d84db15 --- /dev/null +++ b/tests/gdbserver-integration/test-src/threads2.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NR_THREADS 4L + +// NB: this counter is supposed to be set by gdb cli for testing only +static unsigned int bkpt_resumed_count; + +static _Atomic unsigned int thread_count; + +extern pid_t gettid(void); + +__attribute__((noinline)) void bkpt(void) {} + +__attribute__((noinline)) void foo(void) { + atomic_fetch_add(&thread_count, 1); + bkpt(); +} + +__attribute__((noinline)) void* threaded(void* param) { + foo(); + + return 0; +} + +int main(int argc, char* argv[]) { + pthread_t threadid[NR_THREADS]; + + for (int j = 0; j < 2; j++) { + for (int i = 0; i < NR_THREADS; i++) { + assert(pthread_create(&threadid[i], NULL, threaded, NULL) == 0); + } + for (int i = 0; i < NR_THREADS; i++) { + pthread_join(threadid[i], NULL); + } + } + + printf("%d %d\n", thread_count, bkpt_resumed_count); + + return 0; +} diff --git a/tests/parallelism.rs b/tests/parallelism.rs new file mode 100644 index 0000000..e2f611c --- /dev/null +++ b/tests/parallelism.rs @@ -0,0 +1,159 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests for parallelism and concurrency + +use reverie::{syscalls::Syscall, Error, GlobalTool, Guest, Tid, Tool}; +use serde::{Deserialize, Serialize}; +use tokio::time::{sleep, Duration}; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState {} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct TestTool {} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = (); + type Response = (); + + async fn receive_rpc(&self, _from: Tid, _threads: Self::Request) -> Self::Response { + // TODO: replace this with an ivar read: + for _i in 0..400_000 { + tokio::task::yield_now().await; + } + sleep(Duration::from_millis(1000)).await; + // Overkill: spin this async task to make sure there are plenty of turns of the + // other thread. + for _i in 0..400_000 { + tokio::task::yield_now().await; + } + } +} + +#[reverie::tool] +impl Tool for TestTool { + type GlobalState = GlobalState; + type ThreadState = u64; + + fn init_thread_state( + &self, + _tid: Tid, + parent: Option<(Tid, &Self::ThreadState)>, + ) -> Self::ThreadState { + match parent { + None => 0, + Some((_, n)) => n + 1, + } + } + + async fn handle_thread_start>(&self, guest: &mut T) -> Result<(), Error> { + if guest.is_root_thread() { + eprintln!("Root thread starting..."); + } else { + eprintln!("Delaying child thread!"); + guest.send_rpc(()).await.unwrap(); + eprintln!("Done delaying child thread!"); + } + Ok(()) + } +} + +/// A test to interleave writes on memory. +#[test] +#[cfg(not(sanitized))] +pub fn delay_childprint_test() { + use reverie::ExitStatus; + use reverie_ptrace::testing::{print_tracee_output, test_fn}; + + let (output, _state) = test_fn::(|| { + let child = std::thread::spawn(move || { + for _ in 0..2 { + nix::unistd::write(1, b"a").unwrap(); + } + }); + for _ in 0..100 { + nix::unistd::write(1, b"b").unwrap(); + } + child.join().unwrap(); + nix::unistd::write(1, b"\n").unwrap(); + }) + .unwrap(); + + print_tracee_output(&output); + assert_eq!(output.status, ExitStatus::Exited(0)); + assert_eq!(output.stdout.len(), 103); + assert_eq!(output.stderr.len(), 0); + // Because the child was delayed it must finish last: + assert_eq!(output.stdout[101] as char, 'a'); +} + +// A test tool that blocks a handler indefinitely. +#[derive(Debug, Default, Serialize, Deserialize)] +struct TestTool2 {} + +#[reverie::tool] +impl Tool for TestTool2 { + async fn handle_syscall_event>( + &self, + guest: &mut T, + call: Syscall, + ) -> Result { + if let Syscall::Gettid(_) = call { + // Delay forever. When the main thread is killed, this future should + // get canceled. + futures::future::pending::<()>().await; + } + + guest.tail_inject(call).await + } +} + +#[cfg(not(sanitized))] +fn kill_blocked_child() { + use std::sync::{Arc, Barrier}; + + let barrier = Arc::new(Barrier::new(2)); + + let _handle = { + let barrier = barrier.clone(); + std::thread::spawn(move || { + barrier.wait(); + unsafe { + libc::syscall(libc::SYS_gettid, 0) + }; + }) + }; + + // Wait for the thread to start up. + barrier.wait(); + + // This should cause the handler future for the thread to get dropped the + // next time it is polled. + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + + unreachable!() +} + +/// Test where handle_syscall_event blocks a child thread forever. The +/// expectation is that calling `exit_group` will cancel the +/// `handle_syscall_event` future. +#[cfg(not(sanitized))] +#[test] +pub fn test_kill_blocked_child() { + use reverie::ExitStatus; + use reverie_ptrace::testing::test_fn; + + let (output, _state) = test_fn::(kill_blocked_child).unwrap(); + reverie_ptrace::testing::print_tracee_output(&output); + assert_eq!(output.status, ExitStatus::Exited(0)); +} diff --git a/tests/rdtsc.rs b/tests/rdtsc.rs new file mode 100644 index 0000000..60499a8 --- /dev/null +++ b/tests/rdtsc.rs @@ -0,0 +1,102 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use reverie::{Errno, GlobalTool, Guest, Rdtsc, RdtscResult, Subscription, Tid, Tool}; +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState { + tsc: AtomicUsize, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = Rdtsc; + type Response = RdtscResult; + + async fn init_global_state(_: &Self::Config) -> Self { + GlobalState { + tsc: AtomicUsize::new(19200), + } + } + + async fn receive_rpc(&self, _from: Tid, args: Rdtsc) -> RdtscResult { + let tsc = self.tsc.load(Ordering::Relaxed); + self.tsc.store(1 + tsc, Ordering::Relaxed); + match args { + Rdtsc::Tsc => RdtscResult { + tsc: tsc as u64, + aux: None, + }, + Rdtsc::Tscp => RdtscResult { + tsc: tsc as u64, + aux: Some(0), + }, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState {} + +#[reverie::tool] +impl Tool for LocalState { + type GlobalState = GlobalState; + + fn subscriptions(_cfg: &()) -> Subscription { + let mut s = Subscription::none(); + s.rdtsc(); + s + } + + async fn handle_rdtsc_event>( + &self, + guest: &mut T, + request: Rdtsc, + ) -> Result { + let tsc = guest.send_rpc(request).await.unwrap(); + println!("handle_rdtsc: returned {:?}", tsc); + Ok(tsc) + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::check_fn; + + #[allow(unused_mut)] + #[inline(never)] + unsafe fn rdtscp() -> (u64, u32) { + let mut aux_val = core::mem::MaybeUninit::uninit(); + let tsc = core::arch::x86_64::__rdtscp(aux_val.as_mut_ptr()); + (tsc, aux_val.assume_init()) + } + + #[test] + fn run_guest_func_rdtsc_intercepted_test() { + let state = check_fn::(|| { + let tsc1 = unsafe { core::arch::x86_64::_rdtsc() }; + let tsc2 = unsafe { core::arch::x86_64::_rdtsc() }; + assert_eq!(1 + tsc1, tsc2); + }); + assert_ne!(state.tsc.load(Ordering::Relaxed), 0); + } + + #[test] + fn run_guest_func_rdtscp_intercepted_test() { + let state = check_fn::(move || { + let (tsc1, _) = unsafe { rdtscp() }; + let (tsc2, _) = unsafe { rdtscp() }; + assert_eq!(1 + tsc1, tsc2); + }); + assert_ne!(state.tsc.load(Ordering::Relaxed), 0); + } +} diff --git a/tests/shell_tests/build-musl.sh b/tests/shell_tests/build-musl.sh new file mode 100755 index 0000000..5a2e468 --- /dev/null +++ b/tests/shell_tests/build-musl.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e + +MUSL_VER=1.2.1 +MUSL_SOURCE_WORK_DIR=$(mktemp -d) +MUSL_BUILD_DIR="" + +trap cleanup EXIT + +function cleanup { + echo "cleanup.." && rm -fr "${MUSL_SOURCE_WORK_DIR}" +} + +function prepare { + mkdir -p "${MUSL_SOURCE_WORK_DIR}" && cd "${MUSL_SOURCE_WORK_DIR}" + curl -L $(fwdproxy-config curl) "https://git.musl-libc.org/cgit/musl/snapshot/musl-${MUSL_VER}.tar.gz" | tar -zxf - + mkdir -p "musl-${MUSL_VER}-build" && cd "musl-${MUSL_VER}-build" + MUSL_BUILD_DIR=$(pwd) +} + +function build { + if [ "${MUSL_BUILD_DIR}" != "" ]; then + cd "${MUSL_BUILD_DIR}" + ../musl-${MUSL_VER}/configure --prefix="" + make -j + fi +} + +prepare && build diff --git a/tests/signal.rs b/tests/signal.rs new file mode 100644 index 0000000..bd79877 --- /dev/null +++ b/tests/signal.rs @@ -0,0 +1,113 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// signal handling related tests. + +use nix::sys::signal::Signal; +use reverie::{ + syscalls::{AddrMut, ExitGroup, MemoryAccess, RtSigpending, Syscall, SyscallInfo, Sysno}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[reverie::tool] +impl Tool for LocalState { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + if syscall.number() == Sysno::exit_group { + let sigset_rptr = 0x7000_0100usize; + let sigset: AddrMut = AddrMut::from_raw(sigset_rptr as _).unwrap(); + let exit_failure = ExitGroup::new().with_status(1); + let exit_success = syscall; + if guest + .inject( + RtSigpending::new() + .with_set(Some(sigset)) + .with_sigsetsize(8usize), + ) + .await + .is_ok() + { + let memory = guest.memory(); + let pending: u64 = memory.read_value(sigset.cast())?; + if pending != 1u64 << (Signal::SIGVTALRM as i32 - 1) { + guest.tail_inject(exit_failure).await + } else { + guest.tail_inject(exit_success).await + } + } else { + guest.tail_inject(exit_success).await + } + } else { + guest.tail_inject(syscall).await + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use nix::sys::signal; + use reverie_ptrace::testing::check_fn; + use std::{io, mem::MaybeUninit}; + + // kernel_sigset_t used by naked syscall + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + struct KernelSigset(u64); + + impl From<&[Signal]> for KernelSigset { + fn from(signals: &[Signal]) -> Self { + let mut set: u64 = 0; + for &sig in signals { + set |= 1u64 << (sig as usize - 1); + } + KernelSigset(set) + } + } + + #[allow(dead_code)] + unsafe fn block_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: MaybeUninit = MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_BLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + #[test] + // The actual test is in `handle_syscall_event`. To test we can get + // pending signals from tracee, by injecting rt_sigpending. + fn can_get_pending_signals() { + check_fn::(|| { + assert!(unsafe { block_signals(&[Signal::SIGVTALRM]) }.is_ok()); + + assert!(signal::raise(Signal::SIGVTALRM).is_ok()); + + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + }); + } +} diff --git a/tests/signalfd.rs b/tests/signalfd.rs new file mode 100644 index 0000000..db16f7c --- /dev/null +++ b/tests/signalfd.rs @@ -0,0 +1,127 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// signal handling related tests. + +use reverie::{ + syscalls::{ExitGroup, Syscall, SyscallInfo}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct ThreadState; + +#[reverie::tool] +impl Tool for LocalState { + type ThreadState = ThreadState; + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let exit_failure = ExitGroup::new().with_status(1); + match syscall { + // glibc should wrap signalfd -> signalfd4(2). + Syscall::Signalfd(_) => guest.tail_inject(exit_failure).await, + Syscall::Signalfd4(_) => { + let (_, args) = syscall.into_parts(); + assert_eq!(args.arg2, 8); + assert_eq!(args.arg3, libc::SFD_CLOEXEC as u64); + guest.tail_inject(syscall).await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use nix::sys::signal::Signal; + use reverie_ptrace::testing::check_fn; + use std::{ + fs::File, + io::{self, Read}, + mem::{self, MaybeUninit}, + os::unix::io::FromRawFd, + }; + + // kernel_sigset_t used by naked syscall + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + struct KernelSigset(u64); + + impl From<&[Signal]> for KernelSigset { + fn from(signals: &[Signal]) -> Self { + let mut set: u64 = 0; + for &sig in signals { + set |= 1u64 << (sig as usize - 1); + } + KernelSigset(set) + } + } + + #[allow(dead_code)] + unsafe fn unblock_signals(signals: &[Signal]) -> io::Result { + let set = KernelSigset::from(signals); + let mut oldset: MaybeUninit = MaybeUninit::uninit(); + + if libc::syscall( + libc::SYS_rt_sigprocmask, + libc::SIG_UNBLOCK, + &set as *const _, + oldset.as_mut_ptr(), + 8, + ) != 0 + { + Err(io::Error::last_os_error()) + } else { + Ok(KernelSigset(oldset.assume_init())) + } + } + + #[test] + // The actual test is in `handle_syscall_event`. To test we can get + // pending signals from tracee, by injecting rt_sigpending. + fn signalfd_sanity_check() { + check_fn::(|| { + assert!(unsafe { unblock_signals(&[Signal::SIGVTALRM, Signal::SIGALRM]) }.is_ok()); + let mut sigset: MaybeUninit = MaybeUninit::uninit(); + let sigset = unsafe { + libc::sigemptyset(sigset.as_mut_ptr()); + libc::sigaddset(sigset.as_mut_ptr(), libc::SIGALRM); + libc::sigaddset(sigset.as_mut_ptr(), libc::SIGVTALRM); + sigset.assume_init() + }; + let fd = unsafe { libc::signalfd(-1, &sigset as *const _, libc::SFD_CLOEXEC) }; + assert!(fd > 0); + + let mut file = unsafe { File::from_raw_fd(fd) }; + let mut siginfo = [0; mem::size_of::()]; + + unsafe { + libc::alarm(1) + }; + + assert!(file.read_exact(&mut siginfo).is_ok()); + + let siginfo: libc::signalfd_siginfo = unsafe { mem::transmute(siginfo) }; + + assert_eq!(siginfo.ssi_signo, libc::SIGALRM as u32); + + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + }); + } +} diff --git a/tests/spinlock.rs b/tests/spinlock.rs new file mode 100644 index 0000000..b14f95c --- /dev/null +++ b/tests/spinlock.rs @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use reverie::Tool; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[reverie::tool] +impl Tool for LocalState {} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::check_fn; + use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + thread, time, + }; + + #[test] + fn run_guest_spinlock_test() { + check_fn::(move || { + let lock = Arc::new(AtomicUsize::new(0)); + let mut handles: Vec<_> = (0..10) + .map(|_| { + let lock = lock.clone(); + thread::spawn(move || while lock.load(Ordering::Acquire) != 10 {}) + }) + .collect(); + handles.push(thread::spawn(move || { + for _ in 0..10 { + lock.fetch_add(1, Ordering::Release); + let dur = time::Duration::from_millis(10); + thread::sleep(dur); + } + })); + for h in handles { + let _ = h.join(); + } + }); + } +} diff --git a/tests/stack.rs b/tests/stack.rs new file mode 100644 index 0000000..6c270a5 --- /dev/null +++ b/tests/stack.rs @@ -0,0 +1,198 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests for process and thread state. + +use serde::{Deserialize, Serialize}; + +use reverie::{ + syscalls::{ + Addr, AddrMut, ExitGroup, MemoryAccess, Nanosleep, Syscall, SyscallInfo, Sysno, Timespec, + Uname, + }, + Error, Guest, Stack, Tool, +}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[reverie::tool] +impl Tool for LocalState { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let exit_failure = ExitGroup::new().with_status(1); + let exit_success = ExitGroup::new().with_status(0); + match syscall.number() { + Sysno::uname => { + let mut stack = guest.stack().await; + + let uname_on_stack: AddrMut = stack.reserve(); + + stack.commit()?; + // inject uname using stack allocator + let _ = guest + .inject(Uname::new().with_buf(Some(uname_on_stack))) + .await?; + + // (re-) inject the old uname, with buf allocated by caller + let _ = guest.inject(syscall).await?; + + let memory = guest.memory(); + + let unamebuf: Addr = + Addr::from_raw(syscall.into_parts().1.arg0 as usize).unwrap(); + let uname1 = memory.read_value(unamebuf)?; + let uname2 = memory.read_value(uname_on_stack)?; + + if uname1 != uname2 { + guest.tail_inject(exit_failure).await + } else { + Ok(0) + } + } + Sysno::exit_group => { + let request = Timespec { + tv_sec: 1, + tv_nsec: 2, + }; + + let mut stack = guest.stack().await; + + let req = stack.push(request); + let rem: AddrMut = stack.reserve(); + stack.commit()?; + let ret = guest + .inject(Nanosleep::new().with_req(Some(req)).with_rem(Some(rem))) + .await?; + + let memory = guest.memory(); + let rem = memory.read_value(rem)?; + if ret == 0 && rem.tv_sec != 0 { + guest.tail_inject(exit_failure).await + } else { + guest.tail_inject(exit_success).await + } + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState2; + +#[reverie::tool] +impl Tool for LocalState2 { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let exit_success = ExitGroup::new().with_status(0); + match syscall.number() { + Sysno::exit_group => { + let mut stack = guest.stack().await; + let ptr1: AddrMut = stack.reserve(); + let _guard1 = stack.commit().unwrap(); + guest.memory().write_value(ptr1, &3333).unwrap(); + let v1 = guest.memory().read_value(ptr1).unwrap(); + assert_eq!(v1, 3333); + + let mut stack = guest.stack().await; + let ptr2: AddrMut = stack.reserve(); + let _guard2 = stack.commit().unwrap(); + guest.memory().write_value(ptr2, &4444).unwrap(); + let v2 = guest.memory().read_value(ptr2).unwrap(); + assert_eq!(v2, 4444); + + guest.tail_inject(exit_success).await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState3; + +#[reverie::tool] +impl Tool for LocalState3 { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let exit_success = ExitGroup::new().with_status(0); + match syscall.number() { + Sysno::exit_group => { + { + let mut stack = guest.stack().await; + let ptr1: AddrMut = stack.reserve(); + let _guard1 = stack.commit().unwrap(); + guest.memory().write_value(ptr1, &3333).unwrap(); + let v1 = guest.memory().read_value(ptr1).unwrap(); + assert_eq!(v1, 3333); + } + + { + let mut stack = guest.stack().await; + let ptr2: AddrMut = stack.reserve(); + let _guard2 = stack.commit().unwrap(); + guest.memory().write_value(ptr2, &4444).unwrap(); + let v2 = guest.memory().read_value(ptr2).unwrap(); + assert_eq!(v2, 4444); + } + + guest.tail_inject(exit_success).await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::check_fn; + + #[test] + fn stack_allocator_should_work() { + check_fn::(|| { + assert_ne!(nix::sys::utsname::uname().sysname(), ""); + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + }); + } + + /// A test that allocates on the stack TWICE. + /// Currently failing because we attempt to grab a second stack while the guard is still alive. + #[test] + #[should_panic] + fn stack_two_allocs_bad() { + check_fn::(|| { + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + }); + } + + /// In contrast, this is ok because the guard is dropped. + #[test] + fn stack_two_allocs_good() { + check_fn::(|| { + unsafe { + libc::syscall(libc::SYS_exit_group, 0) + }; + }); + } +} diff --git a/tests/standalone/README.md b/tests/standalone/README.md new file mode 100644 index 0000000..cb5b196 --- /dev/null +++ b/tests/standalone/README.md @@ -0,0 +1,5 @@ + +Standalone tool+test programs +============================= + +These programs combine a specific Reverie tool with a specific guest program. diff --git a/tests/standalone/at_random.rs b/tests/standalone/at_random.rs new file mode 100644 index 0000000..5050e75 --- /dev/null +++ b/tests/standalone/at_random.rs @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use reverie::{syscalls::MemoryAccess, Errno, Error, ExitStatus, Guest, Tool}; +use serde::{Deserialize, Serialize}; +use std::env; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct TestTool {} + +const PRNG_SEED: [u8; 16] = [ + 0x12, 0x34, 0x56, 0x78, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xca, 0xfe, 0x87, 0x65, 0x43, 0x21, +]; + +#[reverie::tool] +impl Tool for TestTool { + async fn handle_post_exec>(&self, guest: &mut T) -> Result<(), Errno> { + if let Some(ptr) = guest.auxv().at_random() { + // It is safe to mutate this address since libc has not yet had a + // chance to modify or copy the auxv table. + let ptr = unsafe { ptr.into_mut() }; + guest.memory().write_value(ptr, &PRNG_SEED)?; + } + + Ok(()) + } +} + +fn guest_mode() { + println!("Running in guest mode (actual test)."); + + let at_random = unsafe { libc::getauxval(libc::AT_RANDOM) as *const u8 }; + let slice = unsafe { std::slice::from_raw_parts(at_random, 16) }; + + println!("Entropy (intercepted) at at_random {:02x?}", slice); + + assert_eq!(slice, PRNG_SEED); +} + +async fn host_mode() -> Result { + println!("Running in HOST mode (ReverieTool)"); + + let at_random = unsafe { libc::getauxval(libc::AT_RANDOM) as *const u8 }; + let slice = unsafe { std::slice::from_raw_parts(at_random, 16) }; + + println!("Entropy (non-intercepted) at at_random {:02x?}", slice); + + let mut command = reverie::process::Command::new(std::env::current_exe().unwrap()); + command.arg("guest"); + + let tracer = reverie_ptrace::TracerBuilder::::new(command) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + + Ok(status) +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args: Vec = env::args().collect(); + match &args[..] { + [_] => host_mode().await?.raise_or_exit(), + [_, s] if s == "guest" => guest_mode(), + _ => panic!( + "Expected 'guest' or no CLI argument. Got unexpected command line args ({}): {:?}", + args.len(), + args + ), + } + + Ok(()) +} diff --git a/tests/standalone/inject_then_tail_inject.rs b/tests/standalone/inject_then_tail_inject.rs new file mode 100644 index 0000000..ab1c88a --- /dev/null +++ b/tests/standalone/inject_then_tail_inject.rs @@ -0,0 +1,162 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +use nix::unistd; +use reverie::{ + syscalls::{Displayable, MemoryAccess, Syscall, Sysno}, + Error, GlobalTool, Guest, Pid, Tool, +}; +use serde::{Deserialize, Serialize}; +use std::{alloc, env, mem}; +use tracing::warn; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct TestTool {} + +type Dupcount = u64; + +#[reverie::global_tool] +impl GlobalTool for TestTool { + type Config = Dupcount; + + async fn receive_rpc(&self, _from: Pid, _message: ()) {} +} + +/// How many bytes of randomness to peak at. +const RAND_SIZE: usize = mem::size_of::(); + +/// How many times to DUPLICATE select system calls that are intercepted. +const NUM_REPS: Dupcount = 3; + +#[reverie::tool] +impl Tool for TestTool { + type GlobalState = TestTool; + + async fn handle_syscall_event>( + &self, + guest: &mut T, + call: Syscall, + ) -> Result { + let reps = guest.config().clone(); + match call { + Syscall::Gettid(_) + | Syscall::Getgid(_) + | Syscall::Getsid(_) + | Syscall::Getppid(_) + | Syscall::Getpgid(_) + | Syscall::Getpid(_) => { + for i in 1..=reps { + let syscall_ret = guest.inject(call).await; + warn!( + "[pid {}] Duplicated syscall ({}/{})! {} = {}", + guest.tid(), + i, + reps, + call.display_with_outputs(&guest.memory()), + syscall_ret.unwrap_or_else(|errno| errno.into_raw() as i64) + ); + } + } + Syscall::Getrandom(r) => { + if r.buflen() < RAND_SIZE { + warn!( + "[pid {}] not touching getrandom, buflen too small.", + guest.tid() + ); + } else { + for i in 1..=reps { + let syscall_ret = guest.inject(call).await; + let bufaddr = r.buf().unwrap(); + let mut buf: [u8; RAND_SIZE] = [0; RAND_SIZE]; + guest.memory().read_exact(bufaddr, &mut buf).unwrap(); + let rand_word: u64 = u64::from_le_bytes(buf); + warn!( + "[pid {}] Duplicated getrandom syscall ({}/{}): {}, returned {}, first word {}", + guest.tid(), + i, + reps, + call.display_with_outputs(&guest.memory()), + syscall_ret.unwrap_or_else(|errno| errno.into_raw() as i64), + rand_word + ); + } + } + } + _ => {} + } + // Irrespective of above, run a tail_inject at the end: + guest.tail_inject(call).await + } +} + +fn guest_mode() { + println!("Running in guest mode (actual test)."); + let tid = unistd::gettid(); + let pid = unistd::getpid(); + let gid = unistd::getgid(); + let ppid = unistd::getppid(); + let pgid = unistd::getpgid(None).unwrap(); + let sid = unistd::getsid(None).unwrap(); + println!( + "Read IDs: t {}, p {}, g {}, pp {}, pg {}, s{}", + tid, pid, gid, ppid, pgid, sid, + ); + // let r = syscalls::syscall!(0, 100, 0); + let sz = RAND_SIZE; + let rand_num: u64 = unsafe { + let layout = alloc::Layout::from_size_align(sz, sz).unwrap(); + let buf = alloc::alloc(layout); + let no = Sysno::getrandom as i64; + let rand = libc::syscall(no, buf, sz, 0); + if rand < 0 { + panic!("getrandom returned error code {}\n", rand); + } else if rand != sz as i64 { + panic!( + "getrandom did not generate all {} bytes (instead {}\n", + sz, rand + ); + } + + #[allow(clippy::cast_ptr_alignment)] + let num: u64 = *(buf as *mut u64); + alloc::dealloc(buf, layout); + num + }; + println!("Generated random number: {}", rand_num); +} + +async fn host_mode(thisprog: &str) -> Result { + println!("Running in HOST mode (ReverieTool)"); + + let mut command = reverie::process::Command::new(thisprog); + command.arg("guest"); + + let tracer = reverie_ptrace::TracerBuilder::::new(command) + .config(NUM_REPS) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + + Ok(status.code().unwrap_or(1)) +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args: Vec = env::args().collect(); + match &args[..] { + [p] => std::process::exit(host_mode(p).await?), + [_, s] if s == "guest" => guest_mode(), + _ => panic!( + "Expected 'guest' or no CLI argument. Got unexpected command line args ({}): {:?}", + args.len(), + args + ), + } + + Ok(()) +} diff --git a/tests/standalone/parallel_tasks.rs b/tests/standalone/parallel_tasks.rs new file mode 100644 index 0000000..f7f15a2 --- /dev/null +++ b/tests/standalone/parallel_tasks.rs @@ -0,0 +1,106 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#![feature(get_mut_unchecked)] +#![feature(thread_id_value)] +use reverie::{Error, Tool}; +use serde::{Deserialize, Serialize}; +use std::env; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::thread; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct TestTool {} + +#[reverie::tool] +impl Tool for TestTool { + type GlobalState = (); +} + +const NUM_ELEMENTS: usize = 1_000_000; + +/// In guest mode two threads will try to fill up half of the data array with their thread id as +/// value. The threads grab indices through an atomic int. For sufficiently large arrays we expect +/// the thread ids to show up interleaved. +fn guest_mode() { + let shared_data = Arc::new(Box::new([0; NUM_ELEMENTS])); + let shared_idx = Arc::new(AtomicUsize::new(0)); + + let handles: Vec> = (0..2) + .map(|_| { + let idx = shared_idx.clone(); + let mut data = shared_data.clone(); + thread::spawn(move || { + let tid = thread::current().id().as_u64().get(); + + // Get a mutable reference to the data. This is unsafe, but we guarantee the + // threads are always accesssing unique non-overlapping indices of the array. + let data = unsafe { Arc::get_mut_unchecked(&mut data) }; + + // Give each thread half of the fetch_add attempts. + for _ in 0..(NUM_ELEMENTS / 2) { + let idx = idx.fetch_add(1, Ordering::SeqCst); + data[idx] = tid; + } + }) + }) + .collect(); + + for h in handles { + h.join().unwrap(); + } + + // Calculate the number of switch points. E.g. the number of times we observed interleaved + // writes between the threads. + let mut switch_points = 0; + let mut prev = shared_data[0]; + for i in 1..shared_data.len() { + if prev != shared_data[i] { + prev = shared_data[i]; + switch_points += 1; + } + } + + println!("Switch points: {}", switch_points); + if switch_points <= 1 { + eprintln!("Expected more than 1 switch point!"); + std::process::exit(1); + } +} + +async fn host_mode(thisprog: &str) -> Result { + println!("Running in HOST mode (ReverieTool)"); + + let mut command = reverie::process::Command::new(thisprog); + command.arg("guest"); + + let tracer = reverie_ptrace::TracerBuilder::::new(command) + .spawn() + .await?; + let (status, _) = tracer.wait().await?; + + Ok(status.code().unwrap_or(1)) +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args: Vec = env::args().collect(); + match &args[..] { + [p] => std::process::exit(host_mode(p).await?), + [_, s] if s == "guest" => guest_mode(), + _ => panic!( + "Expected 'guest' or no CLI argument. Got unexpected command line args ({}): {:?}", + args.len(), + args + ), + } + + Ok(()) +} diff --git a/tests/stat.rs b/tests/stat.rs new file mode 100644 index 0000000..1952f73 --- /dev/null +++ b/tests/stat.rs @@ -0,0 +1,119 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// reinject stat* as fstatat unittest + +use reverie::{ + syscalls::{self, Displayable, Syscall}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +async fn handle_newfstatat>( + guest: &mut T, + call: syscalls::Newfstatat, +) -> Result { + let res = guest.inject(call).await; + + println!("{} = {:?}", call.display(&guest.memory()), res); + + Ok(res?) +} + +#[reverie::tool] +impl Tool for LocalState { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Stat(stat) => handle_newfstatat(guest, stat.into()).await, + Syscall::Lstat(lstat) => handle_newfstatat(guest, lstat.into()).await, + _ => guest.tail_inject(syscall).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::check_fn; + use std::mem::MaybeUninit; + + #[test] + fn stat_can_be_reinjected() { + check_fn::(|| { + let path = "/proc/self/exe\0".as_ptr() as _; + let fd = unsafe { libc::open(path, libc::O_RDONLY) }; + assert!(fd > 0); + + let mut stat_result: MaybeUninit = MaybeUninit::uninit(); + let mut lstat_result: MaybeUninit = MaybeUninit::uninit(); + let mut fstat_result: MaybeUninit = MaybeUninit::uninit(); + + assert_eq!(0, unsafe { libc::stat(path, stat_result.as_mut_ptr()) }); + let stat_result = unsafe { stat_result.assume_init() }; + assert_eq!(0, unsafe { libc::lstat(path, lstat_result.as_mut_ptr()) }); + let lstat_result = unsafe { lstat_result.assume_init() }; + assert_eq!(0, unsafe { libc::fstat(fd, fstat_result.as_mut_ptr()) }); + let fstat_result = unsafe { fstat_result.assume_init() }; + assert_eq!(stat_result.st_ino, fstat_result.st_ino); + assert_ne!(stat_result.st_ino, lstat_result.st_ino); + }) + } + + // glibc doesn't provide wrapper for statx + unsafe fn statx( + dirfd: i32, + path: *const i8, + flags: i32, + mask: u32, + statxbuf: *mut libc::statx, + ) -> i64 { + libc::syscall(libc::SYS_statx, dirfd, path, flags, mask, statxbuf) + } + + #[test] + fn statx_fstat_returns_same_ino() { + check_fn::(|| { + let path = "/proc/self/exe\0".as_ptr() as _; + let dirfd = libc::AT_FDCWD; + + let mut fstatat_result: MaybeUninit = MaybeUninit::uninit(); + let mut statx_result: MaybeUninit = MaybeUninit::uninit(); + + assert_eq!(0, unsafe { + libc::fstatat( + dirfd, + path, + fstatat_result.as_mut_ptr(), + libc::AT_SYMLINK_NOFOLLOW, + ) + }); + let fstatat_result = unsafe { fstatat_result.assume_init() }; + + assert_eq!(0, unsafe { + statx( + dirfd, + path, + libc::AT_SYMLINK_NOFOLLOW, + libc::STATX_INO, + statx_result.as_mut_ptr(), + ) + }); + let statx_result = unsafe { statx_result.assume_init() }; + + assert_eq!(fstatat_result.st_ino, statx_result.stx_ino); + }) + } +} diff --git a/tests/state.rs b/tests/state.rs new file mode 100644 index 0000000..5d52250 --- /dev/null +++ b/tests/state.rs @@ -0,0 +1,169 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Tests for process and thread state. + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Mutex, +}; + +use serde::{Deserialize, Serialize}; + +use reverie::{syscalls::Syscall, Error, ExitStatus, GlobalRPC, GlobalTool, Guest, Pid, Tool}; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState { + // Map of pids to parent pids and syscall counts. This is only updated when + // a process exits. + tree: Mutex)>>, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct ThreadState { + // Number of syscalls executed by this thread. + syscalls: usize, + + // Number of children this thread has. + children: AtomicUsize, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = Vec<(i32, usize, usize)>; + type Response = (); + + async fn receive_rpc(&self, from: Pid, threads: Self::Request) -> Self::Response { + // Merge with global state. + self.tree.lock().unwrap().push((from.as_raw(), threads)); + } +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct TestTool { + // Vec of thread ids and their syscall counts and children. This is only + // updated when a thread exits. + threads: Mutex>, +} + +#[reverie::tool] +impl Tool for TestTool { + type GlobalState = GlobalState; + type ThreadState = ThreadState; + + fn init_thread_state( + &self, + _tid: Pid, + parent: Option<(Pid, &Self::ThreadState)>, + ) -> Self::ThreadState { + if let Some((_, parent)) = parent { + parent.children.fetch_add(1, Ordering::Relaxed); + } + + ThreadState::default() + } + + async fn on_exit_process>( + self, + _pid: Pid, + global_state: &G, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + global_state + .send_rpc(self.threads.into_inner().unwrap()) + .await?; + Ok(()) + } + + async fn on_exit_thread>( + &self, + tid: Pid, + _global_state: &G, + thread_state: Self::ThreadState, + _exit_status: ExitStatus, + ) -> Result<(), Error> { + self.threads.lock().unwrap().push(( + tid.as_raw(), + thread_state.syscalls, + thread_state.children.load(Ordering::Relaxed), + )); + Ok(()) + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + guest.thread_state_mut().syscalls += 1; + guest.tail_inject(syscall).await + } +} + +#[cfg(not(sanitized))] +#[test] +fn basic_test() { + use reverie_ptrace::testing::check_fn; + + let state = check_fn::(|| { + // Spawn some top-level threads + let handles = (0..4) + .map(|_| { + std::thread::spawn(|| { + // ...with child threads + (0..4) + .map(|_| { + std::thread::spawn(|| { + // ...that call getpid a bunch of times. + for _ in 0..100 { + let _ = unsafe { libc::getpid() }; + } + }) + }) + .collect::>() + }) + }) + .collect::>(); + + for handles in handles { + for handle in handles.join().unwrap() { + handle.join().unwrap(); + } + } + }); + + let mut tree = state.tree.lock().unwrap(); + let threads = tree.pop().unwrap().1; + + // There should have been only a single top-level process. + assert!(tree.is_empty()); + + // We spawned 20 threads and one thread group loader (21 in total). + // 4 of the threads simply wait on child threads and 16 of the + // threads do the bogus syscalls. + assert_eq!(threads.len(), 21); + + // Partition the threads based on how many children they had. + let (parent_threads, child_threads) = threads + .into_iter() + .partition::, _>(|&(_, _, children)| children > 0); + + assert_eq!(parent_threads.len(), 5); + assert_eq!(child_threads.len(), 16); + + // The first 4 threads are the ones with 4 children each. + for (_tid, _count, children) in parent_threads { + assert_eq!(children, 4); + } + + // The other 16 threads do the syscalls. + for (_tid, count, _children) in child_threads { + assert!(count >= 100); + } +} diff --git a/tests/thread_start.rs b/tests/thread_start.rs new file mode 100644 index 0000000..4182b34 --- /dev/null +++ b/tests/thread_start.rs @@ -0,0 +1,51 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#![cfg(not(sanitized))] + +use reverie::{syscalls, Error, Guest, Pid, Tool}; +use serde::{Deserialize, Serialize}; + +#[test] +fn thread_start_inject() { + #[derive(Debug, Serialize, Deserialize, Default)] + struct TestTool; + + #[reverie::tool] + impl Tool for TestTool { + type GlobalState = (); + type ThreadState = (); + + async fn handle_thread_start>(&self, guest: &mut T) -> Result<(), Error> { + let ret = guest.inject(syscalls::Getpid::new()).await?; + assert_eq!(Pid::from_raw(ret as i32), guest.pid()); + Ok(()) + } + } + + reverie_ptrace::testing::check_fn::(|| {}); +} + +#[test] +fn thread_start_tail_inject() { + #[derive(Debug, Serialize, Deserialize, Default)] + struct TestTool; + + #[reverie::tool] + impl Tool for TestTool { + type GlobalState = (); + type ThreadState = (); + + async fn handle_thread_start>(&self, guest: &mut T) -> Result<(), Error> { + guest.tail_inject(syscalls::Getpid::new()).await; + unreachable!() + } + } + + reverie_ptrace::testing::check_fn::(|| {}); +} diff --git a/tests/timer_semantics.rs b/tests/timer_semantics.rs new file mode 100644 index 0000000..9f53e80 --- /dev/null +++ b/tests/timer_semantics.rs @@ -0,0 +1,712 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +//! Verifies precision, determinism, and cancellation of timer events and clocks. +//! +//! Syscalls are abused to communicate from the guest to the tool instructions +//! necessary to carry out the test, such as setting timers or reading clocks. + +#![feature(llvm_asm)] + +use core::arch::x86_64::{__cpuid, __rdtscp, _rdtsc}; +use libc; +use reverie::{ + syscalls::{Getpid, Gettid, Syscall, SyscallInfo, Sysno, Tgkill}, + Errno, Error, GlobalTool, Guest, Pid, Signal, Subscription, TimerSchedule, Tool, +}; +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicU64, Ordering}; + +#[derive(Debug, Serialize, Deserialize, Default)] +struct GlobalState { + num_timer_evts: AtomicU64, + num_signals: AtomicU64, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[derive(PartialEq, Debug, Eq, Clone, Copy, Serialize, Deserialize)] +enum IncrMsg { + Timer, + Signal, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct Config { + sub_syscalls_only: bool, + run_basic_tests: bool, + timeout_rcbs: u64, + timeout_rcbs_alternate: u64, +} + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct ThreadClockState { + /// baseline for clock comparisons + last_tick: u64, + /// offset from baseline to assert at a timer event + timer_assertion: Option, +} + +#[reverie::global_tool] +impl GlobalTool for GlobalState { + type Request = IncrMsg; + type Response = (); + type Config = Config; + + async fn init_global_state(_: &Self::Config) -> Self { + GlobalState { + num_timer_evts: AtomicU64::new(0), + num_signals: AtomicU64::new(0), + } + } + + async fn receive_rpc(&self, _from: Pid, msg: IncrMsg) -> Self::Response { + match msg { + IncrMsg::Timer => self.num_timer_evts.fetch_add(1, Ordering::SeqCst), + IncrMsg::Signal => self.num_signals.fetch_add(1, Ordering::SeqCst), + }; + } +} + +const BULK_INJECTION_COUNT: u64 = 10; + +#[reverie::tool] +impl Tool for LocalState { + type GlobalState = GlobalState; + type ThreadState = ThreadClockState; + + fn subscriptions(cfg: &Config) -> Subscription { + if cfg.sub_syscalls_only { + Subscription::all_syscalls() + } else { + Subscription::all() + } + } + + async fn handle_thread_start>(&self, guest: &mut T) -> Result<(), Error> { + if guest.config().run_basic_tests { + assert_eq!(guest.read_clock().unwrap(), 0); + assert!(guest.set_timer(TimerSchedule::Rcbs(0)).is_err()); + } + Ok(()) + } + + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + let timeout = TimerSchedule::Rcbs(guest.config().timeout_rcbs); + let alt_timeout = TimerSchedule::Rcbs(guest.config().timeout_rcbs_alternate); + let (no, args) = syscall.into_parts(); + match no { + Sysno::clock_getres => { + guest.set_timer_precise(timeout).unwrap(); + } + Sysno::msgrcv => { + guest.set_timer_precise(alt_timeout).unwrap(); + } + Sysno::timer_getoverrun => { + guest.set_timer(timeout).unwrap(); + } + Sysno::fanotify_init => { + guest.set_timer_precise(timeout).unwrap(); + let kill_call = raise_sigwinch(guest).await; + guest.tail_inject(kill_call).await + } + Sysno::fanotify_mark => { + guest.set_timer(timeout).unwrap(); + let kill_call = raise_sigwinch(guest).await; + guest.tail_inject(kill_call).await + } + Sysno::msgctl => { + guest.set_timer_precise(timeout).unwrap(); + for _ in 0..BULK_INJECTION_COUNT { + guest.inject(Getpid::new()).await.unwrap(); + } + guest.tail_inject(Getpid::new()).await + } + Sysno::msgget => { + guest.set_timer(timeout).unwrap(); + for _ in 0..BULK_INJECTION_COUNT { + guest.inject(Getpid::new()).await.unwrap(); + } + guest.tail_inject(Getpid::new()).await + } + Sysno::clock_settime => { + let clock_value = guest.read_clock().unwrap(); + let ts = guest.thread_state_mut(); + ts.last_tick = clock_value; + ts.timer_assertion = None; + } + Sysno::timer_gettime => { + let clock_value = guest.read_clock().unwrap(); + let ts = guest.thread_state_mut(); + ts.last_tick = clock_value; + ts.timer_assertion = Some(args.arg0); + } + Sysno::clock_adjtime => assert_eq!( + guest.read_clock().unwrap(), + guest.thread_state_mut().last_tick + args.arg0 + ), + _ => guest.tail_inject(syscall).await, + }; + Ok(0) + } + + async fn handle_timer_event>(&self, guest: &mut T) { + guest.send_rpc(IncrMsg::Timer).await.unwrap(); + let clock_value = guest.read_clock().unwrap(); + let ts = guest.thread_state(); + if let Some(val) = ts.timer_assertion { + assert_eq!(ts.last_tick + val, clock_value); + } + } + + async fn handle_signal_event>( + &self, + guest: &mut T, + signal: Signal, + ) -> Result, Errno> { + guest.send_rpc(IncrMsg::Signal).await.unwrap(); + Ok(Some(signal)) + } +} + +async fn raise_sigwinch>(guest: &mut T) -> Tgkill { + let pid = guest.inject(Getpid::new()).await.unwrap(); + let tid = guest.inject(Gettid::new()).await.unwrap(); + Tgkill::new() + .with_tgid(pid as _) + .with_tid(tid as _) + .with_sig(libc::SIGWINCH) +} + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +unsafe fn syscall_no_branches(no: libc::c_long, arg1: libc::c_long) { + llvm_asm!(" + mov $0, %rax + mov $1, %rdi + xor %rsi, %rsi + xor %rdx, %rdx + xor %r10, %r10 + xor %r8, %r8 + xor %r9, %r9 + syscall + " + : /* no output */ + : "r"(no), "r"(arg1) + : "cc", "rax", "rdi", "rsi", "rdx", "r10", "r8", "r9", /* from syscall: */ "rcx", "r11" + ); +} + +fn sched_precise() { + unsafe { syscall_no_branches(libc::SYS_clock_getres, 0) } +} + +fn sched_precise_alternate_rcb_count() { + unsafe { syscall_no_branches(libc::SYS_msgrcv, 0) } +} + +fn sched_imprecise() { + unsafe { syscall_no_branches(libc::SYS_timer_getoverrun, 0) } +} + +fn mark_clock() { + unsafe { syscall_no_branches(libc::SYS_clock_settime, 0) } +} + +fn assert_clock(delta: u64) { + unsafe { syscall_no_branches(libc::SYS_clock_adjtime, delta as i64) } +} + +fn assert_clock_at_next_timer(value: u64) { + unsafe { syscall_no_branches(libc::SYS_timer_gettime, value as i64) } +} + +fn do_syscall() { + unsafe { syscall_no_branches(libc::SYS_clock_gettime, 0) } +} + +fn immediate_exit() { + unsafe { syscall_no_branches(libc::SYS_exit, 0) } +} + +fn sched_precise_and_raise() { + unsafe { syscall_no_branches(libc::SYS_fanotify_init, 0) } +} + +fn sched_imprecise_and_raise() { + unsafe { syscall_no_branches(libc::SYS_fanotify_mark, 0) } +} + +fn sched_precise_and_inject() { + unsafe { syscall_no_branches(libc::SYS_msgctl, 0) } +} + +fn sched_imprecise_and_inject() { + unsafe { syscall_no_branches(libc::SYS_msgget, 0) } +} + +fn cpuid() { + unsafe { + __cpuid(0); + } +} + +fn rdtsc() { + unsafe { + _rdtsc(); + } +} + +fn rdtscp() { + unsafe { + let mut x = 0u32; + __rdtscp(&mut x as *mut _); + } +} + +fn ts_check_fn(rcbs: u64, f: impl FnOnce()) -> GlobalState { + use reverie_ptrace::testing::check_fn_with_config; + check_fn_with_config::( + f, + Config { + timeout_rcbs: rcbs, + ..Default::default() + }, + true, + ) +} + +const MANY_RCBS: u64 = 10000; // Normal perf signaling +const LESS_RCBS: u64 = 15; // Low enough to use artificial signaling + +#[cfg(all(not(sanitized), test))] +mod timer_tests { + //! These tests are highly sensitive to the number of branches executed + //! in the guest, and this must remain consistent between opt and debug + //! mode. If you pass non-constant values into do_branches and need them to + //! be exact, be sure to precompute them in the tracer before moving them + //! into the tracee, otherwise underflow or overflow checks will break the + //! tests. + + use super::*; + use reverie_ptrace::ret_without_perf; + use reverie_ptrace::testing::{check_fn_with_config, do_branches}; + use test_case::test_case; + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_imprecise)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_imprecise)] + fn timer_delays_timer(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + let rcbd2 = rcbs / 2; + let rcbx2 = rcbs * 2; + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(rcbd2); + schedule_timer(); + do_branches(rcbd2); + schedule_timer(); + do_branches(rcbd2); + schedule_timer(); + do_branches(rcbx2); + }); + assert_eq!(gs.num_timer_evts.into_inner(), 1); + } + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_imprecise)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_imprecise)] + fn timer_is_single(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + let rcbx2 = rcbs * 2; + let rcbx10 = rcbs * 10; + let rcbx20 = rcbs * 20; + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(rcbx2); + schedule_timer(); + do_branches(rcbx10); + schedule_timer(); + do_branches(rcbx20); + }); + assert_eq!(gs.num_timer_evts.into_inner(), 3); + } + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_precise_and_inject)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_precise_and_inject)] + fn precise_delivery_exact(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + + // `do_branches(n)` does n+1 branches, so `rcbs - 1` will be the + // first argument resulting in a timer event. + // Precompute to avoid underflow checks in the guest + + let branch_ct = rcbs - 2; + let gs3 = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(branch_ct); + immediate_exit(); + }); + assert_eq!(gs3.num_timer_evts.into_inner(), 0); + + let branch_ct = rcbs - 1; + let gs2 = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(branch_ct); + immediate_exit(); + }); + assert_eq!(gs2.num_timer_evts.into_inner(), 1); + } + + fn early_stop_rcbs(rcbs: u64) -> Vec { + // Final value is 2 because do_branches adds 1 + [rcbs / 2, 1000, 100, 10, 2] + .iter() + .map(|x| *x) + .filter(|x| *x < rcbs) + .collect() + } + const LET_PASS_STOP_RCBS: u64 = 1; + + #[test_case(MANY_RCBS, do_syscall, sched_precise, 0)] + #[test_case(MANY_RCBS, do_syscall, sched_imprecise, 0)] + #[test_case(MANY_RCBS, cpuid, sched_precise, 0)] + #[test_case(MANY_RCBS, cpuid, sched_imprecise, 0)] + #[test_case(MANY_RCBS, rdtsc, sched_precise, 0)] + #[test_case(MANY_RCBS, rdtsc, sched_imprecise, 0)] + #[test_case(MANY_RCBS, rdtscp, sched_precise, 0)] + #[test_case(MANY_RCBS, rdtscp, sched_imprecise, 0)] + #[test_case(MANY_RCBS, sched_precise, sched_precise, 1)] + #[test_case(MANY_RCBS, sched_imprecise, sched_precise, 1)] + #[test_case(MANY_RCBS, sched_imprecise, sched_imprecise, 1)] + #[test_case(MANY_RCBS, sched_precise, sched_imprecise, 1)] + #[test_case(LESS_RCBS, do_syscall, sched_precise, 0)] + #[test_case(LESS_RCBS, do_syscall, sched_imprecise, 0)] + #[test_case(LESS_RCBS, cpuid, sched_precise, 0)] + #[test_case(LESS_RCBS, cpuid, sched_imprecise, 0)] + #[test_case(LESS_RCBS, rdtsc, sched_precise, 0)] + #[test_case(LESS_RCBS, rdtsc, sched_imprecise, 0)] + #[test_case(LESS_RCBS, rdtscp, sched_precise, 0)] + #[test_case(LESS_RCBS, rdtscp, sched_imprecise, 0)] + #[test_case(LESS_RCBS, sched_precise, sched_precise, 1)] + #[test_case(LESS_RCBS, sched_imprecise, sched_precise, 1)] + #[test_case(LESS_RCBS, sched_imprecise, sched_imprecise, 1)] + #[test_case(LESS_RCBS, sched_precise, sched_imprecise, 1)] + fn assert_cancels_timers( + rcbs: u64, + fun: fn() -> (), + schedule_timer: fn() -> (), + additional_evts: u64, + ) { + ret_without_perf!(); + let rcbx2 = rcbs * 2; + for e in early_stop_rcbs(rcbs) { + // Precompute to avoid underflow checks in the guest + let branch_ct = rcbs - e; + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(branch_ct); + fun(); + do_branches(rcbx2); + }); + assert_eq!( + gs.num_timer_evts.into_inner(), + 0 + additional_evts, + "iter: {}", + e + ); + } + // Imprecise events can be delayed, in which case nothing fires, so only + // test this if precise: + if schedule_timer == sched_precise { + // Precompute to avoid underflow checks in the guest + let branch_ct = rcbs - LET_PASS_STOP_RCBS; + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(branch_ct); + fun(); + do_branches(rcbx2); + }); + assert_eq!(gs.num_timer_evts.into_inner(), 1 + additional_evts); + } + } + + #[test_case(MANY_RCBS, sched_precise, sched_precise_and_raise)] + #[test_case(MANY_RCBS, sched_imprecise, sched_imprecise_and_raise)] + #[test_case(LESS_RCBS, sched_precise, sched_precise_and_raise)] + #[test_case(LESS_RCBS, sched_imprecise, sched_imprecise_and_raise)] + fn signals_cancel_timers( + rcbs: u64, + schedule_timer: fn() -> (), + schedule_timer_and_raise: fn() -> (), + ) { + ret_without_perf!(); + let rcbd2 = rcbs / 2; + let rcbx2 = rcbs * 2; + + // The signal after scheduling should immediately cancel the event + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(rcbd2); + schedule_timer_and_raise(); + do_branches(rcbx2); + schedule_timer_and_raise(); + do_branches(rcbx2); + }); + assert_eq!(gs.num_signals.into_inner(), 2); // defensive + assert_eq!(gs.num_timer_evts.into_inner(), 0); + + // If we don't raise, events delivered as expected: + let gs = ts_check_fn(rcbs, move || { + schedule_timer(); + do_branches(rcbd2); + schedule_timer(); + do_branches(rcbx2); + schedule_timer(); + do_branches(rcbx2); + }); + assert_eq!(gs.num_signals.into_inner(), 0); // defensive + assert_eq!(gs.num_timer_evts.into_inner(), 2); + } + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_imprecise)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_imprecise)] + fn not_subscribed_doesnt_cancel(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + let rcbd2 = rcbs / 2; + let rcbx2 = rcbs * 2; + let gs = check_fn_with_config::( + move || { + schedule_timer(); + do_branches(rcbd2); + cpuid(); + rdtsc(); + rdtscp(); + do_branches(rcbx2); + }, + Config { + timeout_rcbs: rcbs, + sub_syscalls_only: true, + ..Default::default() + }, + true, + ); + assert_eq!(gs.num_timer_evts.into_inner(), 1); + } + + fn loop_with_branch_ct( + rcbs: u64, + branch_ct: u64, + iters: u64, + schedule_timer: fn() -> (), + ) -> GlobalState { + ts_check_fn(rcbs, move || { + for _ in 0..iters { + schedule_timer(); + do_branches(branch_ct); + schedule_timer(); // cancel timer before loop branch + } + immediate_exit(); // RCBs in teardown would trigger the last iter's event + }) + } + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_precise_and_inject)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_precise_and_inject)] + fn precise_deterministic_loop(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + let rcbm2 = rcbs - 2; + let rcbm1 = rcbs - 1; + const ITERS: u64 = 500; + assert_eq!( + loop_with_branch_ct(rcbs, rcbm2, ITERS, schedule_timer) + .num_timer_evts + .into_inner(), + 0 + ); + assert_eq!( + loop_with_branch_ct(rcbs, rcbm1, ITERS, schedule_timer) + .num_timer_evts + .into_inner(), + ITERS + ); + } + + #[test_case(MANY_RCBS, sched_imprecise)] + #[test_case(MANY_RCBS, sched_imprecise_and_inject)] + #[test_case(LESS_RCBS, sched_imprecise)] + #[test_case(LESS_RCBS, sched_imprecise_and_inject)] + fn imprecise_not_early_loop(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + const ITERS: u64 = 2000; + let rcbm2 = rcbs - 2; + assert_eq!( + loop_with_branch_ct(rcbs, rcbm2, ITERS, schedule_timer) + .num_timer_evts + .into_inner(), + 0 + ); + } + + /// Regression test: Verify that a shorter event isn't cancelled by the + /// occurrence of longer one if the two happen to align. + #[test] + fn long_short_not_cancelled() { + ret_without_perf!(); + // Doing this test correctly requires correctly predicting the amount of + // skid in the underlying RCB signal. For that reason, we run the gamut + // of possibilities in increments of 5, several times for each. + const ITERS: usize = 50; + + // <============ MANY_RCBS ==============> + // <===== SKID_MARGIN ====> + // <= skid ==> + // <= overlap => + // ---------------------------------------------------- + // ^ ^ | ^ ^ + // sched timeout | signal delivery + // | + // `> schedule short event to cause step overlap + // schedule time is therefore + // MANY_RCBS - (SKID_MARGIN - skid) - (LESS_RCBS / 2) + // skid_param = SKID_MARGIN - skid + for skid_param in (0u64..200).step_by(5) { + let branch_ct = MANY_RCBS - skid_param - (LESS_RCBS / 2); + for _ in 0..ITERS { + let gs = check_fn_with_config::( + move || { + sched_precise(); + do_branches(branch_ct); + sched_precise_alternate_rcb_count(); + do_branches(MANY_RCBS * 10); + }, + Config { + timeout_rcbs: MANY_RCBS, + timeout_rcbs_alternate: LESS_RCBS, + ..Default::default() + }, + true, + ); + assert_eq!(gs.num_timer_evts.into_inner(), 1); + } + } + } +} + +#[cfg(all(not(sanitized), test))] +mod clock_tests { + use super::*; + use reverie_ptrace::ret_without_perf; + use reverie_ptrace::testing::{check_fn, do_branches}; + use test_case::test_case; + + #[test] + fn clock_accuracy() { + ret_without_perf!(); + for r in [100, 1000, 10000, 100000, 1000000] { + let rp1 = r + 1; // precompute + check_fn::(move || { + mark_clock(); + do_branches(r); + assert_clock(rp1); + }); + } + } + + #[test] + fn clock_stays_without_branch() { + ret_without_perf!(); + let r = 2000; + let rp1 = r + 1; // precompute + check_fn::(move || { + mark_clock(); + do_branches(r); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + assert_clock(rp1); + }); + } + + #[test_case(MANY_RCBS, sched_precise)] + #[test_case(MANY_RCBS, sched_imprecise)] + #[test_case(LESS_RCBS, sched_precise)] + #[test_case(LESS_RCBS, sched_imprecise)] + fn clock_with_timer(rcbs: u64, schedule_timer: fn() -> ()) { + ret_without_perf!(); + let a = rcbs * 2; + let b = a + 1; + let c = rcbs / 2; + let d = c + 1; + let gs = ts_check_fn(rcbs, move || { + mark_clock(); + schedule_timer(); + do_branches(a); + assert_clock(b); // timer received + + mark_clock(); + schedule_timer(); + do_branches(c); + assert_clock(d); // timer outstanding + }); + assert_eq!(gs.num_timer_evts.into_inner(), 1); + } + + #[test_case(MANY_RCBS)] + #[test_case(LESS_RCBS)] + fn clock_at_timer_delivery(rcbs: u64) { + ret_without_perf!(); + let rcbx2 = rcbs * 2; + let gs = ts_check_fn(rcbs, move || { + mark_clock(); + assert_clock_at_next_timer(rcbs); + sched_precise(); + do_branches(rcbx2); + }); + assert_eq!(gs.num_timer_evts.into_inner(), 1); + } +} + +#[cfg(all(not(sanitized), test))] +mod general { + use super::*; + use reverie_ptrace::ret_without_perf; + use reverie_ptrace::testing::check_fn_with_config; + + #[test] + fn basic() { + ret_without_perf!(); + let _gs = check_fn_with_config::( + move || { + do_syscall(); + }, + Config { + run_basic_tests: true, + ..Default::default() + }, + true, + ); + } +} diff --git a/tests/vdso.rs b/tests/vdso.rs new file mode 100644 index 0000000..4caf774 --- /dev/null +++ b/tests/vdso.rs @@ -0,0 +1,88 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use reverie::{ + syscalls::{Errno, MemoryAccess, Syscall}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalState; + +#[reverie::tool] +impl Tool for LocalState { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::ClockGettime(_gettime) => Err(Errno::EINVAL.into()), + Syscall::Gettimeofday(gettimeofday) => { + let retval = guest.inject(syscall).await?; + if let Some(tod) = gettimeofday.tv() { + let mut tv = guest.memory().read_value(tod)?; + tv.tv_usec = (tv.tv_usec / 1000) * 1000 + 345; + guest.memory().write_value(tod, &tv)?; + } + + Ok(retval) + } + Syscall::Getcpu(getcpu) => { + if let Some(cpu) = getcpu.cpu() { + guest.memory().write_value(cpu, &0)?; + } + Ok(0) + } + otherwise => guest.tail_inject(otherwise).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use reverie_ptrace::testing::check_fn; + use std::{mem::MaybeUninit, time}; + + #[test] + #[should_panic] + fn run_guest_vdso_tod_test() { + check_fn::(|| { + // this calls clock_gettime. + let _now = time::Instant::now(); + }); + } + + #[test] + fn run_guest_vdso_getcpu_test() { + check_fn::(|| { + // this calls getcpu via vdso. + let cpu = unsafe { libc::sched_getcpu() }; + // NB: getcpu in vdso area always set cpu to 0. + // see symbol __vdso_getcpu. + assert_eq!(cpu, 0); + }); + } + + #[test] + fn run_guest_vdso_gettimeofday_test() { + check_fn::(|| { + let mut tod = MaybeUninit::zeroed(); + let mut tz = MaybeUninit::zeroed(); + assert_eq!( + unsafe { libc::gettimeofday(tod.as_mut_ptr(), tz.as_mut_ptr()) }, + 0 + ); + let tod = unsafe { tod.assume_init() }; + assert_eq!(tod.tv_usec % 1000, 345); + }); + } +} diff --git a/tests/vfork.rs b/tests/vfork.rs new file mode 100644 index 0000000..e37d4c0 --- /dev/null +++ b/tests/vfork.rs @@ -0,0 +1,178 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// signal handling related tests. + +use reverie::{ + syscalls::{Syscall, SyscallArgs, SyscallInfo}, + Error, Guest, Tool, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalStateVfork; + +#[derive(Debug, Serialize, Deserialize, Default, Clone)] +struct LocalStateVforkClone; + +#[reverie::tool] +impl Tool for LocalStateVfork { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Vfork(_) => { + let (_, args) = syscall.into_parts(); + eprintln!( + "[pid = {}] tail_inject vfork (unchanged), args: {:x?}", + guest.tid(), + args + ); + guest.tail_inject(syscall).await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[reverie::tool] +impl Tool for LocalStateVforkClone { + async fn handle_syscall_event>( + &self, + guest: &mut T, + syscall: Syscall, + ) -> Result { + match syscall { + Syscall::Vfork(_) => { + let (_, args) = syscall.into_parts(); + // NB: glibc's vfork is a assembly function, it uses %%rdi as return address (on stack) + // vfork is very tricky because child/parent share the same stack. see P153347946 for + // a bit more context. + let raw: SyscallArgs = SyscallArgs { + arg0: (libc::CLONE_VFORK | libc::CLONE_VM | libc::SIGCHLD) as u64, + arg1: 0, + arg2: 0, + arg3: 0, + arg4: 0, + arg5: 0, + }; + eprintln!( + "[pid = {}] inject vfork as clone, old arg: {:x?}, injected arg: {:x?}", + guest.tid(), + args, + raw + ); + guest.tail_inject(reverie::syscalls::Clone::from(raw)).await + } + _ => guest.tail_inject(syscall).await, + } + } +} + +#[cfg(all(not(sanitized), test))] +mod tests { + use super::*; + use nix::{ + sys::wait::{self, WaitStatus}, + unistd::Pid, + }; + use reverie_ptrace::testing::check_fn; + use std::ffi::CString; + + #[derive(Clone, Copy)] + enum VforkTestFlag { + ImplicitExit, // impicit exit, will run exit_handlers. + ExplicitExit, // explicit exit, exit_handlers ignored. + Execve, // call execve. + } + + fn implicit_exit(code: i32) -> ! { + unsafe { libc::exit(code) } + } + + fn vfork_test_helper(flag: VforkTestFlag) { + #[allow(deprecated)] + let pid = unsafe { libc::vfork() } as i32; + assert!(pid >= 0); + + if pid > 0 { + let pid = Pid::from_raw(pid); + let status = wait::waitpid(Some(pid), None).unwrap(); + assert_eq!(status, WaitStatus::Exited(pid, 0)); + } else { + // do sth trivial making sure stack is altered.. + let tp = libc::timespec { + tv_sec: 0, + tv_nsec: 10_000_000, + }; + unsafe { + libc::clock_nanosleep( + libc::CLOCK_MONOTONIC, + 0, + &tp as *const _, + std::ptr::null_mut(), + ) + }; + match flag { + VforkTestFlag::ExplicitExit => { + let _ = unsafe { libc::syscall(libc::SYS_exit_group, 0) }; + } + VforkTestFlag::ImplicitExit => { + // we should still call libc::exit here. Because `vfork' is not well + // supported by rust. see https://github.com/rust-lang/libc/pull/1574. + // note we've enabled #[ffi_return_twice], but if we don't call + // libc::exit(0) here, we'd end up calling library/std/src/sys/unix/os.rs + // then reached the `ud2` (inserted by never return type) instruction and + // get SIGILL. So it seems even #[ffi_return_twice] doesn't fix the whole + // issue. (The difference might be calling library implicit exit may have + // extra heap allocation). + implicit_exit(0) + } + VforkTestFlag::Execve => { + let program = CString::new("/bin/date").unwrap(); + let env = CString::new("PATH=/bin:/usr/bin").unwrap(); + let res = nix::unistd::execve(&program, &[&program], &[&env]); + assert!(!res.is_err()); + } + } + } + } + + #[test] + fn vfork_then_exit_group() { + check_fn::(|| vfork_test_helper(VforkTestFlag::ExplicitExit)); + } + + #[test] + fn vfork_then_implicit_exit() { + check_fn::(|| vfork_test_helper(VforkTestFlag::ImplicitExit)); + } + + #[test] + fn vfork_then_execve() { + check_fn::(|| vfork_test_helper(VforkTestFlag::Execve)); + } + + #[test] + fn vfork_into_clone_then_exit_group() { + check_fn::(|| vfork_test_helper(VforkTestFlag::ExplicitExit)); + } + + #[test] + fn vfork_into_clone_then_implicit_exit() { + check_fn::(|| vfork_test_helper(VforkTestFlag::ImplicitExit)); + } + + #[test] + fn vfork_into_clone_then_execve() { + check_fn::(|| vfork_test_helper(VforkTestFlag::Execve)); + } +}