diff --git a/experimental/scrape-syscalls/main.rs b/experimental/scrape-syscalls/main.rs new file mode 100644 index 0000000..3049849 --- /dev/null +++ b/experimental/scrape-syscalls/main.rs @@ -0,0 +1,272 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +mod syscall_info; + +use std::collections::HashMap; +use std::fs; +use std::io; +use std::io::Write; +use std::mem; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; +use std::slice; + +use clap::Parser; +use goblin::elf::section_header::SectionHeader; +use goblin::elf::sym::Sym; +use goblin::elf::Elf; +use scroll::Pread; + +use self::syscall_info::*; + +#[derive(Parser)] +struct Opts { + /// Path to the vmlinux ELF image. By default, `/boot/vmlinux-$(uname -r)` + /// is used. This must contain debug information so that we can search + /// through the symbol table for syscall metadata symbols. + #[clap()] + vmlinux: Option, + + /// Outputs the syscall list as Rust source code. + #[clap(long)] + rust: bool, +} + +/// Metadata associated with a syscall. This is defined in +/// [`include/trace/syscall.h`][syscall_metadata]. +/// +/// [syscall_metadata]: https://elixir.bootlin.com/linux/v5.5.3/source/include/trace/syscall.h +#[repr(C)] +#[derive(Debug, Pread)] +struct SyscallMetadata { + /// Address of the syscall name. + name: u64, + + /// Number of the syscall. This is always set to -1. The Kernel sets this to + /// the real syscall number upon boot. Since we are reading the ELF file, we + /// need to look through `sys_call_table` to find out the real syscall + /// number. + syscall_nr: libc::c_int, + + /// Number of parameters it takes. + nb_args: libc::c_int, + + /// List of types as strings. + types: u64, + + /// List of parameters as strings (args[i] matches types[i]). + args: u64, + // Don't care about these fields. + //struct list_head enter_fields; + //struct trace_event_call *enter_event; + //struct trace_event_call *exit_event; +} + +fn find_sym(elf: &Elf, search: &str) -> Option { + for sym in elf.syms.iter() { + if let Some(name) = elf.strtab.get_at(sym.st_name) { + if name == search { + return Some(sym); + } + } + } + + None +} + +fn find_section<'a>(elf: &'a Elf, search: &'a str) -> Option<&'a SectionHeader> { + for sh in &elf.section_headers { + if let Some(name) = elf.shdr_strtab.get_at(sh.sh_name) { + if name == search { + return Some(sh); + } + } + } + + None +} + +fn get_array<'a, T>(elf: &'a Elf, buf: &'a [u8], sym: Sym) -> &'a [T] { + let count = sym.st_size as usize / mem::size_of::(); + + let sh = &elf.section_headers[sym.st_shndx]; + + let offset = sym.st_value - sh.sh_addr + sh.sh_offset; + + unsafe { slice::from_raw_parts(buf.as_ptr().add(offset as usize).cast::(), count) } +} + +fn sym_offset(elf: &Elf, sym: Sym) -> usize { + let sh = &elf.section_headers[sym.st_shndx]; + let offset = sym.st_value - sh.sh_addr + sh.sh_offset; + offset as usize +} + +fn get_syscall_table<'a>(elf: &'a Elf, buf: &'a [u8]) -> Option<&'a [libc::c_ulong]> { + let sym = find_sym(elf, "sys_call_table")?; + Some(get_array(elf, buf, sym)) +} + +fn syscall_list(path: &Path) -> Result, Box> { + let buf = fs::read(path)?; + + let elf = Elf::parse(&buf)?; + + // This is a table of all the addresses of architecture-specific syscall + // symbols. We use this to create a mapping of syscall IDs to symbol names. + let table = match get_syscall_table(&elf, &buf) { + Some(table) => table, + None => return Err("Failed to find `sys_call_table`".into()), + }; + + // The syscall symbols live in this section. + let text_section = match find_section(&elf, ".text") { + Some(sec) => sec, + None => return Err("Failed to find .text section".into()), + }; + + // The syscall symbols live in this section. + let data_section = match find_section(&elf, ".data") { + Some(sec) => sec, + None => return Err("Failed to find .data section".into()), + }; + + // Create a mapping of syscall symbol names to syscall numbers. + let mapping: HashMap<_, _> = elf + .syms + .iter() + .filter_map(|sym| { + table + .iter() + .position(|&addr| addr == sym.st_value) + .and_then(|id| { + // Look up the name, stripping off the `__x64_` prefix. + // TODO: Don't assume x64 architecture. Derive the prefix somehow. + let name = elf.strtab.get_at(sym.st_name)?.strip_prefix("__x64_")?; + Some((name, id)) + }) + }) + .collect(); + + let mut list = Vec::new(); + + for sym in elf.syms.iter() { + if let Some(sym_name) = elf.strtab.get_at(sym.st_name) { + if sym_name.starts_with("__syscall_meta") { + let syscall: SyscallMetadata = buf.pread(sym_offset(&elf, sym))?; + + // Look up the name in the .text section. + let name_offset = syscall.name - text_section.sh_addr + text_section.sh_offset; + let name: &str = buf.pread(name_offset as usize)?; + if name == "sys_ni_syscall" { + // This is a placeholder for syscalls that are not implemented. + continue; + } + + if let Some(&nr) = mapping.get(name) { + let mut types = Vec::new(); + + // Chase pointers and gather arg types + if syscall.types != 0 { + let mut types_offset = (syscall.types - data_section.sh_addr + + data_section.sh_offset) + as usize; + + for _ in 0..syscall.nb_args { + let addr: u64 = buf.gread(&mut types_offset)?; + let offset = addr - text_section.sh_addr + text_section.sh_offset; + let name: &str = buf.pread(offset as usize)?; + types.push(name); + } + } + + let mut args = Vec::new(); + + // Chase pointers and gather arg names + if syscall.args != 0 { + let mut args_offset = + (syscall.args - data_section.sh_addr + data_section.sh_offset) as usize; + + for _ in 0..syscall.nb_args { + let addr: u64 = buf.gread(&mut args_offset)?; + let offset = addr - text_section.sh_addr + text_section.sh_offset; + let name: &str = buf.pread(offset as usize)?; + args.push(name); + } + } + + let params = types + .into_iter() + .zip(args) + .map(|(t, a)| (t.to_string(), a.to_string())) + .collect(); + + list.push(SyscallInfo { + num: nr, + name: name.into(), + params, + }); + } + } + } + } + + Ok(list) +} + +fn main() -> Result<(), Box> { + let args = Opts::from_args(); + + let vmlinux = match args.vmlinux { + Some(vmlinux) => vmlinux, + None => { + // Use /boot/vmlinux-$(uname -r) by default. + PathBuf::from(format!( + "/boot/vmlinux-{}", + nix::sys::utsname::uname() + .expect("Failed getting uname.") + .release() + .to_str() + .ok_or("OsStr release is not a valid Unicode.")? + )) + } + }; + + let mut list = syscall_list(&vmlinux)?; + list.sort_by_key(|syscall| syscall.num); + + if args.rust { + generate_rust(&list)?; + } else { + for syscall in &list { + println!("{}", syscall); + } + } + + Ok(()) +} + +fn generate_rust(syscalls: &[SyscallInfo]) -> io::Result<()> { + let mut child = Command::new("rustfmt").stdin(Stdio::piped()).spawn()?; + + let mut f = child.stdin.take().unwrap(); + + writeln!( + &mut f, + "use syscalls::{{Errno, Sysno, syscall0, syscall1, syscall2, syscall3, syscall4, syscall5, syscall6}};\n" + )?; + + for syscall in syscalls { + writeln!(&mut f, "{}", syscall.display_as_rust())?; + } + + Ok(()) +} diff --git a/experimental/scrape-syscalls/syscall_info.rs b/experimental/scrape-syscalls/syscall_info.rs new file mode 100644 index 0000000..258b162 --- /dev/null +++ b/experimental/scrape-syscalls/syscall_info.rs @@ -0,0 +1,301 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +use core::fmt; + +/// Contains information about a syscall, including its parameters. +pub struct SyscallInfo { + pub num: usize, + pub name: String, + pub params: Vec<(String, String)>, +} + +impl fmt::Display for SyscallInfo { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{:>3} => {}({})", + self.num, + self.name, + self.params + .iter() + .map(|(t, a)| format!("{} {}", t, a)) + .collect::>() + .join(", ") + ) + } +} + +impl SyscallInfo { + pub fn display_as_rust(&self) -> RustSyscall { + RustSyscall(self) + } +} + +pub struct RustSyscall<'a>(&'a SyscallInfo); + +impl<'a> fmt::Display for RustSyscall<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let params = self + .0 + .params + .iter() + .map(|(t, a)| RustParam::new(t, a)) + .collect::>(); + + let name = translate_syscall(&self.0.name); + + writeln!( + f, + "/// See [{name}(2)](http://man7.org/linux/man-pages/man2/{name}.2.html)\n\ + /// for more info on this syscall.", + name = name + )?; + writeln!(f, "#[inline(always)]")?; + + writeln!( + f, + "pub unsafe fn sys_{}({}) -> Result {{", + name, + params + .iter() + .map(|p| format!("{}", p)) + .collect::>() + .join(", ") + )?; + + let idents = params + .iter() + .map(|p| format!("{} as u64", p.ident)) + .collect::>() + .join(", "); + + if params.is_empty() { + writeln!(f, " syscall0(Sysno::{})", name)?; + } else { + writeln!( + f, + " syscall{}(Sysno::{}, {})", + params.len(), + name, + idents + )?; + } + + writeln!(f, "}}") + } +} + +/// Format a parameter as a Rust parameter. +struct RustParam<'a> { + /// The type of the parameter. + ty: &'a str, + /// The identifier of the parameter. + ident: &'a str, +} + +impl<'a> RustParam<'a> { + pub fn new(ty: &'a str, ident: &'a str) -> Self { + let ident = translate_ident(ident); + Self { ty, ident } + } +} + +impl<'a> fmt::Display for RustParam<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}: {}", self.ident, to_rust_type(self.ident, self.ty)) + } +} + +fn translate_syscall(name: &str) -> &str { + let name = name.strip_prefix("sys_").unwrap_or(name); + + match name { + "newstat" => "stat", + "newfstat" => "fstat", + "newlstat" => "lstat", + "sendfile64" => "sendfile", + "sysctl" => "_sysctl", + "umount" => "umount2", + "newuname" => "uname", + _ => name, + } +} + +fn translate_ident(ident: &str) -> &str { + match ident { + "type" => "r#type", + "usize" => "size", + _ => ident, + } +} + +/// Converts this type to a Rust type if possible. +fn to_rust_type(ident: &str, ty: &str) -> &'static str { + match ty { + "char *" => match ident { + "buf" => "*mut u8", + _ => "*mut libc::c_char", + }, + "const char *" => match ident { + "buf" => "*const u8", + _ => "*const libc::c_char", + }, + "unsigned char *" => "*mut u8", + "const unsigned char *" => "*const u8", + "int" => "i32", + "int *" => "*mut i32", + "const int *" => "*const i32", + "u32" => "u32", + "u32 *" => "*mut u32", + "__u64" => "u64", + "__s32" => "i32", + "long" => "i64", + "unsigned" => "u32", + "unsigned *" => "*mut u32", + "unsigned int" => "u32", + "unsigned int *" => "*mut u32", + "size_t" => "usize", + "size_t *" => "*mut usize", + "unsigned long" => "u64", + "unsigned long *" => "*mut u64", + "const unsigned long *" => "*const u64", + "umode_t" => "libc::mode_t", + "struct stat *" => "*mut libc::stat", + "struct pollfd *" => "*mut libc::pollfd", + "off_t" => "libc::off_t", + "const struct sigaction *" => "*const libc::sigaction", + "struct sigaction *" => "*mut libc::sigaction", + "sigset_t *" => "*mut libc::sigset_t", + "const sigset_t *" => "*const libc::sigset_t", + "siginfo_t *" => "*mut libc::siginfo_t", + "struct siginfo *" => "*mut libc::siginfo_t", + "loff_t" => "libc::loff_t", + "loff_t *" => "*mut libc::loff_t", + "const struct iovec *" => "*const libc::iovec", + "fd_set *" => "*mut libc::fd_set", + "struct __kernel_old_timeval *" => "*mut libc::timeval", + "key_t" => "libc::key_t", + "struct shmid_ds *" => "*mut libc::shmid_ds", + "struct __kernel_timespec *" => "*mut libc::timespec", + "const struct __kernel_timespec *" => "*const libc::timespec", + "struct __kernel_old_itimerval *" => "*mut libc::itimerval", + "struct sockaddr *" => "*mut libc::sockaddr", + "void *" => "*mut libc::c_void", + "const void *" => "*const libc::c_void", + "const void * *" => "*mut *const libc::c_void", + "struct user_msghdr *" => "*mut libc::msghdr", + "const char *const *" => "*const *const libc::c_char", + "pid_t" => "libc::pid_t", + "struct rusage *" => "*mut libc::rusage", + "struct new_utsname *" => "*mut libc::utsname", + "struct sembuf *" => "*mut libc::sembuf", + "struct msgbuf *" => "*mut libc::c_void", + "struct msqid_ds *" => "*mut libc::msqid_ds", + "struct linux_dirent *" => "*mut libc::dirent", + "struct linux_dirent64 *" => "*mut libc::dirent64", + "uid_t" => "libc::uid_t", + "uid_t *" => "*mut libc::uid_t", + "gid_t" => "libc::gid_t", + "gid_t *" => "*mut libc::gid_t", + "struct timezone *" => "*mut libc::timezone", + "struct rlimit *" => "*mut libc::rlimit", + "struct rlimit64 *" => "*mut libc::rlimit64", + "const struct rlimit64 *" => "*const libc::rlimit64", + "struct sysinfo *" => "*mut libc::sysinfo", + "struct tms *" => "*mut libc::tms", + // FIXME: See https://man7.org/linux/man-pages/man2/capget.2.html for + // the definition of cap_user_header_t and cap_user_data_t. + "cap_user_header_t" => "*mut libc::c_void", + "cap_user_data_t" => "*mut libc::c_void", + "const cap_user_data_t" => "*const libc::c_void", + "stack_t *" => "*mut libc::stack_t", + "const stack_t *" => "*const libc::stack_t", + "struct utimbuf *" => "*mut libc::utimbuf", + // FIXME: This should be using libc::ustat, but that doesn't exist yet. + "struct ustat *" => "*mut libc::c_void", + "struct statfs *" => "*mut libc::statfs", + "struct sched_param *" => "*mut libc::sched_param", + // FIXME: No equivalent exists. See + // https://man7.org/linux/man-pages/man2/sysctl.2.html for definition. + "struct __sysctl_args *" => "*mut libc::c_void", + "struct __kernel_timex *" => "*mut libc::timex", + "qid_t" => "i32", + "__kernel_old_time_t *" => "*mut libc::time_t", + // aio_context_t is defined as a simple `unsigned long`. + "aio_context_t *" => "*mut u64", + "aio_context_t" => "u64", + // FIXME: io_event is defined at + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/aio_abi.h#L60. + "struct io_event *" => "*mut libc::c_void", + // FIXME: See https://man7.org/linux/man-pages/man2/io_submit.2.html for + // definition of iocb. + "struct iocb * *" => "*mut *mut libc::c_void", + "struct iocb *" => "*mut libc::c_void", + "const clockid_t" => "libc::clockid_t", + "struct sigevent *" => "*mut libc::sigevent", + "const struct sigevent *" => "*mut libc::sigevent", + "timer_t *" => "*mut i32", + "timer_t" => "i32", + "const struct __kernel_itimerspec *" => "*const libc::itimerspec", + "struct __kernel_itimerspec *" => "*mut libc::itimerspec", + "struct epoll_event *" => "*mut libc::epoll_event", + "struct mq_attr *" => "*mut libc::mq_attr", + "const struct mq_attr *" => "*const libc::mq_attr", + "mqd_t" => "libc::mqd_t", + // FIXME: See https://man7.org/linux/man-pages/man2/kexec_load.2.html + // for definition of kexec_segment. + "struct kexec_segment *" => "*mut libc::c_void", + "key_serial_t" => "i32", + // FIXME: robust_list_head is defined at + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/futex.h#L97 + "struct robust_list_head *" => "*mut libc::c_void", + "struct robust_list_head * *" => "*mut *mut libc::c_void", + // FIXME: perf_event_attr is a big struct and no definition in libc + // exists. For real definiton, see + // https://man7.org/linux/man-pages/man2/perf_event_open.2.html. + "struct perf_event_attr *" => "*mut libc::c_void", + "struct mmsghdr *" => "*mut libc::mmsghdr", + // FIXME: See + // https://man7.org/linux/man-pages/man2/name_to_handle_at.2.html for + // definition of file_handle. + "struct file_handle *" => "*mut libc::c_void", + // NOTE: getcpu_cache is an opaque type and should never be accessed by + // user code. + "struct getcpu_cache *" => "*mut libc::c_void", + // FIXME: For definition of sched_attr, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/sched/types.h#L102 + "struct sched_attr *" => "*mut libc::c_void", + // FIXME: For definition of bpf_attr, see: + // https://man7.org/linux/man-pages/man2/bpf.2.html + "union bpf_attr *" => "*mut libc::c_void", + "rwf_t" => "i32", + "struct statx *" => "*mut libc::statx", + // FIXME: For definition of __aio_sigset, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/fs/aio.c#L2216 + "const struct __aio_sigset *" => "*mut libc::c_void", + // FIXME: For definition of rseq, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/rseq.h#L62 + "struct rseq *" => "*mut libc::c_void", + // FIXME: For definitino of io_uring_params, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/io_uring.h#L265 + "struct io_uring_params *" => "*mut libc::c_void", + // FIXME: This is used by the clone3 syscall and libc doesn't have this + // yet. For the definition of clone_args, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/sched.h#L92 + "struct clone_args *" => "*mut libc::c_void", + // FIXME: This is used by the openat2 syscall and libc doesn't have this + // yet. For the definition of open_how, see: + // https://elixir.bootlin.com/linux/v5.16.11/source/include/uapi/linux/openat2.h#L19 + "struct open_how *" => "*mut libc::c_void", + _ => panic!( + "Don't know how to convert this syscall parameter to Rust: {} {}", + ident, ty + ), + } +} diff --git a/public_autocargo/experimental/Cargo.toml b/public_autocargo/experimental/Cargo.toml new file mode 100644 index 0000000..fc8ad34 --- /dev/null +++ b/public_autocargo/experimental/Cargo.toml @@ -0,0 +1,20 @@ +# @generated by autocargo + +[package] +name = "scrape-syscalls" +version = "0.1.0" +authors = ["Meta Platforms"] +edition = "2021" +license = "BSD-2-Clause" + +[[bin]] +name = "scrape_syscalls" +path = "scrape-syscalls/main.rs" +edition = "2018" + +[dependencies] +clap = { version = "3.2.23", features = ["derive", "env", "regex", "unicode", "wrap_help"] } +goblin = "0.5.2" +libc = "0.2.139" +nix = "0.25" +scroll = { version = "0.10", features = ["derive"] }