diff --git a/.gitignore b/.gitignore index c0d236b7e..e94983bbe 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ __pycache__ init/init examples/chroot_vm test-prefix +examples/rootfs_* diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..220304add --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "clangd.fallbackFlags": [ + "-I../include" + ], + "rust-analyzer.cargo.features": [ + "blk", + "net" + ] +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index d5cb6561d..0ce8624b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,6 +71,7 @@ dependencies = [ "kvm-bindings", "kvm-ioctls", "libc", + "linux-loader", "smbios", "utils", "vm-fdt", diff --git a/examples/Makefile b/examples/Makefile index 3eedfa471..8c163059e 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -11,7 +11,7 @@ ROOTFS_DIR = rootfs_$(ROOTFS_DISTRO) .PHONY: clean rootfs -EXAMPLES := chroot_vm external_kernel +EXAMPLES := chroot_vm ifeq ($(SEV),1) EXAMPLES := launch-tee endif @@ -36,12 +36,6 @@ ifeq ($(OS),Darwin) codesign --entitlements chroot_vm.entitlements --force -s - $@ endif -external_kernel: external_kernel.c - gcc -o $@ $< $(CFLAGS) $(LDFLAGS_$(ARCH)_$(OS)) -ifeq ($(OS),Darwin) - codesign --entitlements chroot_vm.entitlements --force -s - $@ -endif - # Build the rootfs to be used with chroot_vm. rootfs: mkdir -p $(ROOTFS_DIR) @@ -50,4 +44,4 @@ rootfs: podman rm libkrun_chroot_vm clean: - rm -rf chroot_vm $(ROOTFS_DIR) launch-tee boot_efi external_kernel + rm -rf chroot_vm $(ROOTFS_DIR) launch-tee boot_efi diff --git a/examples/chroot_vm.c b/examples/chroot_vm.c index 86321aeff..bbc6fa206 100644 --- a/examples/chroot_vm.c +++ b/examples/chroot_vm.c @@ -28,6 +28,12 @@ enum net_mode { NET_MODE_TSI, }; +#if defined(__x86_64__) +#define KERNEL_FORMAT KRUN_KERNEL_FORMAT_AUTO +#else +#define KERNEL_FORMAT KRUN_KERNEL_FORMAT_RAW +#endif + static void print_help(char *const name) { fprintf(stderr, @@ -37,8 +43,15 @@ static void print_help(char *const name) " --net=NET_MODE Set network mode\n" " --passt-socket=PATH Instead of starting passt, connect to passt socket at PATH" "NET_MODE can be either TSI (default) or PASST\n" + " --kernel Path for loading a kernel in place of one supplied by libkrunfw\n" + " --kernel-format Format of a custom kernel (default: autodetect)\n" + " --kernel-cmdline Cmdline for externally-loaded kernel\n" + " --initrd-path Initrd for externally-loaded kernel (optional)\n" + " --boot-disk Add a boot disk (virtio-blk)\n" + " --data-disk Add a data disk (virtio-blk)\n" + " --loglevel Set a logging level (0-5)\n" "\n" - "NEWROOT: the root directory of the vm\n" + "NEWROOT: the root directory of the vm (virtio-fs)\n" "COMMAND: the command you want to execute in the vm\n" "COMMAND_ARGS: arguments of COMMAND\n", name @@ -49,6 +62,13 @@ static const struct option long_options[] = { { "help", no_argument, NULL, 'h' }, { "net_mode", required_argument, NULL, 'N' }, { "passt-socket", required_argument, NULL, 'P' }, + { "kernel", required_argument, NULL, 'k'}, + { "kernel-cmdline", required_argument, NULL, 'c'}, + { "initrd-path", required_argument, NULL, 'i'}, + { "boot-disk", required_argument, NULL, 'b'}, + { "data-disk", required_argument, NULL, 'd'}, + { "loglevel", required_argument, NULL, 'l'}, + { "kernel-format", required_argument, NULL, 'F'}, { NULL, 0, NULL, 0 } }; @@ -58,6 +78,13 @@ struct cmdline { char const *passt_socket_path; char const *new_root; char *const *guest_argv; + char const *boot_disk; + char const *data_disk; + char const *kernel_path; + char const *kernel_cmdline; + char const *initrd_path; + uint loglevel; + int kernel_format; }; bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) @@ -71,6 +98,13 @@ bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) .passt_socket_path = NULL, .new_root = NULL, .guest_argv = NULL, + .kernel_path = NULL, + .kernel_cmdline = NULL, + .initrd_path = NULL, + .boot_disk = NULL, + .data_disk = NULL, + .loglevel = 0, + .kernel_format = KERNEL_FORMAT, }; int option_index = 0; @@ -94,6 +128,27 @@ bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) case 'P': cmdline->passt_socket_path = optarg; break; + case 'k': + cmdline->kernel_path = optarg; + break; + case 'c': + cmdline->kernel_cmdline = optarg; + break; + case 'i': + cmdline->initrd_path = optarg; + break; + case 'b': + cmdline->boot_disk = optarg; + break; + case 'd': + cmdline->data_disk = optarg; + break; + case 'l': + cmdline->loglevel = atoi(optarg); + break; + case 'F': + cmdline->kernel_format = atoi(optarg); + break; case '?': return false; default: @@ -108,12 +163,17 @@ bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) return true; } + // User must either supply rootfs and command or boot disk + if (cmdline->boot_disk) { + return true; + } + if (optind >= argc - 1) { - fprintf(stderr, "Missing COMMAND argument\n"); + fprintf(stderr, "Missing COMMAND argument, but no boot disk has been specified\n"); } if (optind == argc) { - fprintf(stderr, "Missing NEWROOT argument\n"); + fprintf(stderr, "Missing NEWROOT argument, but no boot disk has been specified\n"); } return false; @@ -217,8 +277,7 @@ int main(int argc, char *const argv[]) return 0; } - // Set the log level to "off". - err = krun_set_log_level(0); + err = krun_set_log_level(cmdline.loglevel); if (err) { errno = -err; perror("Error configuring log level"); @@ -245,7 +304,18 @@ int main(int argc, char *const argv[]) rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); - if (err = krun_set_root(ctx_id, cmdline.new_root)) { + if (cmdline.boot_disk && (err = krun_add_disk(ctx_id, "boot", cmdline.boot_disk, 0))) { + errno = -err, + perror("Error configuring boot disk"); + return -1; + } + if (cmdline.data_disk && (err = krun_add_disk(ctx_id, "data", cmdline.data_disk, 0))) { + errno = -err, + perror("Error configuring data disk"); + return -1; + } + + if (cmdline.new_root && (err = krun_set_root(ctx_id, cmdline.new_root))) { errno = -err; perror("Error configuring root path"); return -1; @@ -295,12 +365,21 @@ int main(int argc, char *const argv[]) } // Specify the path of the binary to be executed in the isolated context, relative to the root path. - if (err = krun_set_exec(ctx_id, cmdline.guest_argv[0], (const char* const*) &cmdline.guest_argv[1], &envp[0])) { + if (cmdline.guest_argv && (err = krun_set_exec(ctx_id, cmdline.guest_argv[0], (const char* const*) &cmdline.guest_argv[1], &envp[0]))) { errno = -err; perror("Error configuring the parameters for the executable to be run"); return -1; } + if (cmdline.kernel_path && + (err = krun_set_kernel(ctx_id, cmdline.kernel_path, cmdline.kernel_format, + cmdline.initrd_path, cmdline.kernel_cmdline))) + { + errno = -err; + perror("Error configuring kernel"); + return -1; + } + // Start and enter the microVM. Unless there is some error while creating the microVM // this function never returns. if (err = krun_start_enter(ctx_id)) { diff --git a/examples/external_kernel.c b/examples/external_kernel.c deleted file mode 100644 index c1b4dc1b5..000000000 --- a/examples/external_kernel.c +++ /dev/null @@ -1,321 +0,0 @@ -/* - * This is an example implementing chroot-like functionality with libkrun. - * - * It executes the requested command (relative to NEWROOT) inside a fresh - * Virtual Machine created and managed by libkrun. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_ARGS_LEN 4096 -#ifndef MAX_PATH -#define MAX_PATH 4096 -#endif - -enum net_mode -{ - NET_MODE_PASST = 0, - NET_MODE_TSI, -}; - -#if defined(__x86_64__) -#define KERNEL_FORMAT KRUN_KERNEL_FORMAT_ELF -#else -#define KERNEL_FORMAT KRUN_KERNEL_FORMAT_RAW -#endif - -static void print_help(char *const name) -{ - fprintf(stderr, - "Usage: %s [OPTIONS] KERNEL\n" - "OPTIONS: \n" - " -b --boot-disk Path to a boot disk in raw format\n" - " -c --kernel-cmdline Kernel command line\n" - " -d --data-disk Path to a data disk in raw format\n" - " -h --help Show help\n" - " -i --initrd Path to initramfs\n" - " --net=NET_MODE Set network mode\n" - " --passt-socket=PATH Connect to passt socket at PATH" - "\n" - "NET_MODE can be either TSI (default) or PASST\n" - "\n" -#if defined(__x86_64__) - "KERNEL: path to the kernel image in ELF format\n", -#else - "KERNEL: path to the kernel image in RAW format\n", -#endif - name); -} - -static const struct option long_options[] = { - {"boot-disk", required_argument, NULL, 'b'}, - {"kernel-cmdline", required_argument, NULL, 'c'}, - {"data-disk", required_argument, NULL, 'd'}, - {"initrd-path", required_argument, NULL, 'i'}, - {"help", no_argument, NULL, 'h'}, - {"passt-socket", required_argument, NULL, 'P'}, - {NULL, 0, NULL, 0}}; - -struct cmdline -{ - bool show_help; - enum net_mode net_mode; - char const *boot_disk; - char const *data_disk; - char const *passt_socket_path; - char const *kernel_path; - char const *kernel_cmdline; - char const *initrd_path; -}; - -bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) -{ - assert(cmdline != NULL); - - // set the defaults - *cmdline = (struct cmdline){ - .show_help = false, - .net_mode = NET_MODE_TSI, - .passt_socket_path = "/tmp/network.sock", - .boot_disk = NULL, - .data_disk = NULL, - .kernel_path = NULL, - .kernel_cmdline = NULL, - .initrd_path = NULL, - }; - - int option_index = 0; - int c; - // the '+' in optstring is a GNU extension that disables permutating argv - while ((c = getopt_long(argc, argv, "+hb:c:d:i:", long_options, &option_index)) != -1) - { - switch (c) - { - case 'b': - cmdline->boot_disk = optarg; - break; - case 'c': - cmdline->kernel_cmdline = optarg; - break; - case 'd': - cmdline->data_disk = optarg; - break; - case 'h': - cmdline->show_help = true; - return true; - case 'i': - cmdline->initrd_path = optarg; - break; - case 'P': - cmdline->passt_socket_path = optarg; - break; - case '?': - return false; - default: - fprintf(stderr, "internal argument parsing error (returned character code 0x%x)\n", c); - return false; - } - } - - if (optind <= argc - 1) - { - cmdline->kernel_path = argv[optind]; - return true; - } - - if (optind == argc) - { - fprintf(stderr, "Missing KERNEL argument\n"); - } - - return false; -} - -int connect_to_passt(char *socket_path) -{ - struct sockaddr_un addr; - int socket_fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (socket_fd < 0) - { - perror("Failed to create passt socket fd"); - return -1; - } - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - strncpy(addr.sun_path, socket_path, sizeof(addr.sun_path) - 1); - - if (connect(socket_fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) - { - perror("Failed to bind passt socket"); - return -1; - } - - return socket_fd; -} - -int start_passt() -{ - int socket_fds[2]; - const int PARENT = 0; - const int CHILD = 1; - - if (socketpair(AF_UNIX, SOCK_STREAM, 0, socket_fds) < 0) - { - perror("Failed to create passt socket fd"); - return -1; - } - - int pid = fork(); - if (pid < 0) - { - perror("fork"); - return -1; - } - - if (pid == 0) - { // child - if (close(socket_fds[PARENT]) < 0) - { - perror("close PARENT"); - } - - char fd_as_str[16]; - snprintf(fd_as_str, sizeof(fd_as_str), "%d", socket_fds[CHILD]); - - printf("passing fd %s to passt", fd_as_str); - - if (execlp("passt", "passt", "-f", "--fd", fd_as_str, NULL) < 0) - { - perror("execlp"); - return -1; - } - } - else - { // parent - if (close(socket_fds[CHILD]) < 0) - { - perror("close CHILD"); - } - - return socket_fds[PARENT]; - } -} - -int main(int argc, char *const argv[]) -{ - int ctx_id; - int err; - pthread_t thread; - struct cmdline cmdline; - - if (!parse_cmdline(argc, argv, &cmdline)) - { - putchar('\n'); - print_help(argv[0]); - return -1; - } - - if (cmdline.show_help) - { - print_help(argv[0]); - return 0; - } - - // Set the log level to "off". - err = krun_set_log_level(0); - if (err) - { - errno = -err; - perror("Error configuring log level"); - return -1; - } - - // Create the configuration context. - ctx_id = krun_create_ctx(); - if (ctx_id < 0) - { - errno = -ctx_id; - perror("Error creating configuration context"); - return -1; - } - - // Configure the number of vCPUs (2) and the amount of RAM (1024 MiB). - if (err = krun_set_vm_config(ctx_id, 2, 2048)) - { - errno = -err; - perror("Error configuring the number of vCPUs and/or the amount of RAM"); - return -1; - } - - if (cmdline.boot_disk) - { - if (err = krun_add_disk(ctx_id, "boot", cmdline.boot_disk, 0)) - { - errno = -err, - perror("Error configuring boot disk"); - return -1; - } - } - if (cmdline.data_disk) - { - if (err = krun_add_disk(ctx_id, "data", cmdline.data_disk, 0)) - { - errno = -err, - perror("Error configuring data disk"); - return -1; - } - } - - if (cmdline.net_mode == NET_MODE_PASST) - { - int passt_fd = cmdline.passt_socket_path ? connect_to_passt(cmdline.passt_socket_path) : start_passt(); - - if (passt_fd < 0) - { - return -1; - } - - if (err = krun_set_passt_fd(ctx_id, passt_fd)) - { - errno = -err; - perror("Error configuring net mode"); - return -1; - } - } - - fprintf(stderr, "kernel_path: %s\n", cmdline.kernel_path); - fprintf(stderr, "kernel_cmdline: %s\n", cmdline.kernel_cmdline); - fflush(stderr); - - if (err = krun_set_kernel(ctx_id, cmdline.kernel_path, KERNEL_FORMAT, - cmdline.initrd_path, cmdline.kernel_cmdline)) - { - errno = -err; - perror("Error configuring kernel"); - return -1; - } - - // Start and enter the microVM. Unless there is some error while creating the microVM - // this function never returns. - if (err = krun_start_enter(ctx_id)) - { - errno = -err; - perror("Error creating the microVM"); - return -1; - } - - // Not reached. - return 0; -} diff --git a/include/libkrun.h b/include/libkrun.h index d10f91785..1f87924d8 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -409,6 +409,7 @@ int32_t krun_set_exec(uint32_t ctx_id, #define KRUN_KERNEL_FORMAT_IMAGE_BZ2 3 #define KRUN_KERNEL_FORMAT_IMAGE_GZ 4 #define KRUN_KERNEL_FORMAT_IMAGE_ZSTD 5 +#define KRUN_KERNEL_FORMAT_AUTO 6 /** * Sets the path to the kernel to be loaded in the microVM. * diff --git a/src/arch/Cargo.toml b/src/arch/Cargo.toml index baaedda55..126e59c0d 100644 --- a/src/arch/Cargo.toml +++ b/src/arch/Cargo.toml @@ -24,5 +24,8 @@ kvm-ioctls = ">=0.17" [target.'cfg(target_arch = "aarch64")'.dependencies] vm-fdt = ">= 0.2.0" +[target.'cfg(target_arch = "x86_64")'.dependencies] +linux-loader = { version = "0.13.0", features = ["bzimage", "elf", "pe"] } + [dev-dependencies] utils = { path = "../utils" } diff --git a/src/arch/src/x86_64/gdt.rs b/src/arch/src/x86_64/gdt.rs index c7fcbf31b..b517da9b5 100644 --- a/src/arch/src/x86_64/gdt.rs +++ b/src/arch/src/x86_64/gdt.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -24,8 +26,15 @@ fn get_base(entry: u64) -> u64 { | (((entry) & 0x0000_0000_FFFF_0000) >> 16) } +// https://github.com/firecracker-microvm/firecracker/blob/20b50ce11ed45d99e514f3eda025c185188cd15d/src/vmm/src/arch/x86_64/gdt.rs#L29 fn get_limit(entry: u64) -> u32 { - ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32 + let limit = ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32; + + if get_g(entry) == 1 { + (limit << 12) | 0xFFF + } else { + limit + } } fn get_g(entry: u64) -> u8 { @@ -109,7 +118,7 @@ mod tests { assert_eq!(0xB, seg.type_); // base and limit assert_eq!(0x10_0000, seg.base); - assert_eq!(0xfffff, seg.limit); + assert_eq!(0xffffffff, seg.limit); assert_eq!(0x0, seg.unusable); } } diff --git a/src/arch/src/x86_64/layout.rs b/src/arch/src/x86_64/layout.rs index ca92736b0..0a3e5601e 100644 --- a/src/arch/src/x86_64/layout.rs +++ b/src/arch/src/x86_64/layout.rs @@ -31,6 +31,19 @@ pub const IRQ_MAX: u32 = 15; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; +/// Address of the hvm_start_info struct used in PVH boot +pub const PVH_INFO_START: u64 = 0x6000; + +/// Starting address of array of modules of hvm_modlist_entry type. +/// Used to enable initrd support using the PVH boot ABI. +pub const MODLIST_START: u64 = 0x6040; +/// Address of memory map table used in PVH boot. Can overlap +/// with the zero page address since they are mutually exclusive. +pub const MEMMAP_START: u64 = 0x7000; + +/// Location of RSDP pointer in x86 machines +pub const RSDP_ADDR: u64 = 0x000e_0000; + /// The 'zero page', a.k.a linux kernel bootparams. pub const ZERO_PAGE_START: u64 = 0x7000; diff --git a/src/arch/src/x86_64/mod.rs b/src/arch/src/x86_64/mod.rs index 4253554fc..94186b05e 100644 --- a/src/arch/src/x86_64/mod.rs +++ b/src/arch/src/x86_64/mod.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -18,12 +20,18 @@ pub mod msr; pub mod regs; use crate::{round_up, ArchMemoryInfo, InitrdConfig}; -use arch_gen::x86::bootparam::{boot_params, E820_RAM}; +use arch_gen::x86::bootparam::{boot_params, E820_RAM, E820_RESERVED}; +use linux_loader::configurator::{BootConfigurator, BootParams}; use vm_memory::Bytes; use vm_memory::{ Address, ByteValued, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; +use linux_loader::configurator::pvh::PvhBootConfigurator; +use linux_loader::loader::elf::start_info::{ + hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, +}; + // This is a workaround to the Rust enforcement specifying that any implementation of a foreign // trait (in this case `ByteValued`) where: // * the type that is implementing the trait is foreign or @@ -47,6 +55,8 @@ pub enum Error { ZeroPageSetup, /// Failed to compute initrd address. InitrdAddress, + /// Error writing hvm_start_info to guest memory. + StartInfoSetup, } // Where BIOS/VGA magic would live on a real PC. @@ -240,6 +250,132 @@ pub fn configure_system( cmdline_size: usize, initrd: &Option, num_cpus: u8, + pvh: bool, +) -> Result<(), Error> { + // Note that this puts the mptable at the last 1k of Linux's 640k base RAM + #[cfg(not(feature = "tee"))] + mptable::setup_mptable(guest_mem, num_cpus).map_err(Error::MpTableSetup)?; + + match pvh { + true => { + configure_pvh(guest_mem, arch_memory_info, cmdline_addr, initrd)?; + } + false => { + configure_64bit_boot(guest_mem, arch_memory_info, cmdline_addr, cmdline_size, initrd)?; + } + } + Ok(()) +} + +fn configure_pvh( + guest_mem: &GuestMemoryMmap, + arch_memory_info: &ArchMemoryInfo, + cmdline_addr: GuestAddress, + initrd: &Option, +) -> Result<(), Error> { + const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; + let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); + let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let himem_start = GuestAddress(layout::HIMEM_START); + // Vector to hold modules (currently either empty or holding initrd). + let mut modules: Vec = Vec::new(); + if let Some(initrd_config) = initrd { + // The initrd has been written to guest memory already, here we just need to + // create the module structure that describes it. + modules.push(hvm_modlist_entry { + paddr: initrd_config.address.raw_value(), + size: initrd_config.size as u64, + ..Default::default() + }); + } + // Vector to hold the memory maps which needs to be written to guest memory + // at MEMMAP_START after all of the mappings are recorded. + let mut memmap: Vec = Vec::new(); + // Create the memory map entries. + add_memmap_entry(&mut memmap, 0, mptable::MPTABLE_START, E820_RAM); + add_memmap_entry( + &mut memmap, + mptable::MPTABLE_START, + layout::RSDP_ADDR - mptable::MPTABLE_START, + E820_RESERVED, + ); + let last_addr = GuestAddress(arch_memory_info.ram_last_addr); // firecracker: guest_mem.last_addr(); + println!("last_addr: {:?}/{:?}", last_addr, guest_mem.last_addr()); + if last_addr < end_32bit_gap_start { + add_memmap_entry( + &mut memmap, + himem_start.raw_value(), + last_addr.unchecked_offset_from(himem_start) + 1, + E820_RAM, + ); + } else { + add_memmap_entry( + &mut memmap, + himem_start.raw_value(), + end_32bit_gap_start.unchecked_offset_from(himem_start), + E820_RAM, + ); + if last_addr > first_addr_past_32bits { + add_memmap_entry( + &mut memmap, + first_addr_past_32bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, + E820_RAM, + ); + } + } + + // Construct the hvm_start_info structure and serialize it into + // boot_params. This will be stored at PVH_INFO_START address, and %rbx + // will be initialized to contain PVH_INFO_START prior to starting the + // guest, as required by the PVH ABI. + let mut start_info = hvm_start_info { + magic: XEN_HVM_START_MAGIC_VALUE, + version: 1, + cmdline_paddr: cmdline_addr.raw_value(), + memmap_paddr: layout::MEMMAP_START, + memmap_entries: memmap.len() as u32, + nr_modules: modules.len() as u32, + ..Default::default() + }; + if !modules.is_empty() { + start_info.modlist_paddr = layout::MODLIST_START; + } + let mut boot_params = + BootParams::new::(&start_info, GuestAddress(layout::PVH_INFO_START)); + // Copy the vector with the memmap table to the MEMMAP_START address + // which is already saved in the memmap_paddr field of hvm_start_info struct. + boot_params.set_sections::(&memmap, GuestAddress(layout::MEMMAP_START)); + // Copy the vector with the modules list to the MODLIST_START address. + // Note that we only set the modlist_paddr address if there is a nonzero + // number of modules, but serializing an empty list is harmless. + boot_params.set_modules::(&modules, GuestAddress(layout::MODLIST_START)); + // Write the hvm_start_info struct to guest memory. + println!("start_info: {:#?}", start_info); + PvhBootConfigurator::write_bootparams(&boot_params, guest_mem) + .map_err(|_| Error::StartInfoSetup) +} + +fn add_memmap_entry( + memmap: &mut Vec, + addr: u64, + size: u64, + mem_type: u32, +) { + // Add the table entry to the vector + memmap.push(hvm_memmap_table_entry { + addr, + size, + type_: mem_type, + reserved: 0, + }); +} +fn configure_64bit_boot( + guest_mem: &GuestMemoryMmap, + arch_memory_info: &ArchMemoryInfo, + cmdline_addr: GuestAddress, + cmdline_size: usize, + initrd: &Option, ) -> super::Result<()> { const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; @@ -250,10 +386,6 @@ pub fn configure_system( let himem_start = GuestAddress(layout::HIMEM_START); - // Note that this puts the mptable at the last 1k of Linux's 640k base RAM - #[cfg(not(feature = "tee"))] - mptable::setup_mptable(guest_mem, num_cpus).map_err(Error::MpTableSetup)?; - let mut params: BootParamsWrapper = BootParamsWrapper(boot_params::default()); params.0.hdr.type_of_loader = KERNEL_LOADER_OTHER; diff --git a/src/arch/src/x86_64/mptable.rs b/src/arch/src/x86_64/mptable.rs index 4d3eab588..702eb9eb6 100644 --- a/src/arch/src/x86_64/mptable.rs +++ b/src/arch/src/x86_64/mptable.rs @@ -44,7 +44,7 @@ unsafe impl ByteValued for MpcLintsrcWrapper {} unsafe impl ByteValued for MpfIntelWrapper {} // MPTABLE, describing VCPUS. -const MPTABLE_START: u64 = 0x9fc00; +pub const MPTABLE_START: u64 = 0x9fc00; #[derive(Debug, Eq, PartialEq)] pub enum Error { diff --git a/src/arch/src/x86_64/regs.rs b/src/arch/src/x86_64/regs.rs index 5f8ed13f4..86c2899a3 100644 --- a/src/arch/src/x86_64/regs.rs +++ b/src/arch/src/x86_64/regs.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -67,14 +69,18 @@ pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64, id: u8) -> Result<()> { kvm_regs { rflags: 0x0000_0000_0000_0002u64, rip: boot_ip, - // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are - // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), - // local variables and function parameters are still accessible from a constant offset from rbp. - rsp: super::layout::BOOT_STACK_POINTER, - // Starting stack pointer. - rbp: super::layout::BOOT_STACK_POINTER, - // Must point to zero page address per Linux ABI. This is x86_64 specific. - rsi: super::layout::ZERO_PAGE_START, + // pvh + rbx: super::layout::PVH_INFO_START, + + // linux boot protocol + // // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are + // // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), + // // local variables and function parameters are still accessible from a constant offset from rbp. + // rsp: super::layout::BOOT_STACK_POINTER, + // // Starting stack pointer. + // rbp: super::layout::BOOT_STACK_POINTER, + // // Must point to zero page address per Linux ABI. This is x86_64 specific. + // rsi: super::layout::ZERO_PAGE_START, ..Default::default() } } else { @@ -98,8 +104,10 @@ pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd, id: u8) -> Result<()> { let mut sregs: kvm_sregs = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; if cfg!(not(feature = "tee")) { - configure_segments_and_sregs(mem, &mut sregs)?; - setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead + configure_segments_and_sregs(mem, &mut sregs, true)?; + if !true { // 64-bit Linux boot only + setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead + } } else if id != 0 { //sregs.cs.selector = 0x9100; //sregs.cs.base = 0x91000; @@ -117,6 +125,7 @@ const EFER_LMA: u64 = 0x400; const EFER_LME: u64 = 0x100; const X86_CR0_PE: u64 = 0x1; +const X86_CR0_ET: u64 = 0x10; const X86_CR0_PG: u64 = 0x8000_0000; const X86_CR4_PAE: u64 = 0x20; @@ -140,13 +149,22 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { .map_err(|_| Error::WriteIDT) } -fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> { - let gdt_table: [u64; BOOT_GDT_MAX] = [ - gdt_entry(0, 0, 0), // NULL - gdt_entry(0xa09b, 0, 0xfffff), // CODE - gdt_entry(0xc093, 0, 0xfffff), // DATA - gdt_entry(0x808b, 0, 0xfffff), // TSS - ]; +fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs, pvh: bool) -> Result<()> { + let gdt_table: [u64; BOOT_GDT_MAX] = if pvh { + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE + gdt_entry(0xc093, 0, 0xffff_ffff), // DATA + gdt_entry(0x008b, 0, 0x67), // TSS + ] + } else { + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ] + }; let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); @@ -169,9 +187,15 @@ fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> sregs.ss = data_seg; sregs.tr = tss_seg; - /* 64-bit protected mode */ - sregs.cr0 |= X86_CR0_PE; - sregs.efer |= EFER_LME | EFER_LMA; + // https://github.com/firecracker-microvm/firecracker/blob/20b50ce11ed45d99e514f3eda025c185188cd15d/src/vmm/src/arch/x86_64/regs.rs#L243 + if pvh { + sregs.cr0 = X86_CR0_PE | X86_CR0_ET; + sregs.cr4 = 0; + } else { + /* 64-bit protected mode */ + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + } Ok(()) } diff --git a/src/devices/src/virtio/block/worker.rs b/src/devices/src/virtio/block/worker.rs index 19d7d2133..2d59f82b9 100644 --- a/src/devices/src/virtio/block/worker.rs +++ b/src/devices/src/virtio/block/worker.rs @@ -223,6 +223,7 @@ impl BlockWorker { ) -> result::Result { match request_header.request_type { VIRTIO_BLK_T_IN => { + println!("blk req VIRTIO_BLK_T_IN {}", request_header.sector); let data_len = writer.available_bytes() - 1; if data_len % 512 != 0 { Err(RequestError::InvalidDataLength) @@ -233,6 +234,7 @@ impl BlockWorker { } } VIRTIO_BLK_T_OUT => { + println!("blk req VIRTIO_BLK_T_OUT {}", request_header.sector); let data_len = reader.available_bytes(); if data_len % 512 != 0 { Err(RequestError::InvalidDataLength) @@ -252,6 +254,7 @@ impl BlockWorker { CacheType::Unsafe => Ok(0), }, VIRTIO_BLK_T_GET_ID => { + println!("blk req VIRTIO_BLK_T_GET_ID {}", request_header.sector); let data_len = writer.available_bytes(); let disk_id = self.disk.image_id(); if data_len < disk_id.len() { diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 245d681bd..68c21bc0b 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -1166,6 +1166,46 @@ pub unsafe extern "C" fn krun_set_kernel( 3 => KernelFormat::ImageBz2, 4 => KernelFormat::ImageGz, 5 => KernelFormat::ImageZstd, + 6 => { + let data: Vec = std::fs::read(path.clone()).ok().unwrap(); + if data.len() >= 4 && &data[0..4] == [0x7f, b'E', b'L', b'F'] { + debug!("Found ELF header at offset 0"); + KernelFormat::Elf + } else if data.len() >= 0x202 + && &data[0x1FE..0x200] == [0x55, 0xAA] + && &data[0x202..0x206] == [b'H', b'd', b'r', b'S'] + { + // Linux 2.00+ boot protocol + if let Some(magic) = data + .windows(4) + .position(|window| window == [b'B', b'Z', b'h']) + { + debug!("Found BZIP2 header on Image file at: 0x{:x}", magic); + KernelFormat::ImageBz2 + } else if let Some(magic) = data + .windows(3) + .position(|window| window == [0x1f, 0x8b, 0x8]) + { + debug!("Found GZIP header on Image file at: 0x{:x}", magic); + KernelFormat::ImageGz + } else if let Some(magic) = data + .windows(4) + .position(|window| window == [0x28, 0xb5, 0x2f, 0xfd]) + { + debug!("Found ZSTD header on Image file at: 0x{:x}", magic); + KernelFormat::ImageZstd + } else { + info!("No known header found on Image, defaulting to raw"); + KernelFormat::Raw + } + } else if data.len() >= 2 && &data[0..2] == [b'M', b'Z'] { + debug!("Found PE header at offset 0"); + KernelFormat::PeGz + } else { + info!("No known header found, defaulting to raw"); + KernelFormat::Raw + } + } _ => { return -libc::EINVAL; } @@ -1413,6 +1453,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { return -libc::EINVAL; } }; + println!("MicroVM created"); #[cfg(target_os = "macos")] let mapper_vmm = _vmm.clone(); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e279b7eaa..e67af76d7 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -824,8 +824,10 @@ pub fn build_microvm( println!("Starting TEE/microVM."); } + println!("Starting microVM."); vmm.start_vcpus(vcpus) .map_err(StartMicrovmError::Internal)?; + println!("MicroVM started."); // Clippy thinks we don't need Arc { + let data: Vec = std::fs::read(external_kernel.path.clone()) + .map_err(StartMicrovmError::ImageBz2OpenKernel)?; + if let Some(magic) = data + .windows(4) + .position(|window| window == [b'\x7f', b'E', b'L', b'F']) + { + debug!("Found ELF header on Image file at: 0x{:x}", magic); + } let mut file = File::options() .read(true) .write(false) @@ -863,7 +873,20 @@ fn load_external_kernel( .map_err(StartMicrovmError::ElfOpenKernel)?; let load_result = loader::Elf::load(guest_mem, None, &mut file, None) .map_err(StartMicrovmError::ElfLoadKernel)?; - load_result.kernel_load + match load_result.pvh_boot_cap { + loader::PvhBootCapability::PvhEntryPresent(guest_address) => { + println!("PvhEntryPresent"); + guest_address + }, + loader::PvhBootCapability::PvhEntryNotPresent => { + println!("PvhEntryNotPresent"); + load_result.kernel_load + }, + loader::PvhBootCapability::PvhEntryIgnored => { + println!("PvhEntryIgnored"); + load_result.kernel_load + }, + } } #[cfg(target_arch = "aarch64")] KernelFormat::PeGz => { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 0e680a846..ae3017a16 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -279,6 +279,7 @@ impl Vmm { cmdline_len, initrd, vcpus.len() as u8, + true, ) .map_err(Error::ConfigureSystem)?; }