// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.

//! Virtual Machine Monitor that leverages the Linux Kernel-based Virtual Machine (KVM),
//! and other virtualization features to run a single lightweight micro-virtual
//! machine (microVM).
#![deny(missing_docs)]

/// Handles setup and initialization a `Vmm` object.
pub mod builder;
/// Syscalls allowed through the seccomp filter.
pub mod default_syscalls;
pub(crate) mod device_manager;
pub mod memory_snapshot;
/// Save/restore utilities.
pub mod persist;
/// Resource store for configured microVM resources.
pub mod resources;
/// microVM RPC API adapters.
pub mod rpc_interface;
/// Signal handling utilities.
pub mod signal_handler;
/// microVM state versions.
pub mod version_map;
/// Wrappers over structures used to configure the VMM.
pub mod vmm_config;
mod vstate;

use std::collections::HashMap;
use std::fmt::{Display, Formatter};
use std::io;
use std::os::unix::io::AsRawFd;
#[cfg(target_arch = "x86_64")]
use std::sync::mpsc::RecvTimeoutError;
use std::sync::Mutex;
use std::time::Duration;

#[cfg(target_arch = "x86_64")]
use crate::device_manager::legacy::PortIODeviceManager;
use crate::device_manager::mmio::MMIODeviceManager;
#[cfg(target_arch = "x86_64")]
use crate::memory_snapshot::SnapshotMemory;
#[cfg(target_arch = "x86_64")]
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
#[cfg(target_arch = "x86_64")]
use crate::vstate::vcpu::VcpuState;
use crate::vstate::{
    vcpu::{Vcpu, VcpuEvent, VcpuHandle, VcpuResponse},
    vm::Vm,
};
use arch::DeviceType;
use devices::BusDevice;
use logger::{error, info, warn, LoggerError, MetricsError, METRICS};
use polly::event_manager::{EventManager, Subscriber};
use seccomp::BpfProgramRef;
#[cfg(target_arch = "x86_64")]
use snapshot::Persist;
use utils::epoll::{EpollEvent, EventSet};
use utils::eventfd::EventFd;
use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap};

/// Success exit code.
pub const FC_EXIT_CODE_OK: u8 = 0;
/// Generic error exit code.
pub const FC_EXIT_CODE_GENERIC_ERROR: u8 = 1;
/// Generic exit code for an error considered not possible to occur if the program logic is sound.
pub const FC_EXIT_CODE_UNEXPECTED_ERROR: u8 = 2;
/// Firecracker was shut down after intercepting a restricted system call.
pub const FC_EXIT_CODE_BAD_SYSCALL: u8 = 148;
/// Firecracker was shut down after intercepting `SIGBUS`.
pub const FC_EXIT_CODE_SIGBUS: u8 = 149;
/// Firecracker was shut down after intercepting `SIGSEGV`.
pub const FC_EXIT_CODE_SIGSEGV: u8 = 150;
/// Firecracker was shut down after intercepting `SIGXFSZ`.
pub const FC_EXIT_CODE_SIGXFSZ: u8 = 151;
/// Firecracker was shut down after intercepting `SIGXCPU`.
pub const FC_EXIT_CODE_SIGXCPU: u8 = 154;
/// Firecracker was shut down after intercepting `SIGPIPE`.
pub const FC_EXIT_CODE_SIGPIPE: u8 = 155;
/// Firecracker was shut down after intercepting `SIGHUP`.
pub const FC_EXIT_CODE_SIGHUP: u8 = 156;
/// Firecracker was shut down after intercepting `SIGILL`.
pub const FC_EXIT_CODE_SIGILL: u8 = 157;
/// Bad configuration for microvm's resources, when using a single json.
pub const FC_EXIT_CODE_BAD_CONFIGURATION: u8 = 152;
/// Command line arguments parsing error.
pub const FC_EXIT_CODE_ARG_PARSING: u8 = 153;

/// Errors associated with the VMM internal logic. These errors cannot be generated by direct user
/// input, but can result from bad configuration of the host (for example if Firecracker doesn't
/// have permissions to open the KVM fd).
#[derive(Debug)]
pub enum Error {
    /// Legacy devices work with Event file descriptors and the creation can fail because
    /// of resource exhaustion.
    #[cfg(target_arch = "x86_64")]
    CreateLegacyDevice(device_manager::legacy::Error),
    /// Cannot fetch the KVM dirty bitmap.
    DirtyBitmap(kvm_ioctls::Error),
    /// Cannot read from an Event file descriptor.
    EventFd(io::Error),
    /// I8042 Error.
    I8042Error(devices::legacy::I8042DeviceError),
    /// Cannot access kernel file.
    KernelFile(io::Error),
    /// Cannot open /dev/kvm. Either the host does not have KVM or Firecracker does not have
    /// permission to open the file descriptor.
    KvmContext(vstate::system::Error),
    #[cfg(target_arch = "x86_64")]
    /// Cannot add devices to the Legacy I/O Bus.
    LegacyIOBus(device_manager::legacy::Error),
    /// Internal logger error.
    Logger(LoggerError),
    /// Internal metrics system error.
    Metrics(MetricsError),
    /// Cannot add a device to the MMIO Bus.
    RegisterMMIODevice(device_manager::mmio::Error),
    /// Cannot build seccomp filters.
    SeccompFilters(seccomp::Error),
    /// Write to the serial console failed.
    Serial(io::Error),
    /// Cannot create Timer file descriptor.
    TimerFd(io::Error),
    /// Vcpu configuration error.
    VcpuConfigure(vstate::vcpu::VcpuError),
    /// Vcpu create error.
    VcpuCreate(vstate::vcpu::Error),
    /// Cannot send event to vCPU.
    VcpuEvent(vstate::vcpu::Error),
    /// Cannot create a vCPU handle.
    VcpuHandle(vstate::vcpu::Error),
    /// vCPU pause failed.
    VcpuPause,
    /// vCPU exit failed.
    VcpuExit,
    /// vCPU resume failed.
    VcpuResume,
    /// Vcpu send message failed.
    VcpuMessage,
    /// Cannot spawn a new Vcpu thread.
    VcpuSpawn(io::Error),
    /// Vm error.
    Vm(vstate::vm::Error),
    /// Error thrown by observer object on Vmm initialization.
    VmmObserverInit(utils::errno::Error),
    /// Error thrown by observer object on Vmm teardown.
    VmmObserverTeardown(utils::errno::Error),
}

impl Display for Error {
    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
        use self::Error::*;

        match self {
            #[cfg(target_arch = "x86_64")]
            CreateLegacyDevice(e) => write!(f, "Error creating legacy device: {}", e),
            DirtyBitmap(e) => write!(f, "Error getting the KVM dirty bitmap. {}", e),
            EventFd(e) => write!(f, "Event fd error: {}", e),
            I8042Error(e) => write!(f, "I8042 error: {}", e),
            KernelFile(e) => write!(f, "Cannot access kernel file: {}", e),
            KvmContext(e) => write!(f, "Failed to validate KVM support: {}", e),
            #[cfg(target_arch = "x86_64")]
            LegacyIOBus(e) => write!(f, "Cannot add devices to the legacy I/O Bus. {}", e),
            Logger(e) => write!(f, "Logger error: {}", e),
            Metrics(e) => write!(f, "Metrics error: {}", e),
            RegisterMMIODevice(e) => write!(f, "Cannot add a device to the MMIO Bus. {}", e),
            SeccompFilters(e) => write!(f, "Cannot build seccomp filters: {}", e),
            Serial(e) => write!(f, "Error writing to the serial console: {}", e),
            TimerFd(e) => write!(f, "Error creating timer fd: {}", e),
            VcpuConfigure(e) => write!(f, "Error configuring the vcpu for boot: {}", e),
            VcpuCreate(e) => write!(f, "Error creating the vcpu: {}", e),
            VcpuEvent(e) => write!(f, "Cannot send event to vCPU. {}", e),
            VcpuHandle(e) => write!(f, "Cannot create a vCPU handle. {}", e),
            VcpuPause => write!(f, "Failed to pause the vCPUs."),
            VcpuExit => write!(f, "Failed to exit the vCPUs."),
            VcpuResume => write!(f, "Failed to resume the vCPUs."),
            VcpuMessage => write!(f, "Failed to message the vCPUs."),
            VcpuSpawn(e) => write!(f, "Cannot spawn Vcpu thread: {}", e),
            Vm(e) => write!(f, "Vm error: {}", e),
            VmmObserverInit(e) => write!(
                f,
                "Error thrown by observer object on Vmm initialization: {}",
                e
            ),
            VmmObserverTeardown(e) => {
                write!(f, "Error thrown by observer object on Vmm teardown: {}", e)
            }
        }
    }
}

/// Trait for objects that need custom initialization and teardown during the Vmm lifetime.
pub trait VmmEventsObserver {
    /// This function will be called during microVm boot.
    fn on_vmm_boot(&mut self) -> std::result::Result<(), utils::errno::Error> {
        Ok(())
    }
    /// This function will be called on microVm teardown.
    fn on_vmm_stop(&mut self) -> std::result::Result<(), utils::errno::Error> {
        Ok(())
    }
}

/// Shorthand result type for internal VMM commands.
pub type Result<T> = std::result::Result<T, Error>;

/// Shorthand type for KVM dirty page bitmap.
pub type DirtyBitmap = HashMap<usize, Vec<u64>>;

/// Contains the state and associated methods required for the Firecracker VMM.
pub struct Vmm {
    events_observer: Option<Box<dyn VmmEventsObserver>>,

    // Guest VM core resources.
    guest_memory: GuestMemoryMmap,

    vcpus_handles: Vec<VcpuHandle>,
    exit_evt: EventFd,
    vm: Vm,

    // Guest VM devices.
    mmio_device_manager: MMIODeviceManager,
    #[cfg(target_arch = "x86_64")]
    pio_device_manager: PortIODeviceManager,
}

impl Vmm {
    /// Gets the specified bus device.
    pub fn get_bus_device(
        &self,
        device_type: DeviceType,
        device_id: &str,
    ) -> Option<&Mutex<dyn BusDevice>> {
        self.mmio_device_manager.get_device(device_type, device_id)
    }

    /// Starts the microVM vcpus.
    pub fn start_vcpus(
        &mut self,
        mut vcpus: Vec<Vcpu>,
        vcpu_seccomp_filter: BpfProgramRef,
    ) -> Result<()> {
        let vcpu_count = vcpus.len();

        if let Some(observer) = self.events_observer.as_mut() {
            observer.on_vmm_boot().map_err(Error::VmmObserverInit)?;
        }

        Vcpu::register_kick_signal_handler();

        self.vcpus_handles.reserve(vcpu_count as usize);

        for mut vcpu in vcpus.drain(..) {
            vcpu.set_mmio_bus(self.mmio_device_manager.bus.clone());
            #[cfg(target_arch = "x86_64")]
            vcpu.kvm_vcpu
                .set_pio_bus(self.pio_device_manager.io_bus.clone());

            self.vcpus_handles.push(
                vcpu.start_threaded(vcpu_seccomp_filter.to_vec())
                    .map_err(Error::VcpuHandle)?,
            );
        }

        Ok(())
    }

    // Checks that the vCPUs respond with the `_expected_response`.
    fn check_vcpus_response(
        &mut self,
        _expected_response: VcpuResponse,
    ) -> std::result::Result<(), ()> {
        for handle in self.vcpus_handles.iter() {
            match handle
                .response_receiver()
                .recv_timeout(Duration::from_millis(1000))
            {
                Ok(_expected_response) => (),
                _ => return Err(()),
            }
        }
        Ok(())
    }

    /// Sends a resume command to the vCPUs.
    pub fn resume_vcpus(&mut self) -> Result<()> {
        self.broadcast_vcpu_event(VcpuEvent::Resume, VcpuResponse::Resumed)
            .map_err(|_| Error::VcpuResume)
    }

    /// Sends a pause command to the vCPUs.
    pub fn pause_vcpus(&mut self) -> Result<()> {
        self.broadcast_vcpu_event(VcpuEvent::Pause, VcpuResponse::Paused)
            .map_err(|_| Error::VcpuPause)
    }

    /// Sends an exit command to the vCPUs.
    pub fn exit_vcpus(&mut self) -> Result<()> {
        self.broadcast_vcpu_event(
            VcpuEvent::Exit,
            VcpuResponse::Exited(FC_EXIT_CODE_GENERIC_ERROR),
        )
        .map_err(|_| Error::VcpuExit)
    }
    /// Returns a reference to the inner `GuestMemoryMmap` object if present, or `None` otherwise.
    pub fn guest_memory(&self) -> &GuestMemoryMmap {
        &self.guest_memory
    }

    /// Injects CTRL+ALT+DEL keystroke combo in the i8042 device.
    #[cfg(target_arch = "x86_64")]
    pub fn send_ctrl_alt_del(&mut self) -> Result<()> {
        self.pio_device_manager
            .i8042
            .lock()
            .expect("i8042 lock was poisoned")
            .trigger_ctrl_alt_del()
            .map_err(Error::I8042Error)
    }

    /// Waits for all vCPUs to exit and terminates the Firecracker process.
    pub fn stop(&mut self, exit_code: i32) {
        info!("Vmm is stopping.");

        if let Some(observer) = self.events_observer.as_mut() {
            if let Err(e) = observer.on_vmm_stop() {
                warn!("{}", Error::VmmObserverTeardown(e));
            }
        }

        // Write the metrics before exiting.
        if let Err(e) = METRICS.write() {
            error!("Failed to write metrics while stopping: {}", e);
        }

        // Exit from Firecracker using the provided exit code. Safe because we're terminating
        // the process anyway.
        unsafe {
            libc::_exit(exit_code);
        }
    }

    /// Saves the state of a paused Microvm.
    #[cfg(target_arch = "x86_64")]
    pub fn save_state(&mut self) -> std::result::Result<MicrovmState, MicrovmStateError> {
        use self::MicrovmStateError::SaveVmState;
        let vcpu_states = self.save_vcpu_states()?;

        let vm_state = self.vm.save_state().map_err(SaveVmState)?;

        let device_states = self.mmio_device_manager.save();

        let mem_size_mib = persist::mem_size_mib(self.guest_memory());
        let memory_state = self.guest_memory().describe();

        Ok(MicrovmState {
            vm_info: VmInfo { mem_size_mib },
            memory_state,
            vm_state,
            vcpu_states,
            device_states,
        })
    }

    #[cfg(target_arch = "x86_64")]
    fn save_vcpu_states(&mut self) -> std::result::Result<Vec<VcpuState>, MicrovmStateError> {
        use self::MicrovmStateError::*;
        for handle in self.vcpus_handles.iter() {
            handle
                .send_event(VcpuEvent::SaveState)
                .map_err(SignalVcpu)?;
        }

        let vcpu_responses = self
            .vcpus_handles
            .iter()
            // `Iterator::collect` can transform a `Vec<Result>` into a `Result<Vec>`.
            .map(|handle| {
                handle
                    .response_receiver()
                    .recv_timeout(Duration::from_millis(1000))
            })
            .collect::<std::result::Result<Vec<VcpuResponse>, RecvTimeoutError>>()
            .map_err(|_| UnexpectedVcpuResponse)?;

        let vcpu_states = vcpu_responses
            .into_iter()
            .map(|response| match response {
                VcpuResponse::SavedState(state) => Ok(*state),
                VcpuResponse::Error(e) => Err(SaveVcpuState(e)),
                VcpuResponse::NotAllowed(reason) => Err(MicrovmStateError::NotAllowed(reason)),
                _ => Err(UnexpectedVcpuResponse),
            })
            .collect::<std::result::Result<Vec<VcpuState>, MicrovmStateError>>()?;

        Ok(vcpu_states)
    }

    // Sends an event to all vCPUs and waits for a response.
    fn broadcast_vcpu_event(
        &mut self,
        event: VcpuEvent,
        expected_response: VcpuResponse,
    ) -> Result<()> {
        for handle in self.vcpus_handles.iter() {
            handle
                .send_event(event.clone())
                .map_err(|_| Error::VcpuMessage)?;
        }

        self.check_vcpus_response(expected_response)
            .map_err(|_| Error::VcpuMessage)
    }

    #[cfg(target_arch = "x86_64")]
    /// Restores vcpus kvm states.
    pub fn restore_vcpu_states(
        &mut self,
        mut vcpu_states: Vec<VcpuState>,
    ) -> std::result::Result<(), MicrovmStateError> {
        use self::MicrovmStateError::*;

        if vcpu_states.len() != self.vcpus_handles.len() {
            return Err(InvalidInput);
        }
        for (handle, state) in self.vcpus_handles.iter().zip(vcpu_states.drain(..)) {
            handle
                .send_event(VcpuEvent::RestoreState(Box::new(state)))
                .map_err(MicrovmStateError::SignalVcpu)?;
        }

        let vcpu_responses = self
            .vcpus_handles
            .iter()
            // `Iterator::collect` can transform a `Vec<Result>` into a `Result<Vec>`.
            .map(|handle| {
                handle
                    .response_receiver()
                    .recv_timeout(Duration::from_millis(1000))
            })
            .collect::<std::result::Result<Vec<VcpuResponse>, RecvTimeoutError>>()
            .map_err(|_| MicrovmStateError::UnexpectedVcpuResponse)?;

        for response in vcpu_responses.into_iter() {
            match response {
                VcpuResponse::RestoredState => (),
                VcpuResponse::Error(e) => {
                    error!("Fatal error: {}", e);
                    // Stop all vCPUs and exit.
                    let _ = self.exit_vcpus();
                    self.stop(i32::from(FC_EXIT_CODE_BAD_CONFIGURATION));
                    unreachable!()
                }
                VcpuResponse::NotAllowed(reason) => {
                    return Err(MicrovmStateError::NotAllowed(reason))
                }
                _ => return Err(MicrovmStateError::UnexpectedVcpuResponse),
            }
        }

        Ok(())
    }

    /// Retrieves the KVM dirty bitmap for each of the guest's memory regions.
    pub fn get_dirty_bitmap(&self) -> Result<DirtyBitmap> {
        let mut bitmap: DirtyBitmap = HashMap::new();
        self.guest_memory.with_regions_mut(
            |slot: usize, region: &GuestRegionMmap| -> Result<()> {
                let bitmap_region = self
                    .vm
                    .fd()
                    .get_dirty_log(slot as u32, region.len() as usize)
                    .map_err(Error::DirtyBitmap)?;
                bitmap.insert(slot, bitmap_region);
                Ok(())
            },
        )?;
        Ok(bitmap)
    }

    /// Enables or disables KVM dirty page tracking.
    pub fn set_dirty_page_tracking(&mut self, enable: bool) -> Result<()> {
        // This function _always_ results in an ioctl update. The VMM is stateless in the sense
        // that it's unaware of the current dirty page tracking setting.
        // The VMM's consumer will need to cache the dirty tracking setting internally. For
        // example, if this function were to be exposed through the VMM controller, the VMM
        // resources should cache the flag.
        self.vm
            .set_kvm_memory_regions(&self.guest_memory, enable)
            .map_err(Error::Vm)
    }
}

impl Subscriber for Vmm {
    /// Handle a read event (EPOLLIN).
    fn process(&mut self, event: &EpollEvent, _: &mut EventManager) {
        let source = event.fd();
        let event_set = event.event_set();

        if source == self.exit_evt.as_raw_fd() && event_set == EventSet::IN {
            let _ = self.exit_evt.read();
            // Query each vcpu for the exit_code.
            // If the exit_code can't be found on any vcpu, it means that the exit signal
            // has been issued by the i8042 controller in which case we exit with
            // FC_EXIT_CODE_OK.
            let exit_code = self
                .vcpus_handles
                .iter()
                .find_map(|handle| match handle.response_receiver().try_recv() {
                    Ok(VcpuResponse::Exited(exit_code)) => Some(exit_code),
                    _ => None,
                })
                .unwrap_or(FC_EXIT_CODE_OK);
            self.stop(i32::from(exit_code));
        } else {
            error!("Spurious EventManager event for handler: Vmm");
        }
    }

    fn interest_list(&self) -> Vec<EpollEvent> {
        vec![EpollEvent::new(
            EventSet::IN,
            self.exit_evt.as_raw_fd() as u64,
        )]
    }
}
