Luna/kernel/src/thread/Scheduler.cpp

#define MODULE "sched"

#include "thread/Scheduler.h"
#include "interrupts/Interrupts.h"
#include "log/Log.h"
#include "memory/MemoryManager.h"
#include "memory/PMM.h"
#include "memory/VMM.h"
#include "misc/hang.h"
#include "misc/reboot.h"
#include "misc/utils.h"
#include "panic/Panic.h"
#include "std/assert.h"
#include "std/errno.h"
#include "std/stdlib.h"
#include "std/string.h"
#include "sys/UserMemory.h"
#include "sys/elf/ELFLoader.h"
#include "thread/PIT.h"
#include "thread/Task.h"
#include "utils/Addresses.h"
#include "utils/Registers.h"

static uint64_t task_num = 0;

static Task idle_task;

static uint64_t free_pid = 0;

static Task* sched_current_task;
static Task* base_task;
static Task* end_task;

extern "C" void idle_task_function();

static uint64_t frequency;

template <typename Callback> void sched_for_each_task(Callback callback)
{
    Task* task = base_task;
    if (!task) return;
    do {
        bool will_continue = callback(task);
        if (!will_continue) break;
        task = task->next_task;
    } while (task != base_task);
}

template <typename Callback> void sched_for_each_child(Task* task, Callback callback)
{
    sched_for_each_task([&](Task* child) {
        if (child->ppid == task->id) { return callback(child); }
        return true;
    });
}

Task* Scheduler::find_by_pid(uint64_t pid)
{
    Task* result = nullptr;
    sched_for_each_task([&](Task* task) {
        if (task->id == pid)
        {
            result = task;
            return false;
        }
        return true;
    });
    return result;
}

void Scheduler::append_task(Task* task)
{
    if (!base_task)
    {
        ensure(!end_task);
        base_task = task;
        end_task = base_task;
        task->next_task = task;
        task->prev_task = task;
    }
    else
    {
        end_task->next_task = task;
        task->prev_task = end_task;
        base_task->prev_task = task;
        task->next_task = base_task;
        end_task = task;
    }
}

void Scheduler::init()
{
    memset(&idle_task, 0, sizeof(Task));
    idle_task.id = free_pid++;
    idle_task.regs.rip = (uint64_t)idle_task_function;
    idle_task.regs.rsp = get_top_of_stack((uint64_t)MemoryManager::get_page(), 1);
    idle_task.regs.cs = 0x08;
    idle_task.regs.ss = 0x10;
    idle_task.regs.rflags = (1 << 21) | (1 << 9);
    idle_task.task_sleep = 1000;
    idle_task.user_task = false;
    idle_task.block_reason = BlockReason::None;
    idle_task.state = idle_task.Idle;

    strlcpy(idle_task.name, "[cpu-idle]", sizeof(idle_task.name));

    sched_current_task = &idle_task;

    frequency = 1000 / PIT::frequency();
}

void Scheduler::add_kernel_task(const char* taskname, void (*task)(void))
{
    Task* new_task = new Task;
    ensure(new_task);
    new_task->user_task = false;
    new_task->id = free_pid++;
    new_task->ppid = 0;
    new_task->uid = new_task->euid = new_task->gid = new_task->egid = 0;
    new_task->regs.rip = (uint64_t)task;
    new_task->allocated_stack =
        (uint64_t)MemoryManager::get_pages(TASK_PAGES_IN_STACK); // 16 KB is enough for everyone, right?
    new_task->regs.rsp = get_top_of_stack(new_task->allocated_stack, TASK_PAGES_IN_STACK);
    new_task->regs.cs = 0x08;
    new_task->regs.ss = 0x10;
    new_task->regs.ds = 0x10;
    new_task->regs.rflags = read_rflags() | 0x200; // enable interrupts
    new_task->task_sleep = 0;
    new_task->task_time = 0;
    new_task->cpu_time = 0;
    strlcpy(new_task->name, taskname, sizeof(new_task->name));
    append_task(new_task);
    new_task->block_reason = BlockReason::None;
    new_task->state = new_task->Running;
    task_num++;
    kinfoln("Adding kernel task: %s, starts at %lx, PID %ld, stack at %lx, total tasks: %ld", new_task->name,
            new_task->regs.rip, new_task->id, new_task->regs.rsp, task_num);
}

Task* Scheduler::create_user_task()
{
    Task* new_task = new Task;
    if (!new_task) return nullptr;
    memset(&new_task->regs, 0, sizeof(Context));
    new_task->user_task = true;
    new_task->id = free_pid++;
    new_task->ppid = 0;
    new_task->task_sleep = 0;
    new_task->task_time = 0;
    new_task->cpu_time = 0;
    new_task->block_reason = BlockReason::None;
    append_task(new_task);
    task_num++;
    return new_task;
}

long Scheduler::load_user_task(const char* filename)
{
    kinfoln("Loading user task: %s", filename);
    Interrupts::push_and_disable();
    Task* new_task = new Task;
    ensure(new_task);
    memset(&new_task->regs, 0, sizeof(Context));
    new_task->id = free_pid++;
    new_task->ppid = 0;
    new_task->uid = new_task->euid = new_task->gid = new_task->egid = 0;
    if (!new_task->allocator.init())
    {
        delete new_task;
        free_pid--;
        Interrupts::pop();
        return -ENOMEM;
    }
    new_task->address_space = AddressSpace::create();
    VMM::switch_to_user_address_space(new_task->address_space);
    long result;
    if ((result = ELFLoader::check_elf_image_from_filesystem(filename)) < 0)
    {
        delete new_task;
        free_pid--;
        kerrorln("Failed to load %s from initrd", filename);
        Interrupts::pop();
        return result;
    }
    if ((uint64_t)result > PMM::get_free())
    {
        delete new_task;
        free_pid--;
        kerrorln("Not enough memory for task %s", filename);
        Interrupts::pop();
        return -ENOMEM;
    }
    ELFImage* image = ELFLoader::load_elf_from_filesystem(filename);
    ensure(image);
    new_task->user_task = true;
    new_task->regs.rip = image->entry;
    new_task->image = image;
    new_task->allocated_stack = (uint64_t)MemoryManager::get_pages_at(
        0x100000, TASK_PAGES_IN_STACK, MAP_READ_WRITE | MAP_USER | MAP_AS_OWNED_BY_TASK); // 16 KB is enough for everyone, right?
    if (!new_task->allocated_stack)
    {
        new_task->address_space.destroy();
        delete new_task;
        free_pid--;
        ELFLoader::release_elf_image(image);
        VMM::switch_back_to_kernel_address_space();
        Interrupts::pop();
        return -ENOMEM;
    }
    new_task->regs.rsp = get_top_of_stack(new_task->allocated_stack, TASK_PAGES_IN_STACK);
    new_task->regs.cs = 0x18 | 0x03;
    new_task->regs.ss = 0x20 | 0x03;
    new_task->regs.ds = 0x20 | 0x03;
    new_task->regs.rflags = (1 << 21) | (1 << 9); // enable interrupts
    new_task->task_sleep = 0;
    new_task->task_time = 0;
    new_task->cpu_time = 0;
    strlcpy(new_task->name, filename, sizeof(new_task->name));
    append_task(new_task);
    new_task->block_reason = BlockReason::None;
    new_task->state = new_task->Running;
    task_num++;
    kinfoln("Adding user task: %s, loaded at %lx, PID %ld, stack at %lx, total tasks: %ld", new_task->name,
            new_task->regs.rip, new_task->id, new_task->regs.rsp, task_num);
    VMM::switch_back_to_kernel_address_space();
    Interrupts::pop();
    return (long)new_task->id;
}

void Scheduler::reset_task(Task* task, ELFImage* new_image)
{
    memset(&task->regs, 0, sizeof(Context));
    task->state = task->Running;
    task->regs.rip = new_image->entry;
    task->image = new_image;
    task->regs.rsp = get_top_of_stack(task->allocated_stack, TASK_PAGES_IN_STACK);
    task->regs.cs = 0x18 | 0x03;
    task->regs.ss = 0x20 | 0x03;
    task->regs.ds = 0x20 | 0x03;
    task->regs.rflags = (1 << 21) | (1 << 9); // enable interrupts
    task->task_sleep = 0;
    task->cpu_time = 0;
    task->block_reason = BlockReason::None;
    kinfoln("Resetting task: %s, loaded at %lx, PID %ld, stack at %lx, total tasks: %ld", task->name, task->regs.rip,
            task->id, task->regs.rsp, task_num);
}

void Scheduler::reap_task(Task* task)
{
    ensure(!Interrupts::is_in_handler());
    task_num--;
    Task* exiting_task = task;
    ensure(task->id != 0); // WHY IN THE WORLD WOULD WE BE REAPING THE IDLE TASK?
    if (exiting_task->is_user_task())
    {
        VMM::switch_back_to_kernel_address_space();
        VMM::apply_address_space();
        VMM::switch_to_user_address_space(exiting_task->address_space);
    }
    kinfoln("reaping task %s, PID %ld, exited with code %ld", exiting_task->name, exiting_task->id,
            exiting_task->exit_status);
    if (exiting_task->id == (free_pid - 1)) free_pid--; // If we are the last spawned thread, free our PID.
    if (exiting_task->allocated_stack && !exiting_task->is_user_task())
        MemoryManager::release_pages((void*)exiting_task->allocated_stack, TASK_PAGES_IN_STACK);
    if (exiting_task->image) kfree(exiting_task->image);
    if (exiting_task->is_user_task())
    {
        exiting_task->allocator.free();
        VMM::switch_back_to_kernel_address_space();
        VMM::apply_address_space();
        Interrupts::push_and_enable();
        exiting_task->address_space.destroy();
        Interrupts::pop();
    }
    for (int i = 0; i < TASK_MAX_FDS; i++) { exiting_task->files[i].close(); }
    delete exiting_task;
}

void sched_common_exit(Context* context, int64_t status)
{
    if (sched_current_task->id == 1) sched_current_task->state = sched_current_task->Exited;
    else
        sched_current_task->state = sched_current_task->Dying;
    sched_current_task->exit_status = status;
    if (sched_current_task->id != 1)
    {
        sched_for_each_child(sched_current_task, [](Task* child) {
            if (child->state != child->Exited) child->ppid = 1;
            return true;
        });
    }
    else
    {
#ifndef RUN_TEST_AS_INIT
        reboot();
#else
        hang();
#endif
    }
    Scheduler::task_yield(context);
}

void Scheduler::task_exit(Context* context, int64_t status)
{
    ensure(Interrupts::is_in_handler());
    kdbgln("exit: task %ld finished running, used %ld ms of cpu time", sched_current_task->id,
           sched_current_task->cpu_time);
    sched_common_exit(context, status);
}

void Scheduler::task_misbehave(Context* context, int64_t status)
{
    ensure(Interrupts::is_in_handler());
    kdbgln("exit: task %ld misbehaved, used %ld ms of cpu time", sched_current_task->id, sched_current_task->cpu_time);
    sched_common_exit(context, status);
}

void Scheduler::reap_tasks()
{
    Interrupts::disable();
    ensure(!Interrupts::is_in_handler());
    Task* reap_base = nullptr;
    Task* reap_end = nullptr;
    Task* task = base_task;
    Task* task_reaping;
    uint64_t iter_index = 0;
    do {
        if (task->state == task->Exited)
        {
            if (task == base_task && task == end_task) { panic("Last task exited"); }
            else if (task == base_task) { base_task = task->next_task; }
            else if (task == end_task) { end_task = task->prev_task; }
            if (!reap_base)
            {
                reap_base = task;
                reap_end = task;
                task->prev_task->next_task = task->next_task;
                task->next_task->prev_task = task->prev_task;
                task->prev_task = nullptr;
                task_reaping = task;
                task = task->next_task;
                task_reaping->next_task = nullptr;
            }
            else
            {
                reap_end->next_task = task;
                task->prev_task->next_task = task->next_task;
                task->next_task->prev_task = task->prev_task;
                task->prev_task = nullptr;
                reap_end = task;
                task_reaping = task;
                task = task->next_task;
                task_reaping->next_task = nullptr;
            }
        }
        else { task = task->next_task; }
        iter_index++;
    } while (iter_index < task_num);
    task = reap_base;
    while (task)
    {
        Task* reaped_task = task;
        task = task->next_task;
        reap_task(reaped_task);
    }
    Interrupts::enable();
}

static void sched_decrement_sleep_times()
{
    sched_for_each_task([](Task* task) {
        if (task->task_sleep > 0)
        {
            task->task_sleep -= frequency;
            if (task->task_sleep < 0) task->task_sleep = 0;
        }
        if (task->task_sleep == 0 && task->state == task->Sleeping) task->state = task->Running;
        return true;
    });
}

void Scheduler::task_tick(Context* context)
{
    ensure(Interrupts::is_in_handler());
    Interrupts::disable();
    sched_decrement_sleep_times();
    sched_current_task->task_time -= frequency;
    sched_current_task->cpu_time += frequency;
    if (sched_current_task->id == 0) return task_yield(context);
    if (sched_current_task->task_time <= 0)
    {
        sched_current_task->task_time = 0;
        task_yield(context);
    }
    Interrupts::enable();
}

void Scheduler::task_yield(Context* context)
{
    ensure(Interrupts::is_in_handler());
    Interrupts::disable();
    sched_current_task->save_context(context);
    bool was_idle = false;
    if (sched_current_task->state == sched_current_task->Idle)
    {
        sched_current_task = end_task;
        was_idle = true;
    }
    Task* original_task = sched_current_task;
    do {
        sched_current_task = sched_current_task->next_task;
        if (sched_current_task->state == sched_current_task->Blocking)
        {
            if (!sched_current_task->is_still_blocking()) sched_current_task->resume();
        }
        if (sched_current_task->state == sched_current_task->Running)
        {
            if (sched_current_task->id != original_task->id || was_idle)
            {
                if (!was_idle && original_task->is_user_task() && !original_task->has_died())
                {
                    original_task->save_floating();
                }
                if (sched_current_task->is_user_task())
                {
                    sched_current_task->switch_to_address_space();
                    sched_current_task->restore_floating();
                }
                else if (!was_idle && original_task->is_user_task() && !sched_current_task->is_user_task())
                {
                    VMM::switch_back_to_kernel_address_space();
                    VMM::apply_address_space();
                }
            }
            sched_current_task->task_time = 20;
            sched_current_task->restore_context(context);
            return;
        }
    } while (sched_current_task != original_task);
    if (!was_idle && original_task->is_user_task() && original_task->state != original_task->Exited)
    {
        original_task->save_floating();
    }
    sched_current_task = &idle_task;
    sched_current_task->task_time = frequency;
    if (!was_idle) { sched_current_task->restore_context(context); }
    return;
}

void Scheduler::yield()
{
    asm volatile("int $0x42" : : "a"(1));
}

void Scheduler::exit(int status)
{
    asm volatile("int $0x42" : : "a"(0), "D"(status));
}

void Scheduler::sleep(unsigned long ms)
{
    asm volatile("int $0x42" : : "D"(ms), "a"(2));
}

Task* Scheduler::current_task()
{
    return sched_current_task;
}

#define WNOHANG 1

void sys_waitpid(Context* context, long pid, int* wstatus,
                 int options) // FIXME: only allow waiting for child processes when specifying a PID.
{
    Task* child = nullptr;
    if (pid == -1)
    {
        sched_for_each_child(sched_current_task, [&](Task* task) {
            if (task->state == task->Dying)
            {
                child = task;
                return false;
            }
            return true;
        });
        if (!child)
        {
            if (options & WNOHANG)
            {
                context->rax = 0; // No child has exited, let's return 0.
                return;
            }
            int* kwstatus;
            if (wstatus)
            {
                kwstatus = obtain_user_ref(wstatus);
                if (!kwstatus)
                {
                    context->rax = -EFAULT;
                    return;
                }
            }
            kdbgln("blocking wait on any child");
            sched_current_task->state = sched_current_task->Blocking;
            sched_current_task->block_reason = BlockReason::Waiting;
            sched_current_task->blocking_wait_info.pid = -1;
            if (wstatus) sched_current_task->blocking_wait_info.wstatus = kwstatus;
            else
                sched_current_task->blocking_wait_info.wstatus = nullptr;
            return Scheduler::task_yield(context);
        }
    }
    else
    {
        child = Scheduler::find_by_pid(pid);
        if (!child)
        {
            context->rax = -ECHILD;
            return;
        }
    }
    if (child->ppid != sched_current_task->id)
    {
        // We are trying to call waitpid() on a task that isn't a child of ours. This is not allowed.
        context->rax = -ECHILD;
        return;
    }
    if (child->state != child->Dying)
    {
        if (options & WNOHANG)
        {
            context->rax = 0; // No child has exited, let's return 0.
            return;
        }
        int* kwstatus;
        if (wstatus)
        {
            kwstatus = obtain_user_ref(wstatus);
            if (!kwstatus)
            {
                context->rax = -EFAULT;
                return;
            }
        }
        sched_current_task->state = sched_current_task->Blocking;
        sched_current_task->block_reason = BlockReason::Waiting;
        sched_current_task->blocking_wait_info.pid = pid;
        if (wstatus) sched_current_task->blocking_wait_info.wstatus = kwstatus;
        else
            sched_current_task->blocking_wait_info.wstatus = nullptr;
        return Scheduler::task_yield(context);
    }
    if (wstatus)
    {
        int* kwstatus = obtain_user_ref(wstatus);
        if (kwstatus)
        {
            *kwstatus = (int)(child->exit_status & 0xff);
            release_user_ref(kwstatus);
        }
        else
        {
            kinfoln("wstatus ptr is invalid: %p", (void*)wstatus);
            child->state = child->Exited;
            context->rax = -EFAULT;
            return;
        }
    }
    child->state = child->Exited;
    context->rax = (long)child->id;
}

bool Task::is_wait_still_blocking()
{
    Task* child = nullptr;
    if (blocking_wait_info.pid == -1)
    {
        sched_for_each_child(sched_current_task, [&](Task* task) {
            if (task->state == task->Dying)
            {
                child = task;
                return false;
            }
            return true;
        });
        if (!child) return true;
        else
        {
            blocking_wait_info.pid = child->id; // We're committed to this child now.
            return false;
        }
    }
    else
    {
        child = Scheduler::find_by_pid(blocking_wait_info.pid);
        ensure(child); // since sys_waitpid should have validated this child, and the only way for it to disappear from
                       // the process list is for someone to wait for it, this should be pretty safe.
        if (child->state != child->Dying) return true;
        else
            return false;
    }
}

void Task::resume_wait()
{
    ensure(blocking_wait_info.pid != -1); // is_wait_still_blocking should have chosen a child for us if the user
                                          // process told us to wait for any child.
    Task* child = Scheduler::find_by_pid(blocking_wait_info.pid);
    ensure(child); // This should also already have been validated.

    if (blocking_wait_info.wstatus)
    {
        *blocking_wait_info.wstatus = (int)(child->exit_status & 0xff);
        release_user_ref(blocking_wait_info.wstatus);
    }

    child->state = child->Exited;
    regs.rax = (long)child->id;
}

struct pstat
{
    long pt_pid;
    long pt_ppid;
    char pt_name[128];
    int pt_state;
    long pt_time;
    uid_t pt_uid;
    gid_t pt_gid;
};

void sys_pstat(Context* context, long pid, struct pstat* buf)
{
    Task* task;
    if (pid == -1) task = Scheduler::find_by_pid(free_pid - 1);
    else if (pid == 0)
        task = &idle_task;
    else
        task = Scheduler::find_by_pid(pid);
    if (!task)
    {
        context->rax = -ESRCH;
        return;
    }
    if (task->state == task->Exited) // we're just waiting for the reaper to reap it
    {
        context->rax = -ESRCH;
        return;
    }
    if (buf)
    {
        struct pstat* kpstat = obtain_user_ref(buf);
        if (!kpstat)
        {
            context->rax = -EFAULT;
            return;
        }
        kpstat->pt_pid = task->id;
        kpstat->pt_ppid = task->ppid;
        kpstat->pt_state = (int)task->state;
        kpstat->pt_time = (long)task->cpu_time;
        kpstat->pt_uid = task->uid;
        kpstat->pt_gid = task->gid;
        strlcpy(kpstat->pt_name, task->name, sizeof(kpstat->pt_name));
        release_user_ref(kpstat);
    }
    context->rax = task->id;
    return;
}