Page fault: add implementation of fault handlers

francescolavra · francescolavra · commit 0306ebb2a421 · 2024-09-11T18:19:59.000+02:00
This change adds a page fault handler implementation for the
/dev/nvidiaXXX and /dev/nvidia-uvm special files.
diff --git a/kernel-open/common/inc/nv-nanos.h b/kernel-open/common/inc/nv-nanos.h
@@ -677,6 +677,12 @@ static inline sysreturn nv_io_remap_page_range(vmap vm, NvU64 phys_addr, NvU64 s
     return virt;
 }
 
+static inline status nv_insert_pfn(vmap vma, NvU64 virt_addr, NvU64 pfn, NvU32 extra_prot)
+{
+    map(virt_addr, pfn, PAGESIZE, pageflags_from_vmflags(vma->flags));
+    return STATUS_OK;
+}
+
 #define NV_GET_CURRENT_PROCESS()        ({ thread t = current; int pid = t ? t->p->pid : 0; pid; })
 #define NV_COPY_TO_USER(to, from, n)    (copy_to_user(to, from, n) == false)
 #define NV_COPY_FROM_USER(to, from, n)  (copy_from_user(from, to, n) == false)
diff --git a/kernel-open/nvidia-uvm/uvm.c b/kernel-open/nvidia-uvm/uvm.c
@@ -173,6 +173,36 @@ static NV_STATUS uvm_api_mm_initialize(UVM_MM_INITIALIZE_PARAMS *params, fdesc f
     return status;
 }
 
+static status uvm_vm_fault_sigbus(process p, context ctx, u64 vaddr, vmap vma, pending_fault *pf)
+{
+    return timm("result", "sigbus");
+}
+
+closure_func_basic(thunk, void, uvm_vm_fault_async)
+{
+    pending_fault pf = struct_from_closure(pending_fault, async_handler);
+    uvm_va_space_t *va_space = pf->custom;
+
+    pf->custom = uvm_va_space_cpu_fault_managed(va_space, pf->ctx, pf->addr);
+    thunk complete = (thunk)&pf->complete;
+    apply(complete);
+}
+
+static status uvm_vm_fault(process p, context ctx, u64 vaddr, vmap vm, pending_fault *pf)
+{
+    if (!*pf) {
+        pending_fault new_pf = new_pending_fault_locked(p, ctx, vaddr);
+        if (new_pf != INVALID_ADDRESS) {
+            new_pf->type = PENDING_FAULT_CUSTOM;
+            new_pf->custom = uvm_va_space_get(vm->fd);
+            init_closure_func(&new_pf->async_handler, thunk, uvm_vm_fault_async);
+        }
+        *pf = new_pf;
+        return STATUS_OK;
+    }
+    return (*pf)->custom;
+}
+
 closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
                    vmap vma, u64 offset)
 {
@@ -207,6 +237,8 @@ closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
         return -EINVAL;
     }
 
+    vma->fault = uvm_vm_fault;
+
     // This identity assignment is needed so uvm_vm_open can find its parent vma
     uvm_vma_wrapper_t *vma_wrapper = uvm_vma_wrapper_alloc(vma);
     if (!vma_wrapper) {
@@ -238,6 +270,7 @@ closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
                 va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
             uvm_vma_wrapper_destroy(vma_wrapper);
             vma_wrapper_allocated = false;
+            vma->fault = uvm_vm_fault_sigbus;
             status = uvm_mem_map_cpu_user(va_range->semaphore_pool.mem, va_range->va_space, vma);
         }
     }
diff --git a/kernel-open/nvidia-uvm/uvm_gpu.h b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -139,7 +139,7 @@ struct uvm_service_block_context_struct
         // section.
         unsigned long notifier_seq;
 
-        struct vm_fault *vmf;
+        context ctx;
     } cpu_fault;
 
     //
diff --git a/kernel-open/nvidia-uvm/uvm_va_block.c b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -6715,10 +6715,36 @@ NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
 // each call to vm_insert_page. Multiple faults under one VMA in separate
 // blocks can be serviced concurrently, so the VMA wrapper lock is used
 // to protect access to vma->vm_page_prot.
-static NV_STATUS uvm_cpu_insert_page(NvU64 addr,
+static NV_STATUS uvm_cpu_insert_page(uvm_vma_wrapper_t *vma_wrapper, NvU64 addr,
                                      u64 page,
                                      uvm_prot_t new_prot)
 {
+    vmap vma = &vma_wrapper->vma;
+    unsigned long target_flags;
+
+    UVM_ASSERT(vma);
+
+    target_flags = vma->flags;
+
+    if (new_prot == UVM_PROT_READ_ONLY)
+        target_flags &= ~VMAP_FLAG_WRITABLE;
+
+    // Take VMA wrapper lock to check vma->vm_page_prot
+    uvm_down_read(&vma_wrapper->lock);
+
+    // Take a write lock if we need to modify the VMA vm_page_prot
+    // - vma->vm_page_prot creates writable PTEs but new prot is RO
+    // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
+    if (vma->flags != target_flags) {
+        uvm_up_read(&vma_wrapper->lock);
+        uvm_down_write(&vma_wrapper->lock);
+
+        vma->flags = target_flags;
+
+        uvm_downgrade_write(&vma_wrapper->lock);
+    }
+    map(addr, page, PAGE_SIZE, pageflags_from_vmflags(target_flags));
+    uvm_up_read(&vma_wrapper->lock);
     return NV_OK;
 }
 
@@ -6810,6 +6836,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
     uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
     uvm_va_range_t *va_range = block->va_range;
     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
+    uvm_vma_wrapper_t *vma_wrapper;
     NV_STATUS status;
     NvU64 addr;
     NvU64 page;
@@ -6856,6 +6883,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
             uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
         }
     }
+    vma_wrapper = va_range->managed.vma_wrapper;
 
     // Add the mapping
     addr = uvm_va_block_cpu_page_address(block, page_index);
@@ -6873,7 +6901,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
         return status;
 
     page = block_page_get(block, block_phys_page(resident_id, page_index));
-    return uvm_cpu_insert_page(addr, page, new_prot);
+    return uvm_cpu_insert_page(vma_wrapper, addr, page, new_prot);
 }
 
 // Maps the CPU to the given pages which are resident on resident_id.
@@ -10710,7 +10738,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
 // Check if we are faulting on a page with valid permissions to check if we can
 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
 // details
-static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
+static bool skip_cpu_fault_with_valid_permissions(context ctx, uvm_va_block_t *va_block,
                                                   uvm_page_index_t page_index,
                                                   uvm_fault_access_type_t fault_access_type)
 {
@@ -10724,7 +10752,8 @@ static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
                                            UVM_ID_CPU,
                                            uvm_fault_access_type_to_prot(fault_access_type))) {
         NvU64 now = NV_GETTIME();
-        int pid = current->p->pid;
+        thread t = (ctx->type == CONTEXT_TYPE_SYSCALL) ? ((syscall_context)ctx)->t : (thread)ctx;
+        int pid = t->p->pid;
 
         // Latch the pid/timestamp/page_index values for the first time
         if (!va_block->cpu.fault_authorized.first_fault_stamp) {
@@ -10800,7 +10829,8 @@ static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
 
     uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
 
-    if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
+    if (skip_cpu_fault_with_valid_permissions(service_context->cpu_fault.ctx, va_block, page_index,
+                                              fault_access_type))
         return NV_OK;
 
     thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU);
diff --git a/kernel-open/nvidia-uvm/uvm_va_range.c b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -185,7 +185,7 @@ NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
                                    uvm_va_range_t **out_va_range)
 {
     NV_STATUS status;
-    vmap vma = vma_wrapper->vma;
+    vmap vma = &vma_wrapper->vma;
     uvm_va_range_t *va_range = NULL;
 
     // Check for no overlap with HMM blocks.
@@ -1688,7 +1688,7 @@ uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(vmap vma)
     if (!vma_wrapper)
         return NULL;
 
-    vma_wrapper->vma = vma;
+    runtime_memcpy(&vma_wrapper->vma, vma, sizeof(*vma));
     uvm_init_rwsem(&vma_wrapper->lock, UVM_LOCK_ORDER_LEAF);
 
     return vma_wrapper;
diff --git a/kernel-open/nvidia-uvm/uvm_va_range.h b/kernel-open/nvidia-uvm/uvm_va_range.h
@@ -118,7 +118,7 @@ typedef struct
 {
     // Needed for creating CPU mappings on the va_range. Do not access this
     // directly, instead use uvm_va_range_vma and friends.
-    vmap vma;
+    struct vmap vma;
 
     uvm_rw_semaphore_t lock;
 } uvm_vma_wrapper_t;
@@ -651,7 +651,7 @@ static vmap uvm_va_range_vma(uvm_va_range_t *va_range)
     // vm_file, vm_private_data, vm_start, and vm_end are all safe to access
     // here because they can't change without the kernel calling vm_ops->open
     // or vm_ops->close, which both take va_space->lock.
-    vma = va_range->managed.vma_wrapper->vma;
+    vma = &va_range->managed.vma_wrapper->vma;
     UVM_ASSERT(vma);
     UVM_ASSERT_MSG(va_range->node.start >= vma->node.r.start,
                    "Range mismatch: va_range: [0x%lx, 0x%lx] vma: [0x%lx, 0x%lx]\n",
diff --git a/kernel-open/nvidia-uvm/uvm_va_space.c b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -1818,3 +1818,172 @@ void uvm_service_block_context_exit(void)
     }
     INIT_LIST_HEAD(&g_cpu_service_block_context_list);
 }
+
+// Get a fault service context from the global list or allocate a new one if
+// there are no available entries.
+static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
+{
+    uvm_service_block_context_t *service_context;
+
+    uvm_spin_lock(&g_cpu_service_block_context_list_lock);
+
+    service_context = list_first_entry_or_null(&g_cpu_service_block_context_list, uvm_service_block_context_t,
+                                               cpu_fault.service_context_list);
+
+    if (service_context)
+        list_del(&service_context->cpu_fault.service_context_list);
+
+    uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
+
+    if (!service_context)
+        service_context = uvm_kvmalloc(sizeof(*service_context));
+
+    return service_context;
+}
+
+// Put a fault service context in the global list.
+static void service_block_context_cpu_free(uvm_service_block_context_t *service_context)
+{
+    uvm_spin_lock(&g_cpu_service_block_context_list_lock);
+
+    list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
+
+    uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
+}
+
+static status uvm_va_space_cpu_fault(uvm_va_space_t *va_space, context ctx, u64 fault_addr,
+                                     bool is_hmm)
+{
+    uvm_va_block_t *va_block;
+    bool is_write = is_write_fault(ctx->frame);
+    NV_STATUS status = uvm_global_get_status();
+    bool tools_enabled;
+    uvm_service_block_context_t *service_context;
+    uvm_global_processor_mask_t gpus_to_check_for_ecc;
+
+    if (status != NV_OK)
+        goto convert_error;
+
+    service_context = service_block_context_cpu_alloc();
+    if (!service_context) {
+        status = NV_ERR_NO_MEMORY;
+        goto convert_error;
+    }
+
+    service_context->cpu_fault.wakeup_time_stamp = 0;
+    service_context->cpu_fault.ctx = ctx;
+
+    do {
+        bool do_sleep = false;
+
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            NvU64 now = NV_GETTIME();
+            if (now < service_context->cpu_fault.wakeup_time_stamp)
+                do_sleep = true;
+
+            if (do_sleep)
+                uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
+
+            // Drop the VA space lock while we sleep
+            uvm_va_space_up_read(va_space);
+
+            // usleep_range is preferred because msleep has a 20ms granularity
+            // and udelay uses a busy-wait loop. usleep_range uses
+            // high-resolution timers and, by adding a range, the Linux
+            // scheduler may coalesce our wakeup with others, thus saving some
+            // interrupts.
+            if (do_sleep) {
+                unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
+
+                kernel_delay(microseconds(nap_us));
+            }
+        }
+
+        uvm_va_space_down_read(va_space);
+
+        if (do_sleep)
+            uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
+
+        if (is_hmm) {
+            // Note that normally we should find a va_block for the faulting
+            // address because the block had to be created when migrating a
+            // page to the GPU and a device private PTE inserted into the CPU
+            // page tables in order for migrate_to_ram() to be called. Not
+            // finding it means the PTE was remapped to a different virtual
+            // address with mremap() so create a new va_block if needed.
+            status = uvm_hmm_va_block_find_create(va_space,
+                                                  fault_addr,
+                                                  &service_context->block_context.hmm.vma,
+                                                  &va_block);
+            if (status != NV_OK)
+                break;
+
+            status = uvm_hmm_migrate_begin(va_block);
+            if (status != NV_OK)
+                break;
+        }
+        else {
+            status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
+            if (status != NV_OK) {
+                UVM_ASSERT_MSG(status == NV_ERR_NO_MEMORY, "status: %s\n", nvstatusToString(status));
+                break;
+            }
+        }
+
+        // Loop until thrashing goes away.
+        status = uvm_va_block_cpu_fault(va_block, fault_addr, is_write, service_context);
+
+        if (is_hmm)
+            uvm_hmm_migrate_finish(va_block);
+    } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
+
+    if (status != NV_OK && !(is_hmm && status == NV_ERR_BUSY_RETRY)) {
+        UvmEventFatalReason reason;
+
+        reason = uvm_tools_status_to_fatal_fault_reason(status);
+        UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
+
+        uvm_tools_record_cpu_fatal_fault(va_space, fault_addr, is_write, reason);
+    }
+
+    tools_enabled = va_space->tools.enabled;
+
+    if (status == NV_OK) {
+        uvm_va_space_global_gpus_in_mask(va_space,
+                                         &gpus_to_check_for_ecc,
+                                         &service_context->cpu_fault.gpus_to_check_for_ecc);
+        uvm_global_mask_retain(&gpus_to_check_for_ecc);
+    }
+
+    uvm_va_space_up_read(va_space);
+
+    if (status == NV_OK) {
+        status = uvm_global_mask_check_ecc_error(&gpus_to_check_for_ecc);
+        uvm_global_mask_release(&gpus_to_check_for_ecc);
+    }
+
+    if (tools_enabled)
+        uvm_tools_flush_events();
+
+    // Major faults involve I/O in order to resolve the fault.
+    // If any pages were DMA'ed between the GPU and host memory, that makes it
+    // a major fault. A process can also get statistics for major and minor
+    // faults by calling readproc().
+    service_block_context_cpu_free(service_context);
+
+convert_error:
+    switch (status) {
+        case NV_OK:
+        case NV_ERR_BUSY_RETRY:
+            return STATUS_OK;
+        case NV_ERR_NO_MEMORY:
+            return timm_oom;
+        default:
+            return timm("result", "sigbus");
+    }
+}
+
+status uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space, context ctx, u64 vaddr)
+{
+    return uvm_va_space_cpu_fault(va_space, ctx, vaddr, false);
+}
diff --git a/kernel-open/nvidia-uvm/uvm_va_space.h b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -847,9 +847,7 @@ NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELA
 // VM_FAULT_OOM: if system memory wasn't available.
 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed,
 //     for example because it's within a range group which is non-migratable.
-vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
-                                          struct vm_area_struct *vma,
-                                          struct vm_fault *vmf);
+status uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space, context ctx, u64 vaddr);
 
 // Handle a CPU fault in the given VA space for a HMM allocation,
 // performing any operations necessary to establish a coherent CPU mapping
@@ -863,8 +861,6 @@ vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
 //     (possibly or'ed with VM_FAULT_MAJOR if a migration was needed).
 // VM_FAULT_OOM: if system memory wasn't available.
 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed.
-vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space,
-                                      struct vm_area_struct *vma,
-                                      struct vm_fault *vmf);
+status uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space, context ctx, u64 vaddr);
 
 #endif // __UVM_VA_SPACE_H__
diff --git a/kernel-open/nvidia/nv-mmap.c b/kernel-open/nvidia/nv-mmap.c