Skip to content

Commit 0306ebb

Browse files
Page fault: add implementation of fault handlers
This change adds a page fault handler implementation for the /dev/nvidiaXXX and /dev/nvidia-uvm special files.
1 parent 85a0679 commit 0306ebb

File tree

9 files changed

+328
-16
lines changed

9 files changed

+328
-16
lines changed

kernel-open/common/inc/nv-nanos.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,12 @@ static inline sysreturn nv_io_remap_page_range(vmap vm, NvU64 phys_addr, NvU64 s
677677
return virt;
678678
}
679679

680+
static inline status nv_insert_pfn(vmap vma, NvU64 virt_addr, NvU64 pfn, NvU32 extra_prot)
681+
{
682+
map(virt_addr, pfn, PAGESIZE, pageflags_from_vmflags(vma->flags));
683+
return STATUS_OK;
684+
}
685+
680686
#define NV_GET_CURRENT_PROCESS() ({ thread t = current; int pid = t ? t->p->pid : 0; pid; })
681687
#define NV_COPY_TO_USER(to, from, n) (copy_to_user(to, from, n) == false)
682688
#define NV_COPY_FROM_USER(to, from, n) (copy_from_user(from, to, n) == false)

kernel-open/nvidia-uvm/uvm.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,36 @@ static NV_STATUS uvm_api_mm_initialize(UVM_MM_INITIALIZE_PARAMS *params, fdesc f
173173
return status;
174174
}
175175

176+
static status uvm_vm_fault_sigbus(process p, context ctx, u64 vaddr, vmap vma, pending_fault *pf)
177+
{
178+
return timm("result", "sigbus");
179+
}
180+
181+
closure_func_basic(thunk, void, uvm_vm_fault_async)
182+
{
183+
pending_fault pf = struct_from_closure(pending_fault, async_handler);
184+
uvm_va_space_t *va_space = pf->custom;
185+
186+
pf->custom = uvm_va_space_cpu_fault_managed(va_space, pf->ctx, pf->addr);
187+
thunk complete = (thunk)&pf->complete;
188+
apply(complete);
189+
}
190+
191+
static status uvm_vm_fault(process p, context ctx, u64 vaddr, vmap vm, pending_fault *pf)
192+
{
193+
if (!*pf) {
194+
pending_fault new_pf = new_pending_fault_locked(p, ctx, vaddr);
195+
if (new_pf != INVALID_ADDRESS) {
196+
new_pf->type = PENDING_FAULT_CUSTOM;
197+
new_pf->custom = uvm_va_space_get(vm->fd);
198+
init_closure_func(&new_pf->async_handler, thunk, uvm_vm_fault_async);
199+
}
200+
*pf = new_pf;
201+
return STATUS_OK;
202+
}
203+
return (*pf)->custom;
204+
}
205+
176206
closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
177207
vmap vma, u64 offset)
178208
{
@@ -207,6 +237,8 @@ closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
207237
return -EINVAL;
208238
}
209239

240+
vma->fault = uvm_vm_fault;
241+
210242
// This identity assignment is needed so uvm_vm_open can find its parent vma
211243
uvm_vma_wrapper_t *vma_wrapper = uvm_vma_wrapper_alloc(vma);
212244
if (!vma_wrapper) {
@@ -238,6 +270,7 @@ closure_func_basic(fdesc_mmap, sysreturn, uvm_mmap,
238270
va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
239271
uvm_vma_wrapper_destroy(vma_wrapper);
240272
vma_wrapper_allocated = false;
273+
vma->fault = uvm_vm_fault_sigbus;
241274
status = uvm_mem_map_cpu_user(va_range->semaphore_pool.mem, va_range->va_space, vma);
242275
}
243276
}

kernel-open/nvidia-uvm/uvm_gpu.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ struct uvm_service_block_context_struct
139139
// section.
140140
unsigned long notifier_seq;
141141

142-
struct vm_fault *vmf;
142+
context ctx;
143143
} cpu_fault;
144144

145145
//

kernel-open/nvidia-uvm/uvm_va_block.c

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6715,10 +6715,36 @@ NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
67156715
// each call to vm_insert_page. Multiple faults under one VMA in separate
67166716
// blocks can be serviced concurrently, so the VMA wrapper lock is used
67176717
// to protect access to vma->vm_page_prot.
6718-
static NV_STATUS uvm_cpu_insert_page(NvU64 addr,
6718+
static NV_STATUS uvm_cpu_insert_page(uvm_vma_wrapper_t *vma_wrapper, NvU64 addr,
67196719
u64 page,
67206720
uvm_prot_t new_prot)
67216721
{
6722+
vmap vma = &vma_wrapper->vma;
6723+
unsigned long target_flags;
6724+
6725+
UVM_ASSERT(vma);
6726+
6727+
target_flags = vma->flags;
6728+
6729+
if (new_prot == UVM_PROT_READ_ONLY)
6730+
target_flags &= ~VMAP_FLAG_WRITABLE;
6731+
6732+
// Take VMA wrapper lock to check vma->vm_page_prot
6733+
uvm_down_read(&vma_wrapper->lock);
6734+
6735+
// Take a write lock if we need to modify the VMA vm_page_prot
6736+
// - vma->vm_page_prot creates writable PTEs but new prot is RO
6737+
// - vma->vm_page_prot creates read-only PTEs but new_prot is RW
6738+
if (vma->flags != target_flags) {
6739+
uvm_up_read(&vma_wrapper->lock);
6740+
uvm_down_write(&vma_wrapper->lock);
6741+
6742+
vma->flags = target_flags;
6743+
6744+
uvm_downgrade_write(&vma_wrapper->lock);
6745+
}
6746+
map(addr, page, PAGE_SIZE, pageflags_from_vmflags(target_flags));
6747+
uvm_up_read(&vma_wrapper->lock);
67226748
return NV_OK;
67236749
}
67246750

@@ -6810,6 +6836,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
68106836
uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
68116837
uvm_va_range_t *va_range = block->va_range;
68126838
uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6839+
uvm_vma_wrapper_t *vma_wrapper;
68136840
NV_STATUS status;
68146841
NvU64 addr;
68156842
NvU64 page;
@@ -6856,6 +6883,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
68566883
uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
68576884
}
68586885
}
6886+
vma_wrapper = va_range->managed.vma_wrapper;
68596887

68606888
// Add the mapping
68616889
addr = uvm_va_block_cpu_page_address(block, page_index);
@@ -6873,7 +6901,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
68736901
return status;
68746902

68756903
page = block_page_get(block, block_phys_page(resident_id, page_index));
6876-
return uvm_cpu_insert_page(addr, page, new_prot);
6904+
return uvm_cpu_insert_page(vma_wrapper, addr, page, new_prot);
68776905
}
68786906

68796907
// Maps the CPU to the given pages which are resident on resident_id.
@@ -10710,7 +10738,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
1071010738
// Check if we are faulting on a page with valid permissions to check if we can
1071110739
// skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
1071210740
// details
10713-
static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
10741+
static bool skip_cpu_fault_with_valid_permissions(context ctx, uvm_va_block_t *va_block,
1071410742
uvm_page_index_t page_index,
1071510743
uvm_fault_access_type_t fault_access_type)
1071610744
{
@@ -10724,7 +10752,8 @@ static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
1072410752
UVM_ID_CPU,
1072510753
uvm_fault_access_type_to_prot(fault_access_type))) {
1072610754
NvU64 now = NV_GETTIME();
10727-
int pid = current->p->pid;
10755+
thread t = (ctx->type == CONTEXT_TYPE_SYSCALL) ? ((syscall_context)ctx)->t : (thread)ctx;
10756+
int pid = t->p->pid;
1072810757

1072910758
// Latch the pid/timestamp/page_index values for the first time
1073010759
if (!va_block->cpu.fault_authorized.first_fault_stamp) {
@@ -10800,7 +10829,8 @@ static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
1080010829

1080110830
uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
1080210831

10803-
if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
10832+
if (skip_cpu_fault_with_valid_permissions(service_context->cpu_fault.ctx, va_block, page_index,
10833+
fault_access_type))
1080410834
return NV_OK;
1080510835

1080610836
thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU);

kernel-open/nvidia-uvm/uvm_va_range.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
185185
uvm_va_range_t **out_va_range)
186186
{
187187
NV_STATUS status;
188-
vmap vma = vma_wrapper->vma;
188+
vmap vma = &vma_wrapper->vma;
189189
uvm_va_range_t *va_range = NULL;
190190

191191
// Check for no overlap with HMM blocks.
@@ -1688,7 +1688,7 @@ uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(vmap vma)
16881688
if (!vma_wrapper)
16891689
return NULL;
16901690

1691-
vma_wrapper->vma = vma;
1691+
runtime_memcpy(&vma_wrapper->vma, vma, sizeof(*vma));
16921692
uvm_init_rwsem(&vma_wrapper->lock, UVM_LOCK_ORDER_LEAF);
16931693

16941694
return vma_wrapper;

kernel-open/nvidia-uvm/uvm_va_range.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ typedef struct
118118
{
119119
// Needed for creating CPU mappings on the va_range. Do not access this
120120
// directly, instead use uvm_va_range_vma and friends.
121-
vmap vma;
121+
struct vmap vma;
122122

123123
uvm_rw_semaphore_t lock;
124124
} uvm_vma_wrapper_t;
@@ -651,7 +651,7 @@ static vmap uvm_va_range_vma(uvm_va_range_t *va_range)
651651
// vm_file, vm_private_data, vm_start, and vm_end are all safe to access
652652
// here because they can't change without the kernel calling vm_ops->open
653653
// or vm_ops->close, which both take va_space->lock.
654-
vma = va_range->managed.vma_wrapper->vma;
654+
vma = &va_range->managed.vma_wrapper->vma;
655655
UVM_ASSERT(vma);
656656
UVM_ASSERT_MSG(va_range->node.start >= vma->node.r.start,
657657
"Range mismatch: va_range: [0x%lx, 0x%lx] vma: [0x%lx, 0x%lx]\n",

kernel-open/nvidia-uvm/uvm_va_space.c

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,3 +1818,172 @@ void uvm_service_block_context_exit(void)
18181818
}
18191819
INIT_LIST_HEAD(&g_cpu_service_block_context_list);
18201820
}
1821+
1822+
// Get a fault service context from the global list or allocate a new one if
1823+
// there are no available entries.
1824+
static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
1825+
{
1826+
uvm_service_block_context_t *service_context;
1827+
1828+
uvm_spin_lock(&g_cpu_service_block_context_list_lock);
1829+
1830+
service_context = list_first_entry_or_null(&g_cpu_service_block_context_list, uvm_service_block_context_t,
1831+
cpu_fault.service_context_list);
1832+
1833+
if (service_context)
1834+
list_del(&service_context->cpu_fault.service_context_list);
1835+
1836+
uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
1837+
1838+
if (!service_context)
1839+
service_context = uvm_kvmalloc(sizeof(*service_context));
1840+
1841+
return service_context;
1842+
}
1843+
1844+
// Put a fault service context in the global list.
1845+
static void service_block_context_cpu_free(uvm_service_block_context_t *service_context)
1846+
{
1847+
uvm_spin_lock(&g_cpu_service_block_context_list_lock);
1848+
1849+
list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
1850+
1851+
uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
1852+
}
1853+
1854+
static status uvm_va_space_cpu_fault(uvm_va_space_t *va_space, context ctx, u64 fault_addr,
1855+
bool is_hmm)
1856+
{
1857+
uvm_va_block_t *va_block;
1858+
bool is_write = is_write_fault(ctx->frame);
1859+
NV_STATUS status = uvm_global_get_status();
1860+
bool tools_enabled;
1861+
uvm_service_block_context_t *service_context;
1862+
uvm_global_processor_mask_t gpus_to_check_for_ecc;
1863+
1864+
if (status != NV_OK)
1865+
goto convert_error;
1866+
1867+
service_context = service_block_context_cpu_alloc();
1868+
if (!service_context) {
1869+
status = NV_ERR_NO_MEMORY;
1870+
goto convert_error;
1871+
}
1872+
1873+
service_context->cpu_fault.wakeup_time_stamp = 0;
1874+
service_context->cpu_fault.ctx = ctx;
1875+
1876+
do {
1877+
bool do_sleep = false;
1878+
1879+
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
1880+
NvU64 now = NV_GETTIME();
1881+
if (now < service_context->cpu_fault.wakeup_time_stamp)
1882+
do_sleep = true;
1883+
1884+
if (do_sleep)
1885+
uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
1886+
1887+
// Drop the VA space lock while we sleep
1888+
uvm_va_space_up_read(va_space);
1889+
1890+
// usleep_range is preferred because msleep has a 20ms granularity
1891+
// and udelay uses a busy-wait loop. usleep_range uses
1892+
// high-resolution timers and, by adding a range, the Linux
1893+
// scheduler may coalesce our wakeup with others, thus saving some
1894+
// interrupts.
1895+
if (do_sleep) {
1896+
unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
1897+
1898+
kernel_delay(microseconds(nap_us));
1899+
}
1900+
}
1901+
1902+
uvm_va_space_down_read(va_space);
1903+
1904+
if (do_sleep)
1905+
uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
1906+
1907+
if (is_hmm) {
1908+
// Note that normally we should find a va_block for the faulting
1909+
// address because the block had to be created when migrating a
1910+
// page to the GPU and a device private PTE inserted into the CPU
1911+
// page tables in order for migrate_to_ram() to be called. Not
1912+
// finding it means the PTE was remapped to a different virtual
1913+
// address with mremap() so create a new va_block if needed.
1914+
status = uvm_hmm_va_block_find_create(va_space,
1915+
fault_addr,
1916+
&service_context->block_context.hmm.vma,
1917+
&va_block);
1918+
if (status != NV_OK)
1919+
break;
1920+
1921+
status = uvm_hmm_migrate_begin(va_block);
1922+
if (status != NV_OK)
1923+
break;
1924+
}
1925+
else {
1926+
status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
1927+
if (status != NV_OK) {
1928+
UVM_ASSERT_MSG(status == NV_ERR_NO_MEMORY, "status: %s\n", nvstatusToString(status));
1929+
break;
1930+
}
1931+
}
1932+
1933+
// Loop until thrashing goes away.
1934+
status = uvm_va_block_cpu_fault(va_block, fault_addr, is_write, service_context);
1935+
1936+
if (is_hmm)
1937+
uvm_hmm_migrate_finish(va_block);
1938+
} while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
1939+
1940+
if (status != NV_OK && !(is_hmm && status == NV_ERR_BUSY_RETRY)) {
1941+
UvmEventFatalReason reason;
1942+
1943+
reason = uvm_tools_status_to_fatal_fault_reason(status);
1944+
UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
1945+
1946+
uvm_tools_record_cpu_fatal_fault(va_space, fault_addr, is_write, reason);
1947+
}
1948+
1949+
tools_enabled = va_space->tools.enabled;
1950+
1951+
if (status == NV_OK) {
1952+
uvm_va_space_global_gpus_in_mask(va_space,
1953+
&gpus_to_check_for_ecc,
1954+
&service_context->cpu_fault.gpus_to_check_for_ecc);
1955+
uvm_global_mask_retain(&gpus_to_check_for_ecc);
1956+
}
1957+
1958+
uvm_va_space_up_read(va_space);
1959+
1960+
if (status == NV_OK) {
1961+
status = uvm_global_mask_check_ecc_error(&gpus_to_check_for_ecc);
1962+
uvm_global_mask_release(&gpus_to_check_for_ecc);
1963+
}
1964+
1965+
if (tools_enabled)
1966+
uvm_tools_flush_events();
1967+
1968+
// Major faults involve I/O in order to resolve the fault.
1969+
// If any pages were DMA'ed between the GPU and host memory, that makes it
1970+
// a major fault. A process can also get statistics for major and minor
1971+
// faults by calling readproc().
1972+
service_block_context_cpu_free(service_context);
1973+
1974+
convert_error:
1975+
switch (status) {
1976+
case NV_OK:
1977+
case NV_ERR_BUSY_RETRY:
1978+
return STATUS_OK;
1979+
case NV_ERR_NO_MEMORY:
1980+
return timm_oom;
1981+
default:
1982+
return timm("result", "sigbus");
1983+
}
1984+
}
1985+
1986+
status uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space, context ctx, u64 vaddr)
1987+
{
1988+
return uvm_va_space_cpu_fault(va_space, ctx, vaddr, false);
1989+
}

kernel-open/nvidia-uvm/uvm_va_space.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -847,9 +847,7 @@ NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELA
847847
// VM_FAULT_OOM: if system memory wasn't available.
848848
// VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed,
849849
// for example because it's within a range group which is non-migratable.
850-
vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
851-
struct vm_area_struct *vma,
852-
struct vm_fault *vmf);
850+
status uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space, context ctx, u64 vaddr);
853851

854852
// Handle a CPU fault in the given VA space for a HMM allocation,
855853
// performing any operations necessary to establish a coherent CPU mapping
@@ -863,8 +861,6 @@ vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
863861
// (possibly or'ed with VM_FAULT_MAJOR if a migration was needed).
864862
// VM_FAULT_OOM: if system memory wasn't available.
865863
// VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed.
866-
vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space,
867-
struct vm_area_struct *vma,
868-
struct vm_fault *vmf);
864+
status uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space, context ctx, u64 vaddr);
869865

870866
#endif // __UVM_VA_SPACE_H__

0 commit comments

Comments
 (0)