@@ -87,7 +87,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
87
87
static void hugetlb_vma_lock_alloc (struct vm_area_struct * vma );
88
88
static void __hugetlb_vma_unlock_write_free (struct vm_area_struct * vma );
89
89
static void hugetlb_unshare_pmds (struct vm_area_struct * vma ,
90
- unsigned long start , unsigned long end );
90
+ unsigned long start , unsigned long end , bool take_locks );
91
91
static struct resv_map * vma_resv_map (struct vm_area_struct * vma );
92
92
93
93
static void hugetlb_free_folio (struct folio * folio )
@@ -1218,7 +1218,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
1218
1218
/*
1219
1219
* Reset and decrement one ref on hugepage private reservation.
1220
1220
* Called with mm->mmap_lock writer semaphore held.
1221
- * This function should be only used by move_vma() and operate on
1221
+ * This function should be only used by mremap and operate on
1222
1222
* same sized vma. It should never come here with last ref on the
1223
1223
* reservation.
1224
1224
*/
@@ -5093,26 +5093,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
5093
5093
{
5094
5094
if (addr & ~(huge_page_mask (hstate_vma (vma ))))
5095
5095
return - EINVAL ;
5096
+ return 0 ;
5097
+ }
5096
5098
5099
+ void hugetlb_split (struct vm_area_struct * vma , unsigned long addr )
5100
+ {
5097
5101
/*
5098
5102
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
5099
5103
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5100
5104
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5105
+ * This function is called in the middle of a VMA split operation, with
5106
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
5107
+ * walks (except hardware and gup_fast()).
5101
5108
*/
5109
+ vma_assert_write_locked (vma );
5110
+ i_mmap_assert_write_locked (vma -> vm_file -> f_mapping );
5111
+
5102
5112
if (addr & ~PUD_MASK ) {
5103
- /*
5104
- * hugetlb_vm_op_split is called right before we attempt to
5105
- * split the VMA. We will need to unshare PMDs in the old and
5106
- * new VMAs, so let's unshare before we split.
5107
- */
5108
5113
unsigned long floor = addr & PUD_MASK ;
5109
5114
unsigned long ceil = floor + PUD_SIZE ;
5110
5115
5111
- if (floor >= vma -> vm_start && ceil <= vma -> vm_end )
5112
- hugetlb_unshare_pmds (vma , floor , ceil );
5116
+ if (floor >= vma -> vm_start && ceil <= vma -> vm_end ) {
5117
+ /*
5118
+ * Locking:
5119
+ * Use take_locks=false here.
5120
+ * The file rmap lock is already held.
5121
+ * The hugetlb VMA lock can't be taken when we already
5122
+ * hold the file rmap lock, and we don't need it because
5123
+ * its purpose is to synchronize against concurrent page
5124
+ * table walks, which are not possible thanks to the
5125
+ * locks held by our caller.
5126
+ */
5127
+ hugetlb_unshare_pmds (vma , floor , ceil , /* take_locks = */ false);
5128
+ }
5113
5129
}
5114
-
5115
- return 0 ;
5116
5130
}
5117
5131
5118
5132
static unsigned long hugetlb_vm_op_pagesize (struct vm_area_struct * vma )
@@ -7265,6 +7279,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7265
7279
return 0 ;
7266
7280
7267
7281
pud_clear (pud );
7282
+ /*
7283
+ * Once our caller drops the rmap lock, some other process might be
7284
+ * using this page table as a normal, non-hugetlb page table.
7285
+ * Wait for pending gup_fast() in other threads to finish before letting
7286
+ * that happen.
7287
+ */
7288
+ tlb_remove_table_sync_one ();
7268
7289
put_page (virt_to_page (ptep ));
7269
7290
mm_dec_nr_pmds (mm );
7270
7291
return 1 ;
@@ -7497,9 +7518,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
7497
7518
}
7498
7519
}
7499
7520
7521
+ /*
7522
+ * If @take_locks is false, the caller must ensure that no concurrent page table
7523
+ * access can happen (except for gup_fast() and hardware page walks).
7524
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7525
+ * concurrent page fault handling) and the file rmap lock.
7526
+ */
7500
7527
static void hugetlb_unshare_pmds (struct vm_area_struct * vma ,
7501
7528
unsigned long start ,
7502
- unsigned long end )
7529
+ unsigned long end ,
7530
+ bool take_locks )
7503
7531
{
7504
7532
struct hstate * h = hstate_vma (vma );
7505
7533
unsigned long sz = huge_page_size (h );
@@ -7523,8 +7551,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7523
7551
mmu_notifier_range_init (& range , MMU_NOTIFY_CLEAR , 0 , mm ,
7524
7552
start , end );
7525
7553
mmu_notifier_invalidate_range_start (& range );
7526
- hugetlb_vma_lock_write (vma );
7527
- i_mmap_lock_write (vma -> vm_file -> f_mapping );
7554
+ if (take_locks ) {
7555
+ hugetlb_vma_lock_write (vma );
7556
+ i_mmap_lock_write (vma -> vm_file -> f_mapping );
7557
+ } else {
7558
+ i_mmap_assert_write_locked (vma -> vm_file -> f_mapping );
7559
+ }
7528
7560
for (address = start ; address < end ; address += PUD_SIZE ) {
7529
7561
ptep = hugetlb_walk (vma , address , sz );
7530
7562
if (!ptep )
@@ -7534,8 +7566,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7534
7566
spin_unlock (ptl );
7535
7567
}
7536
7568
flush_hugetlb_tlb_range (vma , start , end );
7537
- i_mmap_unlock_write (vma -> vm_file -> f_mapping );
7538
- hugetlb_vma_unlock_write (vma );
7569
+ if (take_locks ) {
7570
+ i_mmap_unlock_write (vma -> vm_file -> f_mapping );
7571
+ hugetlb_vma_unlock_write (vma );
7572
+ }
7539
7573
/*
7540
7574
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
7541
7575
* Documentation/mm/mmu_notifier.rst.
@@ -7550,7 +7584,22 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7550
7584
void hugetlb_unshare_all_pmds (struct vm_area_struct * vma )
7551
7585
{
7552
7586
hugetlb_unshare_pmds (vma , ALIGN (vma -> vm_start , PUD_SIZE ),
7553
- ALIGN_DOWN (vma -> vm_end , PUD_SIZE ));
7587
+ ALIGN_DOWN (vma -> vm_end , PUD_SIZE ),
7588
+ /* take_locks = */ true);
7589
+ }
7590
+
7591
+ /*
7592
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
7593
+ * performed, we permit both the old and new VMAs to reference the same
7594
+ * reservation.
7595
+ *
7596
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
7597
+ * is closed as a result of a failure to allocate memory.
7598
+ */
7599
+ void fixup_hugetlb_reservations (struct vm_area_struct * vma )
7600
+ {
7601
+ if (is_vm_hugetlb_page (vma ))
7602
+ clear_vma_resv_huge_pages (vma );
7554
7603
}
7555
7604
7556
7605
#ifdef CONFIG_CMA
0 commit comments