1 |
From 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b Mon Sep 17 00:00:00 2001 |
2 |
From: David Hildenbrand <david@redhat.com> |
3 |
Date: Thu, 11 Aug 2022 12:34:35 +0200 |
4 |
Subject: mm/hugetlb: support write-faults in shared mappings |
5 |
|
6 |
From: David Hildenbrand <david@redhat.com> |
7 |
|
8 |
commit 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b upstream. |
9 |
|
10 |
If we ever get a write-fault on a write-protected page in a shared |
11 |
mapping, we'd be in trouble (again). Instead, we can simply map the page |
12 |
writable. |
13 |
|
14 |
And in fact, there is even a way right now to trigger that code via |
15 |
uffd-wp ever since we stared to support it for shmem in 5.19: |
16 |
|
17 |
-------------------------------------------------------------------------- |
18 |
#include <stdio.h> |
19 |
#include <stdlib.h> |
20 |
#include <string.h> |
21 |
#include <fcntl.h> |
22 |
#include <unistd.h> |
23 |
#include <errno.h> |
24 |
#include <sys/mman.h> |
25 |
#include <sys/syscall.h> |
26 |
#include <sys/ioctl.h> |
27 |
#include <linux/userfaultfd.h> |
28 |
|
29 |
#define HUGETLB_SIZE (2 * 1024 * 1024u) |
30 |
|
31 |
static char *map; |
32 |
int uffd; |
33 |
|
34 |
static int temp_setup_uffd(void) |
35 |
{ |
36 |
struct uffdio_api uffdio_api; |
37 |
struct uffdio_register uffdio_register; |
38 |
struct uffdio_writeprotect uffd_writeprotect; |
39 |
struct uffdio_range uffd_range; |
40 |
|
41 |
uffd = syscall(__NR_userfaultfd, |
42 |
O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); |
43 |
if (uffd < 0) { |
44 |
fprintf(stderr, "syscall() failed: %d\n", errno); |
45 |
return -errno; |
46 |
} |
47 |
|
48 |
uffdio_api.api = UFFD_API; |
49 |
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; |
50 |
if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { |
51 |
fprintf(stderr, "UFFDIO_API failed: %d\n", errno); |
52 |
return -errno; |
53 |
} |
54 |
|
55 |
if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { |
56 |
fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); |
57 |
return -ENOSYS; |
58 |
} |
59 |
|
60 |
/* Register UFFD-WP */ |
61 |
uffdio_register.range.start = (unsigned long) map; |
62 |
uffdio_register.range.len = HUGETLB_SIZE; |
63 |
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; |
64 |
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { |
65 |
fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); |
66 |
return -errno; |
67 |
} |
68 |
|
69 |
/* Writeprotect a single page. */ |
70 |
uffd_writeprotect.range.start = (unsigned long) map; |
71 |
uffd_writeprotect.range.len = HUGETLB_SIZE; |
72 |
uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; |
73 |
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { |
74 |
fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); |
75 |
return -errno; |
76 |
} |
77 |
|
78 |
/* Unregister UFFD-WP without prior writeunprotection. */ |
79 |
uffd_range.start = (unsigned long) map; |
80 |
uffd_range.len = HUGETLB_SIZE; |
81 |
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) { |
82 |
fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno); |
83 |
return -errno; |
84 |
} |
85 |
|
86 |
return 0; |
87 |
} |
88 |
|
89 |
int main(int argc, char **argv) |
90 |
{ |
91 |
int fd; |
92 |
|
93 |
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); |
94 |
if (!fd) { |
95 |
fprintf(stderr, "open() failed\n"); |
96 |
return -errno; |
97 |
} |
98 |
if (ftruncate(fd, HUGETLB_SIZE)) { |
99 |
fprintf(stderr, "ftruncate() failed\n"); |
100 |
return -errno; |
101 |
} |
102 |
|
103 |
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); |
104 |
if (map == MAP_FAILED) { |
105 |
fprintf(stderr, "mmap() failed\n"); |
106 |
return -errno; |
107 |
} |
108 |
|
109 |
*map = 0; |
110 |
|
111 |
if (temp_setup_uffd()) |
112 |
return 1; |
113 |
|
114 |
*map = 0; |
115 |
|
116 |
return 0; |
117 |
} |
118 |
-------------------------------------------------------------------------- |
119 |
|
120 |
Above test fails with SIGBUS when there is only a single free hugetlb page. |
121 |
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages |
122 |
# ./test |
123 |
Bus error (core dumped) |
124 |
|
125 |
And worse, with sufficient free hugetlb pages it will map an anonymous page |
126 |
into a shared mapping, for example, messing up accounting during unmap |
127 |
and breaking MAP_SHARED semantics: |
128 |
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages |
129 |
# ./test |
130 |
# cat /proc/meminfo | grep HugePages_ |
131 |
HugePages_Total: 2 |
132 |
HugePages_Free: 1 |
133 |
HugePages_Rsvd: 18446744073709551615 |
134 |
HugePages_Surp: 0 |
135 |
|
136 |
Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when |
137 |
unregistering and consequently keeps the PTE writeprotected. Reason for |
138 |
this is to avoid the additional overhead when unregistering. Note that |
139 |
this is the case also for !hugetlb and that we will end up with writable |
140 |
PTEs that still have the uffd-wp PTE bit set once we return from |
141 |
hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it |
142 |
seems to be a generic thing -- wp_page_reuse() also doesn't clear it. |
143 |
|
144 |
VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates |
145 |
that MAP_SHARED handling was at least envisioned, but could never have |
146 |
worked as expected. |
147 |
|
148 |
While at it, make sure that we never end up in hugetlb_wp() on write |
149 |
faults without VM_WRITE, because we don't support maybe_mkwrite() |
150 |
semantics as commonly used in the !hugetlb case -- for example, in |
151 |
wp_page_reuse(). |
152 |
|
153 |
Note that there is no need to do any kind of reservation in |
154 |
hugetlb_fault() in this case ... because we already have a hugetlb page |
155 |
mapped R/O that we will simply map writable and we are not dealing with |
156 |
COW/unsharing. |
157 |
|
158 |
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com |
159 |
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") |
160 |
Signed-off-by: David Hildenbrand <david@redhat.com> |
161 |
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> |
162 |
Cc: Bjorn Helgaas <bhelgaas@google.com> |
163 |
Cc: Cyrill Gorcunov <gorcunov@openvz.org> |
164 |
Cc: Hugh Dickins <hughd@google.com> |
165 |
Cc: Jamie Liu <jamieliu@google.com> |
166 |
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> |
167 |
Cc: Muchun Song <songmuchun@bytedance.com> |
168 |
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> |
169 |
Cc: Pavel Emelyanov <xemul@parallels.com> |
170 |
Cc: Peter Feiner <pfeiner@google.com> |
171 |
Cc: Peter Xu <peterx@redhat.com> |
172 |
Cc: <stable@vger.kernel.org> [5.19] |
173 |
Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
174 |
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
175 |
--- |
176 |
mm/hugetlb.c | 26 +++++++++++++++++++------- |
177 |
1 file changed, 19 insertions(+), 7 deletions(-) |
178 |
|
179 |
--- a/mm/hugetlb.c |
180 |
+++ b/mm/hugetlb.c |
181 |
@@ -5232,6 +5232,21 @@ static vm_fault_t hugetlb_wp(struct mm_s |
182 |
VM_BUG_ON(unshare && (flags & FOLL_WRITE)); |
183 |
VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); |
184 |
|
185 |
+ /* |
186 |
+ * hugetlb does not support FOLL_FORCE-style write faults that keep the |
187 |
+ * PTE mapped R/O such as maybe_mkwrite() would do. |
188 |
+ */ |
189 |
+ if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) |
190 |
+ return VM_FAULT_SIGSEGV; |
191 |
+ |
192 |
+ /* Let's take out MAP_SHARED mappings first. */ |
193 |
+ if (vma->vm_flags & VM_MAYSHARE) { |
194 |
+ if (unlikely(unshare)) |
195 |
+ return 0; |
196 |
+ set_huge_ptep_writable(vma, haddr, ptep); |
197 |
+ return 0; |
198 |
+ } |
199 |
+ |
200 |
pte = huge_ptep_get(ptep); |
201 |
old_page = pte_page(pte); |
202 |
|
203 |
@@ -5766,12 +5781,11 @@ vm_fault_t hugetlb_fault(struct mm_struc |
204 |
* If we are going to COW/unshare the mapping later, we examine the |
205 |
* pending reservations for this page now. This will ensure that any |
206 |
* allocations necessary to record that reservation occur outside the |
207 |
- * spinlock. For private mappings, we also lookup the pagecache |
208 |
- * page now as it is used to determine if a reservation has been |
209 |
- * consumed. |
210 |
+ * spinlock. Also lookup the pagecache page now as it is used to |
211 |
+ * determine if a reservation has been consumed. |
212 |
*/ |
213 |
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && |
214 |
- !huge_pte_write(entry)) { |
215 |
+ !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { |
216 |
if (vma_needs_reservation(h, vma, haddr) < 0) { |
217 |
ret = VM_FAULT_OOM; |
218 |
goto out_mutex; |
219 |
@@ -5779,9 +5793,7 @@ vm_fault_t hugetlb_fault(struct mm_struc |
220 |
/* Just decrements count, does not deallocate */ |
221 |
vma_end_reservation(h, vma, haddr); |
222 |
|
223 |
- if (!(vma->vm_flags & VM_MAYSHARE)) |
224 |
- pagecache_page = hugetlbfs_pagecache_page(h, |
225 |
- vma, haddr); |
226 |
+ pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); |
227 |
} |
228 |
|
229 |
ptl = huge_pte_lock(h, mm, ptep); |