/[packages]/backports/8/kernel/current/SOURCES/mm-hugetlb-support-write-faults-in-shared-mappings.patch
ViewVC logotype

Contents of /backports/8/kernel/current/SOURCES/mm-hugetlb-support-write-faults-in-shared-mappings.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1880454 - (show annotations) (download)
Fri Aug 26 04:48:43 2022 UTC (19 months, 3 weeks ago) by tmb
File size: 7724 byte(s)
- update to 5.19.4
  * drop merged patches
- add current -stable queue


1 From 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b Mon Sep 17 00:00:00 2001
2 From: David Hildenbrand <david@redhat.com>
3 Date: Thu, 11 Aug 2022 12:34:35 +0200
4 Subject: mm/hugetlb: support write-faults in shared mappings
5
6 From: David Hildenbrand <david@redhat.com>
7
8 commit 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b upstream.
9
10 If we ever get a write-fault on a write-protected page in a shared
11 mapping, we'd be in trouble (again). Instead, we can simply map the page
12 writable.
13
14 And in fact, there is even a way right now to trigger that code via
15 uffd-wp ever since we stared to support it for shmem in 5.19:
16
17 --------------------------------------------------------------------------
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <fcntl.h>
22 #include <unistd.h>
23 #include <errno.h>
24 #include <sys/mman.h>
25 #include <sys/syscall.h>
26 #include <sys/ioctl.h>
27 #include <linux/userfaultfd.h>
28
29 #define HUGETLB_SIZE (2 * 1024 * 1024u)
30
31 static char *map;
32 int uffd;
33
34 static int temp_setup_uffd(void)
35 {
36 struct uffdio_api uffdio_api;
37 struct uffdio_register uffdio_register;
38 struct uffdio_writeprotect uffd_writeprotect;
39 struct uffdio_range uffd_range;
40
41 uffd = syscall(__NR_userfaultfd,
42 O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
43 if (uffd < 0) {
44 fprintf(stderr, "syscall() failed: %d\n", errno);
45 return -errno;
46 }
47
48 uffdio_api.api = UFFD_API;
49 uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
50 if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
51 fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
52 return -errno;
53 }
54
55 if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
56 fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
57 return -ENOSYS;
58 }
59
60 /* Register UFFD-WP */
61 uffdio_register.range.start = (unsigned long) map;
62 uffdio_register.range.len = HUGETLB_SIZE;
63 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
64 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
65 fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
66 return -errno;
67 }
68
69 /* Writeprotect a single page. */
70 uffd_writeprotect.range.start = (unsigned long) map;
71 uffd_writeprotect.range.len = HUGETLB_SIZE;
72 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
73 if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
74 fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
75 return -errno;
76 }
77
78 /* Unregister UFFD-WP without prior writeunprotection. */
79 uffd_range.start = (unsigned long) map;
80 uffd_range.len = HUGETLB_SIZE;
81 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
82 fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno);
83 return -errno;
84 }
85
86 return 0;
87 }
88
89 int main(int argc, char **argv)
90 {
91 int fd;
92
93 fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
94 if (!fd) {
95 fprintf(stderr, "open() failed\n");
96 return -errno;
97 }
98 if (ftruncate(fd, HUGETLB_SIZE)) {
99 fprintf(stderr, "ftruncate() failed\n");
100 return -errno;
101 }
102
103 map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
104 if (map == MAP_FAILED) {
105 fprintf(stderr, "mmap() failed\n");
106 return -errno;
107 }
108
109 *map = 0;
110
111 if (temp_setup_uffd())
112 return 1;
113
114 *map = 0;
115
116 return 0;
117 }
118 --------------------------------------------------------------------------
119
120 Above test fails with SIGBUS when there is only a single free hugetlb page.
121 # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
122 # ./test
123 Bus error (core dumped)
124
125 And worse, with sufficient free hugetlb pages it will map an anonymous page
126 into a shared mapping, for example, messing up accounting during unmap
127 and breaking MAP_SHARED semantics:
128 # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
129 # ./test
130 # cat /proc/meminfo | grep HugePages_
131 HugePages_Total: 2
132 HugePages_Free: 1
133 HugePages_Rsvd: 18446744073709551615
134 HugePages_Surp: 0
135
136 Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
137 unregistering and consequently keeps the PTE writeprotected. Reason for
138 this is to avoid the additional overhead when unregistering. Note that
139 this is the case also for !hugetlb and that we will end up with writable
140 PTEs that still have the uffd-wp PTE bit set once we return from
141 hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
142 seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
143
144 VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates
145 that MAP_SHARED handling was at least envisioned, but could never have
146 worked as expected.
147
148 While at it, make sure that we never end up in hugetlb_wp() on write
149 faults without VM_WRITE, because we don't support maybe_mkwrite()
150 semantics as commonly used in the !hugetlb case -- for example, in
151 wp_page_reuse().
152
153 Note that there is no need to do any kind of reservation in
154 hugetlb_fault() in this case ... because we already have a hugetlb page
155 mapped R/O that we will simply map writable and we are not dealing with
156 COW/unsharing.
157
158 Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
159 Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
160 Signed-off-by: David Hildenbrand <david@redhat.com>
161 Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
162 Cc: Bjorn Helgaas <bhelgaas@google.com>
163 Cc: Cyrill Gorcunov <gorcunov@openvz.org>
164 Cc: Hugh Dickins <hughd@google.com>
165 Cc: Jamie Liu <jamieliu@google.com>
166 Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
167 Cc: Muchun Song <songmuchun@bytedance.com>
168 Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
169 Cc: Pavel Emelyanov <xemul@parallels.com>
170 Cc: Peter Feiner <pfeiner@google.com>
171 Cc: Peter Xu <peterx@redhat.com>
172 Cc: <stable@vger.kernel.org> [5.19]
173 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
174 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
175 ---
176 mm/hugetlb.c | 26 +++++++++++++++++++-------
177 1 file changed, 19 insertions(+), 7 deletions(-)
178
179 --- a/mm/hugetlb.c
180 +++ b/mm/hugetlb.c
181 @@ -5232,6 +5232,21 @@ static vm_fault_t hugetlb_wp(struct mm_s
182 VM_BUG_ON(unshare && (flags & FOLL_WRITE));
183 VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
184
185 + /*
186 + * hugetlb does not support FOLL_FORCE-style write faults that keep the
187 + * PTE mapped R/O such as maybe_mkwrite() would do.
188 + */
189 + if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
190 + return VM_FAULT_SIGSEGV;
191 +
192 + /* Let's take out MAP_SHARED mappings first. */
193 + if (vma->vm_flags & VM_MAYSHARE) {
194 + if (unlikely(unshare))
195 + return 0;
196 + set_huge_ptep_writable(vma, haddr, ptep);
197 + return 0;
198 + }
199 +
200 pte = huge_ptep_get(ptep);
201 old_page = pte_page(pte);
202
203 @@ -5766,12 +5781,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
204 * If we are going to COW/unshare the mapping later, we examine the
205 * pending reservations for this page now. This will ensure that any
206 * allocations necessary to record that reservation occur outside the
207 - * spinlock. For private mappings, we also lookup the pagecache
208 - * page now as it is used to determine if a reservation has been
209 - * consumed.
210 + * spinlock. Also lookup the pagecache page now as it is used to
211 + * determine if a reservation has been consumed.
212 */
213 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
214 - !huge_pte_write(entry)) {
215 + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
216 if (vma_needs_reservation(h, vma, haddr) < 0) {
217 ret = VM_FAULT_OOM;
218 goto out_mutex;
219 @@ -5779,9 +5793,7 @@ vm_fault_t hugetlb_fault(struct mm_struc
220 /* Just decrements count, does not deallocate */
221 vma_end_reservation(h, vma, haddr);
222
223 - if (!(vma->vm_flags & VM_MAYSHARE))
224 - pagecache_page = hugetlbfs_pagecache_page(h,
225 - vma, haddr);
226 + pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
227 }
228
229 ptl = huge_pte_lock(h, mm, ptep);

  ViewVC Help
Powered by ViewVC 1.1.30