1 |
From 031af50045ea97ed4386eb3751ca2c134d0fc911 Mon Sep 17 00:00:00 2001 |
2 |
From: Mark Rutland <mark.rutland@arm.com> |
3 |
Date: Wed, 4 Jan 2023 15:16:26 +0000 |
4 |
Subject: arm64: cmpxchg_double*: hazard against entire exchange variable |
5 |
|
6 |
From: Mark Rutland <mark.rutland@arm.com> |
7 |
|
8 |
commit 031af50045ea97ed4386eb3751ca2c134d0fc911 upstream. |
9 |
|
10 |
The inline assembly for arm64's cmpxchg_double*() implementations use a |
11 |
+Q constraint to hazard against other accesses to the memory location |
12 |
being exchanged. However, the pointer passed to the constraint is a |
13 |
pointer to unsigned long, and thus the hazard only applies to the first |
14 |
8 bytes of the location. |
15 |
|
16 |
GCC can take advantage of this, assuming that other portions of the |
17 |
location are unchanged, leading to a number of potential problems. |
18 |
|
19 |
This is similar to what we fixed back in commit: |
20 |
|
21 |
fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable") |
22 |
|
23 |
... but we forgot to adjust cmpxchg_double*() similarly at the same |
24 |
time. |
25 |
|
26 |
The same problem applies, as demonstrated with the following test: |
27 |
|
28 |
| struct big { |
29 |
| u64 lo, hi; |
30 |
| } __aligned(128); |
31 |
| |
32 |
| unsigned long foo(struct big *b) |
33 |
| { |
34 |
| u64 hi_old, hi_new; |
35 |
| |
36 |
| hi_old = b->hi; |
37 |
| cmpxchg_double_local(&b->lo, &b->hi, 0x12, 0x34, 0x56, 0x78); |
38 |
| hi_new = b->hi; |
39 |
| |
40 |
| return hi_old ^ hi_new; |
41 |
| } |
42 |
|
43 |
... which GCC 12.1.0 compiles as: |
44 |
|
45 |
| 0000000000000000 <foo>: |
46 |
| 0: d503233f paciasp |
47 |
| 4: aa0003e4 mov x4, x0 |
48 |
| 8: 1400000e b 40 <foo+0x40> |
49 |
| c: d2800240 mov x0, #0x12 // #18 |
50 |
| 10: d2800681 mov x1, #0x34 // #52 |
51 |
| 14: aa0003e5 mov x5, x0 |
52 |
| 18: aa0103e6 mov x6, x1 |
53 |
| 1c: d2800ac2 mov x2, #0x56 // #86 |
54 |
| 20: d2800f03 mov x3, #0x78 // #120 |
55 |
| 24: 48207c82 casp x0, x1, x2, x3, [x4] |
56 |
| 28: ca050000 eor x0, x0, x5 |
57 |
| 2c: ca060021 eor x1, x1, x6 |
58 |
| 30: aa010000 orr x0, x0, x1 |
59 |
| 34: d2800000 mov x0, #0x0 // #0 <--- BANG |
60 |
| 38: d50323bf autiasp |
61 |
| 3c: d65f03c0 ret |
62 |
| 40: d2800240 mov x0, #0x12 // #18 |
63 |
| 44: d2800681 mov x1, #0x34 // #52 |
64 |
| 48: d2800ac2 mov x2, #0x56 // #86 |
65 |
| 4c: d2800f03 mov x3, #0x78 // #120 |
66 |
| 50: f9800091 prfm pstl1strm, [x4] |
67 |
| 54: c87f1885 ldxp x5, x6, [x4] |
68 |
| 58: ca0000a5 eor x5, x5, x0 |
69 |
| 5c: ca0100c6 eor x6, x6, x1 |
70 |
| 60: aa0600a6 orr x6, x5, x6 |
71 |
| 64: b5000066 cbnz x6, 70 <foo+0x70> |
72 |
| 68: c8250c82 stxp w5, x2, x3, [x4] |
73 |
| 6c: 35ffff45 cbnz w5, 54 <foo+0x54> |
74 |
| 70: d2800000 mov x0, #0x0 // #0 <--- BANG |
75 |
| 74: d50323bf autiasp |
76 |
| 78: d65f03c0 ret |
77 |
|
78 |
Notice that at the lines with "BANG" comments, GCC has assumed that the |
79 |
higher 8 bytes are unchanged by the cmpxchg_double() call, and that |
80 |
`hi_old ^ hi_new` can be reduced to a constant zero, for both LSE and |
81 |
LL/SC versions of cmpxchg_double(). |
82 |
|
83 |
This patch fixes the issue by passing a pointer to __uint128_t into the |
84 |
+Q constraint, ensuring that the compiler hazards against the entire 16 |
85 |
bytes being modified. |
86 |
|
87 |
With this change, GCC 12.1.0 compiles the above test as: |
88 |
|
89 |
| 0000000000000000 <foo>: |
90 |
| 0: f9400407 ldr x7, [x0, #8] |
91 |
| 4: d503233f paciasp |
92 |
| 8: aa0003e4 mov x4, x0 |
93 |
| c: 1400000f b 48 <foo+0x48> |
94 |
| 10: d2800240 mov x0, #0x12 // #18 |
95 |
| 14: d2800681 mov x1, #0x34 // #52 |
96 |
| 18: aa0003e5 mov x5, x0 |
97 |
| 1c: aa0103e6 mov x6, x1 |
98 |
| 20: d2800ac2 mov x2, #0x56 // #86 |
99 |
| 24: d2800f03 mov x3, #0x78 // #120 |
100 |
| 28: 48207c82 casp x0, x1, x2, x3, [x4] |
101 |
| 2c: ca050000 eor x0, x0, x5 |
102 |
| 30: ca060021 eor x1, x1, x6 |
103 |
| 34: aa010000 orr x0, x0, x1 |
104 |
| 38: f9400480 ldr x0, [x4, #8] |
105 |
| 3c: d50323bf autiasp |
106 |
| 40: ca0000e0 eor x0, x7, x0 |
107 |
| 44: d65f03c0 ret |
108 |
| 48: d2800240 mov x0, #0x12 // #18 |
109 |
| 4c: d2800681 mov x1, #0x34 // #52 |
110 |
| 50: d2800ac2 mov x2, #0x56 // #86 |
111 |
| 54: d2800f03 mov x3, #0x78 // #120 |
112 |
| 58: f9800091 prfm pstl1strm, [x4] |
113 |
| 5c: c87f1885 ldxp x5, x6, [x4] |
114 |
| 60: ca0000a5 eor x5, x5, x0 |
115 |
| 64: ca0100c6 eor x6, x6, x1 |
116 |
| 68: aa0600a6 orr x6, x5, x6 |
117 |
| 6c: b5000066 cbnz x6, 78 <foo+0x78> |
118 |
| 70: c8250c82 stxp w5, x2, x3, [x4] |
119 |
| 74: 35ffff45 cbnz w5, 5c <foo+0x5c> |
120 |
| 78: f9400480 ldr x0, [x4, #8] |
121 |
| 7c: d50323bf autiasp |
122 |
| 80: ca0000e0 eor x0, x7, x0 |
123 |
| 84: d65f03c0 ret |
124 |
|
125 |
... sampling the high 8 bytes before and after the cmpxchg, and |
126 |
performing an EOR, as we'd expect. |
127 |
|
128 |
For backporting, I've tested this atop linux-4.9.y with GCC 5.5.0. Note |
129 |
that linux-4.9.y is oldest currently supported stable release, and |
130 |
mandates GCC 5.1+. Unfortunately I couldn't get a GCC 5.1 binary to run |
131 |
on my machines due to library incompatibilities. |
132 |
|
133 |
I've also used a standalone test to check that we can use a __uint128_t |
134 |
pointer in a +Q constraint at least as far back as GCC 4.8.5 and LLVM |
135 |
3.9.1. |
136 |
|
137 |
Fixes: 5284e1b4bc8a ("arm64: xchg: Implement cmpxchg_double") |
138 |
Fixes: e9a4b795652f ("arm64: cmpxchg_dbl: patch in lse instructions when supported by the CPU") |
139 |
Reported-by: Boqun Feng <boqun.feng@gmail.com> |
140 |
Link: https://lore.kernel.org/lkml/Y6DEfQXymYVgL3oJ@boqun-archlinux/ |
141 |
Reported-by: Peter Zijlstra <peterz@infradead.org> |
142 |
Link: https://lore.kernel.org/lkml/Y6GXoO4qmH9OIZ5Q@hirez.programming.kicks-ass.net/ |
143 |
Signed-off-by: Mark Rutland <mark.rutland@arm.com> |
144 |
Cc: stable@vger.kernel.org |
145 |
Cc: Arnd Bergmann <arnd@arndb.de> |
146 |
Cc: Catalin Marinas <catalin.marinas@arm.com> |
147 |
Cc: Steve Capper <steve.capper@arm.com> |
148 |
Cc: Will Deacon <will@kernel.org> |
149 |
Link: https://lore.kernel.org/r/20230104151626.3262137-1-mark.rutland@arm.com |
150 |
Signed-off-by: Will Deacon <will@kernel.org> |
151 |
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
152 |
--- |
153 |
arch/arm64/include/asm/atomic_ll_sc.h | 2 +- |
154 |
arch/arm64/include/asm/atomic_lse.h | 2 +- |
155 |
2 files changed, 2 insertions(+), 2 deletions(-) |
156 |
|
157 |
--- a/arch/arm64/include/asm/atomic_ll_sc.h |
158 |
+++ b/arch/arm64/include/asm/atomic_ll_sc.h |
159 |
@@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned l |
160 |
" cbnz %w0, 1b\n" \ |
161 |
" " #mb "\n" \ |
162 |
"2:" \ |
163 |
- : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr) \ |
164 |
+ : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr) \ |
165 |
: "r" (old1), "r" (old2), "r" (new1), "r" (new2) \ |
166 |
: cl); \ |
167 |
\ |
168 |
--- a/arch/arm64/include/asm/atomic_lse.h |
169 |
+++ b/arch/arm64/include/asm/atomic_lse.h |
170 |
@@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned lon |
171 |
" eor %[old2], %[old2], %[oldval2]\n" \ |
172 |
" orr %[old1], %[old1], %[old2]" \ |
173 |
: [old1] "+&r" (x0), [old2] "+&r" (x1), \ |
174 |
- [v] "+Q" (*(unsigned long *)ptr) \ |
175 |
+ [v] "+Q" (*(__uint128_t *)ptr) \ |
176 |
: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ |
177 |
[oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ |
178 |
: cl); \ |