/[packages]/updates/5/java-1.8.0-openjdk/current/SOURCES/8145913-pr3466-rh1498309.patch
ViewVC logotype

Contents of /updates/5/java-1.8.0-openjdk/current/SOURCES/8145913-pr3466-rh1498309.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1183463 - (show annotations) (download)
Tue Dec 19 13:23:56 2017 UTC (6 years, 4 months ago) by ns80
File size: 18464 byte(s)
- SILENT: sync files

1 # HG changeset patch
2 # User mdoerr
3 # Date 1507750779 -3600
4 # Wed Oct 11 20:39:39 2017 +0100
5 # Node ID 92f0dbe76a13992cc27188e0f68e4b1771c7004a
6 # Parent 542c122b1d7d30c29189565248074aa28f21ae58
7 8145913, PR3466, RH1498309: PPC64: add Montgomery multiply intrinsic
8 Reviewed-by: aph, goetz
9
10 diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp
11 --- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
12 +++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
13 @@ -1179,6 +1179,8 @@
14 inline void mullw_( Register d, Register a, Register b);
15 inline void mulhw( Register d, Register a, Register b);
16 inline void mulhw_( Register d, Register a, Register b);
17 + inline void mulhwu( Register d, Register a, Register b);
18 + inline void mulhwu_(Register d, Register a, Register b);
19 inline void mulhd( Register d, Register a, Register b);
20 inline void mulhd_( Register d, Register a, Register b);
21 inline void mulhdu( Register d, Register a, Register b);
22 diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp
23 --- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
24 +++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
25 @@ -109,6 +109,8 @@
26 inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
27 inline void Assembler::mulhw( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
28 inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
29 +inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
30 +inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
31 inline void Assembler::mulhd( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
32 inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
33 inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
34 diff --git a/src/cpu/ppc/vm/c2_init_ppc.cpp b/src/cpu/ppc/vm/c2_init_ppc.cpp
35 --- openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
36 +++ openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
37 @@ -45,4 +45,10 @@
38 FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
39 }
40 }
41 +
42 + if (OptimizeFill) {
43 + warning("OptimizeFill is not supported on this CPU.");
44 + FLAG_SET_DEFAULT(OptimizeFill, false);
45 + }
46 +
47 }
48 diff --git a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
49 --- openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
50 +++ openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
51 @@ -42,6 +42,8 @@
52 #include "opto/runtime.hpp"
53 #endif
54
55 +#include <alloca.h>
56 +
57 #define __ masm->
58
59 #ifdef PRODUCT
60 @@ -3269,3 +3271,245 @@
61 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
62 oop_maps, true);
63 }
64 +
65 +
66 +//------------------------------Montgomery multiplication------------------------
67 +//
68 +
69 +// Subtract 0:b from carry:a. Return carry.
70 +static unsigned long
71 +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
72 + long i = 0;
73 + unsigned long tmp, tmp2;
74 + __asm__ __volatile__ (
75 + "subfc %[tmp], %[tmp], %[tmp] \n" // pre-set CA
76 + "mtctr %[len] \n"
77 + "0: \n"
78 + "ldx %[tmp], %[i], %[a] \n"
79 + "ldx %[tmp2], %[i], %[b] \n"
80 + "subfe %[tmp], %[tmp2], %[tmp] \n" // subtract extended
81 + "stdx %[tmp], %[i], %[a] \n"
82 + "addi %[i], %[i], 8 \n"
83 + "bdnz 0b \n"
84 + "addme %[tmp], %[carry] \n" // carry + CA - 1
85 + : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
86 + : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
87 + : "ctr", "xer", "memory"
88 + );
89 + return tmp;
90 +}
91 +
92 +// Multiply (unsigned) Long A by Long B, accumulating the double-
93 +// length result into the accumulator formed of T0, T1, and T2.
94 +inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
95 + unsigned long hi, lo;
96 + __asm__ __volatile__ (
97 + "mulld %[lo], %[A], %[B] \n"
98 + "mulhdu %[hi], %[A], %[B] \n"
99 + "addc %[T0], %[T0], %[lo] \n"
100 + "adde %[T1], %[T1], %[hi] \n"
101 + "addze %[T2], %[T2] \n"
102 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
103 + : [A]"r"(A), [B]"r"(B)
104 + : "xer"
105 + );
106 +}
107 +
108 +// As above, but add twice the double-length result into the
109 +// accumulator.
110 +inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
111 + unsigned long hi, lo;
112 + __asm__ __volatile__ (
113 + "mulld %[lo], %[A], %[B] \n"
114 + "mulhdu %[hi], %[A], %[B] \n"
115 + "addc %[T0], %[T0], %[lo] \n"
116 + "adde %[T1], %[T1], %[hi] \n"
117 + "addze %[T2], %[T2] \n"
118 + "addc %[T0], %[T0], %[lo] \n"
119 + "adde %[T1], %[T1], %[hi] \n"
120 + "addze %[T2], %[T2] \n"
121 + : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
122 + : [A]"r"(A), [B]"r"(B)
123 + : "xer"
124 + );
125 +}
126 +
127 +// Fast Montgomery multiplication. The derivation of the algorithm is
128 +// in "A Cryptographic Library for the Motorola DSP56000,
129 +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
130 +static void
131 +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
132 + unsigned long m[], unsigned long inv, int len) {
133 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
134 + int i;
135 +
136 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
137 +
138 + for (i = 0; i < len; i++) {
139 + int j;
140 + for (j = 0; j < i; j++) {
141 + MACC(a[j], b[i-j], t0, t1, t2);
142 + MACC(m[j], n[i-j], t0, t1, t2);
143 + }
144 + MACC(a[i], b[0], t0, t1, t2);
145 + m[i] = t0 * inv;
146 + MACC(m[i], n[0], t0, t1, t2);
147 +
148 + assert(t0 == 0, "broken Montgomery multiply");
149 +
150 + t0 = t1; t1 = t2; t2 = 0;
151 + }
152 +
153 + for (i = len; i < 2*len; i++) {
154 + int j;
155 + for (j = i-len+1; j < len; j++) {
156 + MACC(a[j], b[i-j], t0, t1, t2);
157 + MACC(m[j], n[i-j], t0, t1, t2);
158 + }
159 + m[i-len] = t0;
160 + t0 = t1; t1 = t2; t2 = 0;
161 + }
162 +
163 + while (t0) {
164 + t0 = sub(m, n, t0, len);
165 + }
166 +}
167 +
168 +// Fast Montgomery squaring. This uses asymptotically 25% fewer
169 +// multiplies so it should be up to 25% faster than Montgomery
170 +// multiplication. However, its loop control is more complex and it
171 +// may actually run slower on some machines.
172 +static void
173 +montgomery_square(unsigned long a[], unsigned long n[],
174 + unsigned long m[], unsigned long inv, int len) {
175 + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
176 + int i;
177 +
178 + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
179 +
180 + for (i = 0; i < len; i++) {
181 + int j;
182 + int end = (i+1)/2;
183 + for (j = 0; j < end; j++) {
184 + MACC2(a[j], a[i-j], t0, t1, t2);
185 + MACC(m[j], n[i-j], t0, t1, t2);
186 + }
187 + if ((i & 1) == 0) {
188 + MACC(a[j], a[j], t0, t1, t2);
189 + }
190 + for (; j < i; j++) {
191 + MACC(m[j], n[i-j], t0, t1, t2);
192 + }
193 + m[i] = t0 * inv;
194 + MACC(m[i], n[0], t0, t1, t2);
195 +
196 + assert(t0 == 0, "broken Montgomery square");
197 +
198 + t0 = t1; t1 = t2; t2 = 0;
199 + }
200 +
201 + for (i = len; i < 2*len; i++) {
202 + int start = i-len+1;
203 + int end = start + (len - start)/2;
204 + int j;
205 + for (j = start; j < end; j++) {
206 + MACC2(a[j], a[i-j], t0, t1, t2);
207 + MACC(m[j], n[i-j], t0, t1, t2);
208 + }
209 + if ((i & 1) == 0) {
210 + MACC(a[j], a[j], t0, t1, t2);
211 + }
212 + for (; j < len; j++) {
213 + MACC(m[j], n[i-j], t0, t1, t2);
214 + }
215 + m[i-len] = t0;
216 + t0 = t1; t1 = t2; t2 = 0;
217 + }
218 +
219 + while (t0) {
220 + t0 = sub(m, n, t0, len);
221 + }
222 +}
223 +
224 +// The threshold at which squaring is advantageous was determined
225 +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
226 +// Doesn't seem to be relevant for Power8 so we use the same value.
227 +#define MONTGOMERY_SQUARING_THRESHOLD 64
228 +
229 +// Copy len longwords from s to d, word-swapping as we go. The
230 +// destination array is reversed.
231 +static void reverse_words(unsigned long *s, unsigned long *d, int len) {
232 + d += len;
233 + while(len-- > 0) {
234 + d--;
235 + unsigned long s_val = *s;
236 + // Swap words in a longword on little endian machines.
237 +#ifdef VM_LITTLE_ENDIAN
238 + s_val = (s_val << 32) | (s_val >> 32);
239 +#endif
240 + *d = s_val;
241 + s++;
242 + }
243 +}
244 +
245 +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
246 + jint len, jlong inv,
247 + jint *m_ints) {
248 + assert(len % 2 == 0, "array length in montgomery_multiply must be even");
249 + int longwords = len/2;
250 + assert(longwords > 0, "unsupported");
251 +
252 + // Make very sure we don't use so much space that the stack might
253 + // overflow. 512 jints corresponds to an 16384-bit integer and
254 + // will use here a total of 8k bytes of stack space.
255 + int total_allocation = longwords * sizeof (unsigned long) * 4;
256 + guarantee(total_allocation <= 8192, "must be");
257 + unsigned long *scratch = (unsigned long *)alloca(total_allocation);
258 +
259 + // Local scratch arrays
260 + unsigned long
261 + *a = scratch + 0 * longwords,
262 + *b = scratch + 1 * longwords,
263 + *n = scratch + 2 * longwords,
264 + *m = scratch + 3 * longwords;
265 +
266 + reverse_words((unsigned long *)a_ints, a, longwords);
267 + reverse_words((unsigned long *)b_ints, b, longwords);
268 + reverse_words((unsigned long *)n_ints, n, longwords);
269 +
270 + ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
271 +
272 + reverse_words(m, (unsigned long *)m_ints, longwords);
273 +}
274 +
275 +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
276 + jint len, jlong inv,
277 + jint *m_ints) {
278 + assert(len % 2 == 0, "array length in montgomery_square must be even");
279 + int longwords = len/2;
280 + assert(longwords > 0, "unsupported");
281 +
282 + // Make very sure we don't use so much space that the stack might
283 + // overflow. 512 jints corresponds to an 16384-bit integer and
284 + // will use here a total of 6k bytes of stack space.
285 + int total_allocation = longwords * sizeof (unsigned long) * 3;
286 + guarantee(total_allocation <= 8192, "must be");
287 + unsigned long *scratch = (unsigned long *)alloca(total_allocation);
288 +
289 + // Local scratch arrays
290 + unsigned long
291 + *a = scratch + 0 * longwords,
292 + *n = scratch + 1 * longwords,
293 + *m = scratch + 2 * longwords;
294 +
295 + reverse_words((unsigned long *)a_ints, a, longwords);
296 + reverse_words((unsigned long *)n_ints, n, longwords);
297 +
298 + if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
299 + ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
300 + } else {
301 + ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
302 + }
303 +
304 + reverse_words(m, (unsigned long *)m_ints, longwords);
305 +}
306 diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp
307 --- openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
308 +++ openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
309 @@ -2094,6 +2094,14 @@
310 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
311 &StubRoutines::_safefetchN_fault_pc,
312 &StubRoutines::_safefetchN_continuation_pc);
313 + if (UseMontgomeryMultiplyIntrinsic) {
314 + StubRoutines::_montgomeryMultiply
315 + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
316 + }
317 + if (UseMontgomerySquareIntrinsic) {
318 + StubRoutines::_montgomerySquare
319 + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
320 + }
321 }
322
323 public:
324 diff --git a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
325 --- openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
326 +++ openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp
327 @@ -265,7 +265,7 @@
328 __ cmpdi(CCR0, Rmdo, 0);
329 __ beq(CCR0, no_mdo);
330
331 - // Increment backedge counter in the MDO.
332 + // Increment invocation counter in the MDO.
333 const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
334 __ lwz(Rscratch2, mdo_bc_offs, Rmdo);
335 __ addi(Rscratch2, Rscratch2, increment);
336 @@ -277,12 +277,12 @@
337 }
338
339 // Increment counter in MethodCounters*.
340 - const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
341 + const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
342 __ bind(no_mdo);
343 __ get_method_counters(R19_method, R3_counters, done);
344 - __ lwz(Rscratch2, mo_bc_offs, R3_counters);
345 + __ lwz(Rscratch2, mo_ic_offs, R3_counters);
346 __ addi(Rscratch2, Rscratch2, increment);
347 - __ stw(Rscratch2, mo_bc_offs, R3_counters);
348 + __ stw(Rscratch2, mo_ic_offs, R3_counters);
349 __ load_const_optimized(Rscratch1, mask, R0);
350 __ and_(Rscratch1, Rscratch2, Rscratch1);
351 __ beq(CCR0, *overflow);
352 diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp
353 --- openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
354 +++ openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
355 @@ -177,6 +177,12 @@
356 FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
357 }
358
359 + if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
360 + UseMontgomeryMultiplyIntrinsic = true;
361 + }
362 + if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
363 + UseMontgomerySquareIntrinsic = true;
364 + }
365 }
366
367 void VM_Version::print_features() {
368 diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp
369 --- openjdk/hotspot/src/share/vm/opto/library_call.cpp
370 +++ openjdk/hotspot/src/share/vm/opto/library_call.cpp
371 @@ -6031,11 +6031,21 @@
372 Node* n_start = array_element_address(n, intcon(0), n_elem);
373 Node* m_start = array_element_address(m, intcon(0), m_elem);
374
375 - Node* call = make_runtime_call(RC_LEAF,
376 - OptoRuntime::montgomeryMultiply_Type(),
377 - stubAddr, stubName, TypePtr::BOTTOM,
378 - a_start, b_start, n_start, len, inv, top(),
379 - m_start);
380 + Node* call = NULL;
381 + if (CCallingConventionRequiresIntsAsLongs) {
382 + Node* len_I2L = ConvI2L(len);
383 + call = make_runtime_call(RC_LEAF,
384 + OptoRuntime::montgomeryMultiply_Type(),
385 + stubAddr, stubName, TypePtr::BOTTOM,
386 + a_start, b_start, n_start, len_I2L XTOP, inv,
387 + top(), m_start);
388 + } else {
389 + call = make_runtime_call(RC_LEAF,
390 + OptoRuntime::montgomeryMultiply_Type(),
391 + stubAddr, stubName, TypePtr::BOTTOM,
392 + a_start, b_start, n_start, len, inv, top(),
393 + m_start);
394 + }
395 set_result(m);
396 }
397
398 @@ -6085,11 +6095,22 @@
399 Node* n_start = array_element_address(n, intcon(0), n_elem);
400 Node* m_start = array_element_address(m, intcon(0), m_elem);
401
402 - Node* call = make_runtime_call(RC_LEAF,
403 - OptoRuntime::montgomerySquare_Type(),
404 - stubAddr, stubName, TypePtr::BOTTOM,
405 - a_start, n_start, len, inv, top(),
406 - m_start);
407 + Node* call = NULL;
408 + if (CCallingConventionRequiresIntsAsLongs) {
409 + Node* len_I2L = ConvI2L(len);
410 + call = make_runtime_call(RC_LEAF,
411 + OptoRuntime::montgomerySquare_Type(),
412 + stubAddr, stubName, TypePtr::BOTTOM,
413 + a_start, n_start, len_I2L XTOP, inv, top(),
414 + m_start);
415 + } else {
416 + call = make_runtime_call(RC_LEAF,
417 + OptoRuntime::montgomerySquare_Type(),
418 + stubAddr, stubName, TypePtr::BOTTOM,
419 + a_start, n_start, len, inv, top(),
420 + m_start);
421 + }
422 +
423 set_result(m);
424 }
425
426 diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp
427 --- openjdk/hotspot/src/share/vm/opto/runtime.cpp
428 +++ openjdk/hotspot/src/share/vm/opto/runtime.cpp
429 @@ -1005,12 +1005,20 @@
430 // create input type (domain)
431 int num_args = 7;
432 int argcnt = num_args;
433 + if (CCallingConventionRequiresIntsAsLongs) {
434 + argcnt++; // additional placeholder
435 + }
436 const Type** fields = TypeTuple::fields(argcnt);
437 int argp = TypeFunc::Parms;
438 fields[argp++] = TypePtr::NOTNULL; // a
439 fields[argp++] = TypePtr::NOTNULL; // b
440 fields[argp++] = TypePtr::NOTNULL; // n
441 - fields[argp++] = TypeInt::INT; // len
442 + if (CCallingConventionRequiresIntsAsLongs) {
443 + fields[argp++] = TypeLong::LONG; // len
444 + fields[argp++] = TypeLong::HALF; // placeholder
445 + } else {
446 + fields[argp++] = TypeInt::INT; // len
447 + }
448 fields[argp++] = TypeLong::LONG; // inv
449 fields[argp++] = Type::HALF;
450 fields[argp++] = TypePtr::NOTNULL; // result
451 @@ -1029,11 +1037,19 @@
452 // create input type (domain)
453 int num_args = 6;
454 int argcnt = num_args;
455 + if (CCallingConventionRequiresIntsAsLongs) {
456 + argcnt++; // additional placeholder
457 + }
458 const Type** fields = TypeTuple::fields(argcnt);
459 int argp = TypeFunc::Parms;
460 fields[argp++] = TypePtr::NOTNULL; // a
461 fields[argp++] = TypePtr::NOTNULL; // n
462 - fields[argp++] = TypeInt::INT; // len
463 + if (CCallingConventionRequiresIntsAsLongs) {
464 + fields[argp++] = TypeLong::LONG; // len
465 + fields[argp++] = TypeLong::HALF; // placeholder
466 + } else {
467 + fields[argp++] = TypeInt::INT; // len
468 + }
469 fields[argp++] = TypeLong::LONG; // inv
470 fields[argp++] = Type::HALF;
471 fields[argp++] = TypePtr::NOTNULL; // result

  ViewVC Help
Powered by ViewVC 1.1.30