1 |
# HG changeset patch |
2 |
# User mdoerr |
3 |
# Date 1507750779 -3600 |
4 |
# Wed Oct 11 20:39:39 2017 +0100 |
5 |
# Node ID 92f0dbe76a13992cc27188e0f68e4b1771c7004a |
6 |
# Parent 542c122b1d7d30c29189565248074aa28f21ae58 |
7 |
8145913, PR3466, RH1498309: PPC64: add Montgomery multiply intrinsic |
8 |
Reviewed-by: aph, goetz |
9 |
|
10 |
diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp |
11 |
--- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp |
12 |
+++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp |
13 |
@@ -1179,6 +1179,8 @@ |
14 |
inline void mullw_( Register d, Register a, Register b); |
15 |
inline void mulhw( Register d, Register a, Register b); |
16 |
inline void mulhw_( Register d, Register a, Register b); |
17 |
+ inline void mulhwu( Register d, Register a, Register b); |
18 |
+ inline void mulhwu_(Register d, Register a, Register b); |
19 |
inline void mulhd( Register d, Register a, Register b); |
20 |
inline void mulhd_( Register d, Register a, Register b); |
21 |
inline void mulhdu( Register d, Register a, Register b); |
22 |
diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp |
23 |
--- openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp |
24 |
+++ openjdk/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp |
25 |
@@ -109,6 +109,8 @@ |
26 |
inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); } |
27 |
inline void Assembler::mulhw( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); } |
28 |
inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); } |
29 |
+inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); } |
30 |
+inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); } |
31 |
inline void Assembler::mulhd( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); } |
32 |
inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); } |
33 |
inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); } |
34 |
diff --git a/src/cpu/ppc/vm/c2_init_ppc.cpp b/src/cpu/ppc/vm/c2_init_ppc.cpp |
35 |
--- openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp |
36 |
+++ openjdk/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp |
37 |
@@ -45,4 +45,10 @@ |
38 |
FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true); |
39 |
} |
40 |
} |
41 |
+ |
42 |
+ if (OptimizeFill) { |
43 |
+ warning("OptimizeFill is not supported on this CPU."); |
44 |
+ FLAG_SET_DEFAULT(OptimizeFill, false); |
45 |
+ } |
46 |
+ |
47 |
} |
48 |
diff --git a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp |
49 |
--- openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp |
50 |
+++ openjdk/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp |
51 |
@@ -42,6 +42,8 @@ |
52 |
#include "opto/runtime.hpp" |
53 |
#endif |
54 |
|
55 |
+#include <alloca.h> |
56 |
+ |
57 |
#define __ masm-> |
58 |
|
59 |
#ifdef PRODUCT |
60 |
@@ -3269,3 +3271,245 @@ |
61 |
return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize, |
62 |
oop_maps, true); |
63 |
} |
64 |
+ |
65 |
+ |
66 |
+//------------------------------Montgomery multiplication------------------------ |
67 |
+// |
68 |
+ |
69 |
+// Subtract 0:b from carry:a. Return carry. |
70 |
+static unsigned long |
71 |
+sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { |
72 |
+ long i = 0; |
73 |
+ unsigned long tmp, tmp2; |
74 |
+ __asm__ __volatile__ ( |
75 |
+ "subfc %[tmp], %[tmp], %[tmp] \n" // pre-set CA |
76 |
+ "mtctr %[len] \n" |
77 |
+ "0: \n" |
78 |
+ "ldx %[tmp], %[i], %[a] \n" |
79 |
+ "ldx %[tmp2], %[i], %[b] \n" |
80 |
+ "subfe %[tmp], %[tmp2], %[tmp] \n" // subtract extended |
81 |
+ "stdx %[tmp], %[i], %[a] \n" |
82 |
+ "addi %[i], %[i], 8 \n" |
83 |
+ "bdnz 0b \n" |
84 |
+ "addme %[tmp], %[carry] \n" // carry + CA - 1 |
85 |
+ : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2) |
86 |
+ : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len) |
87 |
+ : "ctr", "xer", "memory" |
88 |
+ ); |
89 |
+ return tmp; |
90 |
+} |
91 |
+ |
92 |
+// Multiply (unsigned) Long A by Long B, accumulating the double- |
93 |
+// length result into the accumulator formed of T0, T1, and T2. |
94 |
+inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) { |
95 |
+ unsigned long hi, lo; |
96 |
+ __asm__ __volatile__ ( |
97 |
+ "mulld %[lo], %[A], %[B] \n" |
98 |
+ "mulhdu %[hi], %[A], %[B] \n" |
99 |
+ "addc %[T0], %[T0], %[lo] \n" |
100 |
+ "adde %[T1], %[T1], %[hi] \n" |
101 |
+ "addze %[T2], %[T2] \n" |
102 |
+ : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2) |
103 |
+ : [A]"r"(A), [B]"r"(B) |
104 |
+ : "xer" |
105 |
+ ); |
106 |
+} |
107 |
+ |
108 |
+// As above, but add twice the double-length result into the |
109 |
+// accumulator. |
110 |
+inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) { |
111 |
+ unsigned long hi, lo; |
112 |
+ __asm__ __volatile__ ( |
113 |
+ "mulld %[lo], %[A], %[B] \n" |
114 |
+ "mulhdu %[hi], %[A], %[B] \n" |
115 |
+ "addc %[T0], %[T0], %[lo] \n" |
116 |
+ "adde %[T1], %[T1], %[hi] \n" |
117 |
+ "addze %[T2], %[T2] \n" |
118 |
+ "addc %[T0], %[T0], %[lo] \n" |
119 |
+ "adde %[T1], %[T1], %[hi] \n" |
120 |
+ "addze %[T2], %[T2] \n" |
121 |
+ : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2) |
122 |
+ : [A]"r"(A), [B]"r"(B) |
123 |
+ : "xer" |
124 |
+ ); |
125 |
+} |
126 |
+ |
127 |
+// Fast Montgomery multiplication. The derivation of the algorithm is |
128 |
+// in "A Cryptographic Library for the Motorola DSP56000, |
129 |
+// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237". |
130 |
+static void |
131 |
+montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], |
132 |
+ unsigned long m[], unsigned long inv, int len) { |
133 |
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator |
134 |
+ int i; |
135 |
+ |
136 |
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); |
137 |
+ |
138 |
+ for (i = 0; i < len; i++) { |
139 |
+ int j; |
140 |
+ for (j = 0; j < i; j++) { |
141 |
+ MACC(a[j], b[i-j], t0, t1, t2); |
142 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
143 |
+ } |
144 |
+ MACC(a[i], b[0], t0, t1, t2); |
145 |
+ m[i] = t0 * inv; |
146 |
+ MACC(m[i], n[0], t0, t1, t2); |
147 |
+ |
148 |
+ assert(t0 == 0, "broken Montgomery multiply"); |
149 |
+ |
150 |
+ t0 = t1; t1 = t2; t2 = 0; |
151 |
+ } |
152 |
+ |
153 |
+ for (i = len; i < 2*len; i++) { |
154 |
+ int j; |
155 |
+ for (j = i-len+1; j < len; j++) { |
156 |
+ MACC(a[j], b[i-j], t0, t1, t2); |
157 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
158 |
+ } |
159 |
+ m[i-len] = t0; |
160 |
+ t0 = t1; t1 = t2; t2 = 0; |
161 |
+ } |
162 |
+ |
163 |
+ while (t0) { |
164 |
+ t0 = sub(m, n, t0, len); |
165 |
+ } |
166 |
+} |
167 |
+ |
168 |
+// Fast Montgomery squaring. This uses asymptotically 25% fewer |
169 |
+// multiplies so it should be up to 25% faster than Montgomery |
170 |
+// multiplication. However, its loop control is more complex and it |
171 |
+// may actually run slower on some machines. |
172 |
+static void |
173 |
+montgomery_square(unsigned long a[], unsigned long n[], |
174 |
+ unsigned long m[], unsigned long inv, int len) { |
175 |
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator |
176 |
+ int i; |
177 |
+ |
178 |
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); |
179 |
+ |
180 |
+ for (i = 0; i < len; i++) { |
181 |
+ int j; |
182 |
+ int end = (i+1)/2; |
183 |
+ for (j = 0; j < end; j++) { |
184 |
+ MACC2(a[j], a[i-j], t0, t1, t2); |
185 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
186 |
+ } |
187 |
+ if ((i & 1) == 0) { |
188 |
+ MACC(a[j], a[j], t0, t1, t2); |
189 |
+ } |
190 |
+ for (; j < i; j++) { |
191 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
192 |
+ } |
193 |
+ m[i] = t0 * inv; |
194 |
+ MACC(m[i], n[0], t0, t1, t2); |
195 |
+ |
196 |
+ assert(t0 == 0, "broken Montgomery square"); |
197 |
+ |
198 |
+ t0 = t1; t1 = t2; t2 = 0; |
199 |
+ } |
200 |
+ |
201 |
+ for (i = len; i < 2*len; i++) { |
202 |
+ int start = i-len+1; |
203 |
+ int end = start + (len - start)/2; |
204 |
+ int j; |
205 |
+ for (j = start; j < end; j++) { |
206 |
+ MACC2(a[j], a[i-j], t0, t1, t2); |
207 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
208 |
+ } |
209 |
+ if ((i & 1) == 0) { |
210 |
+ MACC(a[j], a[j], t0, t1, t2); |
211 |
+ } |
212 |
+ for (; j < len; j++) { |
213 |
+ MACC(m[j], n[i-j], t0, t1, t2); |
214 |
+ } |
215 |
+ m[i-len] = t0; |
216 |
+ t0 = t1; t1 = t2; t2 = 0; |
217 |
+ } |
218 |
+ |
219 |
+ while (t0) { |
220 |
+ t0 = sub(m, n, t0, len); |
221 |
+ } |
222 |
+} |
223 |
+ |
224 |
+// The threshold at which squaring is advantageous was determined |
225 |
+// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. |
226 |
+// Doesn't seem to be relevant for Power8 so we use the same value. |
227 |
+#define MONTGOMERY_SQUARING_THRESHOLD 64 |
228 |
+ |
229 |
+// Copy len longwords from s to d, word-swapping as we go. The |
230 |
+// destination array is reversed. |
231 |
+static void reverse_words(unsigned long *s, unsigned long *d, int len) { |
232 |
+ d += len; |
233 |
+ while(len-- > 0) { |
234 |
+ d--; |
235 |
+ unsigned long s_val = *s; |
236 |
+ // Swap words in a longword on little endian machines. |
237 |
+#ifdef VM_LITTLE_ENDIAN |
238 |
+ s_val = (s_val << 32) | (s_val >> 32); |
239 |
+#endif |
240 |
+ *d = s_val; |
241 |
+ s++; |
242 |
+ } |
243 |
+} |
244 |
+ |
245 |
+void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, |
246 |
+ jint len, jlong inv, |
247 |
+ jint *m_ints) { |
248 |
+ assert(len % 2 == 0, "array length in montgomery_multiply must be even"); |
249 |
+ int longwords = len/2; |
250 |
+ assert(longwords > 0, "unsupported"); |
251 |
+ |
252 |
+ // Make very sure we don't use so much space that the stack might |
253 |
+ // overflow. 512 jints corresponds to an 16384-bit integer and |
254 |
+ // will use here a total of 8k bytes of stack space. |
255 |
+ int total_allocation = longwords * sizeof (unsigned long) * 4; |
256 |
+ guarantee(total_allocation <= 8192, "must be"); |
257 |
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation); |
258 |
+ |
259 |
+ // Local scratch arrays |
260 |
+ unsigned long |
261 |
+ *a = scratch + 0 * longwords, |
262 |
+ *b = scratch + 1 * longwords, |
263 |
+ *n = scratch + 2 * longwords, |
264 |
+ *m = scratch + 3 * longwords; |
265 |
+ |
266 |
+ reverse_words((unsigned long *)a_ints, a, longwords); |
267 |
+ reverse_words((unsigned long *)b_ints, b, longwords); |
268 |
+ reverse_words((unsigned long *)n_ints, n, longwords); |
269 |
+ |
270 |
+ ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); |
271 |
+ |
272 |
+ reverse_words(m, (unsigned long *)m_ints, longwords); |
273 |
+} |
274 |
+ |
275 |
+void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, |
276 |
+ jint len, jlong inv, |
277 |
+ jint *m_ints) { |
278 |
+ assert(len % 2 == 0, "array length in montgomery_square must be even"); |
279 |
+ int longwords = len/2; |
280 |
+ assert(longwords > 0, "unsupported"); |
281 |
+ |
282 |
+ // Make very sure we don't use so much space that the stack might |
283 |
+ // overflow. 512 jints corresponds to an 16384-bit integer and |
284 |
+ // will use here a total of 6k bytes of stack space. |
285 |
+ int total_allocation = longwords * sizeof (unsigned long) * 3; |
286 |
+ guarantee(total_allocation <= 8192, "must be"); |
287 |
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation); |
288 |
+ |
289 |
+ // Local scratch arrays |
290 |
+ unsigned long |
291 |
+ *a = scratch + 0 * longwords, |
292 |
+ *n = scratch + 1 * longwords, |
293 |
+ *m = scratch + 2 * longwords; |
294 |
+ |
295 |
+ reverse_words((unsigned long *)a_ints, a, longwords); |
296 |
+ reverse_words((unsigned long *)n_ints, n, longwords); |
297 |
+ |
298 |
+ if (len >= MONTGOMERY_SQUARING_THRESHOLD) { |
299 |
+ ::montgomery_square(a, n, m, (unsigned long)inv, longwords); |
300 |
+ } else { |
301 |
+ ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); |
302 |
+ } |
303 |
+ |
304 |
+ reverse_words(m, (unsigned long *)m_ints, longwords); |
305 |
+} |
306 |
diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp |
307 |
--- openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp |
308 |
+++ openjdk/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp |
309 |
@@ -2094,6 +2094,14 @@ |
310 |
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, |
311 |
&StubRoutines::_safefetchN_fault_pc, |
312 |
&StubRoutines::_safefetchN_continuation_pc); |
313 |
+ if (UseMontgomeryMultiplyIntrinsic) { |
314 |
+ StubRoutines::_montgomeryMultiply |
315 |
+ = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); |
316 |
+ } |
317 |
+ if (UseMontgomerySquareIntrinsic) { |
318 |
+ StubRoutines::_montgomerySquare |
319 |
+ = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); |
320 |
+ } |
321 |
} |
322 |
|
323 |
public: |
324 |
diff --git a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp |
325 |
--- openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp |
326 |
+++ openjdk/hotspot/src/cpu/ppc/vm/templateInterpreter_ppc.cpp |
327 |
@@ -265,7 +265,7 @@ |
328 |
__ cmpdi(CCR0, Rmdo, 0); |
329 |
__ beq(CCR0, no_mdo); |
330 |
|
331 |
- // Increment backedge counter in the MDO. |
332 |
+ // Increment invocation counter in the MDO. |
333 |
const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); |
334 |
__ lwz(Rscratch2, mdo_bc_offs, Rmdo); |
335 |
__ addi(Rscratch2, Rscratch2, increment); |
336 |
@@ -277,12 +277,12 @@ |
337 |
} |
338 |
|
339 |
// Increment counter in MethodCounters*. |
340 |
- const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); |
341 |
+ const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); |
342 |
__ bind(no_mdo); |
343 |
__ get_method_counters(R19_method, R3_counters, done); |
344 |
- __ lwz(Rscratch2, mo_bc_offs, R3_counters); |
345 |
+ __ lwz(Rscratch2, mo_ic_offs, R3_counters); |
346 |
__ addi(Rscratch2, Rscratch2, increment); |
347 |
- __ stw(Rscratch2, mo_bc_offs, R3_counters); |
348 |
+ __ stw(Rscratch2, mo_ic_offs, R3_counters); |
349 |
__ load_const_optimized(Rscratch1, mask, R0); |
350 |
__ and_(Rscratch1, Rscratch2, Rscratch1); |
351 |
__ beq(CCR0, *overflow); |
352 |
diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp |
353 |
--- openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp |
354 |
+++ openjdk/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp |
355 |
@@ -177,6 +177,12 @@ |
356 |
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); |
357 |
} |
358 |
|
359 |
+ if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { |
360 |
+ UseMontgomeryMultiplyIntrinsic = true; |
361 |
+ } |
362 |
+ if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { |
363 |
+ UseMontgomerySquareIntrinsic = true; |
364 |
+ } |
365 |
} |
366 |
|
367 |
void VM_Version::print_features() { |
368 |
diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp |
369 |
--- openjdk/hotspot/src/share/vm/opto/library_call.cpp |
370 |
+++ openjdk/hotspot/src/share/vm/opto/library_call.cpp |
371 |
@@ -6031,11 +6031,21 @@ |
372 |
Node* n_start = array_element_address(n, intcon(0), n_elem); |
373 |
Node* m_start = array_element_address(m, intcon(0), m_elem); |
374 |
|
375 |
- Node* call = make_runtime_call(RC_LEAF, |
376 |
- OptoRuntime::montgomeryMultiply_Type(), |
377 |
- stubAddr, stubName, TypePtr::BOTTOM, |
378 |
- a_start, b_start, n_start, len, inv, top(), |
379 |
- m_start); |
380 |
+ Node* call = NULL; |
381 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
382 |
+ Node* len_I2L = ConvI2L(len); |
383 |
+ call = make_runtime_call(RC_LEAF, |
384 |
+ OptoRuntime::montgomeryMultiply_Type(), |
385 |
+ stubAddr, stubName, TypePtr::BOTTOM, |
386 |
+ a_start, b_start, n_start, len_I2L XTOP, inv, |
387 |
+ top(), m_start); |
388 |
+ } else { |
389 |
+ call = make_runtime_call(RC_LEAF, |
390 |
+ OptoRuntime::montgomeryMultiply_Type(), |
391 |
+ stubAddr, stubName, TypePtr::BOTTOM, |
392 |
+ a_start, b_start, n_start, len, inv, top(), |
393 |
+ m_start); |
394 |
+ } |
395 |
set_result(m); |
396 |
} |
397 |
|
398 |
@@ -6085,11 +6095,22 @@ |
399 |
Node* n_start = array_element_address(n, intcon(0), n_elem); |
400 |
Node* m_start = array_element_address(m, intcon(0), m_elem); |
401 |
|
402 |
- Node* call = make_runtime_call(RC_LEAF, |
403 |
- OptoRuntime::montgomerySquare_Type(), |
404 |
- stubAddr, stubName, TypePtr::BOTTOM, |
405 |
- a_start, n_start, len, inv, top(), |
406 |
- m_start); |
407 |
+ Node* call = NULL; |
408 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
409 |
+ Node* len_I2L = ConvI2L(len); |
410 |
+ call = make_runtime_call(RC_LEAF, |
411 |
+ OptoRuntime::montgomerySquare_Type(), |
412 |
+ stubAddr, stubName, TypePtr::BOTTOM, |
413 |
+ a_start, n_start, len_I2L XTOP, inv, top(), |
414 |
+ m_start); |
415 |
+ } else { |
416 |
+ call = make_runtime_call(RC_LEAF, |
417 |
+ OptoRuntime::montgomerySquare_Type(), |
418 |
+ stubAddr, stubName, TypePtr::BOTTOM, |
419 |
+ a_start, n_start, len, inv, top(), |
420 |
+ m_start); |
421 |
+ } |
422 |
+ |
423 |
set_result(m); |
424 |
} |
425 |
|
426 |
diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp |
427 |
--- openjdk/hotspot/src/share/vm/opto/runtime.cpp |
428 |
+++ openjdk/hotspot/src/share/vm/opto/runtime.cpp |
429 |
@@ -1005,12 +1005,20 @@ |
430 |
// create input type (domain) |
431 |
int num_args = 7; |
432 |
int argcnt = num_args; |
433 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
434 |
+ argcnt++; // additional placeholder |
435 |
+ } |
436 |
const Type** fields = TypeTuple::fields(argcnt); |
437 |
int argp = TypeFunc::Parms; |
438 |
fields[argp++] = TypePtr::NOTNULL; // a |
439 |
fields[argp++] = TypePtr::NOTNULL; // b |
440 |
fields[argp++] = TypePtr::NOTNULL; // n |
441 |
- fields[argp++] = TypeInt::INT; // len |
442 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
443 |
+ fields[argp++] = TypeLong::LONG; // len |
444 |
+ fields[argp++] = TypeLong::HALF; // placeholder |
445 |
+ } else { |
446 |
+ fields[argp++] = TypeInt::INT; // len |
447 |
+ } |
448 |
fields[argp++] = TypeLong::LONG; // inv |
449 |
fields[argp++] = Type::HALF; |
450 |
fields[argp++] = TypePtr::NOTNULL; // result |
451 |
@@ -1029,11 +1037,19 @@ |
452 |
// create input type (domain) |
453 |
int num_args = 6; |
454 |
int argcnt = num_args; |
455 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
456 |
+ argcnt++; // additional placeholder |
457 |
+ } |
458 |
const Type** fields = TypeTuple::fields(argcnt); |
459 |
int argp = TypeFunc::Parms; |
460 |
fields[argp++] = TypePtr::NOTNULL; // a |
461 |
fields[argp++] = TypePtr::NOTNULL; // n |
462 |
- fields[argp++] = TypeInt::INT; // len |
463 |
+ if (CCallingConventionRequiresIntsAsLongs) { |
464 |
+ fields[argp++] = TypeLong::LONG; // len |
465 |
+ fields[argp++] = TypeLong::HALF; // placeholder |
466 |
+ } else { |
467 |
+ fields[argp++] = TypeInt::INT; // len |
468 |
+ } |
469 |
fields[argp++] = TypeLong::LONG; // inv |
470 |
fields[argp++] = Type::HALF; |
471 |
fields[argp++] = TypePtr::NOTNULL; // result |