ecp_nistp384.c

Last change on this file was 108206, checked in by vboxsync, 3 months ago
openssl-3.3.2: Exported all files to OSE and removed .scm-settings bugref:10757
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 68.8 KB

Line
1	/*
2	* Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	/* Copyright 2023 IBM Corp.
11	*
12	* Licensed under the Apache License, Version 2.0 (the "License");
13	*
14	* you may not use this file except in compliance with the License.
15	* You may obtain a copy of the License at
16	*
17	* http://www.apache.org/licenses/LICENSE-2.0
18	*
19	* Unless required by applicable law or agreed to in writing, software
20	* distributed under the License is distributed on an "AS IS" BASIS,
21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22	* See the License for the specific language governing permissions and
23	* limitations under the License.
24	*/
25
26	/*
27	* Designed for 56-bit limbs by Rohan McLure <[email protected]>.
28	* The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29	* acceleration of felem_{square,mul} as supported in these files.
30	*/
31
32	#include <openssl/e_os2.h>
33
34	#include <string.h>
35	#include <openssl/err.h>
36	#include "ec_local.h"
37
38	#include "internal/numbers.h"
39
40	#ifndef INT128_MAX
41	# error "Your compiler doesn't appear to support 128-bit integer types"
42	#endif
43
44	typedef uint8_t u8;
45	typedef uint64_t u64;
46
47	/*
48	* The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49	* can serialize an element of this field into 48 bytes. We call this an
50	* felem_bytearray.
51	*/
52
53	typedef u8 felem_bytearray[48];
54
55	/*
56	* These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57	* These values are big-endian.
58	*/
59	static const felem_bytearray nistp384_curve_params[5] = {
60	{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64	{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68	{0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69	0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70	0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71	0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72	{0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73	0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74	0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75	0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76	{0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77	0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78	0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79	0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80	};
81
82	/*-
83	* The representation of field elements.
84	* ------------------------------------
85	*
86	* We represent field elements with seven values. These values are either 64 or
87	* 128 bits and the field element represented is:
88	* v[0]2^0 + v[1]2^56 + v[2]2^112 + ... + v[6]2^336 (mod p)
89	* Each of the seven values is called a 'limb'. Since the limbs are spaced only
90	* 56 bits apart, but are greater than 56 bits in length, the most significant
91	* bits of each limb overlap with the least significant bits of the next
92	*
93	* This representation is considered to be 'redundant' in the sense that
94	* intermediate values can each contain more than a 56-bit value in each limb.
95	* Reduction causes all but the final limb to be reduced to contain a value less
96	* than 2^56, with the final value represented allowed to be larger than 2^384,
97	* inasmuch as we can be sure that arithmetic overflow remains impossible. The
98	* reduced value must of course be congruent to the unreduced value.
99	*
100	* A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101	* 'widefelem', featuring enough bits to store the result of a multiplication
102	* and even some further arithmetic without need for immediate reduction.
103	*/
104
105	#define NLIMBS 7
106
107	typedef uint64_t limb;
108	typedef uint128_t widelimb;
109	typedef limb limb_aX __attribute((__aligned__(1)));
110	typedef limb felem[NLIMBS];
111	typedef widelimb widefelem[2*NLIMBS-1];
112
113	static const limb bottom56bits = 0xffffffffffffff;
114
115	/* Helper functions (de)serialising reduced field elements in little endian */
116	static void bin48_to_felem(felem out, const u8 in[48])
117	{
118	memset(out, 0, 56);
119	out[0] = (((limb ) & in[0])) & bottom56bits;
120	out[1] = (((limb_aX ) & in[7])) & bottom56bits;
121	out[2] = (((limb_aX ) & in[14])) & bottom56bits;
122	out[3] = (((limb_aX ) & in[21])) & bottom56bits;
123	out[4] = (((limb_aX ) & in[28])) & bottom56bits;
124	out[5] = (((limb_aX ) & in[35])) & bottom56bits;
125	memmove(&out[6], &in[42], 6);
126	}
127
128	static void felem_to_bin48(u8 out[48], const felem in)
129	{
130	memset(out, 0, 48);
131	(((limb ) & out[0])) \|= (in[0] & bottom56bits);
132	(((limb_aX ) & out[7])) \|= (in[1] & bottom56bits);
133	(((limb_aX ) & out[14])) \|= (in[2] & bottom56bits);
134	(((limb_aX ) & out[21])) \|= (in[3] & bottom56bits);
135	(((limb_aX ) & out[28])) \|= (in[4] & bottom56bits);
136	(((limb_aX ) & out[35])) \|= (in[5] & bottom56bits);
137	memmove(&out[42], &in[6], 6);
138	}
139
140	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141	static int BN_to_felem(felem out, const BIGNUM *bn)
142	{
143	felem_bytearray b_out;
144	int num_bytes;
145
146	if (BN_is_negative(bn)) {
147	ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148	return 0;
149	}
150	num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151	if (num_bytes < 0) {
152	ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153	return 0;
154	}
155	bin48_to_felem(out, b_out);
156	return 1;
157	}
158
159	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160	static BIGNUM felem_to_BN(BIGNUM out, const felem in)
161	{
162	felem_bytearray b_out;
163
164	felem_to_bin48(b_out, in);
165	return BN_lebin2bn(b_out, sizeof(b_out), out);
166	}
167
168	/*-
169	* Field operations
170	* ----------------
171	*/
172
173	static void felem_one(felem out)
174	{
175	out[0] = 1;
176	memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177	}
178
179	static void felem_assign(felem out, const felem in)
180	{
181	memcpy(out, in, sizeof(felem));
182	}
183
184	/* felem_sum64 sets out = out + in. */
185	static void felem_sum64(felem out, const felem in)
186	{
187	unsigned int i;
188
189	for (i = 0; i < NLIMBS; i++)
190	out[i] += in[i];
191	}
192
193	/* felem_scalar sets out = in * scalar */
194	static void felem_scalar(felem out, const felem in, limb scalar)
195	{
196	unsigned int i;
197
198	for (i = 0; i < NLIMBS; i++)
199	out[i] = in[i] * scalar;
200	}
201
202	/* felem_scalar64 sets out = out * scalar */
203	static void felem_scalar64(felem out, limb scalar)
204	{
205	unsigned int i;
206
207	for (i = 0; i < NLIMBS; i++)
208	out[i] *= scalar;
209	}
210
211	/* felem_scalar128 sets out = out * scalar */
212	static void felem_scalar128(widefelem out, limb scalar)
213	{
214	unsigned int i;
215
216	for (i = 0; i < 2*NLIMBS-1; i++)
217	out[i] *= scalar;
218	}
219
220	/*-
221	* felem_neg sets \|out\| to \|-in\|
222	* On entry:
223	* in[i] < 2^60 - 2^29
224	* On exit:
225	* out[i] < 2^60
226	*/
227	static void felem_neg(felem out, const felem in)
228	{
229	/*
230	* In order to prevent underflow, we add a multiple of p before subtracting.
231	* Use telescopic sums to represent 2^12 * p redundantly with each limb
232	* of the form 2^60 + ...
233	*/
234	static const limb two60m52m4 = (((limb) 1) << 60)
235	- (((limb) 1) << 52)
236	- (((limb) 1) << 4);
237	static const limb two60p44m12 = (((limb) 1) << 60)
238	+ (((limb) 1) << 44)
239	- (((limb) 1) << 12);
240	static const limb two60m28m4 = (((limb) 1) << 60)
241	- (((limb) 1) << 28)
242	- (((limb) 1) << 4);
243	static const limb two60m4 = (((limb) 1) << 60)
244	- (((limb) 1) << 4);
245
246	out[0] = two60p44m12 - in[0];
247	out[1] = two60m52m4 - in[1];
248	out[2] = two60m28m4 - in[2];
249	out[3] = two60m4 - in[3];
250	out[4] = two60m4 - in[4];
251	out[5] = two60m4 - in[5];
252	out[6] = two60m4 - in[6];
253	}
254
255	/*-
256	* felem_diff64 subtracts \|in\| from \|out\|
257	* On entry:
258	* in[i] < 2^60 - 2^52 - 2^4
259	* On exit:
260	* out[i] < out_orig[i] + 2^60 + 2^44
261	*/
262	static void felem_diff64(felem out, const felem in)
263	{
264	/*
265	* In order to prevent underflow, we add a multiple of p before subtracting.
266	* Use telescopic sums to represent 2^12 * p redundantly with each limb
267	* of the form 2^60 + ...
268	*/
269
270	static const limb two60m52m4 = (((limb) 1) << 60)
271	- (((limb) 1) << 52)
272	- (((limb) 1) << 4);
273	static const limb two60p44m12 = (((limb) 1) << 60)
274	+ (((limb) 1) << 44)
275	- (((limb) 1) << 12);
276	static const limb two60m28m4 = (((limb) 1) << 60)
277	- (((limb) 1) << 28)
278	- (((limb) 1) << 4);
279	static const limb two60m4 = (((limb) 1) << 60)
280	- (((limb) 1) << 4);
281
282	out[0] += two60p44m12 - in[0];
283	out[1] += two60m52m4 - in[1];
284	out[2] += two60m28m4 - in[2];
285	out[3] += two60m4 - in[3];
286	out[4] += two60m4 - in[4];
287	out[5] += two60m4 - in[5];
288	out[6] += two60m4 - in[6];
289	}
290
291	/*
292	* in[i] < 2^63
293	* out[i] < out_orig[i] + 2^64 + 2^48
294	*/
295	static void felem_diff_128_64(widefelem out, const felem in)
296	{
297	/*
298	* In order to prevent underflow, we add a multiple of p before subtracting.
299	* Use telescopic sums to represent 2^16 * p redundantly with each limb
300	* of the form 2^64 + ...
301	*/
302
303	static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304	- (((widelimb) 1) << 56)
305	- (((widelimb) 1) << 8);
306	static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307	- (((widelimb) 1) << 32)
308	- (((widelimb) 1) << 8);
309	static const widelimb two64m8 = (((widelimb) 1) << 64)
310	- (((widelimb) 1) << 8);
311	static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312	+ (((widelimb) 1) << 48)
313	- (((widelimb) 1) << 16);
314	unsigned int i;
315
316	out[0] += two64p48m16;
317	out[1] += two64m56m8;
318	out[2] += two64m32m8;
319	out[3] += two64m8;
320	out[4] += two64m8;
321	out[5] += two64m8;
322	out[6] += two64m8;
323
324	for (i = 0; i < NLIMBS; i++)
325	out[i] -= in[i];
326	}
327
328	/*
329	* in[i] < 2^127 - 2^119 - 2^71
330	* out[i] < out_orig[i] + 2^127 + 2^111
331	*/
332	static void felem_diff128(widefelem out, const widefelem in)
333	{
334	/*
335	* In order to prevent underflow, we add a multiple of p before subtracting.
336	* Use telescopic sums to represent 2^415 * p redundantly with each limb
337	* of the form 2^127 + ...
338	*/
339
340	static const widelimb two127 = ((widelimb) 1) << 127;
341	static const widelimb two127m71 = (((widelimb) 1) << 127)
342	- (((widelimb) 1) << 71);
343	static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344	+ (((widelimb) 1) << 111)
345	- (((widelimb) 1) << 79)
346	- (((widelimb) 1) << 71);
347	static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348	- (((widelimb) 1) << 119)
349	- (((widelimb) 1) << 71);
350	static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351	- (((widelimb) 1) << 95)
352	- (((widelimb) 1) << 71);
353	unsigned int i;
354
355	out[0] += two127;
356	out[1] += two127m71;
357	out[2] += two127m71;
358	out[3] += two127m71;
359	out[4] += two127m71;
360	out[5] += two127m71;
361	out[6] += two127p111m79m71;
362	out[7] += two127m119m71;
363	out[8] += two127m95m71;
364	out[9] += two127m71;
365	out[10] += two127m71;
366	out[11] += two127m71;
367	out[12] += two127m71;
368
369	for (i = 0; i < 2*NLIMBS-1; i++)
370	out[i] -= in[i];
371	}
372
373	static void felem_square_ref(widefelem out, const felem in)
374	{
375	felem inx2;
376	felem_scalar(inx2, in, 2);
377
378	out[0] = ((uint128_t) in[0]) * in[0];
379
380	out[1] = ((uint128_t) in[0]) * inx2[1];
381
382	out[2] = ((uint128_t) in[0]) * inx2[2]
383	+ ((uint128_t) in[1]) * in[1];
384
385	out[3] = ((uint128_t) in[0]) * inx2[3]
386	+ ((uint128_t) in[1]) * inx2[2];
387
388	out[4] = ((uint128_t) in[0]) * inx2[4]
389	+ ((uint128_t) in[1]) * inx2[3]
390	+ ((uint128_t) in[2]) * in[2];
391
392	out[5] = ((uint128_t) in[0]) * inx2[5]
393	+ ((uint128_t) in[1]) * inx2[4]
394	+ ((uint128_t) in[2]) * inx2[3];
395
396	out[6] = ((uint128_t) in[0]) * inx2[6]
397	+ ((uint128_t) in[1]) * inx2[5]
398	+ ((uint128_t) in[2]) * inx2[4]
399	+ ((uint128_t) in[3]) * in[3];
400
401	out[7] = ((uint128_t) in[1]) * inx2[6]
402	+ ((uint128_t) in[2]) * inx2[5]
403	+ ((uint128_t) in[3]) * inx2[4];
404
405	out[8] = ((uint128_t) in[2]) * inx2[6]
406	+ ((uint128_t) in[3]) * inx2[5]
407	+ ((uint128_t) in[4]) * in[4];
408
409	out[9] = ((uint128_t) in[3]) * inx2[6]
410	+ ((uint128_t) in[4]) * inx2[5];
411
412	out[10] = ((uint128_t) in[4]) * inx2[6]
413	+ ((uint128_t) in[5]) * in[5];
414
415	out[11] = ((uint128_t) in[5]) * inx2[6];
416
417	out[12] = ((uint128_t) in[6]) * in[6];
418	}
419
420	static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
421	{
422	out[0] = ((uint128_t) in1[0]) * in2[0];
423
424	out[1] = ((uint128_t) in1[0]) * in2[1]
425	+ ((uint128_t) in1[1]) * in2[0];
426
427	out[2] = ((uint128_t) in1[0]) * in2[2]
428	+ ((uint128_t) in1[1]) * in2[1]
429	+ ((uint128_t) in1[2]) * in2[0];
430
431	out[3] = ((uint128_t) in1[0]) * in2[3]
432	+ ((uint128_t) in1[1]) * in2[2]
433	+ ((uint128_t) in1[2]) * in2[1]
434	+ ((uint128_t) in1[3]) * in2[0];
435
436	out[4] = ((uint128_t) in1[0]) * in2[4]
437	+ ((uint128_t) in1[1]) * in2[3]
438	+ ((uint128_t) in1[2]) * in2[2]
439	+ ((uint128_t) in1[3]) * in2[1]
440	+ ((uint128_t) in1[4]) * in2[0];
441
442	out[5] = ((uint128_t) in1[0]) * in2[5]
443	+ ((uint128_t) in1[1]) * in2[4]
444	+ ((uint128_t) in1[2]) * in2[3]
445	+ ((uint128_t) in1[3]) * in2[2]
446	+ ((uint128_t) in1[4]) * in2[1]
447	+ ((uint128_t) in1[5]) * in2[0];
448
449	out[6] = ((uint128_t) in1[0]) * in2[6]
450	+ ((uint128_t) in1[1]) * in2[5]
451	+ ((uint128_t) in1[2]) * in2[4]
452	+ ((uint128_t) in1[3]) * in2[3]
453	+ ((uint128_t) in1[4]) * in2[2]
454	+ ((uint128_t) in1[5]) * in2[1]
455	+ ((uint128_t) in1[6]) * in2[0];
456
457	out[7] = ((uint128_t) in1[1]) * in2[6]
458	+ ((uint128_t) in1[2]) * in2[5]
459	+ ((uint128_t) in1[3]) * in2[4]
460	+ ((uint128_t) in1[4]) * in2[3]
461	+ ((uint128_t) in1[5]) * in2[2]
462	+ ((uint128_t) in1[6]) * in2[1];
463
464	out[8] = ((uint128_t) in1[2]) * in2[6]
465	+ ((uint128_t) in1[3]) * in2[5]
466	+ ((uint128_t) in1[4]) * in2[4]
467	+ ((uint128_t) in1[5]) * in2[3]
468	+ ((uint128_t) in1[6]) * in2[2];
469
470	out[9] = ((uint128_t) in1[3]) * in2[6]
471	+ ((uint128_t) in1[4]) * in2[5]
472	+ ((uint128_t) in1[5]) * in2[4]
473	+ ((uint128_t) in1[6]) * in2[3];
474
475	out[10] = ((uint128_t) in1[4]) * in2[6]
476	+ ((uint128_t) in1[5]) * in2[5]
477	+ ((uint128_t) in1[6]) * in2[4];
478
479	out[11] = ((uint128_t) in1[5]) * in2[6]
480	+ ((uint128_t) in1[6]) * in2[5];
481
482	out[12] = ((uint128_t) in1[6]) * in2[6];
483	}
484
485	/*-
486	* Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487	* in[i] < 2^128 - 2^125
488	* out[i] < 2^56 for i < 6,
489	* out[6] <= 2^48
490	*
491	* The technique in use here stems from the format of the prime modulus:
492	* P384 = 2^384 - delta
493	*
494	* Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495	* them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496	* coefficients are still quite large, and so we repeatedly apply this
497	* technique on high-order bits in order to guarantee the desired bounds on
498	* the size of our output.
499	*
500	* The three phases of elimination are as follows:
501	* [1]: Y = 2^120 (in[12] \| in[11] \| in[10] \| in[9])
502	* [2]: Y = 2^8 (acc[8] \| acc[7])
503	* [3]: Y = 2^48 (acc[6] >> 48)
504	* (Where a \| b \| c \| d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
505	*/
506	static void felem_reduce(felem out, const widefelem in)
507	{
508	/*
509	* In order to prevent underflow, we add a multiple of p before subtracting.
510	* Use telescopic sums to represent 2^76 * p redundantly with each limb
511	* of the form 2^124 + ...
512	*/
513	static const widelimb two124m68 = (((widelimb) 1) << 124)
514	- (((widelimb) 1) << 68);
515	static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516	- (((widelimb) 1) << 116)
517	- (((widelimb) 1) << 68);
518	static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519	+ (((widelimb) 1) << 108)
520	- (((widelimb) 1) << 76);
521	static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522	- (((widelimb) 1) << 92)
523	- (((widelimb) 1) << 68);
524	widelimb temp, acc[9];
525	unsigned int i;
526
527	memcpy(acc, in, sizeof(widelimb) * 9);
528
529	acc[0] += two124p108m76;
530	acc[1] += two124m116m68;
531	acc[2] += two124m92m68;
532	acc[3] += two124m68;
533	acc[4] += two124m68;
534	acc[5] += two124m68;
535	acc[6] += two124m68;
536
537	/* [1]: Eliminate in[9], ..., in[12] */
538	acc[8] += in[12] >> 32;
539	acc[7] += (in[12] & 0xffffffff) << 24;
540	acc[7] += in[12] >> 8;
541	acc[6] += (in[12] & 0xff) << 48;
542	acc[6] -= in[12] >> 16;
543	acc[5] -= (in[12] & 0xffff) << 40;
544	acc[6] += in[12] >> 48;
545	acc[5] += (in[12] & 0xffffffffffff) << 8;
546
547	acc[7] += in[11] >> 32;
548	acc[6] += (in[11] & 0xffffffff) << 24;
549	acc[6] += in[11] >> 8;
550	acc[5] += (in[11] & 0xff) << 48;
551	acc[5] -= in[11] >> 16;
552	acc[4] -= (in[11] & 0xffff) << 40;
553	acc[5] += in[11] >> 48;
554	acc[4] += (in[11] & 0xffffffffffff) << 8;
555
556	acc[6] += in[10] >> 32;
557	acc[5] += (in[10] & 0xffffffff) << 24;
558	acc[5] += in[10] >> 8;
559	acc[4] += (in[10] & 0xff) << 48;
560	acc[4] -= in[10] >> 16;
561	acc[3] -= (in[10] & 0xffff) << 40;
562	acc[4] += in[10] >> 48;
563	acc[3] += (in[10] & 0xffffffffffff) << 8;
564
565	acc[5] += in[9] >> 32;
566	acc[4] += (in[9] & 0xffffffff) << 24;
567	acc[4] += in[9] >> 8;
568	acc[3] += (in[9] & 0xff) << 48;
569	acc[3] -= in[9] >> 16;
570	acc[2] -= (in[9] & 0xffff) << 40;
571	acc[3] += in[9] >> 48;
572	acc[2] += (in[9] & 0xffffffffffff) << 8;
573
574	/*
575	* [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576	* well as the contributions made from eliminating higher limbs.
577	* acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578	* acc[8] < in[8] + 2^96
579	*/
580	acc[4] += acc[8] >> 32;
581	acc[3] += (acc[8] & 0xffffffff) << 24;
582	acc[3] += acc[8] >> 8;
583	acc[2] += (acc[8] & 0xff) << 48;
584	acc[2] -= acc[8] >> 16;
585	acc[1] -= (acc[8] & 0xffff) << 40;
586	acc[2] += acc[8] >> 48;
587	acc[1] += (acc[8] & 0xffffffffffff) << 8;
588
589	acc[3] += acc[7] >> 32;
590	acc[2] += (acc[7] & 0xffffffff) << 24;
591	acc[2] += acc[7] >> 8;
592	acc[1] += (acc[7] & 0xff) << 48;
593	acc[1] -= acc[7] >> 16;
594	acc[0] -= (acc[7] & 0xffff) << 40;
595	acc[1] += acc[7] >> 48;
596	acc[0] += (acc[7] & 0xffffffffffff) << 8;
597
598	/*-
599	* acc[k] < in[k] + 2^124 + 2^121
600	* < in[k] + 2^125
601	* < 2^128, for k <= 6
602	*/
603
604	/*
605	* Carry 4 -> 5 -> 6
606	* This has the effect of ensuring that these more significant limbs
607	* will be small in value after eliminating high bits from acc[6].
608	*/
609	acc[5] += acc[4] >> 56;
610	acc[4] &= 0x00ffffffffffffff;
611
612	acc[6] += acc[5] >> 56;
613	acc[5] &= 0x00ffffffffffffff;
614
615	/*-
616	* acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
617	* < in[6] + 2^125
618	* < 2^128
619	*/
620
621	/* [3]: Eliminate high bits of acc[6] */
622	temp = acc[6] >> 48;
623	acc[6] &= 0x0000ffffffffffff;
624
625	/* temp < 2^80 */
626
627	acc[3] += temp >> 40;
628	acc[2] += (temp & 0xffffffffff) << 16;
629	acc[2] += temp >> 16;
630	acc[1] += (temp & 0xffff) << 40;
631	acc[1] -= temp >> 24;
632	acc[0] -= (temp & 0xffffff) << 32;
633	acc[0] += temp;
634
635	/*-
636	* acc[k] < acc_old[k] + 2^64 + 2^56
637	* < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
638	*/
639
640	/* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641	acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
642	acc[0] &= 0x00ffffffffffffff;
643
644	acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645	acc[1] &= 0x00ffffffffffffff;
646
647	acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648	acc[2] &= 0x00ffffffffffffff;
649
650	/*-
651	* acc[k] < acc_old[k] + 2^72 + 2^16
652	* < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
653	* < in[k] + 2^125
654	* < 2^128 , k < 4
655	*/
656
657	acc[4] += acc[3] >> 56; /*-
658	* acc[4] < acc_old[4] + 2^72 + 2^16
659	* < 2^72 + 2^56 + 2^16
660	*/
661	acc[3] &= 0x00ffffffffffffff;
662
663	acc[5] += acc[4] >> 56; /*-
664	* acc[5] < acc_old[5] + 2^16 + 1
665	* < 2^56 + 2^16 + 1
666	*/
667	acc[4] &= 0x00ffffffffffffff;
668
669	acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
670	acc[5] &= 0x00ffffffffffffff;
671
672	for (i = 0; i < NLIMBS; i++)
673	out[i] = acc[i];
674	}
675
676	#if defined(ECP_NISTP384_ASM)
677	static void felem_square_wrapper(widefelem out, const felem in);
678	static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
679
680	static void (*felem_square_p)(widefelem out, const felem in) =
681	felem_square_wrapper;
682	static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
683	felem_mul_wrapper;
684
685	void p384_felem_square(widefelem out, const felem in);
686	void p384_felem_mul(widefelem out, const felem in1, const felem in2);
687
688	# if defined(_ARCH_PPC64)
689	# include "crypto/ppc_arch.h"
690	# endif
691
692	static void felem_select(void)
693	{
694	# if defined(_ARCH_PPC64)
695	if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696	felem_square_p = p384_felem_square;
697	felem_mul_p = p384_felem_mul;
698
699	return;
700	}
701	# endif
702
703	/* Default */
704	felem_square_p = felem_square_ref;
705	felem_mul_p = felem_mul_ref;
706	}
707
708	static void felem_square_wrapper(widefelem out, const felem in)
709	{
710	felem_select();
711	felem_square_p(out, in);
712	}
713
714	static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
715	{
716	felem_select();
717	felem_mul_p(out, in1, in2);
718	}
719
720	# define felem_square felem_square_p
721	# define felem_mul felem_mul_p
722	#else
723	# define felem_square felem_square_ref
724	# define felem_mul felem_mul_ref
725	#endif
726
727	static ossl_inline void felem_square_reduce(felem out, const felem in)
728	{
729	widefelem tmp;
730
731	felem_square(tmp, in);
732	felem_reduce(out, tmp);
733	}
734
735	static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
736	{
737	widefelem tmp;
738
739	felem_mul(tmp, in1, in2);
740	felem_reduce(out, tmp);
741	}
742
743	/*-
744	* felem_inv calculates \|out\| = \|in\|^{-1}
745	*
746	* Based on Fermat's Little Theorem:
747	* a^p = a (mod p)
748	* a^{p-1} = 1 (mod p)
749	* a^{p-2} = a^{-1} (mod p)
750	*/
751	static void felem_inv(felem out, const felem in)
752	{
753	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
754	unsigned int i = 0;
755
756	felem_square_reduce(ftmp, in); /* 2^1 */
757	felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
758	felem_assign(ftmp2, ftmp);
759
760	felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
761	felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
762	felem_assign(ftmp3, ftmp);
763
764	for (i = 0; i < 3; i++)
765	felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766	felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767	felem_assign(ftmp4, ftmp);
768
769	for (i = 0; i < 6; i++)
770	felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771	felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
772
773	for (i = 0; i < 3; i++)
774	felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775	felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776	felem_assign(ftmp5, ftmp);
777
778	for (i = 0; i < 15; i++)
779	felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780	felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781	felem_assign(ftmp6, ftmp);
782
783	for (i = 0; i < 30; i++)
784	felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785	felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786	felem_assign(ftmp4, ftmp);
787
788	for (i = 0; i < 60; i++)
789	felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790	felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791	felem_assign(ftmp4, ftmp);
792
793	for (i = 0; i < 120; i++)
794	felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */
795	felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
796
797	for (i = 0; i < 15; i++)
798	felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799	felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
800
801	for (i = 0; i < 31; i++)
802	felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803	felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
804
805	for (i = 0; i < 2; i++)
806	felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807	felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
808
809	for (i = 0; i < 94; i++)
810	felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811	felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
812
813	for (i = 0; i < 2; i++)
814	felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815	felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
816
817	memcpy(out, ftmp, sizeof(felem));
818	}
819
820	/*
821	* Zero-check: returns a limb with all bits set if \|in\| == 0 (mod p)
822	* and 0 otherwise. We know that field elements are reduced to
823	* 0 < in < 2p, so we only need to check two cases:
824	* 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825	* in[k] < 2^56, k < 6
826	* in[6] <= 2^48
827	*/
828	static limb felem_is_zero(const felem in)
829	{
830	limb zero, p384;
831
832	zero = in[0] \| in[1] \| in[2] \| in[3] \| in[4] \| in[5] \| in[6];
833	zero = ((int64_t) (zero) - 1) >> 63;
834	p384 = (in[0] ^ 0x000000ffffffff) \| (in[1] ^ 0xffff0000000000)
835	\| (in[2] ^ 0xfffffffffeffff) \| (in[3] ^ 0xffffffffffffff)
836	\| (in[4] ^ 0xffffffffffffff) \| (in[5] ^ 0xffffffffffffff)
837	\| (in[6] ^ 0xffffffffffff);
838	p384 = ((int64_t) (p384) - 1) >> 63;
839
840	return (zero \| p384);
841	}
842
843	static int felem_is_zero_int(const void *in)
844	{
845	return (int)(felem_is_zero(in) & ((limb) 1));
846	}
847
848	/*-
849	* felem_contract converts \|in\| to its unique, minimal representation.
850	* Assume we've removed all redundant bits.
851	* On entry:
852	* in[k] < 2^56, k < 6
853	* in[6] <= 2^48
854	*/
855	static void felem_contract(felem out, const felem in)
856	{
857	static const int64_t two56 = ((limb) 1) << 56;
858
859	/*
860	* We know for a fact that 0 <= \|in\| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861	* Perform two successive, idempotent subtractions to reduce if \|in\| >= p.
862	*/
863
864	int64_t tmp[NLIMBS], cond[5], a;
865	unsigned int i;
866
867	memcpy(tmp, in, sizeof(felem));
868
869	/* Case 1: a = 1 iff \|in\| >= 2^384 */
870	a = (in[6] >> 48);
871	tmp[0] += a;
872	tmp[0] -= a << 32;
873	tmp[1] += a << 40;
874	tmp[2] += a << 16;
875	tmp[6] &= 0x0000ffffffffffff;
876
877	/*
878	* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879	* non-zero, so we only need one step
880	*/
881
882	a = tmp[0] >> 63;
883	tmp[0] += a & two56;
884	tmp[1] -= a & 1;
885
886	/* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887	tmp[2] += tmp[1] >> 56;
888	tmp[1] &= 0x00ffffffffffffff;
889
890	tmp[3] += tmp[2] >> 56;
891	tmp[2] &= 0x00ffffffffffffff;
892
893	tmp[4] += tmp[3] >> 56;
894	tmp[3] &= 0x00ffffffffffffff;
895
896	tmp[5] += tmp[4] >> 56;
897	tmp[4] &= 0x00ffffffffffffff;
898
899	tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900	tmp[5] &= 0x00ffffffffffffff;
901
902	/*
903	* Case 2: a = all ones if p <= \|in\| < 2^384, 0 otherwise
904	*/
905
906	/* 0 iff (2^129..2^383) are all one */
907	cond[0] = ((tmp[6] \| 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] \| 0x0000000001ffff)) + 1;
908	/* 0 iff 2^128 bit is one */
909	cond[1] = (tmp[2] \| ~0x00000000010000) + 1;
910	/* 0 iff (2^96..2^127) bits are all one */
911	cond[2] = ((tmp[2] \| 0xffffffffff0000) & (tmp[1] \| 0x0000ffffffffff)) + 1;
912	/* 0 iff (2^32..2^95) bits are all zero */
913	cond[3] = (tmp[1] & ~0xffff0000000000) \| (tmp[0] & ~((int64_t) 0x000000ffffffff));
914	/* 0 iff (2^0..2^31) bits are all one */
915	cond[4] = (tmp[0] \| 0xffffff00000000) + 1;
916
917	/*
918	* In effect, invert our conditions, so that 0 values become all 1's,
919	* any non-zero value in the low-order 56 bits becomes all 0's
920	*/
921	for (i = 0; i < 5; i++)
922	cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
923
924	/*
925	* The condition for determining whether in is greater than our
926	* prime is given by the following condition.
927	*/
928
929	/* First subtract 2^384 - 2^129 cheaply */
930	a = cond[0] & (cond[1] \| (cond[2] & (~cond[3] \| cond[4])));
931	tmp[6] &= ~a;
932	tmp[5] &= ~a;
933	tmp[4] &= ~a;
934	tmp[3] &= ~a;
935	tmp[2] &= ~a \| 0x0000000001ffff;
936
937	/*
938	* Subtract 2^128 - 2^96 by
939	* means of disjoint cases.
940	*/
941
942	/* subtract 2^128 if that bit is present, and add 2^96 */
943	a = cond[0] & cond[1];
944	tmp[2] &= ~a \| 0xfffffffffeffff;
945	tmp[1] += a & ((int64_t) 1 << 40);
946
947	/* otherwise, clear bits 2^127 .. 2^96 */
948	a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] \| cond[4]));
949	tmp[2] &= ~a \| 0xffffffffff0000;
950	tmp[1] &= ~a \| 0x0000ffffffffff;
951
952	/* finally, subtract the last 2^32 - 1 */
953	a = cond[0] & (cond[1] \| (cond[2] & (~cond[3] \| cond[4])));
954	tmp[0] += a & (-((int64_t) 1 << 32) + 1);
955
956	/*
957	* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958	* non-zero, so we only need one step
959	*/
960	a = tmp[0] >> 63;
961	tmp[0] += a & two56;
962	tmp[1] -= a & 1;
963
964	/* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965	tmp[2] += tmp[1] >> 56;
966	tmp[1] &= 0x00ffffffffffffff;
967
968	tmp[3] += tmp[2] >> 56;
969	tmp[2] &= 0x00ffffffffffffff;
970
971	tmp[4] += tmp[3] >> 56;
972	tmp[3] &= 0x00ffffffffffffff;
973
974	tmp[5] += tmp[4] >> 56;
975	tmp[4] &= 0x00ffffffffffffff;
976
977	tmp[6] += tmp[5] >> 56;
978	tmp[5] &= 0x00ffffffffffffff;
979
980	memcpy(out, tmp, sizeof(felem));
981	}
982
983	/*-
984	* Group operations
985	* ----------------
986	*
987	* Building on top of the field operations we have the operations on the
988	* elliptic curve group itself. Points on the curve are represented in Jacobian
989	* coordinates
990	*/
991
992	/*-
993	* point_double calculates 2*(x_in, y_in, z_in)
994	*
995	* The method is taken from:
996	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997	*
998	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999	* while x_out == y_in is not (maybe this works, but it's not tested).
1000	*/
1001	static void
1002	point_double(felem x_out, felem y_out, felem z_out,
1003	const felem x_in, const felem y_in, const felem z_in)
1004	{
1005	widefelem tmp, tmp2;
1006	felem delta, gamma, beta, alpha, ftmp, ftmp2;
1007
1008	felem_assign(ftmp, x_in);
1009	felem_assign(ftmp2, x_in);
1010
1011	/* delta = z^2 */
1012	felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */
1013
1014	/* gamma = y^2 */
1015	felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */
1016
1017	/* beta = xgamma /
1018	felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */
1019
1020	/* alpha = 3(x-delta)(x+delta) */
1021	felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022	felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */
1023	felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */
1024	felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1025
1026	/* x' = alpha^2 - 8beta /
1027	felem_square(tmp, alpha); /* tmp[i] < 2^115 */
1028	felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */
1029	felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */
1030	felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031	felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1032
1033	/* z' = (y + z)^2 - gamma - delta */
1034	felem_sum64(delta, gamma); /* delta[i] < 2^57 */
1035	felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */
1036	felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */
1037	felem_square(tmp, ftmp); /* tmp[i] < 2^115 */
1038	felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039	felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */
1040
1041	/* y' = alpha(4beta - x') - 8gamma^2 /
1042	felem_scalar64(beta, 4); /* beta[i] < 2^58 */
1043	felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */
1044	felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */
1045	felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */
1046	felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */
1047	felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048	felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */
1049	}
1050
1051	/* copy_conditional copies in to out iff mask is all ones. */
1052	static void copy_conditional(felem out, const felem in, limb mask)
1053	{
1054	unsigned int i;
1055
1056	for (i = 0; i < NLIMBS; i++)
1057	out[i] ^= mask & (in[i] ^ out[i]);
1058	}
1059
1060	/*-
1061	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
1062	*
1063	* The method is taken from
1064	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1066	*
1067	* This function includes a branch for checking whether the two input points
1068	* are equal (while not equal to the point at infinity). See comment below
1069	* on constant-time.
1070	*/
1071	static void point_add(felem x3, felem y3, felem z3,
1072	const felem x1, const felem y1, const felem z1,
1073	const int mixed, const felem x2, const felem y2,
1074	const felem z2)
1075	{
1076	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077	widefelem tmp, tmp2;
1078	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1079	limb points_equal;
1080
1081	z1_is_zero = felem_is_zero(z1);
1082	z2_is_zero = felem_is_zero(z2);
1083
1084	/* ftmp = z1z1 = z1*2 /
1085	felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */
1086
1087	if (!mixed) {
1088	/* ftmp2 = z2z2 = z2*2 /
1089	felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1090
1091	/* u1 = ftmp3 = x1z2z2 /
1092	felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1093
1094	/* ftmp5 = z1 + z2 */
1095	felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */
1096	felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */
1097
1098	/* ftmp5 = (z1 + z2)*2 - z1z1 - z2z2 = 2z1z2 */
1099	felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1100	felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101	felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102	felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1103
1104	/* ftmp2 = z2 * z2z2 */
1105	felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1106
1107	/* s1 = ftmp6 = y1 * z2*3 /
1108	felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1109	} else {
1110	/*
1111	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
1112	*/
1113
1114	/* u1 = ftmp3 = x1z2z2 /
1115	felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */
1116
1117	/* ftmp5 = 2z1z2 /
1118	felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */
1119
1120	/* s1 = ftmp6 = y1 * z2*3 /
1121	felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */
1122	}
1123	/* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1124
1125	/* u2 = x2z1z1 /
1126	felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */
1127
1128	/* h = ftmp4 = u2 - u1 */
1129	felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130	felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */
1131
1132	x_equal = felem_is_zero(ftmp4);
1133
1134	/* z_out = ftmp5 * h */
1135	felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */
1136
1137	/* ftmp = z1 * z1z1 */
1138	felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */
1139
1140	/* s2 = tmp = y2 * z1*3 /
1141	felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */
1142
1143	/* r = ftmp5 = (s2 - s1)2 /
1144	felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145	felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1146	y_equal = felem_is_zero(ftmp5);
1147	felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */
1148
1149	/*
1150	* The formulae are incorrect if the points are equal, in affine coordinates
1151	* (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1152	* happens.
1153	*
1154	* We use bitwise operations to avoid potential side-channels introduced by
1155	* the short-circuiting behaviour of boolean operators.
1156	*
1157	* The special case of either point being the point at infinity (z1 and/or
1158	* z2 are zero), is handled separately later on in this function, so we
1159	* avoid jumping to point_double here in those special cases.
1160	*
1161	* Notice the comment below on the implications of this branching for timing
1162	* leaks and why it is considered practically irrelevant.
1163	*/
1164	points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1165
1166	if (points_equal) {
1167	/*
1168	* This is obviously not constant-time but it will almost-never happen
1169	* for ECDH / ECDSA.
1170	*/
1171	point_double(x3, y3, z3, x1, y1, z1);
1172	return;
1173	}
1174
1175	/* I = ftmp = (2h)*2 /
1176	felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */
1177	felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */
1178	felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */
1179
1180	/* J = ftmp2 = h * I */
1181	felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1182
1183	/* V = ftmp4 = U1 * I */
1184	felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1185
1186	/* x_out = r*2 - J - 2V /
1187	felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1188	felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189	felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */
1190	felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */
1191	felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192	felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1193
1194	/* y_out = r(V-x_out) - 2 * s1 * J */
1195	felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196	felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */
1197	felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198	felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */
1199	felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200	felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */
1201
1202	copy_conditional(x_out, x2, z1_is_zero);
1203	copy_conditional(x_out, x1, z2_is_zero);
1204	copy_conditional(y_out, y2, z1_is_zero);
1205	copy_conditional(y_out, y1, z2_is_zero);
1206	copy_conditional(z_out, z2, z1_is_zero);
1207	copy_conditional(z_out, z1, z2_is_zero);
1208	felem_assign(x3, x_out);
1209	felem_assign(y3, y_out);
1210	felem_assign(z3, z_out);
1211	}
1212
1213	/*-
1214	* Base point pre computation
1215	* --------------------------
1216	*
1217	* Two different sorts of precomputed tables are used in the following code.
1218	* Each contain various points on the curve, where each point is three field
1219	* elements (x, y, z).
1220	*
1221	* For the base point table, z is usually 1 (0 for the point at infinity).
1222	* This table has 16 elements:
1223	* index \| bits \| point
1224	* ------+---------+------------------------------
1225	* 0 \| 0 0 0 0 \| 0G
1226	* 1 \| 0 0 0 1 \| 1G
1227	* 2 \| 0 0 1 0 \| 2^95G
1228	* 3 \| 0 0 1 1 \| (2^95 + 1)G
1229	* 4 \| 0 1 0 0 \| 2^190G
1230	* 5 \| 0 1 0 1 \| (2^190 + 1)G
1231	* 6 \| 0 1 1 0 \| (2^190 + 2^95)G
1232	* 7 \| 0 1 1 1 \| (2^190 + 2^95 + 1)G
1233	* 8 \| 1 0 0 0 \| 2^285G
1234	* 9 \| 1 0 0 1 \| (2^285 + 1)G
1235	* 10 \| 1 0 1 0 \| (2^285 + 2^95)G
1236	* 11 \| 1 0 1 1 \| (2^285 + 2^95 + 1)G
1237	* 12 \| 1 1 0 0 \| (2^285 + 2^190)G
1238	* 13 \| 1 1 0 1 \| (2^285 + 2^190 + 1)G
1239	* 14 \| 1 1 1 0 \| (2^285 + 2^190 + 2^95)G
1240	* 15 \| 1 1 1 1 \| (2^285 + 2^190 + 2^95 + 1)G
1241	*
1242	* The reason for this is so that we can clock bits into four different
1243	* locations when doing simple scalar multiplies against the base point.
1244	*
1245	* Tables for other points have table[i] = iG for i in 0 .. 16.
1246	*/
1247
1248	/* gmul is the table of precomputed base points */
1249	static const felem gmul[16][3] = {
1250	{{0, 0, 0, 0, 0, 0, 0},
1251	{0, 0, 0, 0, 0, 0, 0},
1252	{0, 0, 0, 0, 0, 0, 0}},
1253	{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254	0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255	{0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256	0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257	{1, 0, 0, 0, 0, 0, 0}},
1258	{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259	0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260	{0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261	0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262	{1, 0, 0, 0, 0, 0, 0}},
1263	{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264	0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265	{0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266	0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267	{1, 0, 0, 0, 0, 0, 0}},
1268	{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269	0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270	{0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271	0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272	{1, 0, 0, 0, 0, 0, 0}},
1273	{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274	0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275	{0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276	0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277	{1, 0, 0, 0, 0, 0, 0}},
1278	{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279	0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280	{0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281	0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282	{1, 0, 0, 0, 0, 0, 0}},
1283	{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284	0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285	{0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286	0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287	{1, 0, 0, 0, 0, 0, 0}},
1288	{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289	0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290	{0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291	0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292	{1, 0, 0, 0, 0, 0, 0}},
1293	{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294	0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295	{0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296	0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297	{1, 0, 0, 0, 0, 0, 0}},
1298	{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299	0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300	{0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301	0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302	{1, 0, 0, 0, 0, 0, 0}},
1303	{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304	0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305	{0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306	0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307	{1, 0, 0, 0, 0, 0, 0}},
1308	{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309	0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310	{0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311	0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312	{1, 0, 0, 0, 0, 0, 0}},
1313	{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314	0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315	{0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316	0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317	{1, 0, 0, 0, 0, 0, 0}},
1318	{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319	0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320	{0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321	0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322	{1, 0, 0, 0, 0, 0, 0}},
1323	{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324	0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325	{0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326	0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327	{1, 0, 0, 0, 0, 0, 0}}
1328	};
1329
1330	/*
1331	* select_point selects the \|idx\|th point from a precomputation table and
1332	* copies it to out.
1333	*
1334	* pre_comp below is of the size provided in \|size\|.
1335	*/
1336	static void select_point(const limb idx, unsigned int size,
1337	const felem pre_comp[][3], felem out[3])
1338	{
1339	unsigned int i, j;
1340	limb *outlimbs = &out[0][0];
1341
1342	memset(out, 0, sizeof(out) 3);
1343
1344	for (i = 0; i < size; i++) {
1345	const limb *inlimbs = &pre_comp[i][0][0];
1346	limb mask = i ^ idx;
1347
1348	mask \|= mask >> 4;
1349	mask \|= mask >> 2;
1350	mask \|= mask >> 1;
1351	mask &= 1;
1352	mask--;
1353	for (j = 0; j < NLIMBS * 3; j++)
1354	outlimbs[j] \|= inlimbs[j] & mask;
1355	}
1356	}
1357
1358	/* get_bit returns the \|i\|th bit in \|in\| */
1359	static char get_bit(const felem_bytearray in, int i)
1360	{
1361	if (i < 0 \|\| i >= 384)
1362	return 0;
1363	return (in[i >> 3] >> (i & 7)) & 1;
1364	}
1365
1366	/*
1367	* Interleaved point multiplication using precomputed point multiples: The
1368	* small point multiples 0P, 1P, ..., 16*P are in pre_comp[], the scalars
1369	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370	* generator, using certain (large) precomputed multiples in g_pre_comp.
1371	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
1372	*/
1373	static void batch_mul(felem x_out, felem y_out, felem z_out,
1374	const felem_bytearray scalars[],
1375	const unsigned int num_points, const u8 *g_scalar,
1376	const int mixed, const felem pre_comp[][17][3],
1377	const felem g_pre_comp[16][3])
1378	{
1379	int i, skip;
1380	unsigned int num, gen_mul = (g_scalar != NULL);
1381	felem nq[3], tmp[4];
1382	limb bits;
1383	u8 sign, digit;
1384
1385	/* set nq to the point at infinity */
1386	memset(nq, 0, sizeof(nq));
1387
1388	/*
1389	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390	* of the generator (last quarter of rounds) and additions of other
1391	* points multiples (every 5th round).
1392	*/
1393	skip = 1; /* save two point operations in the first
1394	* round */
1395	for (i = (num_points ? 380 : 98); i >= 0; --i) {
1396	/* double */
1397	if (!skip)
1398	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1399
1400	/* add multiples of the generator */
1401	if (gen_mul && (i <= 98)) {
1402	bits = get_bit(g_scalar, i + 285) << 3;
1403	if (i < 95) {
1404	bits \|= get_bit(g_scalar, i + 190) << 2;
1405	bits \|= get_bit(g_scalar, i + 95) << 1;
1406	bits \|= get_bit(g_scalar, i);
1407	}
1408	/* select the point to add, in constant time */
1409	select_point(bits, 16, g_pre_comp, tmp);
1410	if (!skip) {
1411	/* The 1 argument below is for "mixed" */
1412	point_add(nq[0], nq[1], nq[2],
1413	nq[0], nq[1], nq[2], 1,
1414	tmp[0], tmp[1], tmp[2]);
1415	} else {
1416	memcpy(nq, tmp, 3 * sizeof(felem));
1417	skip = 0;
1418	}
1419	}
1420
1421	/* do other additions every 5 doublings */
1422	if (num_points && (i % 5 == 0)) {
1423	/* loop over all scalars */
1424	for (num = 0; num < num_points; ++num) {
1425	bits = get_bit(scalars[num], i + 4) << 5;
1426	bits \|= get_bit(scalars[num], i + 3) << 4;
1427	bits \|= get_bit(scalars[num], i + 2) << 3;
1428	bits \|= get_bit(scalars[num], i + 1) << 2;
1429	bits \|= get_bit(scalars[num], i) << 1;
1430	bits \|= get_bit(scalars[num], i - 1);
1431	ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1432
1433	/*
1434	* select the point to add or subtract, in constant time
1435	*/
1436	select_point(digit, 17, pre_comp[num], tmp);
1437	felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1438	* point */
1439	copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1440
1441	if (!skip) {
1442	point_add(nq[0], nq[1], nq[2],
1443	nq[0], nq[1], nq[2], mixed,
1444	tmp[0], tmp[1], tmp[2]);
1445	} else {
1446	memcpy(nq, tmp, 3 * sizeof(felem));
1447	skip = 0;
1448	}
1449	}
1450	}
1451	}
1452	felem_assign(x_out, nq[0]);
1453	felem_assign(y_out, nq[1]);
1454	felem_assign(z_out, nq[2]);
1455	}
1456
1457	/* Precomputation for the group generator. */
1458	struct nistp384_pre_comp_st {
1459	felem g_pre_comp[16][3];
1460	CRYPTO_REF_COUNT references;
1461	};
1462
1463	const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1464	{
1465	static const EC_METHOD ret = {
1466	EC_FLAGS_DEFAULT_OCT,
1467	NID_X9_62_prime_field,
1468	ossl_ec_GFp_nistp384_group_init,
1469	ossl_ec_GFp_simple_group_finish,
1470	ossl_ec_GFp_simple_group_clear_finish,
1471	ossl_ec_GFp_nist_group_copy,
1472	ossl_ec_GFp_nistp384_group_set_curve,
1473	ossl_ec_GFp_simple_group_get_curve,
1474	ossl_ec_GFp_simple_group_get_degree,
1475	ossl_ec_group_simple_order_bits,
1476	ossl_ec_GFp_simple_group_check_discriminant,
1477	ossl_ec_GFp_simple_point_init,
1478	ossl_ec_GFp_simple_point_finish,
1479	ossl_ec_GFp_simple_point_clear_finish,
1480	ossl_ec_GFp_simple_point_copy,
1481	ossl_ec_GFp_simple_point_set_to_infinity,
1482	ossl_ec_GFp_simple_point_set_affine_coordinates,
1483	ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484	0, /* point_set_compressed_coordinates */
1485	0, /* point2oct */
1486	0, /* oct2point */
1487	ossl_ec_GFp_simple_add,
1488	ossl_ec_GFp_simple_dbl,
1489	ossl_ec_GFp_simple_invert,
1490	ossl_ec_GFp_simple_is_at_infinity,
1491	ossl_ec_GFp_simple_is_on_curve,
1492	ossl_ec_GFp_simple_cmp,
1493	ossl_ec_GFp_simple_make_affine,
1494	ossl_ec_GFp_simple_points_make_affine,
1495	ossl_ec_GFp_nistp384_points_mul,
1496	ossl_ec_GFp_nistp384_precompute_mult,
1497	ossl_ec_GFp_nistp384_have_precompute_mult,
1498	ossl_ec_GFp_nist_field_mul,
1499	ossl_ec_GFp_nist_field_sqr,
1500	0, /* field_div */
1501	ossl_ec_GFp_simple_field_inv,
1502	0, /* field_encode */
1503	0, /* field_decode */
1504	0, /* field_set_to_one */
1505	ossl_ec_key_simple_priv2oct,
1506	ossl_ec_key_simple_oct2priv,
1507	0, /* set private */
1508	ossl_ec_key_simple_generate_key,
1509	ossl_ec_key_simple_check_key,
1510	ossl_ec_key_simple_generate_public_key,
1511	0, /* keycopy */
1512	0, /* keyfinish */
1513	ossl_ecdh_simple_compute_key,
1514	ossl_ecdsa_simple_sign_setup,
1515	ossl_ecdsa_simple_sign_sig,
1516	ossl_ecdsa_simple_verify_sig,
1517	0, /* field_inverse_mod_ord */
1518	0, /* blind_coordinates */
1519	0, /* ladder_pre */
1520	0, /* ladder_step */
1521	0 /* ladder_post */
1522	};
1523
1524	return &ret;
1525	}
1526
1527	/******************************************************************************/
1528	/*
1529	* FUNCTIONS TO MANAGE PRECOMPUTATION
1530	*/
1531
1532	static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1533	{
1534	NISTP384_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
1535
1536	if (ret == NULL)
1537	return ret;
1538
1539	if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1540	OPENSSL_free(ret);
1541	return NULL;
1542	}
1543	return ret;
1544	}
1545
1546	NISTP384_PRE_COMP ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP p)
1547	{
1548	int i;
1549
1550	if (p != NULL)
1551	CRYPTO_UP_REF(&p->references, &i);
1552	return p;
1553	}
1554
1555	void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1556	{
1557	int i;
1558
1559	if (p == NULL)
1560	return;
1561
1562	CRYPTO_DOWN_REF(&p->references, &i);
1563	REF_PRINT_COUNT("ossl_ec_nistp384", p);
1564	if (i > 0)
1565	return;
1566	REF_ASSERT_ISNT(i < 0);
1567
1568	CRYPTO_FREE_REF(&p->references);
1569	OPENSSL_free(p);
1570	}
1571
1572	/******************************************************************************/
1573	/*
1574	* OPENSSL EC_METHOD FUNCTIONS
1575	*/
1576
1577	int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1578	{
1579	int ret;
1580
1581	ret = ossl_ec_GFp_simple_group_init(group);
1582	group->a_is_minus3 = 1;
1583	return ret;
1584	}
1585
1586	int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP group, const BIGNUM p,
1587	const BIGNUM a, const BIGNUM b,
1588	BN_CTX *ctx)
1589	{
1590	int ret = 0;
1591	BIGNUM curve_p, curve_a, *curve_b;
1592	#ifndef FIPS_MODULE
1593	BN_CTX *new_ctx = NULL;
1594
1595	if (ctx == NULL)
1596	ctx = new_ctx = BN_CTX_new();
1597	#endif
1598	if (ctx == NULL)
1599	return 0;
1600
1601	BN_CTX_start(ctx);
1602	curve_p = BN_CTX_get(ctx);
1603	curve_a = BN_CTX_get(ctx);
1604	curve_b = BN_CTX_get(ctx);
1605	if (curve_b == NULL)
1606	goto err;
1607	BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608	BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609	BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
1611	ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1612	goto err;
1613	}
1614	group->field_mod_func = BN_nist_mod_384;
1615	ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1616	err:
1617	BN_CTX_end(ctx);
1618	#ifndef FIPS_MODULE
1619	BN_CTX_free(new_ctx);
1620	#endif
1621	return ret;
1622	}
1623
1624	/*
1625	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1626	* (X/Z^2, Y/Z^3)
1627	*/
1628	int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629	const EC_POINT *point,
1630	BIGNUM x, BIGNUM y,
1631	BN_CTX *ctx)
1632	{
1633	felem z1, z2, x_in, y_in, x_out, y_out;
1634	widefelem tmp;
1635
1636	if (EC_POINT_is_at_infinity(group, point)) {
1637	ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1638	return 0;
1639	}
1640	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
1641	(!BN_to_felem(z1, point->Z)))
1642	return 0;
1643	felem_inv(z2, z1);
1644	felem_square(tmp, z2);
1645	felem_reduce(z1, tmp);
1646	felem_mul(tmp, x_in, z1);
1647	felem_reduce(x_in, tmp);
1648	felem_contract(x_out, x_in);
1649	if (x != NULL) {
1650	if (!felem_to_BN(x, x_out)) {
1651	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1652	return 0;
1653	}
1654	}
1655	felem_mul(tmp, z1, z2);
1656	felem_reduce(z1, tmp);
1657	felem_mul(tmp, y_in, z1);
1658	felem_reduce(y_in, tmp);
1659	felem_contract(y_out, y_in);
1660	if (y != NULL) {
1661	if (!felem_to_BN(y, y_out)) {
1662	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1663	return 0;
1664	}
1665	}
1666	return 1;
1667	}
1668
1669	/* points below is of size \|num\|, and tmp_felems is of size \|num+1/ */
1670	static void make_points_affine(size_t num, felem points[][3],
1671	felem tmp_felems[])
1672	{
1673	/*
1674	* Runs in constant time, unless an input is the point at infinity (which
1675	* normally shouldn't happen).
1676	*/
1677	ossl_ec_GFp_nistp_points_make_affine_internal(num,
1678	points,
1679	sizeof(felem),
1680	tmp_felems,
1681	(void ()(void ))felem_one,
1682	felem_is_zero_int,
1683	(void ()(void , const void *))
1684	felem_assign,
1685	(void ()(void , const void *))
1686	felem_square_reduce,
1687	(void ()(void , const void , const void))
1688	felem_mul_reduce,
1689	(void ()(void , const void *))
1690	felem_inv,
1691	(void ()(void , const void *))
1692	felem_contract);
1693	}
1694
1695	/*
1696	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
1697	* values Result is stored in r (r can equal one of the inputs).
1698	*/
1699	int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP group, EC_POINT r,
1700	const BIGNUM *scalar, size_t num,
1701	const EC_POINT *points[],
1702	const BIGNUM scalars[], BN_CTX ctx)
1703	{
1704	int ret = 0;
1705	int j;
1706	int mixed = 0;
1707	BIGNUM x, y, z, tmp_scalar;
1708	felem_bytearray g_secret;
1709	felem_bytearray *secrets = NULL;
1710	felem (*pre_comp)[17][3] = NULL;
1711	felem *tmp_felems = NULL;
1712	unsigned int i;
1713	int num_bytes;
1714	int have_pre_comp = 0;
1715	size_t num_points = num;
1716	felem x_in, y_in, z_in, x_out, y_out, z_out;
1717	NISTP384_PRE_COMP *pre = NULL;
1718	felem(*g_pre_comp)[3] = NULL;
1719	EC_POINT *generator = NULL;
1720	const EC_POINT *p = NULL;
1721	const BIGNUM *p_scalar = NULL;
1722
1723	BN_CTX_start(ctx);
1724	x = BN_CTX_get(ctx);
1725	y = BN_CTX_get(ctx);
1726	z = BN_CTX_get(ctx);
1727	tmp_scalar = BN_CTX_get(ctx);
1728	if (tmp_scalar == NULL)
1729	goto err;
1730
1731	if (scalar != NULL) {
1732	pre = group->pre_comp.nistp384;
1733	if (pre)
1734	/* we have precomputation, try to use it */
1735	g_pre_comp = &pre->g_pre_comp[0];
1736	else
1737	/* try to use the standard precomputation */
1738	g_pre_comp = (felem(*)[3]) gmul;
1739	generator = EC_POINT_new(group);
1740	if (generator == NULL)
1741	goto err;
1742	/* get the generator from precomputation */
1743	if (!felem_to_BN(x, g_pre_comp[1][0]) \|\|
1744	!felem_to_BN(y, g_pre_comp[1][1]) \|\|
1745	!felem_to_BN(z, g_pre_comp[1][2])) {
1746	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1747	goto err;
1748	}
1749	if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1750	generator,
1751	x, y, z, ctx))
1752	goto err;
1753	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754	/* precomputation matches generator */
1755	have_pre_comp = 1;
1756	else
1757	/*
1758	* we don't have valid precomputation: treat the generator as a
1759	* random point
1760	*/
1761	num_points++;
1762	}
1763
1764	if (num_points > 0) {
1765	if (num_points >= 2) {
1766	/*
1767	* unless we precompute multiples for just one point, converting
1768	* those into affine form is time well spent
1769	*/
1770	mixed = 1;
1771	}
1772	secrets = OPENSSL_zalloc(sizeof(secrets) num_points);
1773	pre_comp = OPENSSL_zalloc(sizeof(pre_comp) num_points);
1774	if (mixed)
1775	tmp_felems =
1776	OPENSSL_malloc(sizeof(tmp_felems) (num_points * 17 + 1));
1777	if ((secrets == NULL) \|\| (pre_comp == NULL)
1778	\|\| (mixed && (tmp_felems == NULL)))
1779	goto err;
1780
1781	/*
1782	* we treat NULL scalars as 0, and NULL points as points at infinity,
1783	* i.e., they contribute nothing to the linear combination
1784	*/
1785	for (i = 0; i < num_points; ++i) {
1786	if (i == num) {
1787	/*
1788	* we didn't have a valid precomputation, so we pick the
1789	* generator
1790	*/
1791	p = EC_GROUP_get0_generator(group);
1792	p_scalar = scalar;
1793	} else {
1794	/* the i^th point */
1795	p = points[i];
1796	p_scalar = scalars[i];
1797	}
1798	if (p_scalar != NULL && p != NULL) {
1799	/* reduce scalar to 0 <= scalar < 2^384 */
1800	if ((BN_num_bits(p_scalar) > 384)
1801	\|\| (BN_is_negative(p_scalar))) {
1802	/*
1803	* this is an unusual input, and we don't guarantee
1804	* constant-timeness
1805	*/
1806	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1808	goto err;
1809	}
1810	num_bytes = BN_bn2lebinpad(tmp_scalar,
1811	secrets[i], sizeof(secrets[i]));
1812	} else {
1813	num_bytes = BN_bn2lebinpad(p_scalar,
1814	secrets[i], sizeof(secrets[i]));
1815	}
1816	if (num_bytes < 0) {
1817	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1818	goto err;
1819	}
1820	/* precompute multiples */
1821	if ((!BN_to_felem(x_out, p->X)) \|\|
1822	(!BN_to_felem(y_out, p->Y)) \|\|
1823	(!BN_to_felem(z_out, p->Z)))
1824	goto err;
1825	memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826	memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827	memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828	for (j = 2; j <= 16; ++j) {
1829	if (j & 1) {
1830	point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1831	pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0,
1832	pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1833	} else {
1834	point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835	pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1836	}
1837	}
1838	}
1839	}
1840	if (mixed)
1841	make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842	}
1843
1844	/* the scalar for the generator */
1845	if (scalar != NULL && have_pre_comp) {
1846	memset(g_secret, 0, sizeof(g_secret));
1847	/* reduce scalar to 0 <= scalar < 2^384 */
1848	if ((BN_num_bits(scalar) > 384) \|\| (BN_is_negative(scalar))) {
1849	/*
1850	* this is an unusual input, and we don't guarantee
1851	* constant-timeness
1852	*/
1853	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1855	goto err;
1856	}
1857	num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1858	} else {
1859	num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1860	}
1861	/* do the multiplication with generator precomputation */
1862	batch_mul(x_out, y_out, z_out,
1863	(const felem_bytearray(*))secrets, num_points,
1864	g_secret,
1865	mixed, (const felem(*)[17][3])pre_comp,
1866	(const felem(*)[3])g_pre_comp);
1867	} else {
1868	/* do the multiplication without generator precomputation */
1869	batch_mul(x_out, y_out, z_out,
1870	(const felem_bytearray(*))secrets, num_points,
1871	NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1872	}
1873	/* reduce the output to its unique minimal representation */
1874	felem_contract(x_in, x_out);
1875	felem_contract(y_in, y_out);
1876	felem_contract(z_in, z_out);
1877	if ((!felem_to_BN(x, x_in)) \|\| (!felem_to_BN(y, y_in)) \|\|
1878	(!felem_to_BN(z, z_in))) {
1879	ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1880	goto err;
1881	}
1882	ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1883	ctx);
1884
1885	err:
1886	BN_CTX_end(ctx);
1887	EC_POINT_free(generator);
1888	OPENSSL_free(secrets);
1889	OPENSSL_free(pre_comp);
1890	OPENSSL_free(tmp_felems);
1891	return ret;
1892	}
1893
1894	int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP group, BN_CTX ctx)
1895	{
1896	int ret = 0;
1897	NISTP384_PRE_COMP *pre = NULL;
1898	int i, j;
1899	BIGNUM x, y;
1900	EC_POINT *generator = NULL;
1901	felem tmp_felems[16];
1902	#ifndef FIPS_MODULE
1903	BN_CTX *new_ctx = NULL;
1904	#endif
1905
1906	/* throw away old precomputation */
1907	EC_pre_comp_free(group);
1908
1909	#ifndef FIPS_MODULE
1910	if (ctx == NULL)
1911	ctx = new_ctx = BN_CTX_new();
1912	#endif
1913	if (ctx == NULL)
1914	return 0;
1915
1916	BN_CTX_start(ctx);
1917	x = BN_CTX_get(ctx);
1918	y = BN_CTX_get(ctx);
1919	if (y == NULL)
1920	goto err;
1921	/* get the generator */
1922	if (group->generator == NULL)
1923	goto err;
1924	generator = EC_POINT_new(group);
1925	if (generator == NULL)
1926	goto err;
1927	BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928	BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929	if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1930	goto err;
1931	if ((pre = nistp384_pre_comp_new()) == NULL)
1932	goto err;
1933	/*
1934	* if the generator is the standard one, use built-in precomputation
1935	*/
1936	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1938	goto done;
1939	}
1940	if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) \|\|
1941	(!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) \|\|
1942	(!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1943	goto err;
1944	/* compute 2^95G, 2^190G, 2^285G /
1945	for (i = 1; i <= 4; i <<= 1) {
1946	point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947	pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1948	for (j = 0; j < 94; ++j) {
1949	point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950	pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1951	}
1952	}
1953	/* g_pre_comp[0] is the point at infinity */
1954	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955	/* the remaining multiples */
1956	/* 2^95G + 2^190G */
1957	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2],
1958	pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0,
1959	pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1960	/* 2^95G + 2^285G */
1961	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962	pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1963	pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1964	/* 2^190G + 2^285G */
1965	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966	pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1967	pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]);
1968	/* 2^95G + 2^190G + 2^285G /
1969	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970	pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971	pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1972	for (i = 1; i < 8; ++i) {
1973	/* odd multiples: add G */
1974	point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975	pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
1976	pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]);
1977	}
1978	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1979
1980	done:
1981	SETPRECOMP(group, nistp384, pre);
1982	ret = 1;
1983	pre = NULL;
1984	err:
1985	BN_CTX_end(ctx);
1986	EC_POINT_free(generator);
1987	#ifndef FIPS_MODULE
1988	BN_CTX_free(new_ctx);
1989	#endif
1990	ossl_ec_nistp384_pre_comp_free(pre);
1991	return ret;
1992	}
1993
1994	int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1995	{
1996	return HAVEPRECOMP(group, nistp384);
1997	}

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.3.2/crypto/ec/ecp_nistp384.c

Download in other formats: