ghash-riscv64-zvkb-zvbc.pl

Last change on this file was 108206, checked in by vboxsync, 3 months ago
openssl-3.3.2: Exported all files to OSE and removed .scm-settings bugref:10757
Property svn:eol-style set to `LF` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.9 KB

Line
1	#! /usr/bin/env perl
2	# This file is dual-licensed, meaning that you can use it under your
3	# choice of either of the following two licenses:
4	#
5	# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
6	#
7	# Licensed under the Apache License 2.0 (the "License"). You can obtain
8	# a copy in the file LICENSE in the source distribution or at
9	# https://www.openssl.org/source/license.html
10	#
11	# or
12	#
13	# Copyright (c) 2023, Christoph Müllner <[email protected]>
14	# All rights reserved.
15	#
16	# Redistribution and use in source and binary forms, with or without
17	# modification, are permitted provided that the following conditions
18	# are met:
19	# 1. Redistributions of source code must retain the above copyright
20	# notice, this list of conditions and the following disclaimer.
21	# 2. Redistributions in binary form must reproduce the above copyright
22	# notice, this list of conditions and the following disclaimer in the
23	# documentation and/or other materials provided with the distribution.
24	#
25	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
37	# - RV64I
38	# - RISC-V Vector ('V') with VLEN >= 128
39	# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
40	# - RISC-V Vector Carryless Multiplication extension ('Zvbc')
41
42	use strict;
43	use warnings;
44
45	use FindBin qw($Bin);
46	use lib "$Bin";
47	use lib "$Bin/../../perlasm";
48	use riscv;
49
50	# $output is the last argument if it looks like a file (it has an extension)
51	# $flavour is the first argument if it doesn't look like a file
52	my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
53	my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
54
55	$output and open STDOUT,">$output";
56
57	my $code=<<___;
58	.text
59	___
60
61	################################################################################
62	# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
63	#
64	# input: H: 128-bit H - secret parameter E(K, 0^128)
65	# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
66	# gcm_ghash_rv64i_zvkb_zvbc
67	{
68	my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
69	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
70
71	$code .= <<___;
72	.p2align 3
73	.globl gcm_init_rv64i_zvkb_zvbc
74	.type gcm_init_rv64i_zvkb_zvbc,\@function
75	gcm_init_rv64i_zvkb_zvbc:
76	# Load/store data in reverse order.
77	# This is needed as a part of endianness swap.
78	add $H, $H, 8
79	li $TMP0, -8
80	li $TMP1, 63
81	la $TMP2, Lpolymod
82
83	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
84
85	@{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
86	@{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)
87
88	# Shift one left and get the carry bits.
89	@{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
90	@{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1
91
92	# Use the fact that the polynomial degree is no more than 128,
93	# i.e. only the LSB of the upper half could be set.
94	# Thanks to this we don't need to do the full reduction here.
95	# Instead simply subtract the reduction polynomial.
96	# This idea was taken from x86 ghash implementation in OpenSSL.
97	@{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
98	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
99
100	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
101	@{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t
102
103	# Need to set the mask to 3, if the carry bit is set.
104	@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
105	@{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
106	@{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
107	@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
108
109	@{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t
110
111	@{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
112	ret
113	.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
114	___
115	}
116
117	################################################################################
118	# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
119	#
120	# input: Xi: current hash value
121	# Htable: preprocessed H
122	# output: Xi: next hash value Xi = (Xi * H mod f)
123	{
124	my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
125	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
126
127	$code .= <<___;
128	.text
129	.p2align 3
130	.globl gcm_gmult_rv64i_zvkb_zvbc
131	.type gcm_gmult_rv64i_zvkb_zvbc,\@function
132	gcm_gmult_rv64i_zvkb_zvbc:
133	ld $TMP0, ($Htable)
134	ld $TMP1, 8($Htable)
135	li $TMP2, 63
136	la $TMP3, Lpolymod
137	ld $TMP3, 8($TMP3)
138
139	# Load/store data in reverse order.
140	# This is needed as a part of endianness swap.
141	add $Xi, $Xi, 8
142	li $TMP4, -8
143
144	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
145
146	@{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
147	@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
148
149	# Multiplication
150
151	# Do two 64x64 multiplications in one go to save some time
152	# and simplify things.
153
154	# A = a1a0 (t1, t0)
155	# B = b1b0 (v5)
156	# C = c1c0 (256 bit)
157	# c1 = a1b1 + (a0b1)h + (a1b0)h
158	# c0 = a0b0 + (a0b1)l + (a1b0)h
159
160	# v1 = (a0b1)l,(a0b0)l
161	@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
162	# v3 = (a0b1)h,(a0b0)h
163	@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
164
165	# v4 = (a1b1)l,(a1b0)l
166	@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
167	# v2 = (a1b1)h,(a1b0)h
168	@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
169
170	# Is there a better way to do this?
171	# Would need to swap the order of elements within a vector register.
172	@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
173	@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
174	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
175	@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
176
177	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
178	# v2 += (a0b1)h
179	@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
180	# v2 += (a1b1)l
181	@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
182
183	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
184	# v1 += (a0b0)h,0
185	@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
186	# v1 += (a1b0)l,0
187	@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
188
189	# Now the 256bit product should be stored in (v2,v1)
190	# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
191	# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
192
193	# Reduction
194	# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
195	# This is a slight variation of the Gueron's Montgomery reduction.
196	# The difference being the order of some operations has been changed,
197	# to make a better use of vclmul(h) instructions.
198
199	# First step:
200	# c1 += (c0 * P)l
201	# vmv.v.i v0, 2
202	@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
203	@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
204	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
205
206	# Second step:
207	# D = d1,d0 is final result
208	# We want:
209	# m1 = c1 + (c1 * P)h
210	# m0 = (c1 * P)l + (c0 * P)h + c0
211	# d1 = c3 + m1
212	# d0 = c2 + m0
213
214	#v3 = (c1 * P)l, 0
215	@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
216	#v4 = (c1 * P)h, (c0 * P)h
217	@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
218
219	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
220	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
221
222	@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
223	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
224
225	# XOR in the upper upper part of the product
226	@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
227
228	@{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
229	@{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
230	ret
231	.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
232	___
233	}
234
235	################################################################################
236	# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
237	# const u8 *inp, size_t len);
238	#
239	# input: Xi: current hash value
240	# Htable: preprocessed H
241	# inp: pointer to input data
242	# len: length of input data in bytes (multiple of block size)
243	# output: Xi: Xi+1 (next hash value Xi)
244	{
245	my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
246	my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
247
248	$code .= <<___;
249	.p2align 3
250	.globl gcm_ghash_rv64i_zvkb_zvbc
251	.type gcm_ghash_rv64i_zvkb_zvbc,\@function
252	gcm_ghash_rv64i_zvkb_zvbc:
253	ld $TMP0, ($Htable)
254	ld $TMP1, 8($Htable)
255	li $TMP2, 63
256	la $TMP3, Lpolymod
257	ld $TMP3, 8($TMP3)
258
259	# Load/store data in reverse order.
260	# This is needed as a part of endianness swap.
261	add $Xi, $Xi, 8
262	add $inp, $inp, 8
263	li $M8, -8
264
265	@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
266
267	@{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4
268
269	Lstep:
270	# Read input data
271	@{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
272	add $inp, $inp, 16
273	add $len, $len, -16
274	# XOR them into Xi
275	@{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1
276
277	@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
278
279	# Multiplication
280
281	# Do two 64x64 multiplications in one go to save some time
282	# and simplify things.
283
284	# A = a1a0 (t1, t0)
285	# B = b1b0 (v5)
286	# C = c1c0 (256 bit)
287	# c1 = a1b1 + (a0b1)h + (a1b0)h
288	# c0 = a0b0 + (a0b1)l + (a1b0)h
289
290	# v1 = (a0b1)l,(a0b0)l
291	@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
292	# v3 = (a0b1)h,(a0b0)h
293	@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
294
295	# v4 = (a1b1)l,(a1b0)l
296	@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
297	# v2 = (a1b1)h,(a1b0)h
298	@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
299
300	# Is there a better way to do this?
301	# Would need to swap the order of elements within a vector register.
302	@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
303	@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
304	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
305	@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
306
307	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
308	# v2 += (a0b1)h
309	@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
310	# v2 += (a1b1)l
311	@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
312
313	@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
314	# v1 += (a0b0)h,0
315	@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
316	# v1 += (a1b0)l,0
317	@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
318
319	# Now the 256bit product should be stored in (v2,v1)
320	# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
321	# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
322
323	# Reduction
324	# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
325	# This is a slight variation of the Gueron's Montgomery reduction.
326	# The difference being the order of some operations has been changed,
327	# to make a better use of vclmul(h) instructions.
328
329	# First step:
330	# c1 += (c0 * P)l
331	# vmv.v.i v0, 2
332	@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
333	@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
334	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
335
336	# Second step:
337	# D = d1,d0 is final result
338	# We want:
339	# m1 = c1 + (c1 * P)h
340	# m0 = (c1 * P)l + (c0 * P)h + c0
341	# d1 = c3 + m1
342	# d0 = c2 + m0
343
344	#v3 = (c1 * P)l, 0
345	@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
346	#v4 = (c1 * P)h, (c0 * P)h
347	@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
348
349	@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
350	@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
351
352	@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
353	@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
354
355	# XOR in the upper upper part of the product
356	@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
357
358	@{[vrev8_v $V5, $V2]} # vrev8.v v2, v2
359
360	bnez $len, Lstep
361
362	@{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
363	ret
364	.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
365	___
366	}
367
368	$code .= <<___;
369	.p2align 4
370	Lpolymod:
371	.dword 0x0000000000000001
372	.dword 0xc200000000000000
373	.size Lpolymod,.-Lpolymod
374	___
375
376	print $code;
377
378	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.3.2/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl

Download in other formats: