1 | #! /usr/bin/env perl
|
---|
2 | # This file is dual-licensed, meaning that you can use it under your
|
---|
3 | # choice of either of the following two licenses:
|
---|
4 | #
|
---|
5 | # Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
|
---|
6 | #
|
---|
7 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
8 | # this file except in compliance with the License. You can obtain a copy
|
---|
9 | # in the file LICENSE in the source distribution or at
|
---|
10 | # https://www.openssl.org/source/license.html
|
---|
11 | #
|
---|
12 | # or
|
---|
13 | #
|
---|
14 | # Copyright (c) 2023, Jerry Shih <[email protected]>
|
---|
15 | # All rights reserved.
|
---|
16 | #
|
---|
17 | # Redistribution and use in source and binary forms, with or without
|
---|
18 | # modification, are permitted provided that the following conditions
|
---|
19 | # are met:
|
---|
20 | # 1. Redistributions of source code must retain the above copyright
|
---|
21 | # notice, this list of conditions and the following disclaimer.
|
---|
22 | # 2. Redistributions in binary form must reproduce the above copyright
|
---|
23 | # notice, this list of conditions and the following disclaimer in the
|
---|
24 | # documentation and/or other materials provided with the distribution.
|
---|
25 | #
|
---|
26 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
---|
27 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
---|
28 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
---|
29 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
---|
30 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
---|
31 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
---|
32 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
---|
33 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
---|
34 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
35 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
---|
36 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
37 |
|
---|
38 | # - RV64I
|
---|
39 | # - RISC-V Vector ('V') with VLEN >= 128
|
---|
40 | # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
|
---|
41 | # - RISC-V Basic Bit-manipulation extension ('Zbb')
|
---|
42 | # - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
|
---|
43 |
|
---|
44 | use strict;
|
---|
45 | use warnings;
|
---|
46 |
|
---|
47 | use FindBin qw($Bin);
|
---|
48 | use lib "$Bin";
|
---|
49 | use lib "$Bin/../../perlasm";
|
---|
50 | use riscv;
|
---|
51 |
|
---|
52 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
53 | # $flavour is the first argument if it doesn't look like a file
|
---|
54 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
55 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
56 |
|
---|
57 | $output and open STDOUT, ">$output";
|
---|
58 |
|
---|
59 | my $code = <<___;
|
---|
60 | .text
|
---|
61 | ___
|
---|
62 |
|
---|
63 | # void ChaCha20_ctr32_zbb_zvkb(unsigned char *out, const unsigned char *inp,
|
---|
64 | # size_t len, const unsigned int key[8],
|
---|
65 | # const unsigned int counter[4]);
|
---|
66 | ################################################################################
|
---|
67 | my ( $OUTPUT, $INPUT, $LEN, $KEY, $COUNTER ) = ( "a0", "a1", "a2", "a3", "a4" );
|
---|
68 | my ( $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3 ) = ( "a5", "a6",
|
---|
69 | "a7", "s0" );
|
---|
70 | my ( $KEY0, $KEY1, $KEY2, $KEY3, $KEY4, $KEY5, $KEY6, $KEY7, $COUNTER0,
|
---|
71 | $COUNTER1, $NONCE0, $NONCE1) = ( "s1", "s2", "s3", "s4", "s5", "s6", "s7",
|
---|
72 | "s8", "s9", "s10", "s11", "t0" );
|
---|
73 | my ( $STATE0, $STATE1, $STATE2, $STATE3,
|
---|
74 | $STATE4, $STATE5, $STATE6, $STATE7,
|
---|
75 | $STATE8, $STATE9, $STATE10, $STATE11,
|
---|
76 | $STATE12, $STATE13, $STATE14, $STATE15) = (
|
---|
77 | $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3,
|
---|
78 | $KEY0, $KEY1, $KEY2, $KEY3,
|
---|
79 | $KEY4, $KEY5, $KEY6, $KEY7,
|
---|
80 | $COUNTER0, $COUNTER1, $NONCE0, $NONCE1 );
|
---|
81 | my ( $VL ) = ( "t1" );
|
---|
82 | my ( $CURRENT_COUNTER ) = ( "t2" );
|
---|
83 | my ( $T0 ) = ( "t3" );
|
---|
84 | my ( $T1 ) = ( "t4" );
|
---|
85 | my ( $T2 ) = ( "t5" );
|
---|
86 | my ( $T3 ) = ( "t6" );
|
---|
87 | my (
|
---|
88 | $V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, $V8, $V9, $V10,
|
---|
89 | $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21,
|
---|
90 | $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
|
---|
91 | ) = map( "v$_", ( 0 .. 31 ) );
|
---|
92 |
|
---|
93 | sub chacha_quad_round_group {
|
---|
94 | my (
|
---|
95 | $A0, $B0, $C0, $D0,
|
---|
96 | $A1, $B1, $C1, $D1,
|
---|
97 | $A2, $B2, $C2, $D2,
|
---|
98 | $A3, $B3, $C3, $D3,
|
---|
99 |
|
---|
100 | $S_A0, $S_B0, $S_C0, $S_D0,
|
---|
101 | $S_A1, $S_B1, $S_C1, $S_D1,
|
---|
102 | $S_A2, $S_B2, $S_C2, $S_D2,
|
---|
103 | $S_A3, $S_B3, $S_C3, $S_D3,
|
---|
104 | ) = @_;
|
---|
105 |
|
---|
106 | my $code = <<___;
|
---|
107 | # a += b; d ^= a; d <<<= 16;
|
---|
108 | @{[vadd_vv $A0, $A0, $B0]}
|
---|
109 | add $S_A0, $S_A0, $S_B0
|
---|
110 | @{[vadd_vv $A1, $A1, $B1]}
|
---|
111 | add $S_A1, $S_A1, $S_B1
|
---|
112 | @{[vadd_vv $A2, $A2, $B2]}
|
---|
113 | add $S_A2, $S_A2, $S_B2
|
---|
114 | @{[vadd_vv $A3, $A3, $B3]}
|
---|
115 | add $S_A3, $S_A3, $S_B3
|
---|
116 | @{[vxor_vv $D0, $D0, $A0]}
|
---|
117 | xor $S_D0, $S_D0, $S_A0
|
---|
118 | @{[vxor_vv $D1, $D1, $A1]}
|
---|
119 | xor $S_D1, $S_D1, $S_A1
|
---|
120 | @{[vxor_vv $D2, $D2, $A2]}
|
---|
121 | xor $S_D2, $S_D2, $S_A2
|
---|
122 | @{[vxor_vv $D3, $D3, $A3]}
|
---|
123 | xor $S_D3, $S_D3, $S_A3
|
---|
124 | @{[vror_vi $D0, $D0, 32 - 16]}
|
---|
125 | @{[roriw $S_D0, $S_D0, 32 - 16]}
|
---|
126 | @{[vror_vi $D1, $D1, 32 - 16]}
|
---|
127 | @{[roriw $S_D1, $S_D1, 32 - 16]}
|
---|
128 | @{[vror_vi $D2, $D2, 32 - 16]}
|
---|
129 | @{[roriw $S_D2, $S_D2, 32 - 16]}
|
---|
130 | @{[vror_vi $D3, $D3, 32 - 16]}
|
---|
131 | @{[roriw $S_D3, $S_D3, 32 - 16]}
|
---|
132 | # c += d; b ^= c; b <<<= 12;
|
---|
133 | @{[vadd_vv $C0, $C0, $D0]}
|
---|
134 | add $S_C0, $S_C0, $S_D0
|
---|
135 | @{[vadd_vv $C1, $C1, $D1]}
|
---|
136 | add $S_C1, $S_C1, $S_D1
|
---|
137 | @{[vadd_vv $C2, $C2, $D2]}
|
---|
138 | add $S_C2, $S_C2, $S_D2
|
---|
139 | @{[vadd_vv $C3, $C3, $D3]}
|
---|
140 | add $S_C3, $S_C3, $S_D3
|
---|
141 | @{[vxor_vv $B0, $B0, $C0]}
|
---|
142 | xor $S_B0, $S_B0, $S_C0
|
---|
143 | @{[vxor_vv $B1, $B1, $C1]}
|
---|
144 | xor $S_B1, $S_B1, $S_C1
|
---|
145 | @{[vxor_vv $B2, $B2, $C2]}
|
---|
146 | xor $S_B2, $S_B2, $S_C2
|
---|
147 | @{[vxor_vv $B3, $B3, $C3]}
|
---|
148 | xor $S_B3, $S_B3, $S_C3
|
---|
149 | @{[vror_vi $B0, $B0, 32 - 12]}
|
---|
150 | @{[roriw $S_B0, $S_B0, 32 - 12]}
|
---|
151 | @{[vror_vi $B1, $B1, 32 - 12]}
|
---|
152 | @{[roriw $S_B1, $S_B1, 32 - 12]}
|
---|
153 | @{[vror_vi $B2, $B2, 32 - 12]}
|
---|
154 | @{[roriw $S_B2, $S_B2, 32 - 12]}
|
---|
155 | @{[vror_vi $B3, $B3, 32 - 12]}
|
---|
156 | @{[roriw $S_B3, $S_B3, 32 - 12]}
|
---|
157 | # a += b; d ^= a; d <<<= 8;
|
---|
158 | @{[vadd_vv $A0, $A0, $B0]}
|
---|
159 | add $S_A0, $S_A0, $S_B0
|
---|
160 | @{[vadd_vv $A1, $A1, $B1]}
|
---|
161 | add $S_A1, $S_A1, $S_B1
|
---|
162 | @{[vadd_vv $A2, $A2, $B2]}
|
---|
163 | add $S_A2, $S_A2, $S_B2
|
---|
164 | @{[vadd_vv $A3, $A3, $B3]}
|
---|
165 | add $S_A3, $S_A3, $S_B3
|
---|
166 | @{[vxor_vv $D0, $D0, $A0]}
|
---|
167 | xor $S_D0, $S_D0, $S_A0
|
---|
168 | @{[vxor_vv $D1, $D1, $A1]}
|
---|
169 | xor $S_D1, $S_D1, $S_A1
|
---|
170 | @{[vxor_vv $D2, $D2, $A2]}
|
---|
171 | xor $S_D2, $S_D2, $S_A2
|
---|
172 | @{[vxor_vv $D3, $D3, $A3]}
|
---|
173 | xor $S_D3, $S_D3, $S_A3
|
---|
174 | @{[vror_vi $D0, $D0, 32 - 8]}
|
---|
175 | @{[roriw $S_D0, $S_D0, 32 - 8]}
|
---|
176 | @{[vror_vi $D1, $D1, 32 - 8]}
|
---|
177 | @{[roriw $S_D1, $S_D1, 32 - 8]}
|
---|
178 | @{[vror_vi $D2, $D2, 32 - 8]}
|
---|
179 | @{[roriw $S_D2, $S_D2, 32 - 8]}
|
---|
180 | @{[vror_vi $D3, $D3, 32 - 8]}
|
---|
181 | @{[roriw $S_D3, $S_D3, 32 - 8]}
|
---|
182 | # c += d; b ^= c; b <<<= 7;
|
---|
183 | @{[vadd_vv $C0, $C0, $D0]}
|
---|
184 | add $S_C0, $S_C0, $S_D0
|
---|
185 | @{[vadd_vv $C1, $C1, $D1]}
|
---|
186 | add $S_C1, $S_C1, $S_D1
|
---|
187 | @{[vadd_vv $C2, $C2, $D2]}
|
---|
188 | add $S_C2, $S_C2, $S_D2
|
---|
189 | @{[vadd_vv $C3, $C3, $D3]}
|
---|
190 | add $S_C3, $S_C3, $S_D3
|
---|
191 | @{[vxor_vv $B0, $B0, $C0]}
|
---|
192 | xor $S_B0, $S_B0, $S_C0
|
---|
193 | @{[vxor_vv $B1, $B1, $C1]}
|
---|
194 | xor $S_B1, $S_B1, $S_C1
|
---|
195 | @{[vxor_vv $B2, $B2, $C2]}
|
---|
196 | xor $S_B2, $S_B2, $S_C2
|
---|
197 | @{[vxor_vv $B3, $B3, $C3]}
|
---|
198 | xor $S_B3, $S_B3, $S_C3
|
---|
199 | @{[vror_vi $B0, $B0, 32 - 7]}
|
---|
200 | @{[roriw $S_B0, $S_B0, 32 - 7]}
|
---|
201 | @{[vror_vi $B1, $B1, 32 - 7]}
|
---|
202 | @{[roriw $S_B1, $S_B1, 32 - 7]}
|
---|
203 | @{[vror_vi $B2, $B2, 32 - 7]}
|
---|
204 | @{[roriw $S_B2, $S_B2, 32 - 7]}
|
---|
205 | @{[vror_vi $B3, $B3, 32 - 7]}
|
---|
206 | @{[roriw $S_B3, $S_B3, 32 - 7]}
|
---|
207 | ___
|
---|
208 |
|
---|
209 | return $code;
|
---|
210 | }
|
---|
211 |
|
---|
212 | $code .= <<___;
|
---|
213 | .p2align 3
|
---|
214 | .globl ChaCha20_ctr32_zbb_zvkb
|
---|
215 | .type ChaCha20_ctr32_zbb_zvkb,\@function
|
---|
216 | ChaCha20_ctr32_zbb_zvkb:
|
---|
217 | addi sp, sp, -96
|
---|
218 | sd s0, 0(sp)
|
---|
219 | sd s1, 8(sp)
|
---|
220 | sd s2, 16(sp)
|
---|
221 | sd s3, 24(sp)
|
---|
222 | sd s4, 32(sp)
|
---|
223 | sd s5, 40(sp)
|
---|
224 | sd s6, 48(sp)
|
---|
225 | sd s7, 56(sp)
|
---|
226 | sd s8, 64(sp)
|
---|
227 | sd s9, 72(sp)
|
---|
228 | sd s10, 80(sp)
|
---|
229 | sd s11, 88(sp)
|
---|
230 | addi sp, sp, -64
|
---|
231 |
|
---|
232 | lw $CURRENT_COUNTER, 0($COUNTER)
|
---|
233 |
|
---|
234 | .Lblock_loop:
|
---|
235 | # We will use the scalar ALU for 1 chacha block.
|
---|
236 | srli $T0, $LEN, 6
|
---|
237 | @{[vsetvli $VL, $T0, "e32", "m1", "ta", "ma"]}
|
---|
238 | slli $T1, $VL, 6
|
---|
239 | bltu $T1, $LEN, 1f
|
---|
240 | # Since there is no more chacha block existed, we need to split 1 block
|
---|
241 | # from vector ALU.
|
---|
242 | addi $T1, $VL, -1
|
---|
243 | @{[vsetvli $VL, $T1, "e32", "m1", "ta", "ma"]}
|
---|
244 | 1:
|
---|
245 |
|
---|
246 | #### chacha block data
|
---|
247 | # init chacha const states
|
---|
248 | # "expa" little endian
|
---|
249 | li $CONST_DATA0, 0x61707865
|
---|
250 | @{[vmv_v_x $V0, $CONST_DATA0]}
|
---|
251 | # "nd 3" little endian
|
---|
252 | li $CONST_DATA1, 0x3320646e
|
---|
253 | @{[vmv_v_x $V1, $CONST_DATA1]}
|
---|
254 | # "2-by" little endian
|
---|
255 | li $CONST_DATA2, 0x79622d32
|
---|
256 | @{[vmv_v_x $V2, $CONST_DATA2]}
|
---|
257 | # "te k" little endian
|
---|
258 | li $CONST_DATA3, 0x6b206574
|
---|
259 | lw $KEY0, 0($KEY)
|
---|
260 | @{[vmv_v_x $V3, $CONST_DATA3]}
|
---|
261 |
|
---|
262 | # init chacha key states
|
---|
263 | lw $KEY1, 4($KEY)
|
---|
264 | @{[vmv_v_x $V4, $KEY0]}
|
---|
265 | lw $KEY2, 8($KEY)
|
---|
266 | @{[vmv_v_x $V5, $KEY1]}
|
---|
267 | lw $KEY3, 12($KEY)
|
---|
268 | @{[vmv_v_x $V6, $KEY2]}
|
---|
269 | lw $KEY4, 16($KEY)
|
---|
270 | @{[vmv_v_x $V7, $KEY3]}
|
---|
271 | lw $KEY5, 20($KEY)
|
---|
272 | @{[vmv_v_x $V8, $KEY4]}
|
---|
273 | lw $KEY6, 24($KEY)
|
---|
274 | @{[vmv_v_x $V9, $KEY5]}
|
---|
275 | lw $KEY7, 28($KEY)
|
---|
276 | @{[vmv_v_x $V10, $KEY6]}
|
---|
277 | @{[vmv_v_x $V11, $KEY7]}
|
---|
278 |
|
---|
279 | # init chacha key states
|
---|
280 | lw $COUNTER1, 4($COUNTER)
|
---|
281 | @{[vid_v $V12]}
|
---|
282 | lw $NONCE0, 8($COUNTER)
|
---|
283 | @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]}
|
---|
284 | lw $NONCE1, 12($COUNTER)
|
---|
285 | @{[vmv_v_x $V13, $COUNTER1]}
|
---|
286 | add $COUNTER0, $CURRENT_COUNTER, $VL
|
---|
287 |
|
---|
288 | # init chacha nonce states
|
---|
289 | @{[vmv_v_x $V14, $NONCE0]}
|
---|
290 | @{[vmv_v_x $V15, $NONCE1]}
|
---|
291 |
|
---|
292 | li $T0, 64
|
---|
293 | # load the top-half of input data
|
---|
294 | @{[vlsseg_nf_e32_v 8, $V16, $INPUT, $T0]}
|
---|
295 |
|
---|
296 | # 20 round groups
|
---|
297 | li $T0, 10
|
---|
298 | .Lround_loop:
|
---|
299 | addi $T0, $T0, -1
|
---|
300 | @{[chacha_quad_round_group
|
---|
301 | $V0, $V4, $V8, $V12,
|
---|
302 | $V1, $V5, $V9, $V13,
|
---|
303 | $V2, $V6, $V10, $V14,
|
---|
304 | $V3, $V7, $V11, $V15,
|
---|
305 | $STATE0, $STATE4, $STATE8, $STATE12,
|
---|
306 | $STATE1, $STATE5, $STATE9, $STATE13,
|
---|
307 | $STATE2, $STATE6, $STATE10, $STATE14,
|
---|
308 | $STATE3, $STATE7, $STATE11, $STATE15]}
|
---|
309 | @{[chacha_quad_round_group
|
---|
310 | $V3, $V4, $V9, $V14,
|
---|
311 | $V0, $V5, $V10, $V15,
|
---|
312 | $V1, $V6, $V11, $V12,
|
---|
313 | $V2, $V7, $V8, $V13,
|
---|
314 | $STATE3, $STATE4, $STATE9, $STATE14,
|
---|
315 | $STATE0, $STATE5, $STATE10, $STATE15,
|
---|
316 | $STATE1, $STATE6, $STATE11, $STATE12,
|
---|
317 | $STATE2, $STATE7, $STATE8, $STATE13]}
|
---|
318 | bnez $T0, .Lround_loop
|
---|
319 |
|
---|
320 | li $T0, 64
|
---|
321 | # load the bottom-half of input data
|
---|
322 | addi $T1, $INPUT, 32
|
---|
323 | @{[vlsseg_nf_e32_v 8, $V24, $T1, $T0]}
|
---|
324 |
|
---|
325 | # add chacha top-half initial block states
|
---|
326 | # "expa" little endian
|
---|
327 | li $T0, 0x61707865
|
---|
328 | @{[vadd_vx $V0, $V0, $T0]}
|
---|
329 | add $STATE0, $STATE0, $T0
|
---|
330 | # "nd 3" little endian
|
---|
331 | li $T1, 0x3320646e
|
---|
332 | @{[vadd_vx $V1, $V1, $T1]}
|
---|
333 | add $STATE1, $STATE1, $T1
|
---|
334 | lw $T0, 0($KEY)
|
---|
335 | # "2-by" little endian
|
---|
336 | li $T2, 0x79622d32
|
---|
337 | @{[vadd_vx $V2, $V2, $T2]}
|
---|
338 | add $STATE2, $STATE2, $T2
|
---|
339 | lw $T1, 4($KEY)
|
---|
340 | # "te k" little endian
|
---|
341 | li $T3, 0x6b206574
|
---|
342 | @{[vadd_vx $V3, $V3, $T3]}
|
---|
343 | add $STATE3, $STATE3, $T3
|
---|
344 | lw $T2, 8($KEY)
|
---|
345 | @{[vadd_vx $V4, $V4, $T0]}
|
---|
346 | add $STATE4, $STATE4, $T0
|
---|
347 | lw $T3, 12($KEY)
|
---|
348 | @{[vadd_vx $V5, $V5, $T1]}
|
---|
349 | add $STATE5, $STATE5, $T1
|
---|
350 | @{[vadd_vx $V6, $V6, $T2]}
|
---|
351 | add $STATE6, $STATE6, $T2
|
---|
352 | @{[vadd_vx $V7, $V7, $T3]}
|
---|
353 | add $STATE7, $STATE7, $T3
|
---|
354 |
|
---|
355 | # xor with the top-half input
|
---|
356 | @{[vxor_vv $V16, $V16, $V0]}
|
---|
357 | sw $STATE0, 0(sp)
|
---|
358 | sw $STATE1, 4(sp)
|
---|
359 | @{[vxor_vv $V17, $V17, $V1]}
|
---|
360 | sw $STATE2, 8(sp)
|
---|
361 | sw $STATE3, 12(sp)
|
---|
362 | @{[vxor_vv $V18, $V18, $V2]}
|
---|
363 | sw $STATE4, 16(sp)
|
---|
364 | sw $STATE5, 20(sp)
|
---|
365 | @{[vxor_vv $V19, $V19, $V3]}
|
---|
366 | sw $STATE6, 24(sp)
|
---|
367 | sw $STATE7, 28(sp)
|
---|
368 | @{[vxor_vv $V20, $V20, $V4]}
|
---|
369 | lw $T0, 16($KEY)
|
---|
370 | @{[vxor_vv $V21, $V21, $V5]}
|
---|
371 | lw $T1, 20($KEY)
|
---|
372 | @{[vxor_vv $V22, $V22, $V6]}
|
---|
373 | lw $T2, 24($KEY)
|
---|
374 | @{[vxor_vv $V23, $V23, $V7]}
|
---|
375 |
|
---|
376 | # save the top-half of output
|
---|
377 | li $T3, 64
|
---|
378 | @{[vssseg_nf_e32_v 8, $V16, $OUTPUT, $T3]}
|
---|
379 |
|
---|
380 | # add chacha bottom-half initial block states
|
---|
381 | @{[vadd_vx $V8, $V8, $T0]}
|
---|
382 | add $STATE8, $STATE8, $T0
|
---|
383 | lw $T3, 28($KEY)
|
---|
384 | @{[vadd_vx $V9, $V9, $T1]}
|
---|
385 | add $STATE9, $STATE9, $T1
|
---|
386 | lw $T0, 4($COUNTER)
|
---|
387 | @{[vadd_vx $V10, $V10, $T2]}
|
---|
388 | add $STATE10, $STATE10, $T2
|
---|
389 | lw $T1, 8($COUNTER)
|
---|
390 | @{[vadd_vx $V11, $V11, $T3]}
|
---|
391 | add $STATE11, $STATE11, $T3
|
---|
392 | lw $T2, 12($COUNTER)
|
---|
393 | @{[vid_v $V0]}
|
---|
394 | add $STATE12, $STATE12, $CURRENT_COUNTER
|
---|
395 | @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]}
|
---|
396 | add $STATE12, $STATE12, $VL
|
---|
397 | @{[vadd_vx $V13, $V13, $T0]}
|
---|
398 | add $STATE13, $STATE13, $T0
|
---|
399 | @{[vadd_vx $V14, $V14, $T1]}
|
---|
400 | add $STATE14, $STATE14, $T1
|
---|
401 | @{[vadd_vx $V15, $V15, $T2]}
|
---|
402 | add $STATE15, $STATE15, $T2
|
---|
403 | @{[vadd_vv $V12, $V12, $V0]}
|
---|
404 | # xor with the bottom-half input
|
---|
405 | @{[vxor_vv $V24, $V24, $V8]}
|
---|
406 | sw $STATE8, 32(sp)
|
---|
407 | @{[vxor_vv $V25, $V25, $V9]}
|
---|
408 | sw $STATE9, 36(sp)
|
---|
409 | @{[vxor_vv $V26, $V26, $V10]}
|
---|
410 | sw $STATE10, 40(sp)
|
---|
411 | @{[vxor_vv $V27, $V27, $V11]}
|
---|
412 | sw $STATE11, 44(sp)
|
---|
413 | @{[vxor_vv $V29, $V29, $V13]}
|
---|
414 | sw $STATE12, 48(sp)
|
---|
415 | @{[vxor_vv $V28, $V28, $V12]}
|
---|
416 | sw $STATE13, 52(sp)
|
---|
417 | @{[vxor_vv $V30, $V30, $V14]}
|
---|
418 | sw $STATE14, 56(sp)
|
---|
419 | @{[vxor_vv $V31, $V31, $V15]}
|
---|
420 | sw $STATE15, 60(sp)
|
---|
421 |
|
---|
422 | # save the bottom-half of output
|
---|
423 | li $T0, 64
|
---|
424 | addi $T1, $OUTPUT, 32
|
---|
425 | @{[vssseg_nf_e32_v 8, $V24, $T1, $T0]}
|
---|
426 |
|
---|
427 | # the computed vector parts: `64 * VL`
|
---|
428 | slli $T0, $VL, 6
|
---|
429 |
|
---|
430 | add $INPUT, $INPUT, $T0
|
---|
431 | add $OUTPUT, $OUTPUT, $T0
|
---|
432 | sub $LEN, $LEN, $T0
|
---|
433 | add $CURRENT_COUNTER, $CURRENT_COUNTER, $VL
|
---|
434 |
|
---|
435 | # process the scalar data block
|
---|
436 | addi $CURRENT_COUNTER, $CURRENT_COUNTER, 1
|
---|
437 | li $T0, 64
|
---|
438 | @{[minu $T1, $LEN, $T0]}
|
---|
439 | sub $LEN, $LEN, $T1
|
---|
440 | mv $T2, sp
|
---|
441 | .Lscalar_data_loop:
|
---|
442 | @{[vsetvli $VL, $T1, "e8", "m8", "ta", "ma"]}
|
---|
443 | @{[vle8_v $V8, $INPUT]}
|
---|
444 | @{[vle8_v $V16, $T2]}
|
---|
445 | @{[vxor_vv $V8, $V8, $V16]}
|
---|
446 | @{[vse8_v $V8, $OUTPUT]}
|
---|
447 | add $INPUT, $INPUT, $VL
|
---|
448 | add $OUTPUT, $OUTPUT, $VL
|
---|
449 | add $T2, $T2, $VL
|
---|
450 | sub $T1, $T1, $VL
|
---|
451 | bnez $T1, .Lscalar_data_loop
|
---|
452 |
|
---|
453 | bnez $LEN, .Lblock_loop
|
---|
454 |
|
---|
455 | addi sp, sp, 64
|
---|
456 | ld s0, 0(sp)
|
---|
457 | ld s1, 8(sp)
|
---|
458 | ld s2, 16(sp)
|
---|
459 | ld s3, 24(sp)
|
---|
460 | ld s4, 32(sp)
|
---|
461 | ld s5, 40(sp)
|
---|
462 | ld s6, 48(sp)
|
---|
463 | ld s7, 56(sp)
|
---|
464 | ld s8, 64(sp)
|
---|
465 | ld s9, 72(sp)
|
---|
466 | ld s10, 80(sp)
|
---|
467 | ld s11, 88(sp)
|
---|
468 | addi sp, sp, 96
|
---|
469 |
|
---|
470 | ret
|
---|
471 | .size ChaCha20_ctr32_zbb_zvkb,.-ChaCha20_ctr32_zbb_zvkb
|
---|
472 | ___
|
---|
473 |
|
---|
474 | print $code;
|
---|
475 |
|
---|
476 | close STDOUT or die "error closing STDOUT: $!";
|
---|