Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/xtensa/ieee754-sf.S @ 0:a06113de4d67
first commit
author | kent <kent@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Jul 2009 14:47:48 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a06113de4d67 |
---|---|
1 /* IEEE-754 single-precision functions for Xtensa | |
2 Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. | |
3 Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. | |
4 | |
5 This file is part of GCC. | |
6 | |
7 GCC is free software; you can redistribute it and/or modify it | |
8 under the terms of the GNU General Public License as published by | |
9 the Free Software Foundation; either version 3, or (at your option) | |
10 any later version. | |
11 | |
12 GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 License for more details. | |
16 | |
17 Under Section 7 of GPL version 3, you are granted additional | |
18 permissions described in the GCC Runtime Library Exception, version | |
19 3.1, as published by the Free Software Foundation. | |
20 | |
21 You should have received a copy of the GNU General Public License and | |
22 a copy of the GCC Runtime Library Exception along with this program; | |
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 <http://www.gnu.org/licenses/>. */ | |
25 | |
26 #ifdef __XTENSA_EB__ | |
27 #define xh a2 | |
28 #define xl a3 | |
29 #define yh a4 | |
30 #define yl a5 | |
31 #else | |
32 #define xh a3 | |
33 #define xl a2 | |
34 #define yh a5 | |
35 #define yl a4 | |
36 #endif | |
37 | |
38 /* Warning! The branch displacements for some Xtensa branch instructions | |
39 are quite small, and this code has been carefully laid out to keep | |
40 branch targets in range. If you change anything, be sure to check that | |
41 the assembler is not relaxing anything to branch over a jump. */ | |
42 | |
43 #ifdef L_negsf2 | |
44 | |
45 .align 4 | |
46 .global __negsf2 | |
47 .type __negsf2, @function | |
48 __negsf2: | |
49 leaf_entry sp, 16 | |
50 movi a4, 0x80000000 | |
51 xor a2, a2, a4 | |
52 leaf_return | |
53 | |
54 #endif /* L_negsf2 */ | |
55 | |
56 #ifdef L_addsubsf3 | |
57 | |
58 /* Addition */ | |
59 __addsf3_aux: | |
60 | |
61 /* Handle NaNs and Infinities. (This code is placed before the | |
62 start of the function just to keep it in range of the limited | |
63 branch displacements.) */ | |
64 | |
65 .Ladd_xnan_or_inf: | |
66 /* If y is neither Infinity nor NaN, return x. */ | |
67 bnall a3, a6, 1f | |
68 /* If x is a NaN, return it. Otherwise, return y. */ | |
69 slli a7, a2, 9 | |
70 beqz a7, .Ladd_ynan_or_inf | |
71 1: leaf_return | |
72 | |
73 .Ladd_ynan_or_inf: | |
74 /* Return y. */ | |
75 mov a2, a3 | |
76 leaf_return | |
77 | |
78 .Ladd_opposite_signs: | |
79 /* Operand signs differ. Do a subtraction. */ | |
80 slli a7, a6, 8 | |
81 xor a3, a3, a7 | |
82 j .Lsub_same_sign | |
83 | |
84 .align 4 | |
85 .global __addsf3 | |
86 .type __addsf3, @function | |
87 __addsf3: | |
88 leaf_entry sp, 16 | |
89 movi a6, 0x7f800000 | |
90 | |
91 /* Check if the two operands have the same sign. */ | |
92 xor a7, a2, a3 | |
93 bltz a7, .Ladd_opposite_signs | |
94 | |
95 .Ladd_same_sign: | |
96 /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ | |
97 ball a2, a6, .Ladd_xnan_or_inf | |
98 ball a3, a6, .Ladd_ynan_or_inf | |
99 | |
100 /* Compare the exponents. The smaller operand will be shifted | |
101 right by the exponent difference and added to the larger | |
102 one. */ | |
103 extui a7, a2, 23, 9 | |
104 extui a8, a3, 23, 9 | |
105 bltu a7, a8, .Ladd_shiftx | |
106 | |
107 .Ladd_shifty: | |
108 /* Check if the smaller (or equal) exponent is zero. */ | |
109 bnone a3, a6, .Ladd_yexpzero | |
110 | |
111 /* Replace y sign/exponent with 0x008. */ | |
112 or a3, a3, a6 | |
113 slli a3, a3, 8 | |
114 srli a3, a3, 8 | |
115 | |
116 .Ladd_yexpdiff: | |
117 /* Compute the exponent difference. */ | |
118 sub a10, a7, a8 | |
119 | |
120 /* Exponent difference > 32 -- just return the bigger value. */ | |
121 bgeui a10, 32, 1f | |
122 | |
123 /* Shift y right by the exponent difference. Any bits that are | |
124 shifted out of y are saved in a9 for rounding the result. */ | |
125 ssr a10 | |
126 movi a9, 0 | |
127 src a9, a3, a9 | |
128 srl a3, a3 | |
129 | |
130 /* Do the addition. */ | |
131 add a2, a2, a3 | |
132 | |
133 /* Check if the add overflowed into the exponent. */ | |
134 extui a10, a2, 23, 9 | |
135 beq a10, a7, .Ladd_round | |
136 mov a8, a7 | |
137 j .Ladd_carry | |
138 | |
139 .Ladd_yexpzero: | |
140 /* y is a subnormal value. Replace its sign/exponent with zero, | |
141 i.e., no implicit "1.0", and increment the apparent exponent | |
142 because subnormals behave as if they had the minimum (nonzero) | |
143 exponent. Test for the case when both exponents are zero. */ | |
144 slli a3, a3, 9 | |
145 srli a3, a3, 9 | |
146 bnone a2, a6, .Ladd_bothexpzero | |
147 addi a8, a8, 1 | |
148 j .Ladd_yexpdiff | |
149 | |
150 .Ladd_bothexpzero: | |
151 /* Both exponents are zero. Handle this as a special case. There | |
152 is no need to shift or round, and the normal code for handling | |
153 a carry into the exponent field will not work because it | |
154 assumes there is an implicit "1.0" that needs to be added. */ | |
155 add a2, a2, a3 | |
156 1: leaf_return | |
157 | |
158 .Ladd_xexpzero: | |
159 /* Same as "yexpzero" except skip handling the case when both | |
160 exponents are zero. */ | |
161 slli a2, a2, 9 | |
162 srli a2, a2, 9 | |
163 addi a7, a7, 1 | |
164 j .Ladd_xexpdiff | |
165 | |
166 .Ladd_shiftx: | |
167 /* Same thing as the "shifty" code, but with x and y swapped. Also, | |
168 because the exponent difference is always nonzero in this version, | |
169 the shift sequence can use SLL and skip loading a constant zero. */ | |
170 bnone a2, a6, .Ladd_xexpzero | |
171 | |
172 or a2, a2, a6 | |
173 slli a2, a2, 8 | |
174 srli a2, a2, 8 | |
175 | |
176 .Ladd_xexpdiff: | |
177 sub a10, a8, a7 | |
178 bgeui a10, 32, .Ladd_returny | |
179 | |
180 ssr a10 | |
181 sll a9, a2 | |
182 srl a2, a2 | |
183 | |
184 add a2, a2, a3 | |
185 | |
186 /* Check if the add overflowed into the exponent. */ | |
187 extui a10, a2, 23, 9 | |
188 bne a10, a8, .Ladd_carry | |
189 | |
190 .Ladd_round: | |
191 /* Round up if the leftover fraction is >= 1/2. */ | |
192 bgez a9, 1f | |
193 addi a2, a2, 1 | |
194 | |
195 /* Check if the leftover fraction is exactly 1/2. */ | |
196 slli a9, a9, 1 | |
197 beqz a9, .Ladd_exactlyhalf | |
198 1: leaf_return | |
199 | |
200 .Ladd_returny: | |
201 mov a2, a3 | |
202 leaf_return | |
203 | |
204 .Ladd_carry: | |
205 /* The addition has overflowed into the exponent field, so the | |
206 value needs to be renormalized. The mantissa of the result | |
207 can be recovered by subtracting the original exponent and | |
208 adding 0x800000 (which is the explicit "1.0" for the | |
209 mantissa of the non-shifted operand -- the "1.0" for the | |
210 shifted operand was already added). The mantissa can then | |
211 be shifted right by one bit. The explicit "1.0" of the | |
212 shifted mantissa then needs to be replaced by the exponent, | |
213 incremented by one to account for the normalizing shift. | |
214 It is faster to combine these operations: do the shift first | |
215 and combine the additions and subtractions. If x is the | |
216 original exponent, the result is: | |
217 shifted mantissa - (x << 22) + (1 << 22) + (x << 23) | |
218 or: | |
219 shifted mantissa + ((x + 1) << 22) | |
220 Note that the exponent is incremented here by leaving the | |
221 explicit "1.0" of the mantissa in the exponent field. */ | |
222 | |
223 /* Shift x right by one bit. Save the lsb. */ | |
224 mov a10, a2 | |
225 srli a2, a2, 1 | |
226 | |
227 /* See explanation above. The original exponent is in a8. */ | |
228 addi a8, a8, 1 | |
229 slli a8, a8, 22 | |
230 add a2, a2, a8 | |
231 | |
232 /* Return an Infinity if the exponent overflowed. */ | |
233 ball a2, a6, .Ladd_infinity | |
234 | |
235 /* Same thing as the "round" code except the msb of the leftover | |
236 fraction is bit 0 of a10, with the rest of the fraction in a9. */ | |
237 bbci.l a10, 0, 1f | |
238 addi a2, a2, 1 | |
239 beqz a9, .Ladd_exactlyhalf | |
240 1: leaf_return | |
241 | |
242 .Ladd_infinity: | |
243 /* Clear the mantissa. */ | |
244 srli a2, a2, 23 | |
245 slli a2, a2, 23 | |
246 | |
247 /* The sign bit may have been lost in a carry-out. Put it back. */ | |
248 slli a8, a8, 1 | |
249 or a2, a2, a8 | |
250 leaf_return | |
251 | |
252 .Ladd_exactlyhalf: | |
253 /* Round down to the nearest even value. */ | |
254 srli a2, a2, 1 | |
255 slli a2, a2, 1 | |
256 leaf_return | |
257 | |
258 | |
259 /* Subtraction */ | |
260 __subsf3_aux: | |
261 | |
262 /* Handle NaNs and Infinities. (This code is placed before the | |
263 start of the function just to keep it in range of the limited | |
264 branch displacements.) */ | |
265 | |
266 .Lsub_xnan_or_inf: | |
267 /* If y is neither Infinity nor NaN, return x. */ | |
268 bnall a3, a6, 1f | |
269 /* Both x and y are either NaN or Inf, so the result is NaN. */ | |
270 movi a4, 0x400000 /* make it a quiet NaN */ | |
271 or a2, a2, a4 | |
272 1: leaf_return | |
273 | |
274 .Lsub_ynan_or_inf: | |
275 /* Negate y and return it. */ | |
276 slli a7, a6, 8 | |
277 xor a2, a3, a7 | |
278 leaf_return | |
279 | |
280 .Lsub_opposite_signs: | |
281 /* Operand signs differ. Do an addition. */ | |
282 slli a7, a6, 8 | |
283 xor a3, a3, a7 | |
284 j .Ladd_same_sign | |
285 | |
286 .align 4 | |
287 .global __subsf3 | |
288 .type __subsf3, @function | |
289 __subsf3: | |
290 leaf_entry sp, 16 | |
291 movi a6, 0x7f800000 | |
292 | |
293 /* Check if the two operands have the same sign. */ | |
294 xor a7, a2, a3 | |
295 bltz a7, .Lsub_opposite_signs | |
296 | |
297 .Lsub_same_sign: | |
298 /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ | |
299 ball a2, a6, .Lsub_xnan_or_inf | |
300 ball a3, a6, .Lsub_ynan_or_inf | |
301 | |
302 /* Compare the operands. In contrast to addition, the entire | |
303 value matters here. */ | |
304 extui a7, a2, 23, 8 | |
305 extui a8, a3, 23, 8 | |
306 bltu a2, a3, .Lsub_xsmaller | |
307 | |
308 .Lsub_ysmaller: | |
309 /* Check if the smaller (or equal) exponent is zero. */ | |
310 bnone a3, a6, .Lsub_yexpzero | |
311 | |
312 /* Replace y sign/exponent with 0x008. */ | |
313 or a3, a3, a6 | |
314 slli a3, a3, 8 | |
315 srli a3, a3, 8 | |
316 | |
317 .Lsub_yexpdiff: | |
318 /* Compute the exponent difference. */ | |
319 sub a10, a7, a8 | |
320 | |
321 /* Exponent difference > 32 -- just return the bigger value. */ | |
322 bgeui a10, 32, 1f | |
323 | |
324 /* Shift y right by the exponent difference. Any bits that are | |
325 shifted out of y are saved in a9 for rounding the result. */ | |
326 ssr a10 | |
327 movi a9, 0 | |
328 src a9, a3, a9 | |
329 srl a3, a3 | |
330 | |
331 sub a2, a2, a3 | |
332 | |
333 /* Subtract the leftover bits in a9 from zero and propagate any | |
334 borrow from a2. */ | |
335 neg a9, a9 | |
336 addi a10, a2, -1 | |
337 movnez a2, a10, a9 | |
338 | |
339 /* Check if the subtract underflowed into the exponent. */ | |
340 extui a10, a2, 23, 8 | |
341 beq a10, a7, .Lsub_round | |
342 j .Lsub_borrow | |
343 | |
344 .Lsub_yexpzero: | |
345 /* Return zero if the inputs are equal. (For the non-subnormal | |
346 case, subtracting the "1.0" will cause a borrow from the exponent | |
347 and this case can be detected when handling the borrow.) */ | |
348 beq a2, a3, .Lsub_return_zero | |
349 | |
350 /* y is a subnormal value. Replace its sign/exponent with zero, | |
351 i.e., no implicit "1.0". Unless x is also a subnormal, increment | |
352 y's apparent exponent because subnormals behave as if they had | |
353 the minimum (nonzero) exponent. */ | |
354 slli a3, a3, 9 | |
355 srli a3, a3, 9 | |
356 bnone a2, a6, .Lsub_yexpdiff | |
357 addi a8, a8, 1 | |
358 j .Lsub_yexpdiff | |
359 | |
360 .Lsub_returny: | |
361 /* Negate and return y. */ | |
362 slli a7, a6, 8 | |
363 xor a2, a3, a7 | |
364 1: leaf_return | |
365 | |
366 .Lsub_xsmaller: | |
367 /* Same thing as the "ysmaller" code, but with x and y swapped and | |
368 with y negated. */ | |
369 bnone a2, a6, .Lsub_xexpzero | |
370 | |
371 or a2, a2, a6 | |
372 slli a2, a2, 8 | |
373 srli a2, a2, 8 | |
374 | |
375 .Lsub_xexpdiff: | |
376 sub a10, a8, a7 | |
377 bgeui a10, 32, .Lsub_returny | |
378 | |
379 ssr a10 | |
380 movi a9, 0 | |
381 src a9, a2, a9 | |
382 srl a2, a2 | |
383 | |
384 /* Negate y. */ | |
385 slli a11, a6, 8 | |
386 xor a3, a3, a11 | |
387 | |
388 sub a2, a3, a2 | |
389 | |
390 neg a9, a9 | |
391 addi a10, a2, -1 | |
392 movnez a2, a10, a9 | |
393 | |
394 /* Check if the subtract underflowed into the exponent. */ | |
395 extui a10, a2, 23, 8 | |
396 bne a10, a8, .Lsub_borrow | |
397 | |
398 .Lsub_round: | |
399 /* Round up if the leftover fraction is >= 1/2. */ | |
400 bgez a9, 1f | |
401 addi a2, a2, 1 | |
402 | |
403 /* Check if the leftover fraction is exactly 1/2. */ | |
404 slli a9, a9, 1 | |
405 beqz a9, .Lsub_exactlyhalf | |
406 1: leaf_return | |
407 | |
408 .Lsub_xexpzero: | |
409 /* Same as "yexpzero". */ | |
410 beq a2, a3, .Lsub_return_zero | |
411 slli a2, a2, 9 | |
412 srli a2, a2, 9 | |
413 bnone a3, a6, .Lsub_xexpdiff | |
414 addi a7, a7, 1 | |
415 j .Lsub_xexpdiff | |
416 | |
417 .Lsub_return_zero: | |
418 movi a2, 0 | |
419 leaf_return | |
420 | |
421 .Lsub_borrow: | |
422 /* The subtraction has underflowed into the exponent field, so the | |
423 value needs to be renormalized. Shift the mantissa left as | |
424 needed to remove any leading zeros and adjust the exponent | |
425 accordingly. If the exponent is not large enough to remove | |
426 all the leading zeros, the result will be a subnormal value. */ | |
427 | |
428 slli a8, a2, 9 | |
429 beqz a8, .Lsub_xzero | |
430 do_nsau a6, a8, a7, a11 | |
431 srli a8, a8, 9 | |
432 bge a6, a10, .Lsub_subnormal | |
433 addi a6, a6, 1 | |
434 | |
435 .Lsub_normalize_shift: | |
436 /* Shift the mantissa (a8/a9) left by a6. */ | |
437 ssl a6 | |
438 src a8, a8, a9 | |
439 sll a9, a9 | |
440 | |
441 /* Combine the shifted mantissa with the sign and exponent, | |
442 decrementing the exponent by a6. (The exponent has already | |
443 been decremented by one due to the borrow from the subtraction, | |
444 but adding the mantissa will increment the exponent by one.) */ | |
445 srli a2, a2, 23 | |
446 sub a2, a2, a6 | |
447 slli a2, a2, 23 | |
448 add a2, a2, a8 | |
449 j .Lsub_round | |
450 | |
451 .Lsub_exactlyhalf: | |
452 /* Round down to the nearest even value. */ | |
453 srli a2, a2, 1 | |
454 slli a2, a2, 1 | |
455 leaf_return | |
456 | |
457 .Lsub_xzero: | |
458 /* If there was a borrow from the exponent, and the mantissa and | |
459 guard digits are all zero, then the inputs were equal and the | |
460 result should be zero. */ | |
461 beqz a9, .Lsub_return_zero | |
462 | |
463 /* Only the guard digit is nonzero. Shift by min(24, a10). */ | |
464 addi a11, a10, -24 | |
465 movi a6, 24 | |
466 movltz a6, a10, a11 | |
467 j .Lsub_normalize_shift | |
468 | |
469 .Lsub_subnormal: | |
470 /* The exponent is too small to shift away all the leading zeros. | |
471 Set a6 to the current exponent (which has already been | |
472 decremented by the borrow) so that the exponent of the result | |
473 will be zero. Do not add 1 to a6 in this case, because: (1) | |
474 adding the mantissa will not increment the exponent, so there is | |
475 no need to subtract anything extra from the exponent to | |
476 compensate, and (2) the effective exponent of a subnormal is 1 | |
477 not 0 so the shift amount must be 1 smaller than normal. */ | |
478 mov a6, a10 | |
479 j .Lsub_normalize_shift | |
480 | |
481 #endif /* L_addsubsf3 */ | |
482 | |
483 #ifdef L_mulsf3 | |
484 | |
485 /* Multiplication */ | |
486 #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 | |
487 #define XCHAL_NO_MUL 1 | |
488 #endif | |
489 | |
490 __mulsf3_aux: | |
491 | |
492 /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). | |
493 (This code is placed before the start of the function just to | |
494 keep it in range of the limited branch displacements.) */ | |
495 | |
496 .Lmul_xexpzero: | |
497 /* Clear the sign bit of x. */ | |
498 slli a2, a2, 1 | |
499 srli a2, a2, 1 | |
500 | |
501 /* If x is zero, return zero. */ | |
502 beqz a2, .Lmul_return_zero | |
503 | |
504 /* Normalize x. Adjust the exponent in a8. */ | |
505 do_nsau a10, a2, a11, a12 | |
506 addi a10, a10, -8 | |
507 ssl a10 | |
508 sll a2, a2 | |
509 movi a8, 1 | |
510 sub a8, a8, a10 | |
511 j .Lmul_xnormalized | |
512 | |
513 .Lmul_yexpzero: | |
514 /* Clear the sign bit of y. */ | |
515 slli a3, a3, 1 | |
516 srli a3, a3, 1 | |
517 | |
518 /* If y is zero, return zero. */ | |
519 beqz a3, .Lmul_return_zero | |
520 | |
521 /* Normalize y. Adjust the exponent in a9. */ | |
522 do_nsau a10, a3, a11, a12 | |
523 addi a10, a10, -8 | |
524 ssl a10 | |
525 sll a3, a3 | |
526 movi a9, 1 | |
527 sub a9, a9, a10 | |
528 j .Lmul_ynormalized | |
529 | |
530 .Lmul_return_zero: | |
531 /* Return zero with the appropriate sign bit. */ | |
532 srli a2, a7, 31 | |
533 slli a2, a2, 31 | |
534 j .Lmul_done | |
535 | |
536 .Lmul_xnan_or_inf: | |
537 /* If y is zero, return NaN. */ | |
538 slli a8, a3, 1 | |
539 bnez a8, 1f | |
540 movi a4, 0x400000 /* make it a quiet NaN */ | |
541 or a2, a2, a4 | |
542 j .Lmul_done | |
543 1: | |
544 /* If y is NaN, return y. */ | |
545 bnall a3, a6, .Lmul_returnx | |
546 slli a8, a3, 9 | |
547 beqz a8, .Lmul_returnx | |
548 | |
549 .Lmul_returny: | |
550 mov a2, a3 | |
551 | |
552 .Lmul_returnx: | |
553 /* Set the sign bit and return. */ | |
554 extui a7, a7, 31, 1 | |
555 slli a2, a2, 1 | |
556 ssai 1 | |
557 src a2, a7, a2 | |
558 j .Lmul_done | |
559 | |
560 .Lmul_ynan_or_inf: | |
561 /* If x is zero, return NaN. */ | |
562 slli a8, a2, 1 | |
563 bnez a8, .Lmul_returny | |
564 movi a7, 0x400000 /* make it a quiet NaN */ | |
565 or a2, a3, a7 | |
566 j .Lmul_done | |
567 | |
568 .align 4 | |
569 .global __mulsf3 | |
570 .type __mulsf3, @function | |
571 __mulsf3: | |
572 #if __XTENSA_CALL0_ABI__ | |
573 leaf_entry sp, 32 | |
574 addi sp, sp, -32 | |
575 s32i a12, sp, 16 | |
576 s32i a13, sp, 20 | |
577 s32i a14, sp, 24 | |
578 s32i a15, sp, 28 | |
579 #elif XCHAL_NO_MUL | |
580 /* This is not really a leaf function; allocate enough stack space | |
581 to allow CALL12s to a helper function. */ | |
582 leaf_entry sp, 64 | |
583 #else | |
584 leaf_entry sp, 32 | |
585 #endif | |
586 movi a6, 0x7f800000 | |
587 | |
588 /* Get the sign of the result. */ | |
589 xor a7, a2, a3 | |
590 | |
591 /* Check for NaN and infinity. */ | |
592 ball a2, a6, .Lmul_xnan_or_inf | |
593 ball a3, a6, .Lmul_ynan_or_inf | |
594 | |
595 /* Extract the exponents. */ | |
596 extui a8, a2, 23, 8 | |
597 extui a9, a3, 23, 8 | |
598 | |
599 beqz a8, .Lmul_xexpzero | |
600 .Lmul_xnormalized: | |
601 beqz a9, .Lmul_yexpzero | |
602 .Lmul_ynormalized: | |
603 | |
604 /* Add the exponents. */ | |
605 add a8, a8, a9 | |
606 | |
607 /* Replace sign/exponent fields with explicit "1.0". */ | |
608 movi a10, 0xffffff | |
609 or a2, a2, a6 | |
610 and a2, a2, a10 | |
611 or a3, a3, a6 | |
612 and a3, a3, a10 | |
613 | |
614 /* Multiply 32x32 to 64 bits. The result ends up in a2/a6. */ | |
615 | |
616 #if XCHAL_HAVE_MUL32_HIGH | |
617 | |
618 mull a6, a2, a3 | |
619 muluh a2, a2, a3 | |
620 | |
621 #else | |
622 | |
623 /* Break the inputs into 16-bit chunks and compute 4 32-bit partial | |
624 products. These partial products are: | |
625 | |
626 0 xl * yl | |
627 | |
628 1 xl * yh | |
629 2 xh * yl | |
630 | |
631 3 xh * yh | |
632 | |
633 If using the Mul16 or Mul32 multiplier options, these input | |
634 chunks must be stored in separate registers. For Mac16, the | |
635 UMUL.AA.* opcodes can specify that the inputs come from either | |
636 half of the registers, so there is no need to shift them out | |
637 ahead of time. If there is no multiply hardware, the 16-bit | |
638 chunks can be extracted when setting up the arguments to the | |
639 separate multiply function. */ | |
640 | |
641 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL | |
642 /* Calling a separate multiply function will clobber a0 and requires | |
643 use of a8 as a temporary, so save those values now. (The function | |
644 uses a custom ABI so nothing else needs to be saved.) */ | |
645 s32i a0, sp, 0 | |
646 s32i a8, sp, 4 | |
647 #endif | |
648 | |
649 #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 | |
650 | |
651 #define a2h a4 | |
652 #define a3h a5 | |
653 | |
654 /* Get the high halves of the inputs into registers. */ | |
655 srli a2h, a2, 16 | |
656 srli a3h, a3, 16 | |
657 | |
658 #define a2l a2 | |
659 #define a3l a3 | |
660 | |
661 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 | |
662 /* Clear the high halves of the inputs. This does not matter | |
663 for MUL16 because the high bits are ignored. */ | |
664 extui a2, a2, 0, 16 | |
665 extui a3, a3, 0, 16 | |
666 #endif | |
667 #endif /* MUL16 || MUL32 */ | |
668 | |
669 | |
670 #if XCHAL_HAVE_MUL16 | |
671 | |
672 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ | |
673 mul16u dst, xreg ## xhalf, yreg ## yhalf | |
674 | |
675 #elif XCHAL_HAVE_MUL32 | |
676 | |
677 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ | |
678 mull dst, xreg ## xhalf, yreg ## yhalf | |
679 | |
680 #elif XCHAL_HAVE_MAC16 | |
681 | |
682 /* The preprocessor insists on inserting a space when concatenating after | |
683 a period in the definition of do_mul below. These macros are a workaround | |
684 using underscores instead of periods when doing the concatenation. */ | |
685 #define umul_aa_ll umul.aa.ll | |
686 #define umul_aa_lh umul.aa.lh | |
687 #define umul_aa_hl umul.aa.hl | |
688 #define umul_aa_hh umul.aa.hh | |
689 | |
690 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ | |
691 umul_aa_ ## xhalf ## yhalf xreg, yreg; \ | |
692 rsr dst, ACCLO | |
693 | |
694 #else /* no multiply hardware */ | |
695 | |
696 #define set_arg_l(dst, src) \ | |
697 extui dst, src, 0, 16 | |
698 #define set_arg_h(dst, src) \ | |
699 srli dst, src, 16 | |
700 | |
701 #if __XTENSA_CALL0_ABI__ | |
702 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ | |
703 set_arg_ ## xhalf (a13, xreg); \ | |
704 set_arg_ ## yhalf (a14, yreg); \ | |
705 call0 .Lmul_mulsi3; \ | |
706 mov dst, a12 | |
707 #else | |
708 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ | |
709 set_arg_ ## xhalf (a14, xreg); \ | |
710 set_arg_ ## yhalf (a15, yreg); \ | |
711 call12 .Lmul_mulsi3; \ | |
712 mov dst, a14 | |
713 #endif /* __XTENSA_CALL0_ABI__ */ | |
714 | |
715 #endif /* no multiply hardware */ | |
716 | |
717 /* Add pp1 and pp2 into a6 with carry-out in a9. */ | |
718 do_mul(a6, a2, l, a3, h) /* pp 1 */ | |
719 do_mul(a11, a2, h, a3, l) /* pp 2 */ | |
720 movi a9, 0 | |
721 add a6, a6, a11 | |
722 bgeu a6, a11, 1f | |
723 addi a9, a9, 1 | |
724 1: | |
725 /* Shift the high half of a9/a6 into position in a9. Note that | |
726 this value can be safely incremented without any carry-outs. */ | |
727 ssai 16 | |
728 src a9, a9, a6 | |
729 | |
730 /* Compute the low word into a6. */ | |
731 do_mul(a11, a2, l, a3, l) /* pp 0 */ | |
732 sll a6, a6 | |
733 add a6, a6, a11 | |
734 bgeu a6, a11, 1f | |
735 addi a9, a9, 1 | |
736 1: | |
737 /* Compute the high word into a2. */ | |
738 do_mul(a2, a2, h, a3, h) /* pp 3 */ | |
739 add a2, a2, a9 | |
740 | |
741 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL | |
742 /* Restore values saved on the stack during the multiplication. */ | |
743 l32i a0, sp, 0 | |
744 l32i a8, sp, 4 | |
745 #endif | |
746 #endif /* ! XCHAL_HAVE_MUL32_HIGH */ | |
747 | |
748 /* Shift left by 9 bits, unless there was a carry-out from the | |
749 multiply, in which case, shift by 8 bits and increment the | |
750 exponent. */ | |
751 movi a4, 9 | |
752 srli a5, a2, 24 - 9 | |
753 beqz a5, 1f | |
754 addi a4, a4, -1 | |
755 addi a8, a8, 1 | |
756 1: ssl a4 | |
757 src a2, a2, a6 | |
758 sll a6, a6 | |
759 | |
760 /* Subtract the extra bias from the exponent sum (plus one to account | |
761 for the explicit "1.0" of the mantissa that will be added to the | |
762 exponent in the final result). */ | |
763 movi a4, 0x80 | |
764 sub a8, a8, a4 | |
765 | |
766 /* Check for over/underflow. The value in a8 is one less than the | |
767 final exponent, so values in the range 0..fd are OK here. */ | |
768 movi a4, 0xfe | |
769 bgeu a8, a4, .Lmul_overflow | |
770 | |
771 .Lmul_round: | |
772 /* Round. */ | |
773 bgez a6, .Lmul_rounded | |
774 addi a2, a2, 1 | |
775 slli a6, a6, 1 | |
776 beqz a6, .Lmul_exactlyhalf | |
777 | |
778 .Lmul_rounded: | |
779 /* Add the exponent to the mantissa. */ | |
780 slli a8, a8, 23 | |
781 add a2, a2, a8 | |
782 | |
783 .Lmul_addsign: | |
784 /* Add the sign bit. */ | |
785 srli a7, a7, 31 | |
786 slli a7, a7, 31 | |
787 or a2, a2, a7 | |
788 | |
789 .Lmul_done: | |
790 #if __XTENSA_CALL0_ABI__ | |
791 l32i a12, sp, 16 | |
792 l32i a13, sp, 20 | |
793 l32i a14, sp, 24 | |
794 l32i a15, sp, 28 | |
795 addi sp, sp, 32 | |
796 #endif | |
797 leaf_return | |
798 | |
799 .Lmul_exactlyhalf: | |
800 /* Round down to the nearest even value. */ | |
801 srli a2, a2, 1 | |
802 slli a2, a2, 1 | |
803 j .Lmul_rounded | |
804 | |
805 .Lmul_overflow: | |
806 bltz a8, .Lmul_underflow | |
807 /* Return +/- Infinity. */ | |
808 movi a8, 0xff | |
809 slli a2, a8, 23 | |
810 j .Lmul_addsign | |
811 | |
812 .Lmul_underflow: | |
813 /* Create a subnormal value, where the exponent field contains zero, | |
814 but the effective exponent is 1. The value of a8 is one less than | |
815 the actual exponent, so just negate it to get the shift amount. */ | |
816 neg a8, a8 | |
817 mov a9, a6 | |
818 ssr a8 | |
819 bgeui a8, 32, .Lmul_flush_to_zero | |
820 | |
821 /* Shift a2 right. Any bits that are shifted out of a2 are saved | |
822 in a6 (combined with the shifted-out bits currently in a6) for | |
823 rounding the result. */ | |
824 sll a6, a2 | |
825 srl a2, a2 | |
826 | |
827 /* Set the exponent to zero. */ | |
828 movi a8, 0 | |
829 | |
830 /* Pack any nonzero bits shifted out into a6. */ | |
831 beqz a9, .Lmul_round | |
832 movi a9, 1 | |
833 or a6, a6, a9 | |
834 j .Lmul_round | |
835 | |
836 .Lmul_flush_to_zero: | |
837 /* Return zero with the appropriate sign bit. */ | |
838 srli a2, a7, 31 | |
839 slli a2, a2, 31 | |
840 j .Lmul_done | |
841 | |
842 #if XCHAL_NO_MUL | |
843 | |
844 /* For Xtensa processors with no multiply hardware, this simplified | |
845 version of _mulsi3 is used for multiplying 16-bit chunks of | |
846 the floating-point mantissas. When using CALL0, this function | |
847 uses a custom ABI: the inputs are passed in a13 and a14, the | |
848 result is returned in a12, and a8 and a15 are clobbered. */ | |
849 .align 4 | |
850 .Lmul_mulsi3: | |
851 leaf_entry sp, 16 | |
852 .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 | |
853 movi \dst, 0 | |
854 1: add \tmp1, \src2, \dst | |
855 extui \tmp2, \src1, 0, 1 | |
856 movnez \dst, \tmp1, \tmp2 | |
857 | |
858 do_addx2 \tmp1, \src2, \dst, \tmp1 | |
859 extui \tmp2, \src1, 1, 1 | |
860 movnez \dst, \tmp1, \tmp2 | |
861 | |
862 do_addx4 \tmp1, \src2, \dst, \tmp1 | |
863 extui \tmp2, \src1, 2, 1 | |
864 movnez \dst, \tmp1, \tmp2 | |
865 | |
866 do_addx8 \tmp1, \src2, \dst, \tmp1 | |
867 extui \tmp2, \src1, 3, 1 | |
868 movnez \dst, \tmp1, \tmp2 | |
869 | |
870 srli \src1, \src1, 4 | |
871 slli \src2, \src2, 4 | |
872 bnez \src1, 1b | |
873 .endm | |
874 #if __XTENSA_CALL0_ABI__ | |
875 mul_mulsi3_body a12, a13, a14, a15, a8 | |
876 #else | |
877 /* The result will be written into a2, so save that argument in a4. */ | |
878 mov a4, a2 | |
879 mul_mulsi3_body a2, a4, a3, a5, a6 | |
880 #endif | |
881 leaf_return | |
882 #endif /* XCHAL_NO_MUL */ | |
883 #endif /* L_mulsf3 */ | |
884 | |
885 #ifdef L_divsf3 | |
886 | |
887 /* Division */ | |
888 __divsf3_aux: | |
889 | |
890 /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). | |
891 (This code is placed before the start of the function just to | |
892 keep it in range of the limited branch displacements.) */ | |
893 | |
894 .Ldiv_yexpzero: | |
895 /* Clear the sign bit of y. */ | |
896 slli a3, a3, 1 | |
897 srli a3, a3, 1 | |
898 | |
899 /* Check for division by zero. */ | |
900 beqz a3, .Ldiv_yzero | |
901 | |
902 /* Normalize y. Adjust the exponent in a9. */ | |
903 do_nsau a10, a3, a4, a5 | |
904 addi a10, a10, -8 | |
905 ssl a10 | |
906 sll a3, a3 | |
907 movi a9, 1 | |
908 sub a9, a9, a10 | |
909 j .Ldiv_ynormalized | |
910 | |
911 .Ldiv_yzero: | |
912 /* y is zero. Return NaN if x is also zero; otherwise, infinity. */ | |
913 slli a4, a2, 1 | |
914 srli a4, a4, 1 | |
915 srli a2, a7, 31 | |
916 slli a2, a2, 31 | |
917 or a2, a2, a6 | |
918 bnez a4, 1f | |
919 movi a4, 0x400000 /* make it a quiet NaN */ | |
920 or a2, a2, a4 | |
921 1: leaf_return | |
922 | |
923 .Ldiv_xexpzero: | |
924 /* Clear the sign bit of x. */ | |
925 slli a2, a2, 1 | |
926 srli a2, a2, 1 | |
927 | |
928 /* If x is zero, return zero. */ | |
929 beqz a2, .Ldiv_return_zero | |
930 | |
931 /* Normalize x. Adjust the exponent in a8. */ | |
932 do_nsau a10, a2, a4, a5 | |
933 addi a10, a10, -8 | |
934 ssl a10 | |
935 sll a2, a2 | |
936 movi a8, 1 | |
937 sub a8, a8, a10 | |
938 j .Ldiv_xnormalized | |
939 | |
940 .Ldiv_return_zero: | |
941 /* Return zero with the appropriate sign bit. */ | |
942 srli a2, a7, 31 | |
943 slli a2, a2, 31 | |
944 leaf_return | |
945 | |
946 .Ldiv_xnan_or_inf: | |
947 /* Set the sign bit of the result. */ | |
948 srli a7, a3, 31 | |
949 slli a7, a7, 31 | |
950 xor a2, a2, a7 | |
951 /* If y is NaN or Inf, return NaN. */ | |
952 bnall a3, a6, 1f | |
953 movi a4, 0x400000 /* make it a quiet NaN */ | |
954 or a2, a2, a4 | |
955 1: leaf_return | |
956 | |
957 .Ldiv_ynan_or_inf: | |
958 /* If y is Infinity, return zero. */ | |
959 slli a8, a3, 9 | |
960 beqz a8, .Ldiv_return_zero | |
961 /* y is NaN; return it. */ | |
962 mov a2, a3 | |
963 leaf_return | |
964 | |
965 .align 4 | |
966 .global __divsf3 | |
967 .type __divsf3, @function | |
968 __divsf3: | |
969 leaf_entry sp, 16 | |
970 movi a6, 0x7f800000 | |
971 | |
972 /* Get the sign of the result. */ | |
973 xor a7, a2, a3 | |
974 | |
975 /* Check for NaN and infinity. */ | |
976 ball a2, a6, .Ldiv_xnan_or_inf | |
977 ball a3, a6, .Ldiv_ynan_or_inf | |
978 | |
979 /* Extract the exponents. */ | |
980 extui a8, a2, 23, 8 | |
981 extui a9, a3, 23, 8 | |
982 | |
983 beqz a9, .Ldiv_yexpzero | |
984 .Ldiv_ynormalized: | |
985 beqz a8, .Ldiv_xexpzero | |
986 .Ldiv_xnormalized: | |
987 | |
988 /* Subtract the exponents. */ | |
989 sub a8, a8, a9 | |
990 | |
991 /* Replace sign/exponent fields with explicit "1.0". */ | |
992 movi a10, 0xffffff | |
993 or a2, a2, a6 | |
994 and a2, a2, a10 | |
995 or a3, a3, a6 | |
996 and a3, a3, a10 | |
997 | |
998 /* The first digit of the mantissa division must be a one. | |
999 Shift x (and adjust the exponent) as needed to make this true. */ | |
1000 bltu a3, a2, 1f | |
1001 slli a2, a2, 1 | |
1002 addi a8, a8, -1 | |
1003 1: | |
1004 /* Do the first subtraction and shift. */ | |
1005 sub a2, a2, a3 | |
1006 slli a2, a2, 1 | |
1007 | |
1008 /* Put the quotient into a10. */ | |
1009 movi a10, 1 | |
1010 | |
1011 /* Divide one bit at a time for 23 bits. */ | |
1012 movi a9, 23 | |
1013 #if XCHAL_HAVE_LOOPS | |
1014 loop a9, .Ldiv_loopend | |
1015 #endif | |
1016 .Ldiv_loop: | |
1017 /* Shift the quotient << 1. */ | |
1018 slli a10, a10, 1 | |
1019 | |
1020 /* Is this digit a 0 or 1? */ | |
1021 bltu a2, a3, 1f | |
1022 | |
1023 /* Output a 1 and subtract. */ | |
1024 addi a10, a10, 1 | |
1025 sub a2, a2, a3 | |
1026 | |
1027 /* Shift the dividend << 1. */ | |
1028 1: slli a2, a2, 1 | |
1029 | |
1030 #if !XCHAL_HAVE_LOOPS | |
1031 addi a9, a9, -1 | |
1032 bnez a9, .Ldiv_loop | |
1033 #endif | |
1034 .Ldiv_loopend: | |
1035 | |
1036 /* Add the exponent bias (less one to account for the explicit "1.0" | |
1037 of the mantissa that will be added to the exponent in the final | |
1038 result). */ | |
1039 addi a8, a8, 0x7e | |
1040 | |
1041 /* Check for over/underflow. The value in a8 is one less than the | |
1042 final exponent, so values in the range 0..fd are OK here. */ | |
1043 movi a4, 0xfe | |
1044 bgeu a8, a4, .Ldiv_overflow | |
1045 | |
1046 .Ldiv_round: | |
1047 /* Round. The remainder (<< 1) is in a2. */ | |
1048 bltu a2, a3, .Ldiv_rounded | |
1049 addi a10, a10, 1 | |
1050 beq a2, a3, .Ldiv_exactlyhalf | |
1051 | |
1052 .Ldiv_rounded: | |
1053 /* Add the exponent to the mantissa. */ | |
1054 slli a8, a8, 23 | |
1055 add a2, a10, a8 | |
1056 | |
1057 .Ldiv_addsign: | |
1058 /* Add the sign bit. */ | |
1059 srli a7, a7, 31 | |
1060 slli a7, a7, 31 | |
1061 or a2, a2, a7 | |
1062 leaf_return | |
1063 | |
1064 .Ldiv_overflow: | |
1065 bltz a8, .Ldiv_underflow | |
1066 /* Return +/- Infinity. */ | |
1067 addi a8, a4, 1 /* 0xff */ | |
1068 slli a2, a8, 23 | |
1069 j .Ldiv_addsign | |
1070 | |
1071 .Ldiv_exactlyhalf: | |
1072 /* Remainder is exactly half the divisor. Round even. */ | |
1073 srli a10, a10, 1 | |
1074 slli a10, a10, 1 | |
1075 j .Ldiv_rounded | |
1076 | |
1077 .Ldiv_underflow: | |
1078 /* Create a subnormal value, where the exponent field contains zero, | |
1079 but the effective exponent is 1. The value of a8 is one less than | |
1080 the actual exponent, so just negate it to get the shift amount. */ | |
1081 neg a8, a8 | |
1082 ssr a8 | |
1083 bgeui a8, 32, .Ldiv_flush_to_zero | |
1084 | |
1085 /* Shift a10 right. Any bits that are shifted out of a10 are | |
1086 saved in a6 for rounding the result. */ | |
1087 sll a6, a10 | |
1088 srl a10, a10 | |
1089 | |
1090 /* Set the exponent to zero. */ | |
1091 movi a8, 0 | |
1092 | |
1093 /* Pack any nonzero remainder (in a2) into a6. */ | |
1094 beqz a2, 1f | |
1095 movi a9, 1 | |
1096 or a6, a6, a9 | |
1097 | |
1098 /* Round a10 based on the bits shifted out into a6. */ | |
1099 1: bgez a6, .Ldiv_rounded | |
1100 addi a10, a10, 1 | |
1101 slli a6, a6, 1 | |
1102 bnez a6, .Ldiv_rounded | |
1103 srli a10, a10, 1 | |
1104 slli a10, a10, 1 | |
1105 j .Ldiv_rounded | |
1106 | |
1107 .Ldiv_flush_to_zero: | |
1108 /* Return zero with the appropriate sign bit. */ | |
1109 srli a2, a7, 31 | |
1110 slli a2, a2, 31 | |
1111 leaf_return | |
1112 | |
1113 #endif /* L_divsf3 */ | |
1114 | |
1115 #ifdef L_cmpsf2 | |
1116 | |
1117 /* Equal and Not Equal */ | |
1118 | |
1119 .align 4 | |
1120 .global __eqsf2 | |
1121 .global __nesf2 | |
1122 .set __nesf2, __eqsf2 | |
1123 .type __eqsf2, @function | |
1124 __eqsf2: | |
1125 leaf_entry sp, 16 | |
1126 bne a2, a3, 4f | |
1127 | |
1128 /* The values are equal but NaN != NaN. Check the exponent. */ | |
1129 movi a6, 0x7f800000 | |
1130 ball a2, a6, 3f | |
1131 | |
1132 /* Equal. */ | |
1133 movi a2, 0 | |
1134 leaf_return | |
1135 | |
1136 /* Not equal. */ | |
1137 2: movi a2, 1 | |
1138 leaf_return | |
1139 | |
1140 /* Check if the mantissas are nonzero. */ | |
1141 3: slli a7, a2, 9 | |
1142 j 5f | |
1143 | |
1144 /* Check if x and y are zero with different signs. */ | |
1145 4: or a7, a2, a3 | |
1146 slli a7, a7, 1 | |
1147 | |
1148 /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa | |
1149 or x when exponent(x) = 0x7f8 and x == y. */ | |
1150 5: movi a2, 0 | |
1151 movi a3, 1 | |
1152 movnez a2, a3, a7 | |
1153 leaf_return | |
1154 | |
1155 | |
1156 /* Greater Than */ | |
1157 | |
1158 .align 4 | |
1159 .global __gtsf2 | |
1160 .type __gtsf2, @function | |
1161 __gtsf2: | |
1162 leaf_entry sp, 16 | |
1163 movi a6, 0x7f800000 | |
1164 ball a2, a6, 2f | |
1165 1: bnall a3, a6, .Lle_cmp | |
1166 | |
1167 /* Check if y is a NaN. */ | |
1168 slli a7, a3, 9 | |
1169 beqz a7, .Lle_cmp | |
1170 movi a2, 0 | |
1171 leaf_return | |
1172 | |
1173 /* Check if x is a NaN. */ | |
1174 2: slli a7, a2, 9 | |
1175 beqz a7, 1b | |
1176 movi a2, 0 | |
1177 leaf_return | |
1178 | |
1179 | |
1180 /* Less Than or Equal */ | |
1181 | |
1182 .align 4 | |
1183 .global __lesf2 | |
1184 .type __lesf2, @function | |
1185 __lesf2: | |
1186 leaf_entry sp, 16 | |
1187 movi a6, 0x7f800000 | |
1188 ball a2, a6, 2f | |
1189 1: bnall a3, a6, .Lle_cmp | |
1190 | |
1191 /* Check if y is a NaN. */ | |
1192 slli a7, a3, 9 | |
1193 beqz a7, .Lle_cmp | |
1194 movi a2, 1 | |
1195 leaf_return | |
1196 | |
1197 /* Check if x is a NaN. */ | |
1198 2: slli a7, a2, 9 | |
1199 beqz a7, 1b | |
1200 movi a2, 1 | |
1201 leaf_return | |
1202 | |
1203 .Lle_cmp: | |
1204 /* Check if x and y have different signs. */ | |
1205 xor a7, a2, a3 | |
1206 bltz a7, .Lle_diff_signs | |
1207 | |
1208 /* Check if x is negative. */ | |
1209 bltz a2, .Lle_xneg | |
1210 | |
1211 /* Check if x <= y. */ | |
1212 bltu a3, a2, 5f | |
1213 4: movi a2, 0 | |
1214 leaf_return | |
1215 | |
1216 .Lle_xneg: | |
1217 /* Check if y <= x. */ | |
1218 bgeu a2, a3, 4b | |
1219 5: movi a2, 1 | |
1220 leaf_return | |
1221 | |
1222 .Lle_diff_signs: | |
1223 bltz a2, 4b | |
1224 | |
1225 /* Check if both x and y are zero. */ | |
1226 or a7, a2, a3 | |
1227 slli a7, a7, 1 | |
1228 movi a2, 1 | |
1229 movi a3, 0 | |
1230 moveqz a2, a3, a7 | |
1231 leaf_return | |
1232 | |
1233 | |
1234 /* Greater Than or Equal */ | |
1235 | |
1236 .align 4 | |
1237 .global __gesf2 | |
1238 .type __gesf2, @function | |
1239 __gesf2: | |
1240 leaf_entry sp, 16 | |
1241 movi a6, 0x7f800000 | |
1242 ball a2, a6, 2f | |
1243 1: bnall a3, a6, .Llt_cmp | |
1244 | |
1245 /* Check if y is a NaN. */ | |
1246 slli a7, a3, 9 | |
1247 beqz a7, .Llt_cmp | |
1248 movi a2, -1 | |
1249 leaf_return | |
1250 | |
1251 /* Check if x is a NaN. */ | |
1252 2: slli a7, a2, 9 | |
1253 beqz a7, 1b | |
1254 movi a2, -1 | |
1255 leaf_return | |
1256 | |
1257 | |
1258 /* Less Than */ | |
1259 | |
1260 .align 4 | |
1261 .global __ltsf2 | |
1262 .type __ltsf2, @function | |
1263 __ltsf2: | |
1264 leaf_entry sp, 16 | |
1265 movi a6, 0x7f800000 | |
1266 ball a2, a6, 2f | |
1267 1: bnall a3, a6, .Llt_cmp | |
1268 | |
1269 /* Check if y is a NaN. */ | |
1270 slli a7, a3, 9 | |
1271 beqz a7, .Llt_cmp | |
1272 movi a2, 0 | |
1273 leaf_return | |
1274 | |
1275 /* Check if x is a NaN. */ | |
1276 2: slli a7, a2, 9 | |
1277 beqz a7, 1b | |
1278 movi a2, 0 | |
1279 leaf_return | |
1280 | |
1281 .Llt_cmp: | |
1282 /* Check if x and y have different signs. */ | |
1283 xor a7, a2, a3 | |
1284 bltz a7, .Llt_diff_signs | |
1285 | |
1286 /* Check if x is negative. */ | |
1287 bltz a2, .Llt_xneg | |
1288 | |
1289 /* Check if x < y. */ | |
1290 bgeu a2, a3, 5f | |
1291 4: movi a2, -1 | |
1292 leaf_return | |
1293 | |
1294 .Llt_xneg: | |
1295 /* Check if y < x. */ | |
1296 bltu a3, a2, 4b | |
1297 5: movi a2, 0 | |
1298 leaf_return | |
1299 | |
1300 .Llt_diff_signs: | |
1301 bgez a2, 5b | |
1302 | |
1303 /* Check if both x and y are nonzero. */ | |
1304 or a7, a2, a3 | |
1305 slli a7, a7, 1 | |
1306 movi a2, 0 | |
1307 movi a3, -1 | |
1308 movnez a2, a3, a7 | |
1309 leaf_return | |
1310 | |
1311 | |
1312 /* Unordered */ | |
1313 | |
1314 .align 4 | |
1315 .global __unordsf2 | |
1316 .type __unordsf2, @function | |
1317 __unordsf2: | |
1318 leaf_entry sp, 16 | |
1319 movi a6, 0x7f800000 | |
1320 ball a2, a6, 3f | |
1321 1: ball a3, a6, 4f | |
1322 2: movi a2, 0 | |
1323 leaf_return | |
1324 | |
1325 3: slli a7, a2, 9 | |
1326 beqz a7, 1b | |
1327 movi a2, 1 | |
1328 leaf_return | |
1329 | |
1330 4: slli a7, a3, 9 | |
1331 beqz a7, 2b | |
1332 movi a2, 1 | |
1333 leaf_return | |
1334 | |
1335 #endif /* L_cmpsf2 */ | |
1336 | |
1337 #ifdef L_fixsfsi | |
1338 | |
1339 .align 4 | |
1340 .global __fixsfsi | |
1341 .type __fixsfsi, @function | |
1342 __fixsfsi: | |
1343 leaf_entry sp, 16 | |
1344 | |
1345 /* Check for NaN and Infinity. */ | |
1346 movi a6, 0x7f800000 | |
1347 ball a2, a6, .Lfixsfsi_nan_or_inf | |
1348 | |
1349 /* Extract the exponent and check if 0 < (exp - 0x7e) < 32. */ | |
1350 extui a4, a2, 23, 8 | |
1351 addi a4, a4, -0x7e | |
1352 bgei a4, 32, .Lfixsfsi_maxint | |
1353 blti a4, 1, .Lfixsfsi_zero | |
1354 | |
1355 /* Add explicit "1.0" and shift << 8. */ | |
1356 or a7, a2, a6 | |
1357 slli a5, a7, 8 | |
1358 | |
1359 /* Shift back to the right, based on the exponent. */ | |
1360 ssl a4 /* shift by 32 - a4 */ | |
1361 srl a5, a5 | |
1362 | |
1363 /* Negate the result if sign != 0. */ | |
1364 neg a2, a5 | |
1365 movgez a2, a5, a7 | |
1366 leaf_return | |
1367 | |
1368 .Lfixsfsi_nan_or_inf: | |
1369 /* Handle Infinity and NaN. */ | |
1370 slli a4, a2, 9 | |
1371 beqz a4, .Lfixsfsi_maxint | |
1372 | |
1373 /* Translate NaN to +maxint. */ | |
1374 movi a2, 0 | |
1375 | |
1376 .Lfixsfsi_maxint: | |
1377 slli a4, a6, 8 /* 0x80000000 */ | |
1378 addi a5, a4, -1 /* 0x7fffffff */ | |
1379 movgez a4, a5, a2 | |
1380 mov a2, a4 | |
1381 leaf_return | |
1382 | |
1383 .Lfixsfsi_zero: | |
1384 movi a2, 0 | |
1385 leaf_return | |
1386 | |
1387 #endif /* L_fixsfsi */ | |
1388 | |
1389 #ifdef L_fixsfdi | |
1390 | |
1391 .align 4 | |
1392 .global __fixsfdi | |
1393 .type __fixsfdi, @function | |
1394 __fixsfdi: | |
1395 leaf_entry sp, 16 | |
1396 | |
1397 /* Check for NaN and Infinity. */ | |
1398 movi a6, 0x7f800000 | |
1399 ball a2, a6, .Lfixsfdi_nan_or_inf | |
1400 | |
1401 /* Extract the exponent and check if 0 < (exp - 0x7e) < 64. */ | |
1402 extui a4, a2, 23, 8 | |
1403 addi a4, a4, -0x7e | |
1404 bgei a4, 64, .Lfixsfdi_maxint | |
1405 blti a4, 1, .Lfixsfdi_zero | |
1406 | |
1407 /* Add explicit "1.0" and shift << 8. */ | |
1408 or a7, a2, a6 | |
1409 slli xh, a7, 8 | |
1410 | |
1411 /* Shift back to the right, based on the exponent. */ | |
1412 ssl a4 /* shift by 64 - a4 */ | |
1413 bgei a4, 32, .Lfixsfdi_smallshift | |
1414 srl xl, xh | |
1415 movi xh, 0 | |
1416 | |
1417 .Lfixsfdi_shifted: | |
1418 /* Negate the result if sign != 0. */ | |
1419 bgez a7, 1f | |
1420 neg xl, xl | |
1421 neg xh, xh | |
1422 beqz xl, 1f | |
1423 addi xh, xh, -1 | |
1424 1: leaf_return | |
1425 | |
1426 .Lfixsfdi_smallshift: | |
1427 movi xl, 0 | |
1428 sll xl, xh | |
1429 srl xh, xh | |
1430 j .Lfixsfdi_shifted | |
1431 | |
1432 .Lfixsfdi_nan_or_inf: | |
1433 /* Handle Infinity and NaN. */ | |
1434 slli a4, a2, 9 | |
1435 beqz a4, .Lfixsfdi_maxint | |
1436 | |
1437 /* Translate NaN to +maxint. */ | |
1438 movi a2, 0 | |
1439 | |
1440 .Lfixsfdi_maxint: | |
1441 slli a7, a6, 8 /* 0x80000000 */ | |
1442 bgez a2, 1f | |
1443 mov xh, a7 | |
1444 movi xl, 0 | |
1445 leaf_return | |
1446 | |
1447 1: addi xh, a7, -1 /* 0x7fffffff */ | |
1448 movi xl, -1 | |
1449 leaf_return | |
1450 | |
1451 .Lfixsfdi_zero: | |
1452 movi xh, 0 | |
1453 movi xl, 0 | |
1454 leaf_return | |
1455 | |
1456 #endif /* L_fixsfdi */ | |
1457 | |
1458 #ifdef L_fixunssfsi | |
1459 | |
1460 .align 4 | |
1461 .global __fixunssfsi | |
1462 .type __fixunssfsi, @function | |
1463 __fixunssfsi: | |
1464 leaf_entry sp, 16 | |
1465 | |
1466 /* Check for NaN and Infinity. */ | |
1467 movi a6, 0x7f800000 | |
1468 ball a2, a6, .Lfixunssfsi_nan_or_inf | |
1469 | |
1470 /* Extract the exponent and check if 0 <= (exp - 0x7f) < 32. */ | |
1471 extui a4, a2, 23, 8 | |
1472 addi a4, a4, -0x7f | |
1473 bgei a4, 32, .Lfixunssfsi_maxint | |
1474 bltz a4, .Lfixunssfsi_zero | |
1475 | |
1476 /* Add explicit "1.0" and shift << 8. */ | |
1477 or a7, a2, a6 | |
1478 slli a5, a7, 8 | |
1479 | |
1480 /* Shift back to the right, based on the exponent. */ | |
1481 addi a4, a4, 1 | |
1482 beqi a4, 32, .Lfixunssfsi_bigexp | |
1483 ssl a4 /* shift by 32 - a4 */ | |
1484 srl a5, a5 | |
1485 | |
1486 /* Negate the result if sign != 0. */ | |
1487 neg a2, a5 | |
1488 movgez a2, a5, a7 | |
1489 leaf_return | |
1490 | |
1491 .Lfixunssfsi_nan_or_inf: | |
1492 /* Handle Infinity and NaN. */ | |
1493 slli a4, a2, 9 | |
1494 beqz a4, .Lfixunssfsi_maxint | |
1495 | |
1496 /* Translate NaN to 0xffffffff. */ | |
1497 movi a2, -1 | |
1498 leaf_return | |
1499 | |
1500 .Lfixunssfsi_maxint: | |
1501 slli a4, a6, 8 /* 0x80000000 */ | |
1502 movi a5, -1 /* 0xffffffff */ | |
1503 movgez a4, a5, a2 | |
1504 mov a2, a4 | |
1505 leaf_return | |
1506 | |
1507 .Lfixunssfsi_zero: | |
1508 movi a2, 0 | |
1509 leaf_return | |
1510 | |
1511 .Lfixunssfsi_bigexp: | |
1512 /* Handle unsigned maximum exponent case. */ | |
1513 bltz a2, 1f | |
1514 mov a2, a5 /* no shift needed */ | |
1515 leaf_return | |
1516 | |
1517 /* Return 0x80000000 if negative. */ | |
1518 1: slli a2, a6, 8 | |
1519 leaf_return | |
1520 | |
1521 #endif /* L_fixunssfsi */ | |
1522 | |
1523 #ifdef L_fixunssfdi | |
1524 | |
1525 .align 4 | |
1526 .global __fixunssfdi | |
1527 .type __fixunssfdi, @function | |
1528 __fixunssfdi: | |
1529 leaf_entry sp, 16 | |
1530 | |
1531 /* Check for NaN and Infinity. */ | |
1532 movi a6, 0x7f800000 | |
1533 ball a2, a6, .Lfixunssfdi_nan_or_inf | |
1534 | |
1535 /* Extract the exponent and check if 0 <= (exp - 0x7f) < 64. */ | |
1536 extui a4, a2, 23, 8 | |
1537 addi a4, a4, -0x7f | |
1538 bgei a4, 64, .Lfixunssfdi_maxint | |
1539 bltz a4, .Lfixunssfdi_zero | |
1540 | |
1541 /* Add explicit "1.0" and shift << 8. */ | |
1542 or a7, a2, a6 | |
1543 slli xh, a7, 8 | |
1544 | |
1545 /* Shift back to the right, based on the exponent. */ | |
1546 addi a4, a4, 1 | |
1547 beqi a4, 64, .Lfixunssfdi_bigexp | |
1548 ssl a4 /* shift by 64 - a4 */ | |
1549 bgei a4, 32, .Lfixunssfdi_smallshift | |
1550 srl xl, xh | |
1551 movi xh, 0 | |
1552 | |
1553 .Lfixunssfdi_shifted: | |
1554 /* Negate the result if sign != 0. */ | |
1555 bgez a7, 1f | |
1556 neg xl, xl | |
1557 neg xh, xh | |
1558 beqz xl, 1f | |
1559 addi xh, xh, -1 | |
1560 1: leaf_return | |
1561 | |
1562 .Lfixunssfdi_smallshift: | |
1563 movi xl, 0 | |
1564 src xl, xh, xl | |
1565 srl xh, xh | |
1566 j .Lfixunssfdi_shifted | |
1567 | |
1568 .Lfixunssfdi_nan_or_inf: | |
1569 /* Handle Infinity and NaN. */ | |
1570 slli a4, a2, 9 | |
1571 beqz a4, .Lfixunssfdi_maxint | |
1572 | |
1573 /* Translate NaN to 0xffffffff.... */ | |
1574 1: movi xh, -1 | |
1575 movi xl, -1 | |
1576 leaf_return | |
1577 | |
1578 .Lfixunssfdi_maxint: | |
1579 bgez a2, 1b | |
1580 2: slli xh, a6, 8 /* 0x80000000 */ | |
1581 movi xl, 0 | |
1582 leaf_return | |
1583 | |
1584 .Lfixunssfdi_zero: | |
1585 movi xh, 0 | |
1586 movi xl, 0 | |
1587 leaf_return | |
1588 | |
1589 .Lfixunssfdi_bigexp: | |
1590 /* Handle unsigned maximum exponent case. */ | |
1591 bltz a7, 2b | |
1592 movi xl, 0 | |
1593 leaf_return /* no shift needed */ | |
1594 | |
1595 #endif /* L_fixunssfdi */ | |
1596 | |
1597 #ifdef L_floatsisf | |
1598 | |
1599 .align 4 | |
1600 .global __floatunsisf | |
1601 .type __floatunsisf, @function | |
1602 __floatunsisf: | |
1603 leaf_entry sp, 16 | |
1604 beqz a2, .Lfloatsisf_return | |
1605 | |
1606 /* Set the sign to zero and jump to the floatsisf code. */ | |
1607 movi a7, 0 | |
1608 j .Lfloatsisf_normalize | |
1609 | |
1610 .align 4 | |
1611 .global __floatsisf | |
1612 .type __floatsisf, @function | |
1613 __floatsisf: | |
1614 leaf_entry sp, 16 | |
1615 | |
1616 /* Check for zero. */ | |
1617 beqz a2, .Lfloatsisf_return | |
1618 | |
1619 /* Save the sign. */ | |
1620 extui a7, a2, 31, 1 | |
1621 | |
1622 /* Get the absolute value. */ | |
1623 #if XCHAL_HAVE_ABS | |
1624 abs a2, a2 | |
1625 #else | |
1626 neg a4, a2 | |
1627 movltz a2, a4, a2 | |
1628 #endif | |
1629 | |
1630 .Lfloatsisf_normalize: | |
1631 /* Normalize with the first 1 bit in the msb. */ | |
1632 do_nsau a4, a2, a5, a6 | |
1633 ssl a4 | |
1634 sll a5, a2 | |
1635 | |
1636 /* Shift the mantissa into position, with rounding bits in a6. */ | |
1637 srli a2, a5, 8 | |
1638 slli a6, a5, (32 - 8) | |
1639 | |
1640 /* Set the exponent. */ | |
1641 movi a5, 0x9d /* 0x7e + 31 */ | |
1642 sub a5, a5, a4 | |
1643 slli a5, a5, 23 | |
1644 add a2, a2, a5 | |
1645 | |
1646 /* Add the sign. */ | |
1647 slli a7, a7, 31 | |
1648 or a2, a2, a7 | |
1649 | |
1650 /* Round up if the leftover fraction is >= 1/2. */ | |
1651 bgez a6, .Lfloatsisf_return | |
1652 addi a2, a2, 1 /* Overflow to the exponent is OK. */ | |
1653 | |
1654 /* Check if the leftover fraction is exactly 1/2. */ | |
1655 slli a6, a6, 1 | |
1656 beqz a6, .Lfloatsisf_exactlyhalf | |
1657 | |
1658 .Lfloatsisf_return: | |
1659 leaf_return | |
1660 | |
1661 .Lfloatsisf_exactlyhalf: | |
1662 /* Round down to the nearest even value. */ | |
1663 srli a2, a2, 1 | |
1664 slli a2, a2, 1 | |
1665 leaf_return | |
1666 | |
1667 #endif /* L_floatsisf */ | |
1668 | |
1669 #ifdef L_floatdisf | |
1670 | |
1671 .align 4 | |
1672 .global __floatundisf | |
1673 .type __floatundisf, @function | |
1674 __floatundisf: | |
1675 leaf_entry sp, 16 | |
1676 | |
1677 /* Check for zero. */ | |
1678 or a4, xh, xl | |
1679 beqz a4, 2f | |
1680 | |
1681 /* Set the sign to zero and jump to the floatdisf code. */ | |
1682 movi a7, 0 | |
1683 j .Lfloatdisf_normalize | |
1684 | |
1685 .align 4 | |
1686 .global __floatdisf | |
1687 .type __floatdisf, @function | |
1688 __floatdisf: | |
1689 leaf_entry sp, 16 | |
1690 | |
1691 /* Check for zero. */ | |
1692 or a4, xh, xl | |
1693 beqz a4, 2f | |
1694 | |
1695 /* Save the sign. */ | |
1696 extui a7, xh, 31, 1 | |
1697 | |
1698 /* Get the absolute value. */ | |
1699 bgez xh, .Lfloatdisf_normalize | |
1700 neg xl, xl | |
1701 neg xh, xh | |
1702 beqz xl, .Lfloatdisf_normalize | |
1703 addi xh, xh, -1 | |
1704 | |
1705 .Lfloatdisf_normalize: | |
1706 /* Normalize with the first 1 bit in the msb of xh. */ | |
1707 beqz xh, .Lfloatdisf_bigshift | |
1708 do_nsau a4, xh, a5, a6 | |
1709 ssl a4 | |
1710 src xh, xh, xl | |
1711 sll xl, xl | |
1712 | |
1713 .Lfloatdisf_shifted: | |
1714 /* Shift the mantissa into position, with rounding bits in a6. */ | |
1715 ssai 8 | |
1716 sll a5, xl | |
1717 src a6, xh, xl | |
1718 srl xh, xh | |
1719 beqz a5, 1f | |
1720 movi a5, 1 | |
1721 or a6, a6, a5 | |
1722 1: | |
1723 /* Set the exponent. */ | |
1724 movi a5, 0xbd /* 0x7e + 63 */ | |
1725 sub a5, a5, a4 | |
1726 slli a5, a5, 23 | |
1727 add a2, xh, a5 | |
1728 | |
1729 /* Add the sign. */ | |
1730 slli a7, a7, 31 | |
1731 or a2, a2, a7 | |
1732 | |
1733 /* Round up if the leftover fraction is >= 1/2. */ | |
1734 bgez a6, 2f | |
1735 addi a2, a2, 1 /* Overflow to the exponent is OK. */ | |
1736 | |
1737 /* Check if the leftover fraction is exactly 1/2. */ | |
1738 slli a6, a6, 1 | |
1739 beqz a6, .Lfloatdisf_exactlyhalf | |
1740 2: leaf_return | |
1741 | |
1742 .Lfloatdisf_bigshift: | |
1743 /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */ | |
1744 do_nsau a4, xl, a5, a6 | |
1745 ssl a4 | |
1746 sll xh, xl | |
1747 movi xl, 0 | |
1748 addi a4, a4, 32 | |
1749 j .Lfloatdisf_shifted | |
1750 | |
1751 .Lfloatdisf_exactlyhalf: | |
1752 /* Round down to the nearest even value. */ | |
1753 srli a2, a2, 1 | |
1754 slli a2, a2, 1 | |
1755 leaf_return | |
1756 | |
1757 #endif /* L_floatdisf */ |