comparison libgcc/config/libbid/bid64_sqrt.c @ 0:a06113de4d67

first commit
author kent <kent@cr.ie.u-ryukyu.ac.jp>
date Fri, 17 Jul 2009 14:47:48 +0900
parents
children 04ced10e8804
comparison
equal deleted inserted replaced
-1:000000000000 0:a06113de4d67
1 /* Copyright (C) 2007, 2009 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /*****************************************************************************
25 * BID64 square root
26 *****************************************************************************
27 *
28 * Algorithm description:
29 *
30 * if(exponent_x is odd)
31 * scale coefficient_x by 10, adjust exponent
32 * - get lower estimate for number of digits in coefficient_x
33 * - scale coefficient x to between 31 and 33 decimal digits
34 * - in parallel, check for exact case and return if true
35 * - get high part of result coefficient using double precision sqrt
36 * - compute remainder and refine coefficient in one iteration (which
37 * modifies it by at most 1)
38 * - result exponent is easy to compute from the adjusted arg. exponent
39 *
40 ****************************************************************************/
41
42 #include "bid_internal.h"
43 #include "bid_sqrt_macros.h"
44 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
45 #include <fenv.h>
46
47 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
48 #endif
49
50 extern double sqrt (double);
51
52 #if DECIMAL_CALL_BY_REFERENCE
53
54 void
55 bid64_sqrt (UINT64 * pres,
56 UINT64 *
57 px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
58 _EXC_INFO_PARAM) {
59 UINT64 x;
60 #else
61
62 UINT64
63 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
64 _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
65 #endif
66 UINT128 CA, CT;
67 UINT64 sign_x, coefficient_x;
68 UINT64 Q, Q2, A10, C4, R, R2, QE, res;
69 SINT64 D;
70 int_double t_scale;
71 int_float tempx;
72 double da, dq, da_h, da_l, dqe;
73 int exponent_x, exponent_q, bin_expon_cx;
74 int digits_x;
75 int scale;
76 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
77 fexcept_t binaryflags = 0;
78 #endif
79
80 #if DECIMAL_CALL_BY_REFERENCE
81 #if !DECIMAL_GLOBAL_ROUNDING
82 _IDEC_round rnd_mode = *prnd_mode;
83 #endif
84 x = *px;
85 #endif
86
87 // unpack arguments, check for NaN or Infinity
88 if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
89 // x is Inf. or NaN or 0
90 if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
91 res = coefficient_x;
92 if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64) // -Infinity
93 {
94 res = NAN_MASK64;
95 #ifdef SET_STATUS_FLAGS
96 __set_status_flags (pfpsf, INVALID_EXCEPTION);
97 #endif
98 }
99 #ifdef SET_STATUS_FLAGS
100 if ((x & SNAN_MASK64) == SNAN_MASK64) // sNaN
101 __set_status_flags (pfpsf, INVALID_EXCEPTION);
102 #endif
103 BID_RETURN (res & QUIET_MASK64);
104 }
105 // x is 0
106 exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
107 res = sign_x | (((UINT64) exponent_x) << 53);
108 BID_RETURN (res);
109 }
110 // x<0?
111 if (sign_x && coefficient_x) {
112 res = NAN_MASK64;
113 #ifdef SET_STATUS_FLAGS
114 __set_status_flags (pfpsf, INVALID_EXCEPTION);
115 #endif
116 BID_RETURN (res);
117 }
118 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
119 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
120 #endif
121 //--- get number of bits in the coefficient of x ---
122 tempx.d = (float) coefficient_x;
123 bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
124 digits_x = estimate_decimal_digits[bin_expon_cx];
125 // add test for range
126 if (coefficient_x >= power10_index_binexp[bin_expon_cx])
127 digits_x++;
128
129 A10 = coefficient_x;
130 if (exponent_x & 1) {
131 A10 = (A10 << 2) + A10;
132 A10 += A10;
133 }
134
135 dqe = sqrt ((double) A10);
136 QE = (UINT32) dqe;
137 if (QE * QE == A10) {
138 res =
139 very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
140 QE);
141 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
142 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
143 #endif
144 BID_RETURN (res);
145 }
146 // if exponent is odd, scale coefficient by 10
147 scale = 31 - digits_x;
148 exponent_q = exponent_x - scale;
149 scale += (exponent_q & 1); // exp. bias is even
150
151 CT = power10_table_128[scale];
152 __mul_64x128_short (CA, coefficient_x, CT);
153
154 // 2^64
155 t_scale.i = 0x43f0000000000000ull;
156 // convert CA to DP
157 da_h = CA.w[1];
158 da_l = CA.w[0];
159 da = da_h * t_scale.d + da_l;
160
161 dq = sqrt (da);
162
163 Q = (UINT64) dq;
164
165 // get sign(sqrt(CA)-Q)
166 R = CA.w[0] - Q * Q;
167 R = ((SINT64) R) >> 63;
168 D = R + R + 1;
169
170 exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
171
172 #ifdef SET_STATUS_FLAGS
173 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
174 #endif
175
176 #ifndef IEEE_ROUND_NEAREST
177 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
178 if (!((rnd_mode) & 3)) {
179 #endif
180 #endif
181
182 // midpoint to check
183 Q2 = Q + Q + D;
184 C4 = CA.w[0] << 2;
185
186 // get sign(-sqrt(CA)+Midpoint)
187 R2 = Q2 * Q2 - C4;
188 R2 = ((SINT64) R2) >> 63;
189
190 // adjust Q if R!=R2
191 Q += (D & (R ^ R2));
192 #ifndef IEEE_ROUND_NEAREST
193 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
194 } else {
195 C4 = CA.w[0];
196 Q += D;
197 if ((SINT64) (Q * Q - C4) > 0)
198 Q--;
199 if (rnd_mode == ROUNDING_UP)
200 Q++;
201 }
202 #endif
203 #endif
204
205 res = fast_get_BID64 (0, exponent_q, Q);
206 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
207 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
208 #endif
209 BID_RETURN (res);
210 }
211
212
213 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
214
215 UINT256 M256, C4, C8;
216 UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
217 mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
218 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
219 SINT64 D;
220 int_float fx, f64;
221 int exponent_x, bin_expon_cx, done = 0;
222 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
223 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
224 fexcept_t binaryflags = 0;
225 #endif
226
227 // unpack arguments, check for NaN or Infinity
228 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
229 res = CX.w[1];
230 // NaN ?
231 if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
232 #ifdef SET_STATUS_FLAGS
233 if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull) // sNaN
234 __set_status_flags (pfpsf, INVALID_EXCEPTION);
235 #endif
236 Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
237 Tmp.w[0] = CX.w[0];
238 TP128 = reciprocals10_128[18];
239 __mul_128x128_full (Qh, Ql, Tmp, TP128);
240 amount = recip_scale[18];
241 __shr_128 (Tmp, Qh, amount);
242 res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
243 BID_RETURN (res);
244 }
245 // x is Infinity?
246 if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
247 if (sign_x) {
248 // -Inf, return NaN
249 res = 0x7c00000000000000ull;
250 #ifdef SET_STATUS_FLAGS
251 __set_status_flags (pfpsf, INVALID_EXCEPTION);
252 #endif
253 }
254 BID_RETURN (res);
255 }
256 // x is 0 otherwise
257
258 exponent_x =
259 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
260 DECIMAL_EXPONENT_BIAS;
261 if (exponent_x < 0)
262 exponent_x = 0;
263 if (exponent_x > DECIMAL_MAX_EXPON_64)
264 exponent_x = DECIMAL_MAX_EXPON_64;
265 //res= sign_x | (((UINT64)exponent_x)<<53);
266 res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
267 BID_RETURN (res);
268 }
269 if (sign_x) {
270 res = 0x7c00000000000000ull;
271 #ifdef SET_STATUS_FLAGS
272 __set_status_flags (pfpsf, INVALID_EXCEPTION);
273 #endif
274 BID_RETURN (res);
275 }
276 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
277 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
278 #endif
279
280 // 2^64
281 f64.i = 0x5f800000;
282
283 // fx ~ CX
284 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
285 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
286 digits = estimate_decimal_digits[bin_expon_cx];
287
288 A10 = CX;
289 if (exponent_x & 1) {
290 A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
291 A10.w[0] = CX.w[0] << 3;
292 CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
293 CX2.w[0] = CX.w[0] << 1;
294 __add_128_128 (A10, A10, CX2);
295 }
296
297 C256.w[1] = A10.w[1];
298 C256.w[0] = A10.w[0];
299 CS.w[0] = short_sqrt128 (A10);
300 CS.w[1] = 0;
301 mul_factor = 0;
302 // check for exact result
303 if (CS.w[0] < 10000000000000000ull) {
304 if (CS.w[0] * CS.w[0] == A10.w[0]) {
305 __sqr64_fast (S2, CS.w[0]);
306 if (S2.w[1] == A10.w[1]) // && S2.w[0]==A10.w[0])
307 {
308 res =
309 get_BID64 (0,
310 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
311 DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
312 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
313 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
314 #endif
315 BID_RETURN (res);
316 }
317 }
318 if (CS.w[0] >= 1000000000000000ull) {
319 done = 1;
320 exponent_q = exponent_x;
321 C256.w[1] = A10.w[1];
322 C256.w[0] = A10.w[0];
323 }
324 #ifdef SET_STATUS_FLAGS
325 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
326 #endif
327 exact = 0;
328 } else {
329 B10 = 0x3333333333333334ull;
330 __mul_64x64_to_128_full (CS2, CS.w[0], B10);
331 CS0 = CS2.w[1] >> 1;
332 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
333 #ifdef SET_STATUS_FLAGS
334 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
335 #endif
336 exact = 0;
337 }
338 done = 1;
339 CS.w[0] = CS0;
340 exponent_q = exponent_x + 2;
341 mul_factor = 10;
342 mul_factor2 = 100;
343 if (CS.w[0] >= 10000000000000000ull) {
344 __mul_64x64_to_128_full (CS2, CS.w[0], B10);
345 CS0 = CS2.w[1] >> 1;
346 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
347 #ifdef SET_STATUS_FLAGS
348 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
349 #endif
350 exact = 0;
351 }
352 exponent_q += 2;
353 CS.w[0] = CS0;
354 mul_factor = 100;
355 mul_factor2 = 10000;
356 }
357 if (exact) {
358 CS0 = CS.w[0] * mul_factor;
359 __sqr64_fast (CS1, CS0)
360 if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
361 #ifdef SET_STATUS_FLAGS
362 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
363 #endif
364 exact = 0;
365 }
366 }
367 }
368
369 if (!done) {
370 // get number of digits in CX
371 D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
372 if (D > 0
373 || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
374 digits++;
375
376 // if exponent is odd, scale coefficient by 10
377 scale = 31 - digits;
378 exponent_q = exponent_x - scale;
379 scale += (exponent_q & 1); // exp. bias is even
380
381 T128 = power10_table_128[scale];
382 __mul_128x128_low (C256, CX, T128);
383
384
385 CS.w[0] = short_sqrt128 (C256);
386 }
387 //printf("CS=%016I64x\n",CS.w[0]);
388
389 exponent_q =
390 ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
391 DECIMAL_EXPONENT_BIAS;
392 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
393 extra_digits = -exponent_q;
394 exponent_q = 0;
395
396 // get coeff*(2^M[extra_digits])/10^extra_digits
397 __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
398
399 // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
400 amount = short_recip_scale[extra_digits];
401
402 CS0 = QH.w[1] >> amount;
403
404 #ifdef SET_STATUS_FLAGS
405 if (exact) {
406 if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
407 exact = 0;
408 }
409 if (!exact)
410 __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
411 #endif
412
413 CS.w[0] = CS0;
414 if (!mul_factor)
415 mul_factor = 1;
416 mul_factor *= power10_table_128[extra_digits].w[0];
417 __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
418 if (mul_factor2_long.w[1])
419 mul_factor2 = 0;
420 else
421 mul_factor2 = mul_factor2_long.w[1];
422 }
423 // 4*C256
424 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
425 C4.w[0] = C256.w[0] << 2;
426
427 #ifndef IEEE_ROUND_NEAREST
428 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
429 if (!((rnd_mode) & 3)) {
430 #endif
431 #endif
432 // compare to midpoints
433 CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
434 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
435 if (mul_factor)
436 CSM.w[0] *= mul_factor;
437 // CSM^2
438 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
439 //__mul_128x128_to_256(M256, CSM, CSM);
440
441 if (C4.w[1] > M256.w[1] ||
442 (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
443 // round up
444 CS.w[0]++;
445 } else {
446 C8.w[0] = CS.w[0] << 3;
447 C8.w[1] = 0;
448 if (mul_factor) {
449 if (mul_factor2) {
450 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
451 } else {
452 __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
453 }
454 }
455 // M256 - 8*CSM
456 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
457 M256.w[1] = M256.w[1] - C8.w[1] - Carry;
458
459 // if CSM' > C256, round up
460 if (M256.w[1] > C4.w[1] ||
461 (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
462 // round down
463 if (CS.w[0])
464 CS.w[0]--;
465 }
466 }
467 #ifndef IEEE_ROUND_NEAREST
468 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
469 } else {
470 CS.w[0]++;
471 CSM.w[0] = CS.w[0];
472 C8.w[0] = CSM.w[0] << 1;
473 if (mul_factor)
474 CSM.w[0] *= mul_factor;
475 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
476 C8.w[1] = 0;
477 if (mul_factor) {
478 if (mul_factor2) {
479 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
480 } else {
481 __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
482 }
483 }
484 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
485
486 if (M256.w[1] > C256.w[1] ||
487 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
488 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
489 M256.w[1] = M256.w[1] - Carry - C8.w[1];
490 M256.w[0]++;
491 if (!M256.w[0]) {
492 M256.w[1]++;
493
494 }
495
496 if ((M256.w[1] > C256.w[1] ||
497 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
498 && (CS.w[0] > 1)) {
499
500 CS.w[0]--;
501
502 if (CS.w[0] > 1) {
503 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
504 M256.w[1] = M256.w[1] - Carry - C8.w[1];
505 M256.w[0]++;
506 if (!M256.w[0]) {
507 M256.w[1]++;
508 }
509
510 if (M256.w[1] > C256.w[1] ||
511 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
512 CS.w[0]--;
513 }
514 }
515 }
516
517 else {
518 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
519 M256.w[1] = M256.w[1] + Carry + C8.w[1];
520 M256.w[0]++;
521 if(!M256.w[0])
522 {
523 M256.w[1]++;
524 }
525 CS.w[0]++;
526 if(M256.w[1]<C256.w[1] ||
527 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
528 {
529 CS.w[0]++;
530 }*/
531 CS.w[0]++;
532 }
533 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
534 // RU?
535 if (((rnd_mode) != ROUNDING_UP) || exact) {
536 if (CS.w[0])
537 CS.w[0]--;
538 }
539
540 }
541 #endif
542 #endif
543 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
544
545 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
546 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
547 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
548 #endif
549 BID_RETURN (res);
550
551
552 }