0
|
1 /* This is an assembly language implementation of mulsi3, divsi3, and modsi3
|
|
2 for the sparc processor.
|
|
3
|
|
4 These routines are derived from the SPARC Architecture Manual, version 8,
|
|
5 slightly edited to match the desired calling convention, and also to
|
|
6 optimize them for our purposes. */
|
|
7
|
|
8 #ifdef L_mulsi3
|
|
9 .text
|
|
10 .align 4
|
|
11 .global .umul
|
|
12 .proc 4
|
|
13 .umul:
|
|
14 or %o0, %o1, %o4 ! logical or of multiplier and multiplicand
|
|
15 mov %o0, %y ! multiplier to Y register
|
|
16 andncc %o4, 0xfff, %o5 ! mask out lower 12 bits
|
|
17 be mul_shortway ! can do it the short way
|
|
18 andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc
|
|
19 !
|
|
20 ! long multiply
|
|
21 !
|
|
22 mulscc %o4, %o1, %o4 ! first iteration of 33
|
|
23 mulscc %o4, %o1, %o4
|
|
24 mulscc %o4, %o1, %o4
|
|
25 mulscc %o4, %o1, %o4
|
|
26 mulscc %o4, %o1, %o4
|
|
27 mulscc %o4, %o1, %o4
|
|
28 mulscc %o4, %o1, %o4
|
|
29 mulscc %o4, %o1, %o4
|
|
30 mulscc %o4, %o1, %o4
|
|
31 mulscc %o4, %o1, %o4
|
|
32 mulscc %o4, %o1, %o4
|
|
33 mulscc %o4, %o1, %o4
|
|
34 mulscc %o4, %o1, %o4
|
|
35 mulscc %o4, %o1, %o4
|
|
36 mulscc %o4, %o1, %o4
|
|
37 mulscc %o4, %o1, %o4
|
|
38 mulscc %o4, %o1, %o4
|
|
39 mulscc %o4, %o1, %o4
|
|
40 mulscc %o4, %o1, %o4
|
|
41 mulscc %o4, %o1, %o4
|
|
42 mulscc %o4, %o1, %o4
|
|
43 mulscc %o4, %o1, %o4
|
|
44 mulscc %o4, %o1, %o4
|
|
45 mulscc %o4, %o1, %o4
|
|
46 mulscc %o4, %o1, %o4
|
|
47 mulscc %o4, %o1, %o4
|
|
48 mulscc %o4, %o1, %o4
|
|
49 mulscc %o4, %o1, %o4
|
|
50 mulscc %o4, %o1, %o4
|
|
51 mulscc %o4, %o1, %o4
|
|
52 mulscc %o4, %o1, %o4
|
|
53 mulscc %o4, %o1, %o4 ! 32nd iteration
|
|
54 mulscc %o4, %g0, %o4 ! last iteration only shifts
|
|
55 ! the upper 32 bits of product are wrong, but we do not care
|
|
56 retl
|
|
57 rd %y, %o0
|
|
58 !
|
|
59 ! short multiply
|
|
60 !
|
|
61 mul_shortway:
|
|
62 mulscc %o4, %o1, %o4 ! first iteration of 13
|
|
63 mulscc %o4, %o1, %o4
|
|
64 mulscc %o4, %o1, %o4
|
|
65 mulscc %o4, %o1, %o4
|
|
66 mulscc %o4, %o1, %o4
|
|
67 mulscc %o4, %o1, %o4
|
|
68 mulscc %o4, %o1, %o4
|
|
69 mulscc %o4, %o1, %o4
|
|
70 mulscc %o4, %o1, %o4
|
|
71 mulscc %o4, %o1, %o4
|
|
72 mulscc %o4, %o1, %o4
|
|
73 mulscc %o4, %o1, %o4 ! 12th iteration
|
|
74 mulscc %o4, %g0, %o4 ! last iteration only shifts
|
|
75 rd %y, %o5
|
|
76 sll %o4, 12, %o4 ! left shift partial product by 12 bits
|
|
77 srl %o5, 20, %o5 ! right shift partial product by 20 bits
|
|
78 retl
|
|
79 or %o5, %o4, %o0 ! merge for true product
|
|
80 #endif
|
|
81
|
|
82 #ifdef L_divsi3
|
|
83 /*
|
|
84 * Division and remainder, from Appendix E of the SPARC Version 8
|
|
85 * Architecture Manual, with fixes from Gordon Irlam.
|
|
86 */
|
|
87
|
|
88 /*
|
|
89 * Input: dividend and divisor in %o0 and %o1 respectively.
|
|
90 *
|
|
91 * m4 parameters:
|
|
92 * .div name of function to generate
|
|
93 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
|
|
94 * true true=true => signed; true=false => unsigned
|
|
95 *
|
|
96 * Algorithm parameters:
|
|
97 * N how many bits per iteration we try to get (4)
|
|
98 * WORDSIZE total number of bits (32)
|
|
99 *
|
|
100 * Derived constants:
|
|
101 * TOPBITS number of bits in the top decade of a number
|
|
102 *
|
|
103 * Important variables:
|
|
104 * Q the partial quotient under development (initially 0)
|
|
105 * R the remainder so far, initially the dividend
|
|
106 * ITER number of main division loop iterations required;
|
|
107 * equal to ceil(log2(quotient) / N). Note that this
|
|
108 * is the log base (2^N) of the quotient.
|
|
109 * V the current comparand, initially divisor*2^(ITER*N-1)
|
|
110 *
|
|
111 * Cost:
|
|
112 * Current estimate for non-large dividend is
|
|
113 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
|
|
114 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
|
|
115 * different path, as the upper bits of the quotient must be developed
|
|
116 * one bit at a time.
|
|
117 */
|
|
118 .global .udiv
|
|
119 .align 4
|
|
120 .proc 4
|
|
121 .text
|
|
122 .udiv:
|
|
123 b ready_to_divide
|
|
124 mov 0, %g3 ! result is always positive
|
|
125
|
|
126 .global .div
|
|
127 .align 4
|
|
128 .proc 4
|
|
129 .text
|
|
130 .div:
|
|
131 ! compute sign of result; if neither is negative, no problem
|
|
132 orcc %o1, %o0, %g0 ! either negative?
|
|
133 bge ready_to_divide ! no, go do the divide
|
|
134 xor %o1, %o0, %g3 ! compute sign in any case
|
|
135 tst %o1
|
|
136 bge 1f
|
|
137 tst %o0
|
|
138 ! %o1 is definitely negative; %o0 might also be negative
|
|
139 bge ready_to_divide ! if %o0 not negative...
|
|
140 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
|
|
141 1: ! %o0 is negative, %o1 is nonnegative
|
|
142 sub %g0, %o0, %o0 ! make %o0 nonnegative
|
|
143
|
|
144
|
|
145 ready_to_divide:
|
|
146
|
|
147 ! Ready to divide. Compute size of quotient; scale comparand.
|
|
148 orcc %o1, %g0, %o5
|
|
149 bne 1f
|
|
150 mov %o0, %o3
|
|
151
|
|
152 ! Divide by zero trap. If it returns, return 0 (about as
|
|
153 ! wrong as possible, but that is what SunOS does...).
|
|
154 ta 0x2 ! ST_DIV0
|
|
155 retl
|
|
156 clr %o0
|
|
157
|
|
158 1:
|
|
159 cmp %o3, %o5 ! if %o1 exceeds %o0, done
|
|
160 blu got_result ! (and algorithm fails otherwise)
|
|
161 clr %o2
|
|
162 sethi %hi(1 << (32 - 4 - 1)), %g1
|
|
163 cmp %o3, %g1
|
|
164 blu not_really_big
|
|
165 clr %o4
|
|
166
|
|
167 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
|
|
168 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
|
|
169 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
|
|
170 ! Compute ITER in an unorthodox manner: know we need to shift V into
|
|
171 ! the top decade: so do not even bother to compare to R.
|
|
172 1:
|
|
173 cmp %o5, %g1
|
|
174 bgeu 3f
|
|
175 mov 1, %g2
|
|
176 sll %o5, 4, %o5
|
|
177 b 1b
|
|
178 add %o4, 1, %o4
|
|
179
|
|
180 ! Now compute %g2.
|
|
181 2: addcc %o5, %o5, %o5
|
|
182 bcc not_too_big
|
|
183 add %g2, 1, %g2
|
|
184
|
|
185 ! We get here if the %o1 overflowed while shifting.
|
|
186 ! This means that %o3 has the high-order bit set.
|
|
187 ! Restore %o5 and subtract from %o3.
|
|
188 sll %g1, 4, %g1 ! high order bit
|
|
189 srl %o5, 1, %o5 ! rest of %o5
|
|
190 add %o5, %g1, %o5
|
|
191 b do_single_div
|
|
192 sub %g2, 1, %g2
|
|
193
|
|
194 not_too_big:
|
|
195 3: cmp %o5, %o3
|
|
196 blu 2b
|
|
197 nop
|
|
198 be do_single_div
|
|
199 nop
|
|
200 /* NB: these are commented out in the V8-SPARC manual as well */
|
|
201 /* (I do not understand this) */
|
|
202 ! %o5 > %o3: went too far: back up 1 step
|
|
203 ! srl %o5, 1, %o5
|
|
204 ! dec %g2
|
|
205 ! do single-bit divide steps
|
|
206 !
|
|
207 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
|
|
208 ! first divide step without thinking. BUT, the others are conditional,
|
|
209 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
|
|
210 ! order bit set in the first step, just falling into the regular
|
|
211 ! division loop will mess up the first time around.
|
|
212 ! So we unroll slightly...
|
|
213 do_single_div:
|
|
214 subcc %g2, 1, %g2
|
|
215 bl end_regular_divide
|
|
216 nop
|
|
217 sub %o3, %o5, %o3
|
|
218 mov 1, %o2
|
|
219 b end_single_divloop
|
|
220 nop
|
|
221 single_divloop:
|
|
222 sll %o2, 1, %o2
|
|
223 bl 1f
|
|
224 srl %o5, 1, %o5
|
|
225 ! %o3 >= 0
|
|
226 sub %o3, %o5, %o3
|
|
227 b 2f
|
|
228 add %o2, 1, %o2
|
|
229 1: ! %o3 < 0
|
|
230 add %o3, %o5, %o3
|
|
231 sub %o2, 1, %o2
|
|
232 2:
|
|
233 end_single_divloop:
|
|
234 subcc %g2, 1, %g2
|
|
235 bge single_divloop
|
|
236 tst %o3
|
|
237 b,a end_regular_divide
|
|
238
|
|
239 not_really_big:
|
|
240 1:
|
|
241 sll %o5, 4, %o5
|
|
242 cmp %o5, %o3
|
|
243 bleu 1b
|
|
244 addcc %o4, 1, %o4
|
|
245 be got_result
|
|
246 sub %o4, 1, %o4
|
|
247
|
|
248 tst %o3 ! set up for initial iteration
|
|
249 divloop:
|
|
250 sll %o2, 4, %o2
|
|
251 ! depth 1, accumulated bits 0
|
|
252 bl L1.16
|
|
253 srl %o5,1,%o5
|
|
254 ! remainder is positive
|
|
255 subcc %o3,%o5,%o3
|
|
256 ! depth 2, accumulated bits 1
|
|
257 bl L2.17
|
|
258 srl %o5,1,%o5
|
|
259 ! remainder is positive
|
|
260 subcc %o3,%o5,%o3
|
|
261 ! depth 3, accumulated bits 3
|
|
262 bl L3.19
|
|
263 srl %o5,1,%o5
|
|
264 ! remainder is positive
|
|
265 subcc %o3,%o5,%o3
|
|
266 ! depth 4, accumulated bits 7
|
|
267 bl L4.23
|
|
268 srl %o5,1,%o5
|
|
269 ! remainder is positive
|
|
270 subcc %o3,%o5,%o3
|
|
271 b 9f
|
|
272 add %o2, (7*2+1), %o2
|
|
273
|
|
274 L4.23:
|
|
275 ! remainder is negative
|
|
276 addcc %o3,%o5,%o3
|
|
277 b 9f
|
|
278 add %o2, (7*2-1), %o2
|
|
279
|
|
280
|
|
281 L3.19:
|
|
282 ! remainder is negative
|
|
283 addcc %o3,%o5,%o3
|
|
284 ! depth 4, accumulated bits 5
|
|
285 bl L4.21
|
|
286 srl %o5,1,%o5
|
|
287 ! remainder is positive
|
|
288 subcc %o3,%o5,%o3
|
|
289 b 9f
|
|
290 add %o2, (5*2+1), %o2
|
|
291
|
|
292 L4.21:
|
|
293 ! remainder is negative
|
|
294 addcc %o3,%o5,%o3
|
|
295 b 9f
|
|
296 add %o2, (5*2-1), %o2
|
|
297
|
|
298 L2.17:
|
|
299 ! remainder is negative
|
|
300 addcc %o3,%o5,%o3
|
|
301 ! depth 3, accumulated bits 1
|
|
302 bl L3.17
|
|
303 srl %o5,1,%o5
|
|
304 ! remainder is positive
|
|
305 subcc %o3,%o5,%o3
|
|
306 ! depth 4, accumulated bits 3
|
|
307 bl L4.19
|
|
308 srl %o5,1,%o5
|
|
309 ! remainder is positive
|
|
310 subcc %o3,%o5,%o3
|
|
311 b 9f
|
|
312 add %o2, (3*2+1), %o2
|
|
313
|
|
314 L4.19:
|
|
315 ! remainder is negative
|
|
316 addcc %o3,%o5,%o3
|
|
317 b 9f
|
|
318 add %o2, (3*2-1), %o2
|
|
319
|
|
320 L3.17:
|
|
321 ! remainder is negative
|
|
322 addcc %o3,%o5,%o3
|
|
323 ! depth 4, accumulated bits 1
|
|
324 bl L4.17
|
|
325 srl %o5,1,%o5
|
|
326 ! remainder is positive
|
|
327 subcc %o3,%o5,%o3
|
|
328 b 9f
|
|
329 add %o2, (1*2+1), %o2
|
|
330
|
|
331 L4.17:
|
|
332 ! remainder is negative
|
|
333 addcc %o3,%o5,%o3
|
|
334 b 9f
|
|
335 add %o2, (1*2-1), %o2
|
|
336
|
|
337 L1.16:
|
|
338 ! remainder is negative
|
|
339 addcc %o3,%o5,%o3
|
|
340 ! depth 2, accumulated bits -1
|
|
341 bl L2.15
|
|
342 srl %o5,1,%o5
|
|
343 ! remainder is positive
|
|
344 subcc %o3,%o5,%o3
|
|
345 ! depth 3, accumulated bits -1
|
|
346 bl L3.15
|
|
347 srl %o5,1,%o5
|
|
348 ! remainder is positive
|
|
349 subcc %o3,%o5,%o3
|
|
350 ! depth 4, accumulated bits -1
|
|
351 bl L4.15
|
|
352 srl %o5,1,%o5
|
|
353 ! remainder is positive
|
|
354 subcc %o3,%o5,%o3
|
|
355 b 9f
|
|
356 add %o2, (-1*2+1), %o2
|
|
357
|
|
358 L4.15:
|
|
359 ! remainder is negative
|
|
360 addcc %o3,%o5,%o3
|
|
361 b 9f
|
|
362 add %o2, (-1*2-1), %o2
|
|
363
|
|
364 L3.15:
|
|
365 ! remainder is negative
|
|
366 addcc %o3,%o5,%o3
|
|
367 ! depth 4, accumulated bits -3
|
|
368 bl L4.13
|
|
369 srl %o5,1,%o5
|
|
370 ! remainder is positive
|
|
371 subcc %o3,%o5,%o3
|
|
372 b 9f
|
|
373 add %o2, (-3*2+1), %o2
|
|
374
|
|
375 L4.13:
|
|
376 ! remainder is negative
|
|
377 addcc %o3,%o5,%o3
|
|
378 b 9f
|
|
379 add %o2, (-3*2-1), %o2
|
|
380
|
|
381 L2.15:
|
|
382 ! remainder is negative
|
|
383 addcc %o3,%o5,%o3
|
|
384 ! depth 3, accumulated bits -3
|
|
385 bl L3.13
|
|
386 srl %o5,1,%o5
|
|
387 ! remainder is positive
|
|
388 subcc %o3,%o5,%o3
|
|
389 ! depth 4, accumulated bits -5
|
|
390 bl L4.11
|
|
391 srl %o5,1,%o5
|
|
392 ! remainder is positive
|
|
393 subcc %o3,%o5,%o3
|
|
394 b 9f
|
|
395 add %o2, (-5*2+1), %o2
|
|
396
|
|
397 L4.11:
|
|
398 ! remainder is negative
|
|
399 addcc %o3,%o5,%o3
|
|
400 b 9f
|
|
401 add %o2, (-5*2-1), %o2
|
|
402
|
|
403 L3.13:
|
|
404 ! remainder is negative
|
|
405 addcc %o3,%o5,%o3
|
|
406 ! depth 4, accumulated bits -7
|
|
407 bl L4.9
|
|
408 srl %o5,1,%o5
|
|
409 ! remainder is positive
|
|
410 subcc %o3,%o5,%o3
|
|
411 b 9f
|
|
412 add %o2, (-7*2+1), %o2
|
|
413
|
|
414 L4.9:
|
|
415 ! remainder is negative
|
|
416 addcc %o3,%o5,%o3
|
|
417 b 9f
|
|
418 add %o2, (-7*2-1), %o2
|
|
419
|
|
420 9:
|
|
421 end_regular_divide:
|
|
422 subcc %o4, 1, %o4
|
|
423 bge divloop
|
|
424 tst %o3
|
|
425 bl,a got_result
|
|
426 ! non-restoring fixup here (one instruction only!)
|
|
427 sub %o2, 1, %o2
|
|
428
|
|
429
|
|
430 got_result:
|
|
431 ! check to see if answer should be < 0
|
|
432 tst %g3
|
|
433 bl,a 1f
|
|
434 sub %g0, %o2, %o2
|
|
435 1:
|
|
436 retl
|
|
437 mov %o2, %o0
|
|
438 #endif
|
|
439
|
|
440 #ifdef L_modsi3
|
|
441 /* This implementation was taken from glibc:
|
|
442 *
|
|
443 * Input: dividend and divisor in %o0 and %o1 respectively.
|
|
444 *
|
|
445 * Algorithm parameters:
|
|
446 * N how many bits per iteration we try to get (4)
|
|
447 * WORDSIZE total number of bits (32)
|
|
448 *
|
|
449 * Derived constants:
|
|
450 * TOPBITS number of bits in the top decade of a number
|
|
451 *
|
|
452 * Important variables:
|
|
453 * Q the partial quotient under development (initially 0)
|
|
454 * R the remainder so far, initially the dividend
|
|
455 * ITER number of main division loop iterations required;
|
|
456 * equal to ceil(log2(quotient) / N). Note that this
|
|
457 * is the log base (2^N) of the quotient.
|
|
458 * V the current comparand, initially divisor*2^(ITER*N-1)
|
|
459 *
|
|
460 * Cost:
|
|
461 * Current estimate for non-large dividend is
|
|
462 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
|
|
463 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
|
|
464 * different path, as the upper bits of the quotient must be developed
|
|
465 * one bit at a time.
|
|
466 */
|
|
467 .text
|
|
468 .align 4
|
|
469 .global .urem
|
|
470 .proc 4
|
|
471 .urem:
|
|
472 b divide
|
|
473 mov 0, %g3 ! result always positive
|
|
474
|
|
475 .align 4
|
|
476 .global .rem
|
|
477 .proc 4
|
|
478 .rem:
|
|
479 ! compute sign of result; if neither is negative, no problem
|
|
480 orcc %o1, %o0, %g0 ! either negative?
|
|
481 bge 2f ! no, go do the divide
|
|
482 mov %o0, %g3 ! sign of remainder matches %o0
|
|
483 tst %o1
|
|
484 bge 1f
|
|
485 tst %o0
|
|
486 ! %o1 is definitely negative; %o0 might also be negative
|
|
487 bge 2f ! if %o0 not negative...
|
|
488 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
|
|
489 1: ! %o0 is negative, %o1 is nonnegative
|
|
490 sub %g0, %o0, %o0 ! make %o0 nonnegative
|
|
491 2:
|
|
492
|
|
493 ! Ready to divide. Compute size of quotient; scale comparand.
|
|
494 divide:
|
|
495 orcc %o1, %g0, %o5
|
|
496 bne 1f
|
|
497 mov %o0, %o3
|
|
498
|
|
499 ! Divide by zero trap. If it returns, return 0 (about as
|
|
500 ! wrong as possible, but that is what SunOS does...).
|
|
501 ta 0x2 !ST_DIV0
|
|
502 retl
|
|
503 clr %o0
|
|
504
|
|
505 1:
|
|
506 cmp %o3, %o5 ! if %o1 exceeds %o0, done
|
|
507 blu got_result ! (and algorithm fails otherwise)
|
|
508 clr %o2
|
|
509 sethi %hi(1 << (32 - 4 - 1)), %g1
|
|
510 cmp %o3, %g1
|
|
511 blu not_really_big
|
|
512 clr %o4
|
|
513
|
|
514 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
|
|
515 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
|
|
516 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
|
|
517 ! Compute ITER in an unorthodox manner: know we need to shift V into
|
|
518 ! the top decade: so do not even bother to compare to R.
|
|
519 1:
|
|
520 cmp %o5, %g1
|
|
521 bgeu 3f
|
|
522 mov 1, %g2
|
|
523 sll %o5, 4, %o5
|
|
524 b 1b
|
|
525 add %o4, 1, %o4
|
|
526
|
|
527 ! Now compute %g2.
|
|
528 2: addcc %o5, %o5, %o5
|
|
529 bcc not_too_big
|
|
530 add %g2, 1, %g2
|
|
531
|
|
532 ! We get here if the %o1 overflowed while shifting.
|
|
533 ! This means that %o3 has the high-order bit set.
|
|
534 ! Restore %o5 and subtract from %o3.
|
|
535 sll %g1, 4, %g1 ! high order bit
|
|
536 srl %o5, 1, %o5 ! rest of %o5
|
|
537 add %o5, %g1, %o5
|
|
538 b do_single_div
|
|
539 sub %g2, 1, %g2
|
|
540
|
|
541 not_too_big:
|
|
542 3: cmp %o5, %o3
|
|
543 blu 2b
|
|
544 nop
|
|
545 be do_single_div
|
|
546 nop
|
|
547 /* NB: these are commented out in the V8-SPARC manual as well */
|
|
548 /* (I do not understand this) */
|
|
549 ! %o5 > %o3: went too far: back up 1 step
|
|
550 ! srl %o5, 1, %o5
|
|
551 ! dec %g2
|
|
552 ! do single-bit divide steps
|
|
553 !
|
|
554 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
|
|
555 ! first divide step without thinking. BUT, the others are conditional,
|
|
556 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
|
|
557 ! order bit set in the first step, just falling into the regular
|
|
558 ! division loop will mess up the first time around.
|
|
559 ! So we unroll slightly...
|
|
560 do_single_div:
|
|
561 subcc %g2, 1, %g2
|
|
562 bl end_regular_divide
|
|
563 nop
|
|
564 sub %o3, %o5, %o3
|
|
565 mov 1, %o2
|
|
566 b end_single_divloop
|
|
567 nop
|
|
568 single_divloop:
|
|
569 sll %o2, 1, %o2
|
|
570 bl 1f
|
|
571 srl %o5, 1, %o5
|
|
572 ! %o3 >= 0
|
|
573 sub %o3, %o5, %o3
|
|
574 b 2f
|
|
575 add %o2, 1, %o2
|
|
576 1: ! %o3 < 0
|
|
577 add %o3, %o5, %o3
|
|
578 sub %o2, 1, %o2
|
|
579 2:
|
|
580 end_single_divloop:
|
|
581 subcc %g2, 1, %g2
|
|
582 bge single_divloop
|
|
583 tst %o3
|
|
584 b,a end_regular_divide
|
|
585
|
|
586 not_really_big:
|
|
587 1:
|
|
588 sll %o5, 4, %o5
|
|
589 cmp %o5, %o3
|
|
590 bleu 1b
|
|
591 addcc %o4, 1, %o4
|
|
592 be got_result
|
|
593 sub %o4, 1, %o4
|
|
594
|
|
595 tst %o3 ! set up for initial iteration
|
|
596 divloop:
|
|
597 sll %o2, 4, %o2
|
|
598 ! depth 1, accumulated bits 0
|
|
599 bl L1.16
|
|
600 srl %o5,1,%o5
|
|
601 ! remainder is positive
|
|
602 subcc %o3,%o5,%o3
|
|
603 ! depth 2, accumulated bits 1
|
|
604 bl L2.17
|
|
605 srl %o5,1,%o5
|
|
606 ! remainder is positive
|
|
607 subcc %o3,%o5,%o3
|
|
608 ! depth 3, accumulated bits 3
|
|
609 bl L3.19
|
|
610 srl %o5,1,%o5
|
|
611 ! remainder is positive
|
|
612 subcc %o3,%o5,%o3
|
|
613 ! depth 4, accumulated bits 7
|
|
614 bl L4.23
|
|
615 srl %o5,1,%o5
|
|
616 ! remainder is positive
|
|
617 subcc %o3,%o5,%o3
|
|
618 b 9f
|
|
619 add %o2, (7*2+1), %o2
|
|
620 L4.23:
|
|
621 ! remainder is negative
|
|
622 addcc %o3,%o5,%o3
|
|
623 b 9f
|
|
624 add %o2, (7*2-1), %o2
|
|
625
|
|
626 L3.19:
|
|
627 ! remainder is negative
|
|
628 addcc %o3,%o5,%o3
|
|
629 ! depth 4, accumulated bits 5
|
|
630 bl L4.21
|
|
631 srl %o5,1,%o5
|
|
632 ! remainder is positive
|
|
633 subcc %o3,%o5,%o3
|
|
634 b 9f
|
|
635 add %o2, (5*2+1), %o2
|
|
636
|
|
637 L4.21:
|
|
638 ! remainder is negative
|
|
639 addcc %o3,%o5,%o3
|
|
640 b 9f
|
|
641 add %o2, (5*2-1), %o2
|
|
642
|
|
643 L2.17:
|
|
644 ! remainder is negative
|
|
645 addcc %o3,%o5,%o3
|
|
646 ! depth 3, accumulated bits 1
|
|
647 bl L3.17
|
|
648 srl %o5,1,%o5
|
|
649 ! remainder is positive
|
|
650 subcc %o3,%o5,%o3
|
|
651 ! depth 4, accumulated bits 3
|
|
652 bl L4.19
|
|
653 srl %o5,1,%o5
|
|
654 ! remainder is positive
|
|
655 subcc %o3,%o5,%o3
|
|
656 b 9f
|
|
657 add %o2, (3*2+1), %o2
|
|
658
|
|
659 L4.19:
|
|
660 ! remainder is negative
|
|
661 addcc %o3,%o5,%o3
|
|
662 b 9f
|
|
663 add %o2, (3*2-1), %o2
|
|
664
|
|
665 L3.17:
|
|
666 ! remainder is negative
|
|
667 addcc %o3,%o5,%o3
|
|
668 ! depth 4, accumulated bits 1
|
|
669 bl L4.17
|
|
670 srl %o5,1,%o5
|
|
671 ! remainder is positive
|
|
672 subcc %o3,%o5,%o3
|
|
673 b 9f
|
|
674 add %o2, (1*2+1), %o2
|
|
675
|
|
676 L4.17:
|
|
677 ! remainder is negative
|
|
678 addcc %o3,%o5,%o3
|
|
679 b 9f
|
|
680 add %o2, (1*2-1), %o2
|
|
681
|
|
682 L1.16:
|
|
683 ! remainder is negative
|
|
684 addcc %o3,%o5,%o3
|
|
685 ! depth 2, accumulated bits -1
|
|
686 bl L2.15
|
|
687 srl %o5,1,%o5
|
|
688 ! remainder is positive
|
|
689 subcc %o3,%o5,%o3
|
|
690 ! depth 3, accumulated bits -1
|
|
691 bl L3.15
|
|
692 srl %o5,1,%o5
|
|
693 ! remainder is positive
|
|
694 subcc %o3,%o5,%o3
|
|
695 ! depth 4, accumulated bits -1
|
|
696 bl L4.15
|
|
697 srl %o5,1,%o5
|
|
698 ! remainder is positive
|
|
699 subcc %o3,%o5,%o3
|
|
700 b 9f
|
|
701 add %o2, (-1*2+1), %o2
|
|
702
|
|
703 L4.15:
|
|
704 ! remainder is negative
|
|
705 addcc %o3,%o5,%o3
|
|
706 b 9f
|
|
707 add %o2, (-1*2-1), %o2
|
|
708
|
|
709 L3.15:
|
|
710 ! remainder is negative
|
|
711 addcc %o3,%o5,%o3
|
|
712 ! depth 4, accumulated bits -3
|
|
713 bl L4.13
|
|
714 srl %o5,1,%o5
|
|
715 ! remainder is positive
|
|
716 subcc %o3,%o5,%o3
|
|
717 b 9f
|
|
718 add %o2, (-3*2+1), %o2
|
|
719
|
|
720 L4.13:
|
|
721 ! remainder is negative
|
|
722 addcc %o3,%o5,%o3
|
|
723 b 9f
|
|
724 add %o2, (-3*2-1), %o2
|
|
725
|
|
726 L2.15:
|
|
727 ! remainder is negative
|
|
728 addcc %o3,%o5,%o3
|
|
729 ! depth 3, accumulated bits -3
|
|
730 bl L3.13
|
|
731 srl %o5,1,%o5
|
|
732 ! remainder is positive
|
|
733 subcc %o3,%o5,%o3
|
|
734 ! depth 4, accumulated bits -5
|
|
735 bl L4.11
|
|
736 srl %o5,1,%o5
|
|
737 ! remainder is positive
|
|
738 subcc %o3,%o5,%o3
|
|
739 b 9f
|
|
740 add %o2, (-5*2+1), %o2
|
|
741
|
|
742 L4.11:
|
|
743 ! remainder is negative
|
|
744 addcc %o3,%o5,%o3
|
|
745 b 9f
|
|
746 add %o2, (-5*2-1), %o2
|
|
747
|
|
748 L3.13:
|
|
749 ! remainder is negative
|
|
750 addcc %o3,%o5,%o3
|
|
751 ! depth 4, accumulated bits -7
|
|
752 bl L4.9
|
|
753 srl %o5,1,%o5
|
|
754 ! remainder is positive
|
|
755 subcc %o3,%o5,%o3
|
|
756 b 9f
|
|
757 add %o2, (-7*2+1), %o2
|
|
758
|
|
759 L4.9:
|
|
760 ! remainder is negative
|
|
761 addcc %o3,%o5,%o3
|
|
762 b 9f
|
|
763 add %o2, (-7*2-1), %o2
|
|
764
|
|
765 9:
|
|
766 end_regular_divide:
|
|
767 subcc %o4, 1, %o4
|
|
768 bge divloop
|
|
769 tst %o3
|
|
770 bl,a got_result
|
|
771 ! non-restoring fixup here (one instruction only!)
|
|
772 add %o3, %o1, %o3
|
|
773
|
|
774 got_result:
|
|
775 ! check to see if answer should be < 0
|
|
776 tst %g3
|
|
777 bl,a 1f
|
|
778 sub %g0, %o3, %o3
|
|
779 1:
|
|
780 retl
|
|
781 mov %o3, %o0
|
|
782
|
|
783 #endif
|
|
784
|