comparison gcc/config/rs6000/rs6000-string.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
comparison
equal deleted inserted replaced
68:561a7518be6b 111:04ced10e8804
1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2017 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "tm_p.h"
29 #include "ira.h"
30 #include "print-tree.h"
31 #include "varasm.h"
32 #include "explow.h"
33 #include "expr.h"
34 #include "output.h"
35 #include "target.h"
36
37 /* Expand a block clear operation, and return 1 if successful. Return 0
38 if we should let the compiler generate normal code.
39
40 operands[0] is the destination
41 operands[1] is the length
42 operands[3] is the alignment */
43
44 int
45 expand_block_clear (rtx operands[])
46 {
47 rtx orig_dest = operands[0];
48 rtx bytes_rtx = operands[1];
49 rtx align_rtx = operands[3];
50 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
51 HOST_WIDE_INT align;
52 HOST_WIDE_INT bytes;
53 int offset;
54 int clear_bytes;
55 int clear_step;
56
57 /* If this is not a fixed size move, just call memcpy */
58 if (! constp)
59 return 0;
60
61 /* This must be a fixed size alignment */
62 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
63 align = INTVAL (align_rtx) * BITS_PER_UNIT;
64
65 /* Anything to clear? */
66 bytes = INTVAL (bytes_rtx);
67 if (bytes <= 0)
68 return 1;
69
70 /* Use the builtin memset after a point, to avoid huge code bloat.
71 When optimize_size, avoid any significant code bloat; calling
72 memset is about 4 instructions, so allow for one instruction to
73 load zero and three to do clearing. */
74 if (TARGET_ALTIVEC && align >= 128)
75 clear_step = 16;
76 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
77 clear_step = 8;
78 else
79 clear_step = 4;
80
81 if (optimize_size && bytes > 3 * clear_step)
82 return 0;
83 if (! optimize_size && bytes > 8 * clear_step)
84 return 0;
85
86 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
87 {
88 machine_mode mode = BLKmode;
89 rtx dest;
90
91 if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
92 {
93 clear_bytes = 16;
94 mode = V4SImode;
95 }
96 else if (bytes >= 8 && TARGET_POWERPC64
97 && (align >= 64 || !STRICT_ALIGNMENT))
98 {
99 clear_bytes = 8;
100 mode = DImode;
101 if (offset == 0 && align < 64)
102 {
103 rtx addr;
104
105 /* If the address form is reg+offset with offset not a
106 multiple of four, reload into reg indirect form here
107 rather than waiting for reload. This way we get one
108 reload, not one per store. */
109 addr = XEXP (orig_dest, 0);
110 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
111 && GET_CODE (XEXP (addr, 1)) == CONST_INT
112 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
113 {
114 addr = copy_addr_to_reg (addr);
115 orig_dest = replace_equiv_address (orig_dest, addr);
116 }
117 }
118 }
119 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
120 { /* move 4 bytes */
121 clear_bytes = 4;
122 mode = SImode;
123 }
124 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
125 { /* move 2 bytes */
126 clear_bytes = 2;
127 mode = HImode;
128 }
129 else /* move 1 byte at a time */
130 {
131 clear_bytes = 1;
132 mode = QImode;
133 }
134
135 dest = adjust_address (orig_dest, mode, offset);
136
137 emit_move_insn (dest, CONST0_RTX (mode));
138 }
139
140 return 1;
141 }
142
143 /* Figure out the correct instructions to generate to load data for
144 block compare. MODE is used for the read from memory, and
145 data is zero extended if REG is wider than MODE. If LE code
146 is being generated, bswap loads are used.
147
148 REG is the destination register to move the data into.
149 MEM is the memory block being read.
150 MODE is the mode of memory to use for the read. */
151 static void
152 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
153 {
154 switch (GET_MODE (reg))
155 {
156 case E_DImode:
157 switch (mode)
158 {
159 case E_QImode:
160 emit_insn (gen_zero_extendqidi2 (reg, mem));
161 break;
162 case E_HImode:
163 {
164 rtx src = mem;
165 if (!BYTES_BIG_ENDIAN)
166 {
167 src = gen_reg_rtx (HImode);
168 emit_insn (gen_bswaphi2 (src, mem));
169 }
170 emit_insn (gen_zero_extendhidi2 (reg, src));
171 break;
172 }
173 case E_SImode:
174 {
175 rtx src = mem;
176 if (!BYTES_BIG_ENDIAN)
177 {
178 src = gen_reg_rtx (SImode);
179 emit_insn (gen_bswapsi2 (src, mem));
180 }
181 emit_insn (gen_zero_extendsidi2 (reg, src));
182 }
183 break;
184 case E_DImode:
185 if (!BYTES_BIG_ENDIAN)
186 emit_insn (gen_bswapdi2 (reg, mem));
187 else
188 emit_insn (gen_movdi (reg, mem));
189 break;
190 default:
191 gcc_unreachable ();
192 }
193 break;
194
195 case E_SImode:
196 switch (mode)
197 {
198 case E_QImode:
199 emit_insn (gen_zero_extendqisi2 (reg, mem));
200 break;
201 case E_HImode:
202 {
203 rtx src = mem;
204 if (!BYTES_BIG_ENDIAN)
205 {
206 src = gen_reg_rtx (HImode);
207 emit_insn (gen_bswaphi2 (src, mem));
208 }
209 emit_insn (gen_zero_extendhisi2 (reg, src));
210 break;
211 }
212 case E_SImode:
213 if (!BYTES_BIG_ENDIAN)
214 emit_insn (gen_bswapsi2 (reg, mem));
215 else
216 emit_insn (gen_movsi (reg, mem));
217 break;
218 case E_DImode:
219 /* DImode is larger than the destination reg so is not expected. */
220 gcc_unreachable ();
221 break;
222 default:
223 gcc_unreachable ();
224 }
225 break;
226 default:
227 gcc_unreachable ();
228 break;
229 }
230 }
231
232 /* Select the mode to be used for reading the next chunk of bytes
233 in the compare.
234
235 OFFSET is the current read offset from the beginning of the block.
236 BYTES is the number of bytes remaining to be read.
237 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
238 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
239 the largest allowable mode. */
240 static machine_mode
241 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
242 unsigned HOST_WIDE_INT bytes,
243 unsigned HOST_WIDE_INT align, bool word_mode_ok)
244 {
245 /* First see if we can do a whole load unit
246 as that will be more efficient than a larger load + shift. */
247
248 /* If big, use biggest chunk.
249 If exactly chunk size, use that size.
250 If remainder can be done in one piece with shifting, do that.
251 Do largest chunk possible without violating alignment rules. */
252
253 /* The most we can read without potential page crossing. */
254 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
255
256 if (word_mode_ok && bytes >= UNITS_PER_WORD)
257 return word_mode;
258 else if (bytes == GET_MODE_SIZE (SImode))
259 return SImode;
260 else if (bytes == GET_MODE_SIZE (HImode))
261 return HImode;
262 else if (bytes == GET_MODE_SIZE (QImode))
263 return QImode;
264 else if (bytes < GET_MODE_SIZE (SImode)
265 && offset >= GET_MODE_SIZE (SImode) - bytes)
266 /* This matches the case were we have SImode and 3 bytes
267 and offset >= 1 and permits us to move back one and overlap
268 with the previous read, thus avoiding having to shift
269 unwanted bytes off of the input. */
270 return SImode;
271 else if (word_mode_ok && bytes < UNITS_PER_WORD
272 && offset >= UNITS_PER_WORD-bytes)
273 /* Similarly, if we can use DImode it will get matched here and
274 can do an overlapping read that ends at the end of the block. */
275 return word_mode;
276 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
277 /* It is safe to do all remaining in one load of largest size,
278 possibly with a shift to get rid of unwanted bytes. */
279 return word_mode;
280 else if (maxread >= GET_MODE_SIZE (SImode))
281 /* It is safe to do all remaining in one SImode load,
282 possibly with a shift to get rid of unwanted bytes. */
283 return SImode;
284 else if (bytes > GET_MODE_SIZE (SImode))
285 return SImode;
286 else if (bytes > GET_MODE_SIZE (HImode))
287 return HImode;
288
289 /* final fallback is do one byte */
290 return QImode;
291 }
292
293 /* Compute the alignment of pointer+OFFSET where the original alignment
294 of pointer was BASE_ALIGN. */
295 static unsigned HOST_WIDE_INT
296 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
297 unsigned HOST_WIDE_INT offset)
298 {
299 if (offset == 0)
300 return base_align;
301 return MIN (base_align, offset & -offset);
302 }
303
304 /* Expand a block compare operation, and return true if successful.
305 Return false if we should let the compiler generate normal code,
306 probably a memcmp call.
307
308 OPERANDS[0] is the target (result).
309 OPERANDS[1] is the first source.
310 OPERANDS[2] is the second source.
311 OPERANDS[3] is the length.
312 OPERANDS[4] is the alignment. */
313 bool
314 expand_block_compare (rtx operands[])
315 {
316 rtx target = operands[0];
317 rtx orig_src1 = operands[1];
318 rtx orig_src2 = operands[2];
319 rtx bytes_rtx = operands[3];
320 rtx align_rtx = operands[4];
321 HOST_WIDE_INT cmp_bytes = 0;
322 rtx src1 = orig_src1;
323 rtx src2 = orig_src2;
324
325 /* This case is complicated to handle because the subtract
326 with carry instructions do not generate the 64-bit
327 carry and so we must emit code to calculate it ourselves.
328 We choose not to implement this yet. */
329 if (TARGET_32BIT && TARGET_POWERPC64)
330 return false;
331
332 /* If this is not a fixed size compare, just call memcmp. */
333 if (!CONST_INT_P (bytes_rtx))
334 return false;
335
336 /* This must be a fixed size alignment. */
337 if (!CONST_INT_P (align_rtx))
338 return false;
339
340 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
341
342 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
343 if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
344 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))
345 return false;
346
347 gcc_assert (GET_MODE (target) == SImode);
348
349 /* Anything to move? */
350 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
351 if (bytes == 0)
352 return true;
353
354 /* The code generated for p7 and older is not faster than glibc
355 memcmp if alignment is small and length is not short, so bail
356 out to avoid those conditions. */
357 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
358 && ((base_align == 1 && bytes > 16)
359 || (base_align == 2 && bytes > 32)))
360 return false;
361
362 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
363 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
364 /* P7/P8 code uses cond for subfc. but P9 uses
365 it for cmpld which needs CCUNSmode. */
366 rtx cond;
367 if (TARGET_P9_MISC)
368 cond = gen_reg_rtx (CCUNSmode);
369 else
370 cond = gen_reg_rtx (CCmode);
371
372 /* If we have an LE target without ldbrx and word_mode is DImode,
373 then we must avoid using word_mode. */
374 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
375 && word_mode == DImode);
376
377 /* Strategy phase. How many ops will this take and should we expand it? */
378
379 unsigned HOST_WIDE_INT offset = 0;
380 machine_mode load_mode =
381 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
382 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
383
384 /* We don't want to generate too much code. */
385 unsigned HOST_WIDE_INT max_bytes =
386 load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
387 if (!IN_RANGE (bytes, 1, max_bytes))
388 return false;
389
390 bool generate_6432_conversion = false;
391 rtx convert_label = NULL;
392 rtx final_label = NULL;
393
394 /* Example of generated code for 18 bytes aligned 1 byte.
395 Compiled with -fno-reorder-blocks for clarity.
396 ldbrx 10,31,8
397 ldbrx 9,7,8
398 subfc. 9,9,10
399 bne 0,.L6487
400 addi 9,12,8
401 addi 5,11,8
402 ldbrx 10,0,9
403 ldbrx 9,0,5
404 subfc. 9,9,10
405 bne 0,.L6487
406 addi 9,12,16
407 lhbrx 10,0,9
408 addi 9,11,16
409 lhbrx 9,0,9
410 subf 9,9,10
411 b .L6488
412 .p2align 4,,15
413 .L6487: #convert_label
414 popcntd 9,9
415 subfe 10,10,10
416 or 9,9,10
417 .L6488: #final_label
418 extsw 10,9
419
420 We start off with DImode for two blocks that jump to the DI->SI conversion
421 if the difference is found there, then a final block of HImode that skips
422 the DI->SI conversion. */
423
424 while (bytes > 0)
425 {
426 unsigned int align = compute_current_alignment (base_align, offset);
427 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
428 load_mode = select_block_compare_mode (offset, bytes, align,
429 word_mode_ok);
430 else
431 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
432 load_mode_size = GET_MODE_SIZE (load_mode);
433 if (bytes >= load_mode_size)
434 cmp_bytes = load_mode_size;
435 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
436 {
437 /* Move this load back so it doesn't go past the end.
438 P8/P9 can do this efficiently. */
439 unsigned int extra_bytes = load_mode_size - bytes;
440 cmp_bytes = bytes;
441 if (extra_bytes < offset)
442 {
443 offset -= extra_bytes;
444 cmp_bytes = load_mode_size;
445 bytes = cmp_bytes;
446 }
447 }
448 else
449 /* P7 and earlier can't do the overlapping load trick fast,
450 so this forces a non-overlapping load and a shift to get
451 rid of the extra bytes. */
452 cmp_bytes = bytes;
453
454 src1 = adjust_address (orig_src1, load_mode, offset);
455 src2 = adjust_address (orig_src2, load_mode, offset);
456
457 if (!REG_P (XEXP (src1, 0)))
458 {
459 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
460 src1 = replace_equiv_address (src1, src1_reg);
461 }
462 set_mem_size (src1, cmp_bytes);
463
464 if (!REG_P (XEXP (src2, 0)))
465 {
466 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
467 src2 = replace_equiv_address (src2, src2_reg);
468 }
469 set_mem_size (src2, cmp_bytes);
470
471 do_load_for_compare (tmp_reg_src1, src1, load_mode);
472 do_load_for_compare (tmp_reg_src2, src2, load_mode);
473
474 if (cmp_bytes < load_mode_size)
475 {
476 /* Shift unneeded bytes off. */
477 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
478 if (word_mode == DImode)
479 {
480 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
481 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
482 }
483 else
484 {
485 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
486 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
487 }
488 }
489
490 int remain = bytes - cmp_bytes;
491 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
492 {
493 /* Target is larger than load size so we don't need to
494 reduce result size. */
495
496 /* We previously did a block that need 64->32 conversion but
497 the current block does not, so a label is needed to jump
498 to the end. */
499 if (generate_6432_conversion && !final_label)
500 final_label = gen_label_rtx ();
501
502 if (remain > 0)
503 {
504 /* This is not the last block, branch to the end if the result
505 of this subtract is not zero. */
506 if (!final_label)
507 final_label = gen_label_rtx ();
508 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
509 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
510 rtx cr = gen_reg_rtx (CCmode);
511 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
512 emit_insn (gen_movsi (target,
513 gen_lowpart (SImode, tmp_reg_src2)));
514 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
515 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
516 fin_ref, pc_rtx);
517 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
518 JUMP_LABEL (j) = final_label;
519 LABEL_NUSES (final_label) += 1;
520 }
521 else
522 {
523 if (word_mode == DImode)
524 {
525 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
526 tmp_reg_src2));
527 emit_insn (gen_movsi (target,
528 gen_lowpart (SImode, tmp_reg_src2)));
529 }
530 else
531 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
532
533 if (final_label)
534 {
535 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
536 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
537 JUMP_LABEL(j) = final_label;
538 LABEL_NUSES (final_label) += 1;
539 emit_barrier ();
540 }
541 }
542 }
543 else
544 {
545 /* Do we need a 64->32 conversion block? We need the 64->32
546 conversion even if target size == load_mode size because
547 the subtract generates one extra bit. */
548 generate_6432_conversion = true;
549
550 if (remain > 0)
551 {
552 if (!convert_label)
553 convert_label = gen_label_rtx ();
554
555 /* Compare to zero and branch to convert_label if not zero. */
556 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
557 if (TARGET_P9_MISC)
558 {
559 /* Generate a compare, and convert with a setb later. */
560 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
561 tmp_reg_src2);
562 emit_insn (gen_rtx_SET (cond, cmp));
563 }
564 else
565 /* Generate a subfc. and use the longer
566 sequence for conversion. */
567 if (TARGET_64BIT)
568 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
569 tmp_reg_src1, cond));
570 else
571 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
572 tmp_reg_src1, cond));
573 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
574 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
575 cvt_ref, pc_rtx);
576 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
577 JUMP_LABEL(j) = convert_label;
578 LABEL_NUSES (convert_label) += 1;
579 }
580 else
581 {
582 /* Just do the subtract/compare. Since this is the last block
583 the convert code will be generated immediately following. */
584 if (TARGET_P9_MISC)
585 {
586 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
587 tmp_reg_src2);
588 emit_insn (gen_rtx_SET (cond, cmp));
589 }
590 else
591 if (TARGET_64BIT)
592 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
593 tmp_reg_src1));
594 else
595 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
596 tmp_reg_src1));
597 }
598 }
599
600 offset += cmp_bytes;
601 bytes -= cmp_bytes;
602 }
603
604 if (generate_6432_conversion)
605 {
606 if (convert_label)
607 emit_label (convert_label);
608
609 /* We need to produce DI result from sub, then convert to target SI
610 while maintaining <0 / ==0 / >0 properties. This sequence works:
611 subfc L,A,B
612 subfe H,H,H
613 popcntd L,L
614 rldimi L,H,6,0
615
616 This is an alternate one Segher cooked up if somebody
617 wants to expand this for something that doesn't have popcntd:
618 subfc L,a,b
619 subfe H,x,x
620 addic t,L,-1
621 subfe v,t,L
622 or z,v,H
623
624 And finally, p9 can just do this:
625 cmpld A,B
626 setb r */
627
628 if (TARGET_P9_MISC)
629 {
630 emit_insn (gen_setb_unsigned (target, cond));
631 }
632 else
633 {
634 if (TARGET_64BIT)
635 {
636 rtx tmp_reg_ca = gen_reg_rtx (DImode);
637 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
638 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
639 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
640 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
641 }
642 else
643 {
644 rtx tmp_reg_ca = gen_reg_rtx (SImode);
645 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
646 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
647 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
648 }
649 }
650 }
651
652 if (final_label)
653 emit_label (final_label);
654
655 gcc_assert (bytes == 0);
656 return true;
657 }
658
659 /* Generate alignment check and branch code to set up for
660 strncmp when we don't have DI alignment.
661 STRNCMP_LABEL is the label to branch if there is a page crossing.
662 SRC is the string pointer to be examined.
663 BYTES is the max number of bytes to compare. */
664 static void
665 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
666 {
667 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
668 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
669 if (GET_MODE (src_check) == SImode)
670 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
671 else
672 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
673 rtx cond = gen_reg_rtx (CCmode);
674 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
675 GEN_INT (4096 - bytes)));
676
677 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
678
679 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
680 lab_ref, pc_rtx);
681 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
682 JUMP_LABEL (j) = strncmp_label;
683 LABEL_NUSES (strncmp_label) += 1;
684 }
685
686 /* Expand a string compare operation with length, and return
687 true if successful. Return false if we should let the
688 compiler generate normal code, probably a strncmp call.
689
690 OPERANDS[0] is the target (result).
691 OPERANDS[1] is the first source.
692 OPERANDS[2] is the second source.
693 If NO_LENGTH is zero, then:
694 OPERANDS[3] is the length.
695 OPERANDS[4] is the alignment in bytes.
696 If NO_LENGTH is nonzero, then:
697 OPERANDS[3] is the alignment in bytes. */
698 bool
699 expand_strn_compare (rtx operands[], int no_length)
700 {
701 rtx target = operands[0];
702 rtx orig_src1 = operands[1];
703 rtx orig_src2 = operands[2];
704 rtx bytes_rtx, align_rtx;
705 if (no_length)
706 {
707 bytes_rtx = NULL;
708 align_rtx = operands[3];
709 }
710 else
711 {
712 bytes_rtx = operands[3];
713 align_rtx = operands[4];
714 }
715 unsigned HOST_WIDE_INT cmp_bytes = 0;
716 rtx src1 = orig_src1;
717 rtx src2 = orig_src2;
718
719 /* If we have a length, it must be constant. This simplifies things
720 a bit as we don't have to generate code to check if we've exceeded
721 the length. Later this could be expanded to handle this case. */
722 if (!no_length && !CONST_INT_P (bytes_rtx))
723 return false;
724
725 /* This must be a fixed size alignment. */
726 if (!CONST_INT_P (align_rtx))
727 return false;
728
729 unsigned int base_align = UINTVAL (align_rtx);
730 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
731 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
732
733 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
734 if (targetm.slow_unaligned_access (word_mode, align1)
735 || targetm.slow_unaligned_access (word_mode, align2))
736 return false;
737
738 gcc_assert (GET_MODE (target) == SImode);
739
740 /* If we have an LE target without ldbrx and word_mode is DImode,
741 then we must avoid using word_mode. */
742 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
743 && word_mode == DImode);
744
745 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
746
747 unsigned HOST_WIDE_INT offset = 0;
748 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
749 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
750 if (no_length)
751 /* Use this as a standin to determine the mode to use. */
752 bytes = rs6000_string_compare_inline_limit * word_mode_size;
753 else
754 bytes = UINTVAL (bytes_rtx);
755
756 machine_mode load_mode =
757 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
758 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
759 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
760
761 /* If we have equality at the end of the last compare and we have not
762 found the end of the string, we need to call strcmp/strncmp to
763 compare the remainder. */
764 bool equality_compare_rest = false;
765
766 if (no_length)
767 {
768 bytes = compare_length;
769 equality_compare_rest = true;
770 }
771 else
772 {
773 if (bytes <= compare_length)
774 compare_length = bytes;
775 else
776 equality_compare_rest = true;
777 }
778
779 rtx result_reg = gen_reg_rtx (word_mode);
780 rtx final_move_label = gen_label_rtx ();
781 rtx final_label = gen_label_rtx ();
782 rtx begin_compare_label = NULL;
783
784 if (base_align < 8)
785 {
786 /* Generate code that checks distance to 4k boundary for this case. */
787 begin_compare_label = gen_label_rtx ();
788 rtx strncmp_label = gen_label_rtx ();
789 rtx jmp;
790
791 /* Strncmp for power8 in glibc does this:
792 rldicl r8,r3,0,52
793 cmpldi cr7,r8,4096-16
794 bgt cr7,L(pagecross) */
795
796 /* Make sure that the length we use for the alignment test and
797 the subsequent code generation are in agreement so we do not
798 go past the length we tested for a 4k boundary crossing. */
799 unsigned HOST_WIDE_INT align_test = compare_length;
800 if (align_test < 8)
801 {
802 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
803 base_align = align_test;
804 }
805 else
806 {
807 align_test = ROUND_UP (align_test, 8);
808 base_align = 8;
809 }
810
811 if (align1 < 8)
812 expand_strncmp_align_check (strncmp_label, src1, align_test);
813 if (align2 < 8)
814 expand_strncmp_align_check (strncmp_label, src2, align_test);
815
816 /* Now generate the following sequence:
817 - branch to begin_compare
818 - strncmp_label
819 - call to strncmp
820 - branch to final_label
821 - begin_compare_label */
822
823 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
824 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
825 JUMP_LABEL (jmp) = begin_compare_label;
826 LABEL_NUSES (begin_compare_label) += 1;
827 emit_barrier ();
828
829 emit_label (strncmp_label);
830
831 if (!REG_P (XEXP (src1, 0)))
832 {
833 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
834 src1 = replace_equiv_address (src1, src1_reg);
835 }
836
837 if (!REG_P (XEXP (src2, 0)))
838 {
839 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
840 src2 = replace_equiv_address (src2, src2_reg);
841 }
842
843 if (no_length)
844 {
845 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
846 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
847 target, LCT_NORMAL, GET_MODE (target),
848 force_reg (Pmode, XEXP (src1, 0)), Pmode,
849 force_reg (Pmode, XEXP (src2, 0)), Pmode);
850 }
851 else
852 {
853 /* -m32 -mpowerpc64 results in word_mode being DImode even
854 though otherwise it is 32-bit. The length arg to strncmp
855 is a size_t which will be the same size as pointers. */
856 rtx len_rtx;
857 if (TARGET_64BIT)
858 len_rtx = gen_reg_rtx (DImode);
859 else
860 len_rtx = gen_reg_rtx (SImode);
861
862 emit_move_insn (len_rtx, bytes_rtx);
863
864 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
865 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
866 target, LCT_NORMAL, GET_MODE (target),
867 force_reg (Pmode, XEXP (src1, 0)), Pmode,
868 force_reg (Pmode, XEXP (src2, 0)), Pmode,
869 len_rtx, GET_MODE (len_rtx));
870 }
871
872 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
873 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
874 JUMP_LABEL (jmp) = final_label;
875 LABEL_NUSES (final_label) += 1;
876 emit_barrier ();
877 emit_label (begin_compare_label);
878 }
879
880 rtx cleanup_label = NULL;
881 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
882 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
883
884 /* Generate sequence of ld/ldbrx, cmpb to compare out
885 to the length specified. */
886 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
887 while (bytes_to_compare > 0)
888 {
889 /* Compare sequence:
890 check each 8B with: ld/ld cmpd bne
891 If equal, use rldicr/cmpb to check for zero byte.
892 cleanup code at end:
893 cmpb get byte that differs
894 cmpb look for zero byte
895 orc combine
896 cntlzd get bit of first zero/diff byte
897 subfic convert for rldcl use
898 rldcl rldcl extract diff/zero byte
899 subf subtract for final result
900
901 The last compare can branch around the cleanup code if the
902 result is zero because the strings are exactly equal. */
903 unsigned int align = compute_current_alignment (base_align, offset);
904 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
905 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
906 word_mode_ok);
907 else
908 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
909 word_mode_ok);
910 load_mode_size = GET_MODE_SIZE (load_mode);
911 if (bytes_to_compare >= load_mode_size)
912 cmp_bytes = load_mode_size;
913 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
914 {
915 /* Move this load back so it doesn't go past the end.
916 P8/P9 can do this efficiently. */
917 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
918 cmp_bytes = bytes_to_compare;
919 if (extra_bytes < offset)
920 {
921 offset -= extra_bytes;
922 cmp_bytes = load_mode_size;
923 bytes_to_compare = cmp_bytes;
924 }
925 }
926 else
927 /* P7 and earlier can't do the overlapping load trick fast,
928 so this forces a non-overlapping load and a shift to get
929 rid of the extra bytes. */
930 cmp_bytes = bytes_to_compare;
931
932 src1 = adjust_address (orig_src1, load_mode, offset);
933 src2 = adjust_address (orig_src2, load_mode, offset);
934
935 if (!REG_P (XEXP (src1, 0)))
936 {
937 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
938 src1 = replace_equiv_address (src1, src1_reg);
939 }
940 set_mem_size (src1, cmp_bytes);
941
942 if (!REG_P (XEXP (src2, 0)))
943 {
944 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
945 src2 = replace_equiv_address (src2, src2_reg);
946 }
947 set_mem_size (src2, cmp_bytes);
948
949 do_load_for_compare (tmp_reg_src1, src1, load_mode);
950 do_load_for_compare (tmp_reg_src2, src2, load_mode);
951
952 /* We must always left-align the data we read, and
953 clear any bytes to the right that are beyond the string.
954 Otherwise the cmpb sequence won't produce the correct
955 results. The beginning of the compare will be done
956 with word_mode so will not have any extra shifts or
957 clear rights. */
958
959 if (load_mode_size < word_mode_size)
960 {
961 /* Rotate left first. */
962 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
963 if (word_mode == DImode)
964 {
965 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
966 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
967 }
968 else
969 {
970 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
971 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
972 }
973 }
974
975 if (cmp_bytes < word_mode_size)
976 {
977 /* Now clear right. This plus the rotate can be
978 turned into a rldicr instruction. */
979 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
980 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
981 if (word_mode == DImode)
982 {
983 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
984 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
985 }
986 else
987 {
988 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
989 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
990 }
991 }
992
993 /* Cases to handle. A and B are chunks of the two strings.
994 1: Not end of comparison:
995 A != B: branch to cleanup code to compute result.
996 A == B: check for 0 byte, next block if not found.
997 2: End of the inline comparison:
998 A != B: branch to cleanup code to compute result.
999 A == B: check for 0 byte, call strcmp/strncmp
1000 3: compared requested N bytes:
1001 A == B: branch to result 0.
1002 A != B: cleanup code to compute result. */
1003
1004 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1005
1006 rtx dst_label;
1007 if (remain > 0 || equality_compare_rest)
1008 {
1009 /* Branch to cleanup code, otherwise fall through to do
1010 more compares. */
1011 if (!cleanup_label)
1012 cleanup_label = gen_label_rtx ();
1013 dst_label = cleanup_label;
1014 }
1015 else
1016 /* Branch to end and produce result of 0. */
1017 dst_label = final_move_label;
1018
1019 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1020 rtx cond = gen_reg_rtx (CCmode);
1021
1022 /* Always produce the 0 result, it is needed if
1023 cmpb finds a 0 byte in this chunk. */
1024 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1025 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
1026
1027 rtx cmp_rtx;
1028 if (remain == 0 && !equality_compare_rest)
1029 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
1030 else
1031 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1032
1033 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1034 lab_ref, pc_rtx);
1035 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1036 JUMP_LABEL (j) = dst_label;
1037 LABEL_NUSES (dst_label) += 1;
1038
1039 if (remain > 0 || equality_compare_rest)
1040 {
1041 /* Generate a cmpb to test for a 0 byte and branch
1042 to final result if found. */
1043 rtx cmpb_zero = gen_reg_rtx (word_mode);
1044 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1045 rtx condz = gen_reg_rtx (CCmode);
1046 rtx zero_reg = gen_reg_rtx (word_mode);
1047 if (word_mode == SImode)
1048 {
1049 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
1050 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1051 if (cmp_bytes < word_mode_size)
1052 {
1053 /* Don't want to look at zero bytes past end. */
1054 HOST_WIDE_INT mb =
1055 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1056 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1057 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
1058 }
1059 }
1060 else
1061 {
1062 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
1063 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1064 if (cmp_bytes < word_mode_size)
1065 {
1066 /* Don't want to look at zero bytes past end. */
1067 HOST_WIDE_INT mb =
1068 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1069 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1070 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
1071 }
1072 }
1073
1074 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
1075 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
1076 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
1077 lab_ref_fin, pc_rtx);
1078 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1079 JUMP_LABEL (j2) = final_move_label;
1080 LABEL_NUSES (final_move_label) += 1;
1081
1082 }
1083
1084 offset += cmp_bytes;
1085 bytes_to_compare -= cmp_bytes;
1086 }
1087
1088 if (equality_compare_rest)
1089 {
1090 /* Update pointers past what has been compared already. */
1091 src1 = adjust_address (orig_src1, load_mode, offset);
1092 src2 = adjust_address (orig_src2, load_mode, offset);
1093
1094 if (!REG_P (XEXP (src1, 0)))
1095 {
1096 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1097 src1 = replace_equiv_address (src1, src1_reg);
1098 }
1099 set_mem_size (src1, cmp_bytes);
1100
1101 if (!REG_P (XEXP (src2, 0)))
1102 {
1103 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1104 src2 = replace_equiv_address (src2, src2_reg);
1105 }
1106 set_mem_size (src2, cmp_bytes);
1107
1108 /* Construct call to strcmp/strncmp to compare the rest of the string. */
1109 if (no_length)
1110 {
1111 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1112 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1113 target, LCT_NORMAL, GET_MODE (target),
1114 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1115 force_reg (Pmode, XEXP (src2, 0)), Pmode);
1116 }
1117 else
1118 {
1119 rtx len_rtx;
1120 if (TARGET_64BIT)
1121 len_rtx = gen_reg_rtx (DImode);
1122 else
1123 len_rtx = gen_reg_rtx (SImode);
1124
1125 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
1126 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1127 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1128 target, LCT_NORMAL, GET_MODE (target),
1129 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1130 force_reg (Pmode, XEXP (src2, 0)), Pmode,
1131 len_rtx, GET_MODE (len_rtx));
1132 }
1133
1134 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1135 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1136 JUMP_LABEL (jmp) = final_label;
1137 LABEL_NUSES (final_label) += 1;
1138 emit_barrier ();
1139 }
1140
1141 if (cleanup_label)
1142 emit_label (cleanup_label);
1143
1144 /* Generate the final sequence that identifies the differing
1145 byte and generates the final result, taking into account
1146 zero bytes:
1147
1148 cmpb cmpb_result1, src1, src2
1149 cmpb cmpb_result2, src1, zero
1150 orc cmpb_result1, cmp_result1, cmpb_result2
1151 cntlzd get bit of first zero/diff byte
1152 addi convert for rldcl use
1153 rldcl rldcl extract diff/zero byte
1154 subf subtract for final result
1155 */
1156
1157 rtx cmpb_diff = gen_reg_rtx (word_mode);
1158 rtx cmpb_zero = gen_reg_rtx (word_mode);
1159 rtx rot_amt = gen_reg_rtx (word_mode);
1160 rtx zero_reg = gen_reg_rtx (word_mode);
1161
1162 rtx rot1_1 = gen_reg_rtx (word_mode);
1163 rtx rot1_2 = gen_reg_rtx (word_mode);
1164 rtx rot2_1 = gen_reg_rtx (word_mode);
1165 rtx rot2_2 = gen_reg_rtx (word_mode);
1166
1167 if (word_mode == SImode)
1168 {
1169 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
1170 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
1171 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1172 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
1173 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1174 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
1175 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
1176 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
1177 gen_lowpart (SImode, rot_amt)));
1178 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1179 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
1180 gen_lowpart (SImode, rot_amt)));
1181 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1182 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
1183 }
1184 else
1185 {
1186 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
1187 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
1188 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1189 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
1190 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1191 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
1192 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
1193 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
1194 gen_lowpart (SImode, rot_amt)));
1195 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1196 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
1197 gen_lowpart (SImode, rot_amt)));
1198 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1199 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
1200 }
1201
1202 emit_label (final_move_label);
1203 emit_insn (gen_movsi (target,
1204 gen_lowpart (SImode, result_reg)));
1205 emit_label (final_label);
1206 return true;
1207 }
1208
1209 /* Expand a block move operation, and return 1 if successful. Return 0
1210 if we should let the compiler generate normal code.
1211
1212 operands[0] is the destination
1213 operands[1] is the source
1214 operands[2] is the length
1215 operands[3] is the alignment */
1216
1217 #define MAX_MOVE_REG 4
1218
1219 int
1220 expand_block_move (rtx operands[])
1221 {
1222 rtx orig_dest = operands[0];
1223 rtx orig_src = operands[1];
1224 rtx bytes_rtx = operands[2];
1225 rtx align_rtx = operands[3];
1226 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
1227 int align;
1228 int bytes;
1229 int offset;
1230 int move_bytes;
1231 rtx stores[MAX_MOVE_REG];
1232 int num_reg = 0;
1233
1234 /* If this is not a fixed size move, just call memcpy */
1235 if (! constp)
1236 return 0;
1237
1238 /* This must be a fixed size alignment */
1239 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
1240 align = INTVAL (align_rtx) * BITS_PER_UNIT;
1241
1242 /* Anything to move? */
1243 bytes = INTVAL (bytes_rtx);
1244 if (bytes <= 0)
1245 return 1;
1246
1247 if (bytes > rs6000_block_move_inline_limit)
1248 return 0;
1249
1250 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
1251 {
1252 union {
1253 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
1254 rtx (*mov) (rtx, rtx);
1255 } gen_func;
1256 machine_mode mode = BLKmode;
1257 rtx src, dest;
1258
1259 /* Altivec first, since it will be faster than a string move
1260 when it applies, and usually not significantly larger. */
1261 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
1262 {
1263 move_bytes = 16;
1264 mode = V4SImode;
1265 gen_func.mov = gen_movv4si;
1266 }
1267 else if (TARGET_STRING
1268 && bytes > 24 /* move up to 32 bytes at a time */
1269 && ! fixed_regs[5]
1270 && ! fixed_regs[6]
1271 && ! fixed_regs[7]
1272 && ! fixed_regs[8]
1273 && ! fixed_regs[9]
1274 && ! fixed_regs[10]
1275 && ! fixed_regs[11]
1276 && ! fixed_regs[12])
1277 {
1278 move_bytes = (bytes > 32) ? 32 : bytes;
1279 gen_func.movmemsi = gen_movmemsi_8reg;
1280 }
1281 else if (TARGET_STRING
1282 && bytes > 16 /* move up to 24 bytes at a time */
1283 && ! fixed_regs[5]
1284 && ! fixed_regs[6]
1285 && ! fixed_regs[7]
1286 && ! fixed_regs[8]
1287 && ! fixed_regs[9]
1288 && ! fixed_regs[10])
1289 {
1290 move_bytes = (bytes > 24) ? 24 : bytes;
1291 gen_func.movmemsi = gen_movmemsi_6reg;
1292 }
1293 else if (TARGET_STRING
1294 && bytes > 8 /* move up to 16 bytes at a time */
1295 && ! fixed_regs[5]
1296 && ! fixed_regs[6]
1297 && ! fixed_regs[7]
1298 && ! fixed_regs[8])
1299 {
1300 move_bytes = (bytes > 16) ? 16 : bytes;
1301 gen_func.movmemsi = gen_movmemsi_4reg;
1302 }
1303 else if (bytes >= 8 && TARGET_POWERPC64
1304 && (align >= 64 || !STRICT_ALIGNMENT))
1305 {
1306 move_bytes = 8;
1307 mode = DImode;
1308 gen_func.mov = gen_movdi;
1309 if (offset == 0 && align < 64)
1310 {
1311 rtx addr;
1312
1313 /* If the address form is reg+offset with offset not a
1314 multiple of four, reload into reg indirect form here
1315 rather than waiting for reload. This way we get one
1316 reload, not one per load and/or store. */
1317 addr = XEXP (orig_dest, 0);
1318 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
1319 && GET_CODE (XEXP (addr, 1)) == CONST_INT
1320 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
1321 {
1322 addr = copy_addr_to_reg (addr);
1323 orig_dest = replace_equiv_address (orig_dest, addr);
1324 }
1325 addr = XEXP (orig_src, 0);
1326 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
1327 && GET_CODE (XEXP (addr, 1)) == CONST_INT
1328 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
1329 {
1330 addr = copy_addr_to_reg (addr);
1331 orig_src = replace_equiv_address (orig_src, addr);
1332 }
1333 }
1334 }
1335 else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64)
1336 { /* move up to 8 bytes at a time */
1337 move_bytes = (bytes > 8) ? 8 : bytes;
1338 gen_func.movmemsi = gen_movmemsi_2reg;
1339 }
1340 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
1341 { /* move 4 bytes */
1342 move_bytes = 4;
1343 mode = SImode;
1344 gen_func.mov = gen_movsi;
1345 }
1346 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
1347 { /* move 2 bytes */
1348 move_bytes = 2;
1349 mode = HImode;
1350 gen_func.mov = gen_movhi;
1351 }
1352 else if (TARGET_STRING && bytes > 1)
1353 { /* move up to 4 bytes at a time */
1354 move_bytes = (bytes > 4) ? 4 : bytes;
1355 gen_func.movmemsi = gen_movmemsi_1reg;
1356 }
1357 else /* move 1 byte at a time */
1358 {
1359 move_bytes = 1;
1360 mode = QImode;
1361 gen_func.mov = gen_movqi;
1362 }
1363
1364 src = adjust_address (orig_src, mode, offset);
1365 dest = adjust_address (orig_dest, mode, offset);
1366
1367 if (mode != BLKmode)
1368 {
1369 rtx tmp_reg = gen_reg_rtx (mode);
1370
1371 emit_insn ((*gen_func.mov) (tmp_reg, src));
1372 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
1373 }
1374
1375 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
1376 {
1377 int i;
1378 for (i = 0; i < num_reg; i++)
1379 emit_insn (stores[i]);
1380 num_reg = 0;
1381 }
1382
1383 if (mode == BLKmode)
1384 {
1385 /* Move the address into scratch registers. The movmemsi
1386 patterns require zero offset. */
1387 if (!REG_P (XEXP (src, 0)))
1388 {
1389 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
1390 src = replace_equiv_address (src, src_reg);
1391 }
1392 set_mem_size (src, move_bytes);
1393
1394 if (!REG_P (XEXP (dest, 0)))
1395 {
1396 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
1397 dest = replace_equiv_address (dest, dest_reg);
1398 }
1399 set_mem_size (dest, move_bytes);
1400
1401 emit_insn ((*gen_func.movmemsi) (dest, src,
1402 GEN_INT (move_bytes & 31),
1403 align_rtx));
1404 }
1405 }
1406
1407 return 1;
1408 }
1409
1410
1411 /* Return a string to perform a load_multiple operation.
1412 operands[0] is the vector.
1413 operands[1] is the source address.
1414 operands[2] is the first destination register. */
1415
1416 const char *
1417 rs6000_output_load_multiple (rtx operands[3])
1418 {
1419 /* We have to handle the case where the pseudo used to contain the address
1420 is assigned to one of the output registers. */
1421 int i, j;
1422 int words = XVECLEN (operands[0], 0);
1423 rtx xop[10];
1424
1425 if (XVECLEN (operands[0], 0) == 1)
1426 return "lwz %2,0(%1)";
1427
1428 for (i = 0; i < words; i++)
1429 if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1]))
1430 {
1431 if (i == words-1)
1432 {
1433 xop[0] = GEN_INT (4 * (words-1));
1434 xop[1] = operands[1];
1435 xop[2] = operands[2];
1436 output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop);
1437 return "";
1438 }
1439 else if (i == 0)
1440 {
1441 xop[0] = GEN_INT (4 * (words-1));
1442 xop[1] = operands[1];
1443 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1);
1444 output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop);
1445 return "";
1446 }
1447 else
1448 {
1449 for (j = 0; j < words; j++)
1450 if (j != i)
1451 {
1452 xop[0] = GEN_INT (j * 4);
1453 xop[1] = operands[1];
1454 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j);
1455 output_asm_insn ("lwz %2,%0(%1)", xop);
1456 }
1457 xop[0] = GEN_INT (i * 4);
1458 xop[1] = operands[1];
1459 output_asm_insn ("lwz %1,%0(%1)", xop);
1460 return "";
1461 }
1462 }
1463
1464 return "lswi %2,%1,%N0";
1465 }
1466