Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/rs6000/rs6000-string.c @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 /* Subroutines used to expand string and block move, clear, | |
2 compare and other operations for PowerPC. | |
3 Copyright (C) 1991-2017 Free Software Foundation, Inc. | |
4 | |
5 This file is part of GCC. | |
6 | |
7 GCC is free software; you can redistribute it and/or modify it | |
8 under the terms of the GNU General Public License as published | |
9 by the Free Software Foundation; either version 3, or (at your | |
10 option) any later version. | |
11 | |
12 GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 License for more details. | |
16 | |
17 You should have received a copy of the GNU General Public License | |
18 along with GCC; see the file COPYING3. If not see | |
19 <http://www.gnu.org/licenses/>. */ | |
20 | |
21 #include "config.h" | |
22 #include "system.h" | |
23 #include "coretypes.h" | |
24 #include "backend.h" | |
25 #include "rtl.h" | |
26 #include "tree.h" | |
27 #include "memmodel.h" | |
28 #include "tm_p.h" | |
29 #include "ira.h" | |
30 #include "print-tree.h" | |
31 #include "varasm.h" | |
32 #include "explow.h" | |
33 #include "expr.h" | |
34 #include "output.h" | |
35 #include "target.h" | |
36 | |
37 /* Expand a block clear operation, and return 1 if successful. Return 0 | |
38 if we should let the compiler generate normal code. | |
39 | |
40 operands[0] is the destination | |
41 operands[1] is the length | |
42 operands[3] is the alignment */ | |
43 | |
44 int | |
45 expand_block_clear (rtx operands[]) | |
46 { | |
47 rtx orig_dest = operands[0]; | |
48 rtx bytes_rtx = operands[1]; | |
49 rtx align_rtx = operands[3]; | |
50 bool constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
51 HOST_WIDE_INT align; | |
52 HOST_WIDE_INT bytes; | |
53 int offset; | |
54 int clear_bytes; | |
55 int clear_step; | |
56 | |
57 /* If this is not a fixed size move, just call memcpy */ | |
58 if (! constp) | |
59 return 0; | |
60 | |
61 /* This must be a fixed size alignment */ | |
62 gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
63 align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
64 | |
65 /* Anything to clear? */ | |
66 bytes = INTVAL (bytes_rtx); | |
67 if (bytes <= 0) | |
68 return 1; | |
69 | |
70 /* Use the builtin memset after a point, to avoid huge code bloat. | |
71 When optimize_size, avoid any significant code bloat; calling | |
72 memset is about 4 instructions, so allow for one instruction to | |
73 load zero and three to do clearing. */ | |
74 if (TARGET_ALTIVEC && align >= 128) | |
75 clear_step = 16; | |
76 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
77 clear_step = 8; | |
78 else | |
79 clear_step = 4; | |
80 | |
81 if (optimize_size && bytes > 3 * clear_step) | |
82 return 0; | |
83 if (! optimize_size && bytes > 8 * clear_step) | |
84 return 0; | |
85 | |
86 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) | |
87 { | |
88 machine_mode mode = BLKmode; | |
89 rtx dest; | |
90 | |
91 if (bytes >= 16 && TARGET_ALTIVEC && align >= 128) | |
92 { | |
93 clear_bytes = 16; | |
94 mode = V4SImode; | |
95 } | |
96 else if (bytes >= 8 && TARGET_POWERPC64 | |
97 && (align >= 64 || !STRICT_ALIGNMENT)) | |
98 { | |
99 clear_bytes = 8; | |
100 mode = DImode; | |
101 if (offset == 0 && align < 64) | |
102 { | |
103 rtx addr; | |
104 | |
105 /* If the address form is reg+offset with offset not a | |
106 multiple of four, reload into reg indirect form here | |
107 rather than waiting for reload. This way we get one | |
108 reload, not one per store. */ | |
109 addr = XEXP (orig_dest, 0); | |
110 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
111 && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
112 && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
113 { | |
114 addr = copy_addr_to_reg (addr); | |
115 orig_dest = replace_equiv_address (orig_dest, addr); | |
116 } | |
117 } | |
118 } | |
119 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
120 { /* move 4 bytes */ | |
121 clear_bytes = 4; | |
122 mode = SImode; | |
123 } | |
124 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
125 { /* move 2 bytes */ | |
126 clear_bytes = 2; | |
127 mode = HImode; | |
128 } | |
129 else /* move 1 byte at a time */ | |
130 { | |
131 clear_bytes = 1; | |
132 mode = QImode; | |
133 } | |
134 | |
135 dest = adjust_address (orig_dest, mode, offset); | |
136 | |
137 emit_move_insn (dest, CONST0_RTX (mode)); | |
138 } | |
139 | |
140 return 1; | |
141 } | |
142 | |
143 /* Figure out the correct instructions to generate to load data for | |
144 block compare. MODE is used for the read from memory, and | |
145 data is zero extended if REG is wider than MODE. If LE code | |
146 is being generated, bswap loads are used. | |
147 | |
148 REG is the destination register to move the data into. | |
149 MEM is the memory block being read. | |
150 MODE is the mode of memory to use for the read. */ | |
151 static void | |
152 do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
153 { | |
154 switch (GET_MODE (reg)) | |
155 { | |
156 case E_DImode: | |
157 switch (mode) | |
158 { | |
159 case E_QImode: | |
160 emit_insn (gen_zero_extendqidi2 (reg, mem)); | |
161 break; | |
162 case E_HImode: | |
163 { | |
164 rtx src = mem; | |
165 if (!BYTES_BIG_ENDIAN) | |
166 { | |
167 src = gen_reg_rtx (HImode); | |
168 emit_insn (gen_bswaphi2 (src, mem)); | |
169 } | |
170 emit_insn (gen_zero_extendhidi2 (reg, src)); | |
171 break; | |
172 } | |
173 case E_SImode: | |
174 { | |
175 rtx src = mem; | |
176 if (!BYTES_BIG_ENDIAN) | |
177 { | |
178 src = gen_reg_rtx (SImode); | |
179 emit_insn (gen_bswapsi2 (src, mem)); | |
180 } | |
181 emit_insn (gen_zero_extendsidi2 (reg, src)); | |
182 } | |
183 break; | |
184 case E_DImode: | |
185 if (!BYTES_BIG_ENDIAN) | |
186 emit_insn (gen_bswapdi2 (reg, mem)); | |
187 else | |
188 emit_insn (gen_movdi (reg, mem)); | |
189 break; | |
190 default: | |
191 gcc_unreachable (); | |
192 } | |
193 break; | |
194 | |
195 case E_SImode: | |
196 switch (mode) | |
197 { | |
198 case E_QImode: | |
199 emit_insn (gen_zero_extendqisi2 (reg, mem)); | |
200 break; | |
201 case E_HImode: | |
202 { | |
203 rtx src = mem; | |
204 if (!BYTES_BIG_ENDIAN) | |
205 { | |
206 src = gen_reg_rtx (HImode); | |
207 emit_insn (gen_bswaphi2 (src, mem)); | |
208 } | |
209 emit_insn (gen_zero_extendhisi2 (reg, src)); | |
210 break; | |
211 } | |
212 case E_SImode: | |
213 if (!BYTES_BIG_ENDIAN) | |
214 emit_insn (gen_bswapsi2 (reg, mem)); | |
215 else | |
216 emit_insn (gen_movsi (reg, mem)); | |
217 break; | |
218 case E_DImode: | |
219 /* DImode is larger than the destination reg so is not expected. */ | |
220 gcc_unreachable (); | |
221 break; | |
222 default: | |
223 gcc_unreachable (); | |
224 } | |
225 break; | |
226 default: | |
227 gcc_unreachable (); | |
228 break; | |
229 } | |
230 } | |
231 | |
232 /* Select the mode to be used for reading the next chunk of bytes | |
233 in the compare. | |
234 | |
235 OFFSET is the current read offset from the beginning of the block. | |
236 BYTES is the number of bytes remaining to be read. | |
237 ALIGN is the minimum alignment of the memory blocks being compared in bytes. | |
238 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is | |
239 the largest allowable mode. */ | |
240 static machine_mode | |
241 select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
242 unsigned HOST_WIDE_INT bytes, | |
243 unsigned HOST_WIDE_INT align, bool word_mode_ok) | |
244 { | |
245 /* First see if we can do a whole load unit | |
246 as that will be more efficient than a larger load + shift. */ | |
247 | |
248 /* If big, use biggest chunk. | |
249 If exactly chunk size, use that size. | |
250 If remainder can be done in one piece with shifting, do that. | |
251 Do largest chunk possible without violating alignment rules. */ | |
252 | |
253 /* The most we can read without potential page crossing. */ | |
254 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
255 | |
256 if (word_mode_ok && bytes >= UNITS_PER_WORD) | |
257 return word_mode; | |
258 else if (bytes == GET_MODE_SIZE (SImode)) | |
259 return SImode; | |
260 else if (bytes == GET_MODE_SIZE (HImode)) | |
261 return HImode; | |
262 else if (bytes == GET_MODE_SIZE (QImode)) | |
263 return QImode; | |
264 else if (bytes < GET_MODE_SIZE (SImode) | |
265 && offset >= GET_MODE_SIZE (SImode) - bytes) | |
266 /* This matches the case were we have SImode and 3 bytes | |
267 and offset >= 1 and permits us to move back one and overlap | |
268 with the previous read, thus avoiding having to shift | |
269 unwanted bytes off of the input. */ | |
270 return SImode; | |
271 else if (word_mode_ok && bytes < UNITS_PER_WORD | |
272 && offset >= UNITS_PER_WORD-bytes) | |
273 /* Similarly, if we can use DImode it will get matched here and | |
274 can do an overlapping read that ends at the end of the block. */ | |
275 return word_mode; | |
276 else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
277 /* It is safe to do all remaining in one load of largest size, | |
278 possibly with a shift to get rid of unwanted bytes. */ | |
279 return word_mode; | |
280 else if (maxread >= GET_MODE_SIZE (SImode)) | |
281 /* It is safe to do all remaining in one SImode load, | |
282 possibly with a shift to get rid of unwanted bytes. */ | |
283 return SImode; | |
284 else if (bytes > GET_MODE_SIZE (SImode)) | |
285 return SImode; | |
286 else if (bytes > GET_MODE_SIZE (HImode)) | |
287 return HImode; | |
288 | |
289 /* final fallback is do one byte */ | |
290 return QImode; | |
291 } | |
292 | |
293 /* Compute the alignment of pointer+OFFSET where the original alignment | |
294 of pointer was BASE_ALIGN. */ | |
295 static unsigned HOST_WIDE_INT | |
296 compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
297 unsigned HOST_WIDE_INT offset) | |
298 { | |
299 if (offset == 0) | |
300 return base_align; | |
301 return MIN (base_align, offset & -offset); | |
302 } | |
303 | |
304 /* Expand a block compare operation, and return true if successful. | |
305 Return false if we should let the compiler generate normal code, | |
306 probably a memcmp call. | |
307 | |
308 OPERANDS[0] is the target (result). | |
309 OPERANDS[1] is the first source. | |
310 OPERANDS[2] is the second source. | |
311 OPERANDS[3] is the length. | |
312 OPERANDS[4] is the alignment. */ | |
313 bool | |
314 expand_block_compare (rtx operands[]) | |
315 { | |
316 rtx target = operands[0]; | |
317 rtx orig_src1 = operands[1]; | |
318 rtx orig_src2 = operands[2]; | |
319 rtx bytes_rtx = operands[3]; | |
320 rtx align_rtx = operands[4]; | |
321 HOST_WIDE_INT cmp_bytes = 0; | |
322 rtx src1 = orig_src1; | |
323 rtx src2 = orig_src2; | |
324 | |
325 /* This case is complicated to handle because the subtract | |
326 with carry instructions do not generate the 64-bit | |
327 carry and so we must emit code to calculate it ourselves. | |
328 We choose not to implement this yet. */ | |
329 if (TARGET_32BIT && TARGET_POWERPC64) | |
330 return false; | |
331 | |
332 /* If this is not a fixed size compare, just call memcmp. */ | |
333 if (!CONST_INT_P (bytes_rtx)) | |
334 return false; | |
335 | |
336 /* This must be a fixed size alignment. */ | |
337 if (!CONST_INT_P (align_rtx)) | |
338 return false; | |
339 | |
340 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; | |
341 | |
342 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ | |
343 if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) | |
344 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))) | |
345 return false; | |
346 | |
347 gcc_assert (GET_MODE (target) == SImode); | |
348 | |
349 /* Anything to move? */ | |
350 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
351 if (bytes == 0) | |
352 return true; | |
353 | |
354 /* The code generated for p7 and older is not faster than glibc | |
355 memcmp if alignment is small and length is not short, so bail | |
356 out to avoid those conditions. */ | |
357 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED | |
358 && ((base_align == 1 && bytes > 16) | |
359 || (base_align == 2 && bytes > 32))) | |
360 return false; | |
361 | |
362 rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
363 rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
364 /* P7/P8 code uses cond for subfc. but P9 uses | |
365 it for cmpld which needs CCUNSmode. */ | |
366 rtx cond; | |
367 if (TARGET_P9_MISC) | |
368 cond = gen_reg_rtx (CCUNSmode); | |
369 else | |
370 cond = gen_reg_rtx (CCmode); | |
371 | |
372 /* If we have an LE target without ldbrx and word_mode is DImode, | |
373 then we must avoid using word_mode. */ | |
374 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
375 && word_mode == DImode); | |
376 | |
377 /* Strategy phase. How many ops will this take and should we expand it? */ | |
378 | |
379 unsigned HOST_WIDE_INT offset = 0; | |
380 machine_mode load_mode = | |
381 select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
382 unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
383 | |
384 /* We don't want to generate too much code. */ | |
385 unsigned HOST_WIDE_INT max_bytes = | |
386 load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit; | |
387 if (!IN_RANGE (bytes, 1, max_bytes)) | |
388 return false; | |
389 | |
390 bool generate_6432_conversion = false; | |
391 rtx convert_label = NULL; | |
392 rtx final_label = NULL; | |
393 | |
394 /* Example of generated code for 18 bytes aligned 1 byte. | |
395 Compiled with -fno-reorder-blocks for clarity. | |
396 ldbrx 10,31,8 | |
397 ldbrx 9,7,8 | |
398 subfc. 9,9,10 | |
399 bne 0,.L6487 | |
400 addi 9,12,8 | |
401 addi 5,11,8 | |
402 ldbrx 10,0,9 | |
403 ldbrx 9,0,5 | |
404 subfc. 9,9,10 | |
405 bne 0,.L6487 | |
406 addi 9,12,16 | |
407 lhbrx 10,0,9 | |
408 addi 9,11,16 | |
409 lhbrx 9,0,9 | |
410 subf 9,9,10 | |
411 b .L6488 | |
412 .p2align 4,,15 | |
413 .L6487: #convert_label | |
414 popcntd 9,9 | |
415 subfe 10,10,10 | |
416 or 9,9,10 | |
417 .L6488: #final_label | |
418 extsw 10,9 | |
419 | |
420 We start off with DImode for two blocks that jump to the DI->SI conversion | |
421 if the difference is found there, then a final block of HImode that skips | |
422 the DI->SI conversion. */ | |
423 | |
424 while (bytes > 0) | |
425 { | |
426 unsigned int align = compute_current_alignment (base_align, offset); | |
427 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
428 load_mode = select_block_compare_mode (offset, bytes, align, | |
429 word_mode_ok); | |
430 else | |
431 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok); | |
432 load_mode_size = GET_MODE_SIZE (load_mode); | |
433 if (bytes >= load_mode_size) | |
434 cmp_bytes = load_mode_size; | |
435 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
436 { | |
437 /* Move this load back so it doesn't go past the end. | |
438 P8/P9 can do this efficiently. */ | |
439 unsigned int extra_bytes = load_mode_size - bytes; | |
440 cmp_bytes = bytes; | |
441 if (extra_bytes < offset) | |
442 { | |
443 offset -= extra_bytes; | |
444 cmp_bytes = load_mode_size; | |
445 bytes = cmp_bytes; | |
446 } | |
447 } | |
448 else | |
449 /* P7 and earlier can't do the overlapping load trick fast, | |
450 so this forces a non-overlapping load and a shift to get | |
451 rid of the extra bytes. */ | |
452 cmp_bytes = bytes; | |
453 | |
454 src1 = adjust_address (orig_src1, load_mode, offset); | |
455 src2 = adjust_address (orig_src2, load_mode, offset); | |
456 | |
457 if (!REG_P (XEXP (src1, 0))) | |
458 { | |
459 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
460 src1 = replace_equiv_address (src1, src1_reg); | |
461 } | |
462 set_mem_size (src1, cmp_bytes); | |
463 | |
464 if (!REG_P (XEXP (src2, 0))) | |
465 { | |
466 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
467 src2 = replace_equiv_address (src2, src2_reg); | |
468 } | |
469 set_mem_size (src2, cmp_bytes); | |
470 | |
471 do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
472 do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
473 | |
474 if (cmp_bytes < load_mode_size) | |
475 { | |
476 /* Shift unneeded bytes off. */ | |
477 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
478 if (word_mode == DImode) | |
479 { | |
480 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
481 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
482 } | |
483 else | |
484 { | |
485 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
486 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
487 } | |
488 } | |
489 | |
490 int remain = bytes - cmp_bytes; | |
491 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode)) | |
492 { | |
493 /* Target is larger than load size so we don't need to | |
494 reduce result size. */ | |
495 | |
496 /* We previously did a block that need 64->32 conversion but | |
497 the current block does not, so a label is needed to jump | |
498 to the end. */ | |
499 if (generate_6432_conversion && !final_label) | |
500 final_label = gen_label_rtx (); | |
501 | |
502 if (remain > 0) | |
503 { | |
504 /* This is not the last block, branch to the end if the result | |
505 of this subtract is not zero. */ | |
506 if (!final_label) | |
507 final_label = gen_label_rtx (); | |
508 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
509 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
510 rtx cr = gen_reg_rtx (CCmode); | |
511 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
512 emit_insn (gen_movsi (target, | |
513 gen_lowpart (SImode, tmp_reg_src2))); | |
514 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
515 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
516 fin_ref, pc_rtx); | |
517 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
518 JUMP_LABEL (j) = final_label; | |
519 LABEL_NUSES (final_label) += 1; | |
520 } | |
521 else | |
522 { | |
523 if (word_mode == DImode) | |
524 { | |
525 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
526 tmp_reg_src2)); | |
527 emit_insn (gen_movsi (target, | |
528 gen_lowpart (SImode, tmp_reg_src2))); | |
529 } | |
530 else | |
531 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2)); | |
532 | |
533 if (final_label) | |
534 { | |
535 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
536 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
537 JUMP_LABEL(j) = final_label; | |
538 LABEL_NUSES (final_label) += 1; | |
539 emit_barrier (); | |
540 } | |
541 } | |
542 } | |
543 else | |
544 { | |
545 /* Do we need a 64->32 conversion block? We need the 64->32 | |
546 conversion even if target size == load_mode size because | |
547 the subtract generates one extra bit. */ | |
548 generate_6432_conversion = true; | |
549 | |
550 if (remain > 0) | |
551 { | |
552 if (!convert_label) | |
553 convert_label = gen_label_rtx (); | |
554 | |
555 /* Compare to zero and branch to convert_label if not zero. */ | |
556 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
557 if (TARGET_P9_MISC) | |
558 { | |
559 /* Generate a compare, and convert with a setb later. */ | |
560 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
561 tmp_reg_src2); | |
562 emit_insn (gen_rtx_SET (cond, cmp)); | |
563 } | |
564 else | |
565 /* Generate a subfc. and use the longer | |
566 sequence for conversion. */ | |
567 if (TARGET_64BIT) | |
568 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
569 tmp_reg_src1, cond)); | |
570 else | |
571 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
572 tmp_reg_src1, cond)); | |
573 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
574 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
575 cvt_ref, pc_rtx); | |
576 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
577 JUMP_LABEL(j) = convert_label; | |
578 LABEL_NUSES (convert_label) += 1; | |
579 } | |
580 else | |
581 { | |
582 /* Just do the subtract/compare. Since this is the last block | |
583 the convert code will be generated immediately following. */ | |
584 if (TARGET_P9_MISC) | |
585 { | |
586 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
587 tmp_reg_src2); | |
588 emit_insn (gen_rtx_SET (cond, cmp)); | |
589 } | |
590 else | |
591 if (TARGET_64BIT) | |
592 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2, | |
593 tmp_reg_src1)); | |
594 else | |
595 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2, | |
596 tmp_reg_src1)); | |
597 } | |
598 } | |
599 | |
600 offset += cmp_bytes; | |
601 bytes -= cmp_bytes; | |
602 } | |
603 | |
604 if (generate_6432_conversion) | |
605 { | |
606 if (convert_label) | |
607 emit_label (convert_label); | |
608 | |
609 /* We need to produce DI result from sub, then convert to target SI | |
610 while maintaining <0 / ==0 / >0 properties. This sequence works: | |
611 subfc L,A,B | |
612 subfe H,H,H | |
613 popcntd L,L | |
614 rldimi L,H,6,0 | |
615 | |
616 This is an alternate one Segher cooked up if somebody | |
617 wants to expand this for something that doesn't have popcntd: | |
618 subfc L,a,b | |
619 subfe H,x,x | |
620 addic t,L,-1 | |
621 subfe v,t,L | |
622 or z,v,H | |
623 | |
624 And finally, p9 can just do this: | |
625 cmpld A,B | |
626 setb r */ | |
627 | |
628 if (TARGET_P9_MISC) | |
629 { | |
630 emit_insn (gen_setb_unsigned (target, cond)); | |
631 } | |
632 else | |
633 { | |
634 if (TARGET_64BIT) | |
635 { | |
636 rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
637 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
638 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2)); | |
639 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca)); | |
640 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); | |
641 } | |
642 else | |
643 { | |
644 rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
645 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
646 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2)); | |
647 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca)); | |
648 } | |
649 } | |
650 } | |
651 | |
652 if (final_label) | |
653 emit_label (final_label); | |
654 | |
655 gcc_assert (bytes == 0); | |
656 return true; | |
657 } | |
658 | |
659 /* Generate alignment check and branch code to set up for | |
660 strncmp when we don't have DI alignment. | |
661 STRNCMP_LABEL is the label to branch if there is a page crossing. | |
662 SRC is the string pointer to be examined. | |
663 BYTES is the max number of bytes to compare. */ | |
664 static void | |
665 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes) | |
666 { | |
667 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
668 rtx src_check = copy_addr_to_reg (XEXP (src, 0)); | |
669 if (GET_MODE (src_check) == SImode) | |
670 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff))); | |
671 else | |
672 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff))); | |
673 rtx cond = gen_reg_rtx (CCmode); | |
674 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check, | |
675 GEN_INT (4096 - bytes))); | |
676 | |
677 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); | |
678 | |
679 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
680 lab_ref, pc_rtx); | |
681 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
682 JUMP_LABEL (j) = strncmp_label; | |
683 LABEL_NUSES (strncmp_label) += 1; | |
684 } | |
685 | |
686 /* Expand a string compare operation with length, and return | |
687 true if successful. Return false if we should let the | |
688 compiler generate normal code, probably a strncmp call. | |
689 | |
690 OPERANDS[0] is the target (result). | |
691 OPERANDS[1] is the first source. | |
692 OPERANDS[2] is the second source. | |
693 If NO_LENGTH is zero, then: | |
694 OPERANDS[3] is the length. | |
695 OPERANDS[4] is the alignment in bytes. | |
696 If NO_LENGTH is nonzero, then: | |
697 OPERANDS[3] is the alignment in bytes. */ | |
698 bool | |
699 expand_strn_compare (rtx operands[], int no_length) | |
700 { | |
701 rtx target = operands[0]; | |
702 rtx orig_src1 = operands[1]; | |
703 rtx orig_src2 = operands[2]; | |
704 rtx bytes_rtx, align_rtx; | |
705 if (no_length) | |
706 { | |
707 bytes_rtx = NULL; | |
708 align_rtx = operands[3]; | |
709 } | |
710 else | |
711 { | |
712 bytes_rtx = operands[3]; | |
713 align_rtx = operands[4]; | |
714 } | |
715 unsigned HOST_WIDE_INT cmp_bytes = 0; | |
716 rtx src1 = orig_src1; | |
717 rtx src2 = orig_src2; | |
718 | |
719 /* If we have a length, it must be constant. This simplifies things | |
720 a bit as we don't have to generate code to check if we've exceeded | |
721 the length. Later this could be expanded to handle this case. */ | |
722 if (!no_length && !CONST_INT_P (bytes_rtx)) | |
723 return false; | |
724 | |
725 /* This must be a fixed size alignment. */ | |
726 if (!CONST_INT_P (align_rtx)) | |
727 return false; | |
728 | |
729 unsigned int base_align = UINTVAL (align_rtx); | |
730 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
731 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
732 | |
733 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ | |
734 if (targetm.slow_unaligned_access (word_mode, align1) | |
735 || targetm.slow_unaligned_access (word_mode, align2)) | |
736 return false; | |
737 | |
738 gcc_assert (GET_MODE (target) == SImode); | |
739 | |
740 /* If we have an LE target without ldbrx and word_mode is DImode, | |
741 then we must avoid using word_mode. */ | |
742 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
743 && word_mode == DImode); | |
744 | |
745 unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
746 | |
747 unsigned HOST_WIDE_INT offset = 0; | |
748 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
749 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
750 if (no_length) | |
751 /* Use this as a standin to determine the mode to use. */ | |
752 bytes = rs6000_string_compare_inline_limit * word_mode_size; | |
753 else | |
754 bytes = UINTVAL (bytes_rtx); | |
755 | |
756 machine_mode load_mode = | |
757 select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
758 unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
759 compare_length = rs6000_string_compare_inline_limit * load_mode_size; | |
760 | |
761 /* If we have equality at the end of the last compare and we have not | |
762 found the end of the string, we need to call strcmp/strncmp to | |
763 compare the remainder. */ | |
764 bool equality_compare_rest = false; | |
765 | |
766 if (no_length) | |
767 { | |
768 bytes = compare_length; | |
769 equality_compare_rest = true; | |
770 } | |
771 else | |
772 { | |
773 if (bytes <= compare_length) | |
774 compare_length = bytes; | |
775 else | |
776 equality_compare_rest = true; | |
777 } | |
778 | |
779 rtx result_reg = gen_reg_rtx (word_mode); | |
780 rtx final_move_label = gen_label_rtx (); | |
781 rtx final_label = gen_label_rtx (); | |
782 rtx begin_compare_label = NULL; | |
783 | |
784 if (base_align < 8) | |
785 { | |
786 /* Generate code that checks distance to 4k boundary for this case. */ | |
787 begin_compare_label = gen_label_rtx (); | |
788 rtx strncmp_label = gen_label_rtx (); | |
789 rtx jmp; | |
790 | |
791 /* Strncmp for power8 in glibc does this: | |
792 rldicl r8,r3,0,52 | |
793 cmpldi cr7,r8,4096-16 | |
794 bgt cr7,L(pagecross) */ | |
795 | |
796 /* Make sure that the length we use for the alignment test and | |
797 the subsequent code generation are in agreement so we do not | |
798 go past the length we tested for a 4k boundary crossing. */ | |
799 unsigned HOST_WIDE_INT align_test = compare_length; | |
800 if (align_test < 8) | |
801 { | |
802 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
803 base_align = align_test; | |
804 } | |
805 else | |
806 { | |
807 align_test = ROUND_UP (align_test, 8); | |
808 base_align = 8; | |
809 } | |
810 | |
811 if (align1 < 8) | |
812 expand_strncmp_align_check (strncmp_label, src1, align_test); | |
813 if (align2 < 8) | |
814 expand_strncmp_align_check (strncmp_label, src2, align_test); | |
815 | |
816 /* Now generate the following sequence: | |
817 - branch to begin_compare | |
818 - strncmp_label | |
819 - call to strncmp | |
820 - branch to final_label | |
821 - begin_compare_label */ | |
822 | |
823 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
824 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
825 JUMP_LABEL (jmp) = begin_compare_label; | |
826 LABEL_NUSES (begin_compare_label) += 1; | |
827 emit_barrier (); | |
828 | |
829 emit_label (strncmp_label); | |
830 | |
831 if (!REG_P (XEXP (src1, 0))) | |
832 { | |
833 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
834 src1 = replace_equiv_address (src1, src1_reg); | |
835 } | |
836 | |
837 if (!REG_P (XEXP (src2, 0))) | |
838 { | |
839 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
840 src2 = replace_equiv_address (src2, src2_reg); | |
841 } | |
842 | |
843 if (no_length) | |
844 { | |
845 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
846 emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
847 target, LCT_NORMAL, GET_MODE (target), | |
848 force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
849 force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
850 } | |
851 else | |
852 { | |
853 /* -m32 -mpowerpc64 results in word_mode being DImode even | |
854 though otherwise it is 32-bit. The length arg to strncmp | |
855 is a size_t which will be the same size as pointers. */ | |
856 rtx len_rtx; | |
857 if (TARGET_64BIT) | |
858 len_rtx = gen_reg_rtx (DImode); | |
859 else | |
860 len_rtx = gen_reg_rtx (SImode); | |
861 | |
862 emit_move_insn (len_rtx, bytes_rtx); | |
863 | |
864 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
865 emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
866 target, LCT_NORMAL, GET_MODE (target), | |
867 force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
868 force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
869 len_rtx, GET_MODE (len_rtx)); | |
870 } | |
871 | |
872 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
873 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
874 JUMP_LABEL (jmp) = final_label; | |
875 LABEL_NUSES (final_label) += 1; | |
876 emit_barrier (); | |
877 emit_label (begin_compare_label); | |
878 } | |
879 | |
880 rtx cleanup_label = NULL; | |
881 rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
882 rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
883 | |
884 /* Generate sequence of ld/ldbrx, cmpb to compare out | |
885 to the length specified. */ | |
886 unsigned HOST_WIDE_INT bytes_to_compare = compare_length; | |
887 while (bytes_to_compare > 0) | |
888 { | |
889 /* Compare sequence: | |
890 check each 8B with: ld/ld cmpd bne | |
891 If equal, use rldicr/cmpb to check for zero byte. | |
892 cleanup code at end: | |
893 cmpb get byte that differs | |
894 cmpb look for zero byte | |
895 orc combine | |
896 cntlzd get bit of first zero/diff byte | |
897 subfic convert for rldcl use | |
898 rldcl rldcl extract diff/zero byte | |
899 subf subtract for final result | |
900 | |
901 The last compare can branch around the cleanup code if the | |
902 result is zero because the strings are exactly equal. */ | |
903 unsigned int align = compute_current_alignment (base_align, offset); | |
904 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
905 load_mode = select_block_compare_mode (offset, bytes_to_compare, align, | |
906 word_mode_ok); | |
907 else | |
908 load_mode = select_block_compare_mode (0, bytes_to_compare, align, | |
909 word_mode_ok); | |
910 load_mode_size = GET_MODE_SIZE (load_mode); | |
911 if (bytes_to_compare >= load_mode_size) | |
912 cmp_bytes = load_mode_size; | |
913 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
914 { | |
915 /* Move this load back so it doesn't go past the end. | |
916 P8/P9 can do this efficiently. */ | |
917 unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
918 cmp_bytes = bytes_to_compare; | |
919 if (extra_bytes < offset) | |
920 { | |
921 offset -= extra_bytes; | |
922 cmp_bytes = load_mode_size; | |
923 bytes_to_compare = cmp_bytes; | |
924 } | |
925 } | |
926 else | |
927 /* P7 and earlier can't do the overlapping load trick fast, | |
928 so this forces a non-overlapping load and a shift to get | |
929 rid of the extra bytes. */ | |
930 cmp_bytes = bytes_to_compare; | |
931 | |
932 src1 = adjust_address (orig_src1, load_mode, offset); | |
933 src2 = adjust_address (orig_src2, load_mode, offset); | |
934 | |
935 if (!REG_P (XEXP (src1, 0))) | |
936 { | |
937 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
938 src1 = replace_equiv_address (src1, src1_reg); | |
939 } | |
940 set_mem_size (src1, cmp_bytes); | |
941 | |
942 if (!REG_P (XEXP (src2, 0))) | |
943 { | |
944 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
945 src2 = replace_equiv_address (src2, src2_reg); | |
946 } | |
947 set_mem_size (src2, cmp_bytes); | |
948 | |
949 do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
950 do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
951 | |
952 /* We must always left-align the data we read, and | |
953 clear any bytes to the right that are beyond the string. | |
954 Otherwise the cmpb sequence won't produce the correct | |
955 results. The beginning of the compare will be done | |
956 with word_mode so will not have any extra shifts or | |
957 clear rights. */ | |
958 | |
959 if (load_mode_size < word_mode_size) | |
960 { | |
961 /* Rotate left first. */ | |
962 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size)); | |
963 if (word_mode == DImode) | |
964 { | |
965 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
966 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
967 } | |
968 else | |
969 { | |
970 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
971 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
972 } | |
973 } | |
974 | |
975 if (cmp_bytes < word_mode_size) | |
976 { | |
977 /* Now clear right. This plus the rotate can be | |
978 turned into a rldicr instruction. */ | |
979 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
980 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
981 if (word_mode == DImode) | |
982 { | |
983 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
984 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
985 } | |
986 else | |
987 { | |
988 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
989 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
990 } | |
991 } | |
992 | |
993 /* Cases to handle. A and B are chunks of the two strings. | |
994 1: Not end of comparison: | |
995 A != B: branch to cleanup code to compute result. | |
996 A == B: check for 0 byte, next block if not found. | |
997 2: End of the inline comparison: | |
998 A != B: branch to cleanup code to compute result. | |
999 A == B: check for 0 byte, call strcmp/strncmp | |
1000 3: compared requested N bytes: | |
1001 A == B: branch to result 0. | |
1002 A != B: cleanup code to compute result. */ | |
1003 | |
1004 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
1005 | |
1006 rtx dst_label; | |
1007 if (remain > 0 || equality_compare_rest) | |
1008 { | |
1009 /* Branch to cleanup code, otherwise fall through to do | |
1010 more compares. */ | |
1011 if (!cleanup_label) | |
1012 cleanup_label = gen_label_rtx (); | |
1013 dst_label = cleanup_label; | |
1014 } | |
1015 else | |
1016 /* Branch to end and produce result of 0. */ | |
1017 dst_label = final_move_label; | |
1018 | |
1019 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
1020 rtx cond = gen_reg_rtx (CCmode); | |
1021 | |
1022 /* Always produce the 0 result, it is needed if | |
1023 cmpb finds a 0 byte in this chunk. */ | |
1024 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1025 rs6000_emit_dot_insn (result_reg, tmp, 1, cond); | |
1026 | |
1027 rtx cmp_rtx; | |
1028 if (remain == 0 && !equality_compare_rest) | |
1029 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
1030 else | |
1031 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
1032 | |
1033 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
1034 lab_ref, pc_rtx); | |
1035 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1036 JUMP_LABEL (j) = dst_label; | |
1037 LABEL_NUSES (dst_label) += 1; | |
1038 | |
1039 if (remain > 0 || equality_compare_rest) | |
1040 { | |
1041 /* Generate a cmpb to test for a 0 byte and branch | |
1042 to final result if found. */ | |
1043 rtx cmpb_zero = gen_reg_rtx (word_mode); | |
1044 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
1045 rtx condz = gen_reg_rtx (CCmode); | |
1046 rtx zero_reg = gen_reg_rtx (word_mode); | |
1047 if (word_mode == SImode) | |
1048 { | |
1049 emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
1050 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1051 if (cmp_bytes < word_mode_size) | |
1052 { | |
1053 /* Don't want to look at zero bytes past end. */ | |
1054 HOST_WIDE_INT mb = | |
1055 BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1056 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1057 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask)); | |
1058 } | |
1059 } | |
1060 else | |
1061 { | |
1062 emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
1063 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1064 if (cmp_bytes < word_mode_size) | |
1065 { | |
1066 /* Don't want to look at zero bytes past end. */ | |
1067 HOST_WIDE_INT mb = | |
1068 BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1069 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1070 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask)); | |
1071 } | |
1072 } | |
1073 | |
1074 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg)); | |
1075 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx); | |
1076 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx, | |
1077 lab_ref_fin, pc_rtx); | |
1078 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1079 JUMP_LABEL (j2) = final_move_label; | |
1080 LABEL_NUSES (final_move_label) += 1; | |
1081 | |
1082 } | |
1083 | |
1084 offset += cmp_bytes; | |
1085 bytes_to_compare -= cmp_bytes; | |
1086 } | |
1087 | |
1088 if (equality_compare_rest) | |
1089 { | |
1090 /* Update pointers past what has been compared already. */ | |
1091 src1 = adjust_address (orig_src1, load_mode, offset); | |
1092 src2 = adjust_address (orig_src2, load_mode, offset); | |
1093 | |
1094 if (!REG_P (XEXP (src1, 0))) | |
1095 { | |
1096 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1097 src1 = replace_equiv_address (src1, src1_reg); | |
1098 } | |
1099 set_mem_size (src1, cmp_bytes); | |
1100 | |
1101 if (!REG_P (XEXP (src2, 0))) | |
1102 { | |
1103 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1104 src2 = replace_equiv_address (src2, src2_reg); | |
1105 } | |
1106 set_mem_size (src2, cmp_bytes); | |
1107 | |
1108 /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
1109 if (no_length) | |
1110 { | |
1111 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
1112 emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1113 target, LCT_NORMAL, GET_MODE (target), | |
1114 force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
1115 force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
1116 } | |
1117 else | |
1118 { | |
1119 rtx len_rtx; | |
1120 if (TARGET_64BIT) | |
1121 len_rtx = gen_reg_rtx (DImode); | |
1122 else | |
1123 len_rtx = gen_reg_rtx (SImode); | |
1124 | |
1125 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length)); | |
1126 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
1127 emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1128 target, LCT_NORMAL, GET_MODE (target), | |
1129 force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
1130 force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
1131 len_rtx, GET_MODE (len_rtx)); | |
1132 } | |
1133 | |
1134 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1135 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
1136 JUMP_LABEL (jmp) = final_label; | |
1137 LABEL_NUSES (final_label) += 1; | |
1138 emit_barrier (); | |
1139 } | |
1140 | |
1141 if (cleanup_label) | |
1142 emit_label (cleanup_label); | |
1143 | |
1144 /* Generate the final sequence that identifies the differing | |
1145 byte and generates the final result, taking into account | |
1146 zero bytes: | |
1147 | |
1148 cmpb cmpb_result1, src1, src2 | |
1149 cmpb cmpb_result2, src1, zero | |
1150 orc cmpb_result1, cmp_result1, cmpb_result2 | |
1151 cntlzd get bit of first zero/diff byte | |
1152 addi convert for rldcl use | |
1153 rldcl rldcl extract diff/zero byte | |
1154 subf subtract for final result | |
1155 */ | |
1156 | |
1157 rtx cmpb_diff = gen_reg_rtx (word_mode); | |
1158 rtx cmpb_zero = gen_reg_rtx (word_mode); | |
1159 rtx rot_amt = gen_reg_rtx (word_mode); | |
1160 rtx zero_reg = gen_reg_rtx (word_mode); | |
1161 | |
1162 rtx rot1_1 = gen_reg_rtx (word_mode); | |
1163 rtx rot1_2 = gen_reg_rtx (word_mode); | |
1164 rtx rot2_1 = gen_reg_rtx (word_mode); | |
1165 rtx rot2_2 = gen_reg_rtx (word_mode); | |
1166 | |
1167 if (word_mode == SImode) | |
1168 { | |
1169 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
1170 emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
1171 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1172 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff)); | |
1173 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
1174 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff)); | |
1175 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); | |
1176 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1, | |
1177 gen_lowpart (SImode, rot_amt))); | |
1178 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
1179 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2, | |
1180 gen_lowpart (SImode, rot_amt))); | |
1181 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
1182 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2)); | |
1183 } | |
1184 else | |
1185 { | |
1186 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
1187 emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
1188 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1189 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff)); | |
1190 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
1191 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff)); | |
1192 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); | |
1193 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1, | |
1194 gen_lowpart (SImode, rot_amt))); | |
1195 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
1196 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2, | |
1197 gen_lowpart (SImode, rot_amt))); | |
1198 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
1199 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2)); | |
1200 } | |
1201 | |
1202 emit_label (final_move_label); | |
1203 emit_insn (gen_movsi (target, | |
1204 gen_lowpart (SImode, result_reg))); | |
1205 emit_label (final_label); | |
1206 return true; | |
1207 } | |
1208 | |
1209 /* Expand a block move operation, and return 1 if successful. Return 0 | |
1210 if we should let the compiler generate normal code. | |
1211 | |
1212 operands[0] is the destination | |
1213 operands[1] is the source | |
1214 operands[2] is the length | |
1215 operands[3] is the alignment */ | |
1216 | |
1217 #define MAX_MOVE_REG 4 | |
1218 | |
1219 int | |
1220 expand_block_move (rtx operands[]) | |
1221 { | |
1222 rtx orig_dest = operands[0]; | |
1223 rtx orig_src = operands[1]; | |
1224 rtx bytes_rtx = operands[2]; | |
1225 rtx align_rtx = operands[3]; | |
1226 int constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
1227 int align; | |
1228 int bytes; | |
1229 int offset; | |
1230 int move_bytes; | |
1231 rtx stores[MAX_MOVE_REG]; | |
1232 int num_reg = 0; | |
1233 | |
1234 /* If this is not a fixed size move, just call memcpy */ | |
1235 if (! constp) | |
1236 return 0; | |
1237 | |
1238 /* This must be a fixed size alignment */ | |
1239 gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
1240 align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
1241 | |
1242 /* Anything to move? */ | |
1243 bytes = INTVAL (bytes_rtx); | |
1244 if (bytes <= 0) | |
1245 return 1; | |
1246 | |
1247 if (bytes > rs6000_block_move_inline_limit) | |
1248 return 0; | |
1249 | |
1250 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) | |
1251 { | |
1252 union { | |
1253 rtx (*movmemsi) (rtx, rtx, rtx, rtx); | |
1254 rtx (*mov) (rtx, rtx); | |
1255 } gen_func; | |
1256 machine_mode mode = BLKmode; | |
1257 rtx src, dest; | |
1258 | |
1259 /* Altivec first, since it will be faster than a string move | |
1260 when it applies, and usually not significantly larger. */ | |
1261 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) | |
1262 { | |
1263 move_bytes = 16; | |
1264 mode = V4SImode; | |
1265 gen_func.mov = gen_movv4si; | |
1266 } | |
1267 else if (TARGET_STRING | |
1268 && bytes > 24 /* move up to 32 bytes at a time */ | |
1269 && ! fixed_regs[5] | |
1270 && ! fixed_regs[6] | |
1271 && ! fixed_regs[7] | |
1272 && ! fixed_regs[8] | |
1273 && ! fixed_regs[9] | |
1274 && ! fixed_regs[10] | |
1275 && ! fixed_regs[11] | |
1276 && ! fixed_regs[12]) | |
1277 { | |
1278 move_bytes = (bytes > 32) ? 32 : bytes; | |
1279 gen_func.movmemsi = gen_movmemsi_8reg; | |
1280 } | |
1281 else if (TARGET_STRING | |
1282 && bytes > 16 /* move up to 24 bytes at a time */ | |
1283 && ! fixed_regs[5] | |
1284 && ! fixed_regs[6] | |
1285 && ! fixed_regs[7] | |
1286 && ! fixed_regs[8] | |
1287 && ! fixed_regs[9] | |
1288 && ! fixed_regs[10]) | |
1289 { | |
1290 move_bytes = (bytes > 24) ? 24 : bytes; | |
1291 gen_func.movmemsi = gen_movmemsi_6reg; | |
1292 } | |
1293 else if (TARGET_STRING | |
1294 && bytes > 8 /* move up to 16 bytes at a time */ | |
1295 && ! fixed_regs[5] | |
1296 && ! fixed_regs[6] | |
1297 && ! fixed_regs[7] | |
1298 && ! fixed_regs[8]) | |
1299 { | |
1300 move_bytes = (bytes > 16) ? 16 : bytes; | |
1301 gen_func.movmemsi = gen_movmemsi_4reg; | |
1302 } | |
1303 else if (bytes >= 8 && TARGET_POWERPC64 | |
1304 && (align >= 64 || !STRICT_ALIGNMENT)) | |
1305 { | |
1306 move_bytes = 8; | |
1307 mode = DImode; | |
1308 gen_func.mov = gen_movdi; | |
1309 if (offset == 0 && align < 64) | |
1310 { | |
1311 rtx addr; | |
1312 | |
1313 /* If the address form is reg+offset with offset not a | |
1314 multiple of four, reload into reg indirect form here | |
1315 rather than waiting for reload. This way we get one | |
1316 reload, not one per load and/or store. */ | |
1317 addr = XEXP (orig_dest, 0); | |
1318 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
1319 && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
1320 && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
1321 { | |
1322 addr = copy_addr_to_reg (addr); | |
1323 orig_dest = replace_equiv_address (orig_dest, addr); | |
1324 } | |
1325 addr = XEXP (orig_src, 0); | |
1326 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
1327 && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
1328 && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
1329 { | |
1330 addr = copy_addr_to_reg (addr); | |
1331 orig_src = replace_equiv_address (orig_src, addr); | |
1332 } | |
1333 } | |
1334 } | |
1335 else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64) | |
1336 { /* move up to 8 bytes at a time */ | |
1337 move_bytes = (bytes > 8) ? 8 : bytes; | |
1338 gen_func.movmemsi = gen_movmemsi_2reg; | |
1339 } | |
1340 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
1341 { /* move 4 bytes */ | |
1342 move_bytes = 4; | |
1343 mode = SImode; | |
1344 gen_func.mov = gen_movsi; | |
1345 } | |
1346 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
1347 { /* move 2 bytes */ | |
1348 move_bytes = 2; | |
1349 mode = HImode; | |
1350 gen_func.mov = gen_movhi; | |
1351 } | |
1352 else if (TARGET_STRING && bytes > 1) | |
1353 { /* move up to 4 bytes at a time */ | |
1354 move_bytes = (bytes > 4) ? 4 : bytes; | |
1355 gen_func.movmemsi = gen_movmemsi_1reg; | |
1356 } | |
1357 else /* move 1 byte at a time */ | |
1358 { | |
1359 move_bytes = 1; | |
1360 mode = QImode; | |
1361 gen_func.mov = gen_movqi; | |
1362 } | |
1363 | |
1364 src = adjust_address (orig_src, mode, offset); | |
1365 dest = adjust_address (orig_dest, mode, offset); | |
1366 | |
1367 if (mode != BLKmode) | |
1368 { | |
1369 rtx tmp_reg = gen_reg_rtx (mode); | |
1370 | |
1371 emit_insn ((*gen_func.mov) (tmp_reg, src)); | |
1372 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
1373 } | |
1374 | |
1375 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
1376 { | |
1377 int i; | |
1378 for (i = 0; i < num_reg; i++) | |
1379 emit_insn (stores[i]); | |
1380 num_reg = 0; | |
1381 } | |
1382 | |
1383 if (mode == BLKmode) | |
1384 { | |
1385 /* Move the address into scratch registers. The movmemsi | |
1386 patterns require zero offset. */ | |
1387 if (!REG_P (XEXP (src, 0))) | |
1388 { | |
1389 rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); | |
1390 src = replace_equiv_address (src, src_reg); | |
1391 } | |
1392 set_mem_size (src, move_bytes); | |
1393 | |
1394 if (!REG_P (XEXP (dest, 0))) | |
1395 { | |
1396 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); | |
1397 dest = replace_equiv_address (dest, dest_reg); | |
1398 } | |
1399 set_mem_size (dest, move_bytes); | |
1400 | |
1401 emit_insn ((*gen_func.movmemsi) (dest, src, | |
1402 GEN_INT (move_bytes & 31), | |
1403 align_rtx)); | |
1404 } | |
1405 } | |
1406 | |
1407 return 1; | |
1408 } | |
1409 | |
1410 | |
1411 /* Return a string to perform a load_multiple operation. | |
1412 operands[0] is the vector. | |
1413 operands[1] is the source address. | |
1414 operands[2] is the first destination register. */ | |
1415 | |
1416 const char * | |
1417 rs6000_output_load_multiple (rtx operands[3]) | |
1418 { | |
1419 /* We have to handle the case where the pseudo used to contain the address | |
1420 is assigned to one of the output registers. */ | |
1421 int i, j; | |
1422 int words = XVECLEN (operands[0], 0); | |
1423 rtx xop[10]; | |
1424 | |
1425 if (XVECLEN (operands[0], 0) == 1) | |
1426 return "lwz %2,0(%1)"; | |
1427 | |
1428 for (i = 0; i < words; i++) | |
1429 if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1])) | |
1430 { | |
1431 if (i == words-1) | |
1432 { | |
1433 xop[0] = GEN_INT (4 * (words-1)); | |
1434 xop[1] = operands[1]; | |
1435 xop[2] = operands[2]; | |
1436 output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop); | |
1437 return ""; | |
1438 } | |
1439 else if (i == 0) | |
1440 { | |
1441 xop[0] = GEN_INT (4 * (words-1)); | |
1442 xop[1] = operands[1]; | |
1443 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); | |
1444 output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop); | |
1445 return ""; | |
1446 } | |
1447 else | |
1448 { | |
1449 for (j = 0; j < words; j++) | |
1450 if (j != i) | |
1451 { | |
1452 xop[0] = GEN_INT (j * 4); | |
1453 xop[1] = operands[1]; | |
1454 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j); | |
1455 output_asm_insn ("lwz %2,%0(%1)", xop); | |
1456 } | |
1457 xop[0] = GEN_INT (i * 4); | |
1458 xop[1] = operands[1]; | |
1459 output_asm_insn ("lwz %1,%0(%1)", xop); | |
1460 return ""; | |
1461 } | |
1462 } | |
1463 | |
1464 return "lswi %2,%1,%N0"; | |
1465 } | |
1466 |