111
|
1 /* Subroutines used to expand string and block move, clear,
|
|
2 compare and other operations for PowerPC.
|
|
3 Copyright (C) 1991-2017 Free Software Foundation, Inc.
|
|
4
|
|
5 This file is part of GCC.
|
|
6
|
|
7 GCC is free software; you can redistribute it and/or modify it
|
|
8 under the terms of the GNU General Public License as published
|
|
9 by the Free Software Foundation; either version 3, or (at your
|
|
10 option) any later version.
|
|
11
|
|
12 GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
|
15 License for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with GCC; see the file COPYING3. If not see
|
|
19 <http://www.gnu.org/licenses/>. */
|
|
20
|
|
21 #include "config.h"
|
|
22 #include "system.h"
|
|
23 #include "coretypes.h"
|
|
24 #include "backend.h"
|
|
25 #include "rtl.h"
|
|
26 #include "tree.h"
|
|
27 #include "memmodel.h"
|
|
28 #include "tm_p.h"
|
|
29 #include "ira.h"
|
|
30 #include "print-tree.h"
|
|
31 #include "varasm.h"
|
|
32 #include "explow.h"
|
|
33 #include "expr.h"
|
|
34 #include "output.h"
|
|
35 #include "target.h"
|
|
36
|
|
37 /* Expand a block clear operation, and return 1 if successful. Return 0
|
|
38 if we should let the compiler generate normal code.
|
|
39
|
|
40 operands[0] is the destination
|
|
41 operands[1] is the length
|
|
42 operands[3] is the alignment */
|
|
43
|
|
44 int
|
|
45 expand_block_clear (rtx operands[])
|
|
46 {
|
|
47 rtx orig_dest = operands[0];
|
|
48 rtx bytes_rtx = operands[1];
|
|
49 rtx align_rtx = operands[3];
|
|
50 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
|
|
51 HOST_WIDE_INT align;
|
|
52 HOST_WIDE_INT bytes;
|
|
53 int offset;
|
|
54 int clear_bytes;
|
|
55 int clear_step;
|
|
56
|
|
57 /* If this is not a fixed size move, just call memcpy */
|
|
58 if (! constp)
|
|
59 return 0;
|
|
60
|
|
61 /* This must be a fixed size alignment */
|
|
62 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
|
|
63 align = INTVAL (align_rtx) * BITS_PER_UNIT;
|
|
64
|
|
65 /* Anything to clear? */
|
|
66 bytes = INTVAL (bytes_rtx);
|
|
67 if (bytes <= 0)
|
|
68 return 1;
|
|
69
|
|
70 /* Use the builtin memset after a point, to avoid huge code bloat.
|
|
71 When optimize_size, avoid any significant code bloat; calling
|
|
72 memset is about 4 instructions, so allow for one instruction to
|
|
73 load zero and three to do clearing. */
|
|
74 if (TARGET_ALTIVEC && align >= 128)
|
|
75 clear_step = 16;
|
|
76 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
|
|
77 clear_step = 8;
|
|
78 else
|
|
79 clear_step = 4;
|
|
80
|
|
81 if (optimize_size && bytes > 3 * clear_step)
|
|
82 return 0;
|
|
83 if (! optimize_size && bytes > 8 * clear_step)
|
|
84 return 0;
|
|
85
|
|
86 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
|
|
87 {
|
|
88 machine_mode mode = BLKmode;
|
|
89 rtx dest;
|
|
90
|
|
91 if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
|
|
92 {
|
|
93 clear_bytes = 16;
|
|
94 mode = V4SImode;
|
|
95 }
|
|
96 else if (bytes >= 8 && TARGET_POWERPC64
|
|
97 && (align >= 64 || !STRICT_ALIGNMENT))
|
|
98 {
|
|
99 clear_bytes = 8;
|
|
100 mode = DImode;
|
|
101 if (offset == 0 && align < 64)
|
|
102 {
|
|
103 rtx addr;
|
|
104
|
|
105 /* If the address form is reg+offset with offset not a
|
|
106 multiple of four, reload into reg indirect form here
|
|
107 rather than waiting for reload. This way we get one
|
|
108 reload, not one per store. */
|
|
109 addr = XEXP (orig_dest, 0);
|
|
110 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
|
|
111 && GET_CODE (XEXP (addr, 1)) == CONST_INT
|
|
112 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
|
|
113 {
|
|
114 addr = copy_addr_to_reg (addr);
|
|
115 orig_dest = replace_equiv_address (orig_dest, addr);
|
|
116 }
|
|
117 }
|
|
118 }
|
|
119 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
|
|
120 { /* move 4 bytes */
|
|
121 clear_bytes = 4;
|
|
122 mode = SImode;
|
|
123 }
|
|
124 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
|
|
125 { /* move 2 bytes */
|
|
126 clear_bytes = 2;
|
|
127 mode = HImode;
|
|
128 }
|
|
129 else /* move 1 byte at a time */
|
|
130 {
|
|
131 clear_bytes = 1;
|
|
132 mode = QImode;
|
|
133 }
|
|
134
|
|
135 dest = adjust_address (orig_dest, mode, offset);
|
|
136
|
|
137 emit_move_insn (dest, CONST0_RTX (mode));
|
|
138 }
|
|
139
|
|
140 return 1;
|
|
141 }
|
|
142
|
|
143 /* Figure out the correct instructions to generate to load data for
|
|
144 block compare. MODE is used for the read from memory, and
|
|
145 data is zero extended if REG is wider than MODE. If LE code
|
|
146 is being generated, bswap loads are used.
|
|
147
|
|
148 REG is the destination register to move the data into.
|
|
149 MEM is the memory block being read.
|
|
150 MODE is the mode of memory to use for the read. */
|
|
151 static void
|
|
152 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
|
|
153 {
|
|
154 switch (GET_MODE (reg))
|
|
155 {
|
|
156 case E_DImode:
|
|
157 switch (mode)
|
|
158 {
|
|
159 case E_QImode:
|
|
160 emit_insn (gen_zero_extendqidi2 (reg, mem));
|
|
161 break;
|
|
162 case E_HImode:
|
|
163 {
|
|
164 rtx src = mem;
|
|
165 if (!BYTES_BIG_ENDIAN)
|
|
166 {
|
|
167 src = gen_reg_rtx (HImode);
|
|
168 emit_insn (gen_bswaphi2 (src, mem));
|
|
169 }
|
|
170 emit_insn (gen_zero_extendhidi2 (reg, src));
|
|
171 break;
|
|
172 }
|
|
173 case E_SImode:
|
|
174 {
|
|
175 rtx src = mem;
|
|
176 if (!BYTES_BIG_ENDIAN)
|
|
177 {
|
|
178 src = gen_reg_rtx (SImode);
|
|
179 emit_insn (gen_bswapsi2 (src, mem));
|
|
180 }
|
|
181 emit_insn (gen_zero_extendsidi2 (reg, src));
|
|
182 }
|
|
183 break;
|
|
184 case E_DImode:
|
|
185 if (!BYTES_BIG_ENDIAN)
|
|
186 emit_insn (gen_bswapdi2 (reg, mem));
|
|
187 else
|
|
188 emit_insn (gen_movdi (reg, mem));
|
|
189 break;
|
|
190 default:
|
|
191 gcc_unreachable ();
|
|
192 }
|
|
193 break;
|
|
194
|
|
195 case E_SImode:
|
|
196 switch (mode)
|
|
197 {
|
|
198 case E_QImode:
|
|
199 emit_insn (gen_zero_extendqisi2 (reg, mem));
|
|
200 break;
|
|
201 case E_HImode:
|
|
202 {
|
|
203 rtx src = mem;
|
|
204 if (!BYTES_BIG_ENDIAN)
|
|
205 {
|
|
206 src = gen_reg_rtx (HImode);
|
|
207 emit_insn (gen_bswaphi2 (src, mem));
|
|
208 }
|
|
209 emit_insn (gen_zero_extendhisi2 (reg, src));
|
|
210 break;
|
|
211 }
|
|
212 case E_SImode:
|
|
213 if (!BYTES_BIG_ENDIAN)
|
|
214 emit_insn (gen_bswapsi2 (reg, mem));
|
|
215 else
|
|
216 emit_insn (gen_movsi (reg, mem));
|
|
217 break;
|
|
218 case E_DImode:
|
|
219 /* DImode is larger than the destination reg so is not expected. */
|
|
220 gcc_unreachable ();
|
|
221 break;
|
|
222 default:
|
|
223 gcc_unreachable ();
|
|
224 }
|
|
225 break;
|
|
226 default:
|
|
227 gcc_unreachable ();
|
|
228 break;
|
|
229 }
|
|
230 }
|
|
231
|
|
232 /* Select the mode to be used for reading the next chunk of bytes
|
|
233 in the compare.
|
|
234
|
|
235 OFFSET is the current read offset from the beginning of the block.
|
|
236 BYTES is the number of bytes remaining to be read.
|
|
237 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
|
|
238 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
|
|
239 the largest allowable mode. */
|
|
240 static machine_mode
|
|
241 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
|
|
242 unsigned HOST_WIDE_INT bytes,
|
|
243 unsigned HOST_WIDE_INT align, bool word_mode_ok)
|
|
244 {
|
|
245 /* First see if we can do a whole load unit
|
|
246 as that will be more efficient than a larger load + shift. */
|
|
247
|
|
248 /* If big, use biggest chunk.
|
|
249 If exactly chunk size, use that size.
|
|
250 If remainder can be done in one piece with shifting, do that.
|
|
251 Do largest chunk possible without violating alignment rules. */
|
|
252
|
|
253 /* The most we can read without potential page crossing. */
|
|
254 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
|
|
255
|
|
256 if (word_mode_ok && bytes >= UNITS_PER_WORD)
|
|
257 return word_mode;
|
|
258 else if (bytes == GET_MODE_SIZE (SImode))
|
|
259 return SImode;
|
|
260 else if (bytes == GET_MODE_SIZE (HImode))
|
|
261 return HImode;
|
|
262 else if (bytes == GET_MODE_SIZE (QImode))
|
|
263 return QImode;
|
|
264 else if (bytes < GET_MODE_SIZE (SImode)
|
|
265 && offset >= GET_MODE_SIZE (SImode) - bytes)
|
|
266 /* This matches the case were we have SImode and 3 bytes
|
|
267 and offset >= 1 and permits us to move back one and overlap
|
|
268 with the previous read, thus avoiding having to shift
|
|
269 unwanted bytes off of the input. */
|
|
270 return SImode;
|
|
271 else if (word_mode_ok && bytes < UNITS_PER_WORD
|
|
272 && offset >= UNITS_PER_WORD-bytes)
|
|
273 /* Similarly, if we can use DImode it will get matched here and
|
|
274 can do an overlapping read that ends at the end of the block. */
|
|
275 return word_mode;
|
|
276 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
|
|
277 /* It is safe to do all remaining in one load of largest size,
|
|
278 possibly with a shift to get rid of unwanted bytes. */
|
|
279 return word_mode;
|
|
280 else if (maxread >= GET_MODE_SIZE (SImode))
|
|
281 /* It is safe to do all remaining in one SImode load,
|
|
282 possibly with a shift to get rid of unwanted bytes. */
|
|
283 return SImode;
|
|
284 else if (bytes > GET_MODE_SIZE (SImode))
|
|
285 return SImode;
|
|
286 else if (bytes > GET_MODE_SIZE (HImode))
|
|
287 return HImode;
|
|
288
|
|
289 /* final fallback is do one byte */
|
|
290 return QImode;
|
|
291 }
|
|
292
|
|
293 /* Compute the alignment of pointer+OFFSET where the original alignment
|
|
294 of pointer was BASE_ALIGN. */
|
|
295 static unsigned HOST_WIDE_INT
|
|
296 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
|
|
297 unsigned HOST_WIDE_INT offset)
|
|
298 {
|
|
299 if (offset == 0)
|
|
300 return base_align;
|
|
301 return MIN (base_align, offset & -offset);
|
|
302 }
|
|
303
|
|
304 /* Expand a block compare operation, and return true if successful.
|
|
305 Return false if we should let the compiler generate normal code,
|
|
306 probably a memcmp call.
|
|
307
|
|
308 OPERANDS[0] is the target (result).
|
|
309 OPERANDS[1] is the first source.
|
|
310 OPERANDS[2] is the second source.
|
|
311 OPERANDS[3] is the length.
|
|
312 OPERANDS[4] is the alignment. */
|
|
313 bool
|
|
314 expand_block_compare (rtx operands[])
|
|
315 {
|
|
316 rtx target = operands[0];
|
|
317 rtx orig_src1 = operands[1];
|
|
318 rtx orig_src2 = operands[2];
|
|
319 rtx bytes_rtx = operands[3];
|
|
320 rtx align_rtx = operands[4];
|
|
321 HOST_WIDE_INT cmp_bytes = 0;
|
|
322 rtx src1 = orig_src1;
|
|
323 rtx src2 = orig_src2;
|
|
324
|
|
325 /* This case is complicated to handle because the subtract
|
|
326 with carry instructions do not generate the 64-bit
|
|
327 carry and so we must emit code to calculate it ourselves.
|
|
328 We choose not to implement this yet. */
|
|
329 if (TARGET_32BIT && TARGET_POWERPC64)
|
|
330 return false;
|
|
331
|
|
332 /* If this is not a fixed size compare, just call memcmp. */
|
|
333 if (!CONST_INT_P (bytes_rtx))
|
|
334 return false;
|
|
335
|
|
336 /* This must be a fixed size alignment. */
|
|
337 if (!CONST_INT_P (align_rtx))
|
|
338 return false;
|
|
339
|
|
340 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
|
|
341
|
|
342 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
|
|
343 if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
|
|
344 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))
|
|
345 return false;
|
|
346
|
|
347 gcc_assert (GET_MODE (target) == SImode);
|
|
348
|
|
349 /* Anything to move? */
|
|
350 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
|
|
351 if (bytes == 0)
|
|
352 return true;
|
|
353
|
|
354 /* The code generated for p7 and older is not faster than glibc
|
|
355 memcmp if alignment is small and length is not short, so bail
|
|
356 out to avoid those conditions. */
|
|
357 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
|
|
358 && ((base_align == 1 && bytes > 16)
|
|
359 || (base_align == 2 && bytes > 32)))
|
|
360 return false;
|
|
361
|
|
362 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
|
|
363 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
|
|
364 /* P7/P8 code uses cond for subfc. but P9 uses
|
|
365 it for cmpld which needs CCUNSmode. */
|
|
366 rtx cond;
|
|
367 if (TARGET_P9_MISC)
|
|
368 cond = gen_reg_rtx (CCUNSmode);
|
|
369 else
|
|
370 cond = gen_reg_rtx (CCmode);
|
|
371
|
|
372 /* If we have an LE target without ldbrx and word_mode is DImode,
|
|
373 then we must avoid using word_mode. */
|
|
374 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
|
|
375 && word_mode == DImode);
|
|
376
|
|
377 /* Strategy phase. How many ops will this take and should we expand it? */
|
|
378
|
|
379 unsigned HOST_WIDE_INT offset = 0;
|
|
380 machine_mode load_mode =
|
|
381 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
|
|
382 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
|
|
383
|
|
384 /* We don't want to generate too much code. */
|
|
385 unsigned HOST_WIDE_INT max_bytes =
|
|
386 load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
|
|
387 if (!IN_RANGE (bytes, 1, max_bytes))
|
|
388 return false;
|
|
389
|
|
390 bool generate_6432_conversion = false;
|
|
391 rtx convert_label = NULL;
|
|
392 rtx final_label = NULL;
|
|
393
|
|
394 /* Example of generated code for 18 bytes aligned 1 byte.
|
|
395 Compiled with -fno-reorder-blocks for clarity.
|
|
396 ldbrx 10,31,8
|
|
397 ldbrx 9,7,8
|
|
398 subfc. 9,9,10
|
|
399 bne 0,.L6487
|
|
400 addi 9,12,8
|
|
401 addi 5,11,8
|
|
402 ldbrx 10,0,9
|
|
403 ldbrx 9,0,5
|
|
404 subfc. 9,9,10
|
|
405 bne 0,.L6487
|
|
406 addi 9,12,16
|
|
407 lhbrx 10,0,9
|
|
408 addi 9,11,16
|
|
409 lhbrx 9,0,9
|
|
410 subf 9,9,10
|
|
411 b .L6488
|
|
412 .p2align 4,,15
|
|
413 .L6487: #convert_label
|
|
414 popcntd 9,9
|
|
415 subfe 10,10,10
|
|
416 or 9,9,10
|
|
417 .L6488: #final_label
|
|
418 extsw 10,9
|
|
419
|
|
420 We start off with DImode for two blocks that jump to the DI->SI conversion
|
|
421 if the difference is found there, then a final block of HImode that skips
|
|
422 the DI->SI conversion. */
|
|
423
|
|
424 while (bytes > 0)
|
|
425 {
|
|
426 unsigned int align = compute_current_alignment (base_align, offset);
|
|
427 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
|
|
428 load_mode = select_block_compare_mode (offset, bytes, align,
|
|
429 word_mode_ok);
|
|
430 else
|
|
431 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
|
|
432 load_mode_size = GET_MODE_SIZE (load_mode);
|
|
433 if (bytes >= load_mode_size)
|
|
434 cmp_bytes = load_mode_size;
|
|
435 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
|
|
436 {
|
|
437 /* Move this load back so it doesn't go past the end.
|
|
438 P8/P9 can do this efficiently. */
|
|
439 unsigned int extra_bytes = load_mode_size - bytes;
|
|
440 cmp_bytes = bytes;
|
|
441 if (extra_bytes < offset)
|
|
442 {
|
|
443 offset -= extra_bytes;
|
|
444 cmp_bytes = load_mode_size;
|
|
445 bytes = cmp_bytes;
|
|
446 }
|
|
447 }
|
|
448 else
|
|
449 /* P7 and earlier can't do the overlapping load trick fast,
|
|
450 so this forces a non-overlapping load and a shift to get
|
|
451 rid of the extra bytes. */
|
|
452 cmp_bytes = bytes;
|
|
453
|
|
454 src1 = adjust_address (orig_src1, load_mode, offset);
|
|
455 src2 = adjust_address (orig_src2, load_mode, offset);
|
|
456
|
|
457 if (!REG_P (XEXP (src1, 0)))
|
|
458 {
|
|
459 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
|
|
460 src1 = replace_equiv_address (src1, src1_reg);
|
|
461 }
|
|
462 set_mem_size (src1, cmp_bytes);
|
|
463
|
|
464 if (!REG_P (XEXP (src2, 0)))
|
|
465 {
|
|
466 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
|
|
467 src2 = replace_equiv_address (src2, src2_reg);
|
|
468 }
|
|
469 set_mem_size (src2, cmp_bytes);
|
|
470
|
|
471 do_load_for_compare (tmp_reg_src1, src1, load_mode);
|
|
472 do_load_for_compare (tmp_reg_src2, src2, load_mode);
|
|
473
|
|
474 if (cmp_bytes < load_mode_size)
|
|
475 {
|
|
476 /* Shift unneeded bytes off. */
|
|
477 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
|
|
478 if (word_mode == DImode)
|
|
479 {
|
|
480 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
|
|
481 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
|
|
482 }
|
|
483 else
|
|
484 {
|
|
485 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
|
|
486 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
|
|
487 }
|
|
488 }
|
|
489
|
|
490 int remain = bytes - cmp_bytes;
|
|
491 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
|
|
492 {
|
|
493 /* Target is larger than load size so we don't need to
|
|
494 reduce result size. */
|
|
495
|
|
496 /* We previously did a block that need 64->32 conversion but
|
|
497 the current block does not, so a label is needed to jump
|
|
498 to the end. */
|
|
499 if (generate_6432_conversion && !final_label)
|
|
500 final_label = gen_label_rtx ();
|
|
501
|
|
502 if (remain > 0)
|
|
503 {
|
|
504 /* This is not the last block, branch to the end if the result
|
|
505 of this subtract is not zero. */
|
|
506 if (!final_label)
|
|
507 final_label = gen_label_rtx ();
|
|
508 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
|
|
509 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
|
|
510 rtx cr = gen_reg_rtx (CCmode);
|
|
511 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
|
|
512 emit_insn (gen_movsi (target,
|
|
513 gen_lowpart (SImode, tmp_reg_src2)));
|
|
514 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
|
|
515 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
|
|
516 fin_ref, pc_rtx);
|
|
517 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
518 JUMP_LABEL (j) = final_label;
|
|
519 LABEL_NUSES (final_label) += 1;
|
|
520 }
|
|
521 else
|
|
522 {
|
|
523 if (word_mode == DImode)
|
|
524 {
|
|
525 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
|
|
526 tmp_reg_src2));
|
|
527 emit_insn (gen_movsi (target,
|
|
528 gen_lowpart (SImode, tmp_reg_src2)));
|
|
529 }
|
|
530 else
|
|
531 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
|
|
532
|
|
533 if (final_label)
|
|
534 {
|
|
535 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
|
|
536 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
|
|
537 JUMP_LABEL(j) = final_label;
|
|
538 LABEL_NUSES (final_label) += 1;
|
|
539 emit_barrier ();
|
|
540 }
|
|
541 }
|
|
542 }
|
|
543 else
|
|
544 {
|
|
545 /* Do we need a 64->32 conversion block? We need the 64->32
|
|
546 conversion even if target size == load_mode size because
|
|
547 the subtract generates one extra bit. */
|
|
548 generate_6432_conversion = true;
|
|
549
|
|
550 if (remain > 0)
|
|
551 {
|
|
552 if (!convert_label)
|
|
553 convert_label = gen_label_rtx ();
|
|
554
|
|
555 /* Compare to zero and branch to convert_label if not zero. */
|
|
556 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
|
|
557 if (TARGET_P9_MISC)
|
|
558 {
|
|
559 /* Generate a compare, and convert with a setb later. */
|
|
560 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
|
|
561 tmp_reg_src2);
|
|
562 emit_insn (gen_rtx_SET (cond, cmp));
|
|
563 }
|
|
564 else
|
|
565 /* Generate a subfc. and use the longer
|
|
566 sequence for conversion. */
|
|
567 if (TARGET_64BIT)
|
|
568 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
|
|
569 tmp_reg_src1, cond));
|
|
570 else
|
|
571 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
|
|
572 tmp_reg_src1, cond));
|
|
573 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
|
|
574 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
|
|
575 cvt_ref, pc_rtx);
|
|
576 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
577 JUMP_LABEL(j) = convert_label;
|
|
578 LABEL_NUSES (convert_label) += 1;
|
|
579 }
|
|
580 else
|
|
581 {
|
|
582 /* Just do the subtract/compare. Since this is the last block
|
|
583 the convert code will be generated immediately following. */
|
|
584 if (TARGET_P9_MISC)
|
|
585 {
|
|
586 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
|
|
587 tmp_reg_src2);
|
|
588 emit_insn (gen_rtx_SET (cond, cmp));
|
|
589 }
|
|
590 else
|
|
591 if (TARGET_64BIT)
|
|
592 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
|
|
593 tmp_reg_src1));
|
|
594 else
|
|
595 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
|
|
596 tmp_reg_src1));
|
|
597 }
|
|
598 }
|
|
599
|
|
600 offset += cmp_bytes;
|
|
601 bytes -= cmp_bytes;
|
|
602 }
|
|
603
|
|
604 if (generate_6432_conversion)
|
|
605 {
|
|
606 if (convert_label)
|
|
607 emit_label (convert_label);
|
|
608
|
|
609 /* We need to produce DI result from sub, then convert to target SI
|
|
610 while maintaining <0 / ==0 / >0 properties. This sequence works:
|
|
611 subfc L,A,B
|
|
612 subfe H,H,H
|
|
613 popcntd L,L
|
|
614 rldimi L,H,6,0
|
|
615
|
|
616 This is an alternate one Segher cooked up if somebody
|
|
617 wants to expand this for something that doesn't have popcntd:
|
|
618 subfc L,a,b
|
|
619 subfe H,x,x
|
|
620 addic t,L,-1
|
|
621 subfe v,t,L
|
|
622 or z,v,H
|
|
623
|
|
624 And finally, p9 can just do this:
|
|
625 cmpld A,B
|
|
626 setb r */
|
|
627
|
|
628 if (TARGET_P9_MISC)
|
|
629 {
|
|
630 emit_insn (gen_setb_unsigned (target, cond));
|
|
631 }
|
|
632 else
|
|
633 {
|
|
634 if (TARGET_64BIT)
|
|
635 {
|
|
636 rtx tmp_reg_ca = gen_reg_rtx (DImode);
|
|
637 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
|
|
638 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
|
|
639 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
|
|
640 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
|
|
641 }
|
|
642 else
|
|
643 {
|
|
644 rtx tmp_reg_ca = gen_reg_rtx (SImode);
|
|
645 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
|
|
646 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
|
|
647 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
|
|
648 }
|
|
649 }
|
|
650 }
|
|
651
|
|
652 if (final_label)
|
|
653 emit_label (final_label);
|
|
654
|
|
655 gcc_assert (bytes == 0);
|
|
656 return true;
|
|
657 }
|
|
658
|
|
659 /* Generate alignment check and branch code to set up for
|
|
660 strncmp when we don't have DI alignment.
|
|
661 STRNCMP_LABEL is the label to branch if there is a page crossing.
|
|
662 SRC is the string pointer to be examined.
|
|
663 BYTES is the max number of bytes to compare. */
|
|
664 static void
|
|
665 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
|
|
666 {
|
|
667 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
|
|
668 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
|
|
669 if (GET_MODE (src_check) == SImode)
|
|
670 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
|
|
671 else
|
|
672 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
|
|
673 rtx cond = gen_reg_rtx (CCmode);
|
|
674 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
|
|
675 GEN_INT (4096 - bytes)));
|
|
676
|
|
677 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
|
|
678
|
|
679 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
|
|
680 lab_ref, pc_rtx);
|
|
681 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
682 JUMP_LABEL (j) = strncmp_label;
|
|
683 LABEL_NUSES (strncmp_label) += 1;
|
|
684 }
|
|
685
|
|
686 /* Expand a string compare operation with length, and return
|
|
687 true if successful. Return false if we should let the
|
|
688 compiler generate normal code, probably a strncmp call.
|
|
689
|
|
690 OPERANDS[0] is the target (result).
|
|
691 OPERANDS[1] is the first source.
|
|
692 OPERANDS[2] is the second source.
|
|
693 If NO_LENGTH is zero, then:
|
|
694 OPERANDS[3] is the length.
|
|
695 OPERANDS[4] is the alignment in bytes.
|
|
696 If NO_LENGTH is nonzero, then:
|
|
697 OPERANDS[3] is the alignment in bytes. */
|
|
698 bool
|
|
699 expand_strn_compare (rtx operands[], int no_length)
|
|
700 {
|
|
701 rtx target = operands[0];
|
|
702 rtx orig_src1 = operands[1];
|
|
703 rtx orig_src2 = operands[2];
|
|
704 rtx bytes_rtx, align_rtx;
|
|
705 if (no_length)
|
|
706 {
|
|
707 bytes_rtx = NULL;
|
|
708 align_rtx = operands[3];
|
|
709 }
|
|
710 else
|
|
711 {
|
|
712 bytes_rtx = operands[3];
|
|
713 align_rtx = operands[4];
|
|
714 }
|
|
715 unsigned HOST_WIDE_INT cmp_bytes = 0;
|
|
716 rtx src1 = orig_src1;
|
|
717 rtx src2 = orig_src2;
|
|
718
|
|
719 /* If we have a length, it must be constant. This simplifies things
|
|
720 a bit as we don't have to generate code to check if we've exceeded
|
|
721 the length. Later this could be expanded to handle this case. */
|
|
722 if (!no_length && !CONST_INT_P (bytes_rtx))
|
|
723 return false;
|
|
724
|
|
725 /* This must be a fixed size alignment. */
|
|
726 if (!CONST_INT_P (align_rtx))
|
|
727 return false;
|
|
728
|
|
729 unsigned int base_align = UINTVAL (align_rtx);
|
|
730 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
|
|
731 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
|
|
732
|
|
733 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
|
|
734 if (targetm.slow_unaligned_access (word_mode, align1)
|
|
735 || targetm.slow_unaligned_access (word_mode, align2))
|
|
736 return false;
|
|
737
|
|
738 gcc_assert (GET_MODE (target) == SImode);
|
|
739
|
|
740 /* If we have an LE target without ldbrx and word_mode is DImode,
|
|
741 then we must avoid using word_mode. */
|
|
742 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
|
|
743 && word_mode == DImode);
|
|
744
|
|
745 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
|
|
746
|
|
747 unsigned HOST_WIDE_INT offset = 0;
|
|
748 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
|
|
749 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
|
|
750 if (no_length)
|
|
751 /* Use this as a standin to determine the mode to use. */
|
|
752 bytes = rs6000_string_compare_inline_limit * word_mode_size;
|
|
753 else
|
|
754 bytes = UINTVAL (bytes_rtx);
|
|
755
|
|
756 machine_mode load_mode =
|
|
757 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
|
|
758 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
|
|
759 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
|
|
760
|
|
761 /* If we have equality at the end of the last compare and we have not
|
|
762 found the end of the string, we need to call strcmp/strncmp to
|
|
763 compare the remainder. */
|
|
764 bool equality_compare_rest = false;
|
|
765
|
|
766 if (no_length)
|
|
767 {
|
|
768 bytes = compare_length;
|
|
769 equality_compare_rest = true;
|
|
770 }
|
|
771 else
|
|
772 {
|
|
773 if (bytes <= compare_length)
|
|
774 compare_length = bytes;
|
|
775 else
|
|
776 equality_compare_rest = true;
|
|
777 }
|
|
778
|
|
779 rtx result_reg = gen_reg_rtx (word_mode);
|
|
780 rtx final_move_label = gen_label_rtx ();
|
|
781 rtx final_label = gen_label_rtx ();
|
|
782 rtx begin_compare_label = NULL;
|
|
783
|
|
784 if (base_align < 8)
|
|
785 {
|
|
786 /* Generate code that checks distance to 4k boundary for this case. */
|
|
787 begin_compare_label = gen_label_rtx ();
|
|
788 rtx strncmp_label = gen_label_rtx ();
|
|
789 rtx jmp;
|
|
790
|
|
791 /* Strncmp for power8 in glibc does this:
|
|
792 rldicl r8,r3,0,52
|
|
793 cmpldi cr7,r8,4096-16
|
|
794 bgt cr7,L(pagecross) */
|
|
795
|
|
796 /* Make sure that the length we use for the alignment test and
|
|
797 the subsequent code generation are in agreement so we do not
|
|
798 go past the length we tested for a 4k boundary crossing. */
|
|
799 unsigned HOST_WIDE_INT align_test = compare_length;
|
|
800 if (align_test < 8)
|
|
801 {
|
|
802 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
|
|
803 base_align = align_test;
|
|
804 }
|
|
805 else
|
|
806 {
|
|
807 align_test = ROUND_UP (align_test, 8);
|
|
808 base_align = 8;
|
|
809 }
|
|
810
|
|
811 if (align1 < 8)
|
|
812 expand_strncmp_align_check (strncmp_label, src1, align_test);
|
|
813 if (align2 < 8)
|
|
814 expand_strncmp_align_check (strncmp_label, src2, align_test);
|
|
815
|
|
816 /* Now generate the following sequence:
|
|
817 - branch to begin_compare
|
|
818 - strncmp_label
|
|
819 - call to strncmp
|
|
820 - branch to final_label
|
|
821 - begin_compare_label */
|
|
822
|
|
823 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
|
|
824 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
|
|
825 JUMP_LABEL (jmp) = begin_compare_label;
|
|
826 LABEL_NUSES (begin_compare_label) += 1;
|
|
827 emit_barrier ();
|
|
828
|
|
829 emit_label (strncmp_label);
|
|
830
|
|
831 if (!REG_P (XEXP (src1, 0)))
|
|
832 {
|
|
833 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
|
|
834 src1 = replace_equiv_address (src1, src1_reg);
|
|
835 }
|
|
836
|
|
837 if (!REG_P (XEXP (src2, 0)))
|
|
838 {
|
|
839 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
|
|
840 src2 = replace_equiv_address (src2, src2_reg);
|
|
841 }
|
|
842
|
|
843 if (no_length)
|
|
844 {
|
|
845 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
|
|
846 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
|
|
847 target, LCT_NORMAL, GET_MODE (target),
|
|
848 force_reg (Pmode, XEXP (src1, 0)), Pmode,
|
|
849 force_reg (Pmode, XEXP (src2, 0)), Pmode);
|
|
850 }
|
|
851 else
|
|
852 {
|
|
853 /* -m32 -mpowerpc64 results in word_mode being DImode even
|
|
854 though otherwise it is 32-bit. The length arg to strncmp
|
|
855 is a size_t which will be the same size as pointers. */
|
|
856 rtx len_rtx;
|
|
857 if (TARGET_64BIT)
|
|
858 len_rtx = gen_reg_rtx (DImode);
|
|
859 else
|
|
860 len_rtx = gen_reg_rtx (SImode);
|
|
861
|
|
862 emit_move_insn (len_rtx, bytes_rtx);
|
|
863
|
|
864 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
|
|
865 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
|
|
866 target, LCT_NORMAL, GET_MODE (target),
|
|
867 force_reg (Pmode, XEXP (src1, 0)), Pmode,
|
|
868 force_reg (Pmode, XEXP (src2, 0)), Pmode,
|
|
869 len_rtx, GET_MODE (len_rtx));
|
|
870 }
|
|
871
|
|
872 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
|
|
873 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
|
|
874 JUMP_LABEL (jmp) = final_label;
|
|
875 LABEL_NUSES (final_label) += 1;
|
|
876 emit_barrier ();
|
|
877 emit_label (begin_compare_label);
|
|
878 }
|
|
879
|
|
880 rtx cleanup_label = NULL;
|
|
881 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
|
|
882 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
|
|
883
|
|
884 /* Generate sequence of ld/ldbrx, cmpb to compare out
|
|
885 to the length specified. */
|
|
886 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
|
|
887 while (bytes_to_compare > 0)
|
|
888 {
|
|
889 /* Compare sequence:
|
|
890 check each 8B with: ld/ld cmpd bne
|
|
891 If equal, use rldicr/cmpb to check for zero byte.
|
|
892 cleanup code at end:
|
|
893 cmpb get byte that differs
|
|
894 cmpb look for zero byte
|
|
895 orc combine
|
|
896 cntlzd get bit of first zero/diff byte
|
|
897 subfic convert for rldcl use
|
|
898 rldcl rldcl extract diff/zero byte
|
|
899 subf subtract for final result
|
|
900
|
|
901 The last compare can branch around the cleanup code if the
|
|
902 result is zero because the strings are exactly equal. */
|
|
903 unsigned int align = compute_current_alignment (base_align, offset);
|
|
904 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
|
|
905 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
|
|
906 word_mode_ok);
|
|
907 else
|
|
908 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
|
|
909 word_mode_ok);
|
|
910 load_mode_size = GET_MODE_SIZE (load_mode);
|
|
911 if (bytes_to_compare >= load_mode_size)
|
|
912 cmp_bytes = load_mode_size;
|
|
913 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
|
|
914 {
|
|
915 /* Move this load back so it doesn't go past the end.
|
|
916 P8/P9 can do this efficiently. */
|
|
917 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
|
|
918 cmp_bytes = bytes_to_compare;
|
|
919 if (extra_bytes < offset)
|
|
920 {
|
|
921 offset -= extra_bytes;
|
|
922 cmp_bytes = load_mode_size;
|
|
923 bytes_to_compare = cmp_bytes;
|
|
924 }
|
|
925 }
|
|
926 else
|
|
927 /* P7 and earlier can't do the overlapping load trick fast,
|
|
928 so this forces a non-overlapping load and a shift to get
|
|
929 rid of the extra bytes. */
|
|
930 cmp_bytes = bytes_to_compare;
|
|
931
|
|
932 src1 = adjust_address (orig_src1, load_mode, offset);
|
|
933 src2 = adjust_address (orig_src2, load_mode, offset);
|
|
934
|
|
935 if (!REG_P (XEXP (src1, 0)))
|
|
936 {
|
|
937 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
|
|
938 src1 = replace_equiv_address (src1, src1_reg);
|
|
939 }
|
|
940 set_mem_size (src1, cmp_bytes);
|
|
941
|
|
942 if (!REG_P (XEXP (src2, 0)))
|
|
943 {
|
|
944 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
|
|
945 src2 = replace_equiv_address (src2, src2_reg);
|
|
946 }
|
|
947 set_mem_size (src2, cmp_bytes);
|
|
948
|
|
949 do_load_for_compare (tmp_reg_src1, src1, load_mode);
|
|
950 do_load_for_compare (tmp_reg_src2, src2, load_mode);
|
|
951
|
|
952 /* We must always left-align the data we read, and
|
|
953 clear any bytes to the right that are beyond the string.
|
|
954 Otherwise the cmpb sequence won't produce the correct
|
|
955 results. The beginning of the compare will be done
|
|
956 with word_mode so will not have any extra shifts or
|
|
957 clear rights. */
|
|
958
|
|
959 if (load_mode_size < word_mode_size)
|
|
960 {
|
|
961 /* Rotate left first. */
|
|
962 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
|
|
963 if (word_mode == DImode)
|
|
964 {
|
|
965 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
|
|
966 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
|
|
967 }
|
|
968 else
|
|
969 {
|
|
970 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
|
|
971 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
|
|
972 }
|
|
973 }
|
|
974
|
|
975 if (cmp_bytes < word_mode_size)
|
|
976 {
|
|
977 /* Now clear right. This plus the rotate can be
|
|
978 turned into a rldicr instruction. */
|
|
979 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
|
|
980 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
|
|
981 if (word_mode == DImode)
|
|
982 {
|
|
983 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
|
|
984 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
|
|
985 }
|
|
986 else
|
|
987 {
|
|
988 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
|
|
989 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
|
|
990 }
|
|
991 }
|
|
992
|
|
993 /* Cases to handle. A and B are chunks of the two strings.
|
|
994 1: Not end of comparison:
|
|
995 A != B: branch to cleanup code to compute result.
|
|
996 A == B: check for 0 byte, next block if not found.
|
|
997 2: End of the inline comparison:
|
|
998 A != B: branch to cleanup code to compute result.
|
|
999 A == B: check for 0 byte, call strcmp/strncmp
|
|
1000 3: compared requested N bytes:
|
|
1001 A == B: branch to result 0.
|
|
1002 A != B: cleanup code to compute result. */
|
|
1003
|
|
1004 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
|
|
1005
|
|
1006 rtx dst_label;
|
|
1007 if (remain > 0 || equality_compare_rest)
|
|
1008 {
|
|
1009 /* Branch to cleanup code, otherwise fall through to do
|
|
1010 more compares. */
|
|
1011 if (!cleanup_label)
|
|
1012 cleanup_label = gen_label_rtx ();
|
|
1013 dst_label = cleanup_label;
|
|
1014 }
|
|
1015 else
|
|
1016 /* Branch to end and produce result of 0. */
|
|
1017 dst_label = final_move_label;
|
|
1018
|
|
1019 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
|
|
1020 rtx cond = gen_reg_rtx (CCmode);
|
|
1021
|
|
1022 /* Always produce the 0 result, it is needed if
|
|
1023 cmpb finds a 0 byte in this chunk. */
|
|
1024 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
|
|
1025 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
|
|
1026
|
|
1027 rtx cmp_rtx;
|
|
1028 if (remain == 0 && !equality_compare_rest)
|
|
1029 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
|
|
1030 else
|
|
1031 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
|
|
1032
|
|
1033 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
|
|
1034 lab_ref, pc_rtx);
|
|
1035 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
1036 JUMP_LABEL (j) = dst_label;
|
|
1037 LABEL_NUSES (dst_label) += 1;
|
|
1038
|
|
1039 if (remain > 0 || equality_compare_rest)
|
|
1040 {
|
|
1041 /* Generate a cmpb to test for a 0 byte and branch
|
|
1042 to final result if found. */
|
|
1043 rtx cmpb_zero = gen_reg_rtx (word_mode);
|
|
1044 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
|
|
1045 rtx condz = gen_reg_rtx (CCmode);
|
|
1046 rtx zero_reg = gen_reg_rtx (word_mode);
|
|
1047 if (word_mode == SImode)
|
|
1048 {
|
|
1049 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
|
|
1050 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
|
|
1051 if (cmp_bytes < word_mode_size)
|
|
1052 {
|
|
1053 /* Don't want to look at zero bytes past end. */
|
|
1054 HOST_WIDE_INT mb =
|
|
1055 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
|
|
1056 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
|
|
1057 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
|
|
1058 }
|
|
1059 }
|
|
1060 else
|
|
1061 {
|
|
1062 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
|
|
1063 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
|
|
1064 if (cmp_bytes < word_mode_size)
|
|
1065 {
|
|
1066 /* Don't want to look at zero bytes past end. */
|
|
1067 HOST_WIDE_INT mb =
|
|
1068 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
|
|
1069 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
|
|
1070 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
|
|
1071 }
|
|
1072 }
|
|
1073
|
|
1074 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
|
|
1075 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
|
|
1076 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
|
|
1077 lab_ref_fin, pc_rtx);
|
|
1078 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
1079 JUMP_LABEL (j2) = final_move_label;
|
|
1080 LABEL_NUSES (final_move_label) += 1;
|
|
1081
|
|
1082 }
|
|
1083
|
|
1084 offset += cmp_bytes;
|
|
1085 bytes_to_compare -= cmp_bytes;
|
|
1086 }
|
|
1087
|
|
1088 if (equality_compare_rest)
|
|
1089 {
|
|
1090 /* Update pointers past what has been compared already. */
|
|
1091 src1 = adjust_address (orig_src1, load_mode, offset);
|
|
1092 src2 = adjust_address (orig_src2, load_mode, offset);
|
|
1093
|
|
1094 if (!REG_P (XEXP (src1, 0)))
|
|
1095 {
|
|
1096 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
|
|
1097 src1 = replace_equiv_address (src1, src1_reg);
|
|
1098 }
|
|
1099 set_mem_size (src1, cmp_bytes);
|
|
1100
|
|
1101 if (!REG_P (XEXP (src2, 0)))
|
|
1102 {
|
|
1103 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
|
|
1104 src2 = replace_equiv_address (src2, src2_reg);
|
|
1105 }
|
|
1106 set_mem_size (src2, cmp_bytes);
|
|
1107
|
|
1108 /* Construct call to strcmp/strncmp to compare the rest of the string. */
|
|
1109 if (no_length)
|
|
1110 {
|
|
1111 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
|
|
1112 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
|
|
1113 target, LCT_NORMAL, GET_MODE (target),
|
|
1114 force_reg (Pmode, XEXP (src1, 0)), Pmode,
|
|
1115 force_reg (Pmode, XEXP (src2, 0)), Pmode);
|
|
1116 }
|
|
1117 else
|
|
1118 {
|
|
1119 rtx len_rtx;
|
|
1120 if (TARGET_64BIT)
|
|
1121 len_rtx = gen_reg_rtx (DImode);
|
|
1122 else
|
|
1123 len_rtx = gen_reg_rtx (SImode);
|
|
1124
|
|
1125 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
|
|
1126 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
|
|
1127 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
|
|
1128 target, LCT_NORMAL, GET_MODE (target),
|
|
1129 force_reg (Pmode, XEXP (src1, 0)), Pmode,
|
|
1130 force_reg (Pmode, XEXP (src2, 0)), Pmode,
|
|
1131 len_rtx, GET_MODE (len_rtx));
|
|
1132 }
|
|
1133
|
|
1134 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
|
|
1135 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
|
|
1136 JUMP_LABEL (jmp) = final_label;
|
|
1137 LABEL_NUSES (final_label) += 1;
|
|
1138 emit_barrier ();
|
|
1139 }
|
|
1140
|
|
1141 if (cleanup_label)
|
|
1142 emit_label (cleanup_label);
|
|
1143
|
|
1144 /* Generate the final sequence that identifies the differing
|
|
1145 byte and generates the final result, taking into account
|
|
1146 zero bytes:
|
|
1147
|
|
1148 cmpb cmpb_result1, src1, src2
|
|
1149 cmpb cmpb_result2, src1, zero
|
|
1150 orc cmpb_result1, cmp_result1, cmpb_result2
|
|
1151 cntlzd get bit of first zero/diff byte
|
|
1152 addi convert for rldcl use
|
|
1153 rldcl rldcl extract diff/zero byte
|
|
1154 subf subtract for final result
|
|
1155 */
|
|
1156
|
|
1157 rtx cmpb_diff = gen_reg_rtx (word_mode);
|
|
1158 rtx cmpb_zero = gen_reg_rtx (word_mode);
|
|
1159 rtx rot_amt = gen_reg_rtx (word_mode);
|
|
1160 rtx zero_reg = gen_reg_rtx (word_mode);
|
|
1161
|
|
1162 rtx rot1_1 = gen_reg_rtx (word_mode);
|
|
1163 rtx rot1_2 = gen_reg_rtx (word_mode);
|
|
1164 rtx rot2_1 = gen_reg_rtx (word_mode);
|
|
1165 rtx rot2_2 = gen_reg_rtx (word_mode);
|
|
1166
|
|
1167 if (word_mode == SImode)
|
|
1168 {
|
|
1169 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
|
|
1170 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
|
|
1171 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
|
|
1172 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
|
|
1173 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
|
|
1174 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
|
|
1175 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
|
|
1176 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
|
|
1177 gen_lowpart (SImode, rot_amt)));
|
|
1178 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
|
|
1179 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
|
|
1180 gen_lowpart (SImode, rot_amt)));
|
|
1181 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
|
|
1182 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
|
|
1183 }
|
|
1184 else
|
|
1185 {
|
|
1186 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
|
|
1187 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
|
|
1188 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
|
|
1189 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
|
|
1190 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
|
|
1191 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
|
|
1192 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
|
|
1193 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
|
|
1194 gen_lowpart (SImode, rot_amt)));
|
|
1195 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
|
|
1196 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
|
|
1197 gen_lowpart (SImode, rot_amt)));
|
|
1198 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
|
|
1199 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
|
|
1200 }
|
|
1201
|
|
1202 emit_label (final_move_label);
|
|
1203 emit_insn (gen_movsi (target,
|
|
1204 gen_lowpart (SImode, result_reg)));
|
|
1205 emit_label (final_label);
|
|
1206 return true;
|
|
1207 }
|
|
1208
|
|
1209 /* Expand a block move operation, and return 1 if successful. Return 0
|
|
1210 if we should let the compiler generate normal code.
|
|
1211
|
|
1212 operands[0] is the destination
|
|
1213 operands[1] is the source
|
|
1214 operands[2] is the length
|
|
1215 operands[3] is the alignment */
|
|
1216
|
|
1217 #define MAX_MOVE_REG 4
|
|
1218
|
|
1219 int
|
|
1220 expand_block_move (rtx operands[])
|
|
1221 {
|
|
1222 rtx orig_dest = operands[0];
|
|
1223 rtx orig_src = operands[1];
|
|
1224 rtx bytes_rtx = operands[2];
|
|
1225 rtx align_rtx = operands[3];
|
|
1226 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
|
|
1227 int align;
|
|
1228 int bytes;
|
|
1229 int offset;
|
|
1230 int move_bytes;
|
|
1231 rtx stores[MAX_MOVE_REG];
|
|
1232 int num_reg = 0;
|
|
1233
|
|
1234 /* If this is not a fixed size move, just call memcpy */
|
|
1235 if (! constp)
|
|
1236 return 0;
|
|
1237
|
|
1238 /* This must be a fixed size alignment */
|
|
1239 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
|
|
1240 align = INTVAL (align_rtx) * BITS_PER_UNIT;
|
|
1241
|
|
1242 /* Anything to move? */
|
|
1243 bytes = INTVAL (bytes_rtx);
|
|
1244 if (bytes <= 0)
|
|
1245 return 1;
|
|
1246
|
|
1247 if (bytes > rs6000_block_move_inline_limit)
|
|
1248 return 0;
|
|
1249
|
|
1250 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
|
|
1251 {
|
|
1252 union {
|
|
1253 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
|
|
1254 rtx (*mov) (rtx, rtx);
|
|
1255 } gen_func;
|
|
1256 machine_mode mode = BLKmode;
|
|
1257 rtx src, dest;
|
|
1258
|
|
1259 /* Altivec first, since it will be faster than a string move
|
|
1260 when it applies, and usually not significantly larger. */
|
|
1261 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
|
|
1262 {
|
|
1263 move_bytes = 16;
|
|
1264 mode = V4SImode;
|
|
1265 gen_func.mov = gen_movv4si;
|
|
1266 }
|
|
1267 else if (TARGET_STRING
|
|
1268 && bytes > 24 /* move up to 32 bytes at a time */
|
|
1269 && ! fixed_regs[5]
|
|
1270 && ! fixed_regs[6]
|
|
1271 && ! fixed_regs[7]
|
|
1272 && ! fixed_regs[8]
|
|
1273 && ! fixed_regs[9]
|
|
1274 && ! fixed_regs[10]
|
|
1275 && ! fixed_regs[11]
|
|
1276 && ! fixed_regs[12])
|
|
1277 {
|
|
1278 move_bytes = (bytes > 32) ? 32 : bytes;
|
|
1279 gen_func.movmemsi = gen_movmemsi_8reg;
|
|
1280 }
|
|
1281 else if (TARGET_STRING
|
|
1282 && bytes > 16 /* move up to 24 bytes at a time */
|
|
1283 && ! fixed_regs[5]
|
|
1284 && ! fixed_regs[6]
|
|
1285 && ! fixed_regs[7]
|
|
1286 && ! fixed_regs[8]
|
|
1287 && ! fixed_regs[9]
|
|
1288 && ! fixed_regs[10])
|
|
1289 {
|
|
1290 move_bytes = (bytes > 24) ? 24 : bytes;
|
|
1291 gen_func.movmemsi = gen_movmemsi_6reg;
|
|
1292 }
|
|
1293 else if (TARGET_STRING
|
|
1294 && bytes > 8 /* move up to 16 bytes at a time */
|
|
1295 && ! fixed_regs[5]
|
|
1296 && ! fixed_regs[6]
|
|
1297 && ! fixed_regs[7]
|
|
1298 && ! fixed_regs[8])
|
|
1299 {
|
|
1300 move_bytes = (bytes > 16) ? 16 : bytes;
|
|
1301 gen_func.movmemsi = gen_movmemsi_4reg;
|
|
1302 }
|
|
1303 else if (bytes >= 8 && TARGET_POWERPC64
|
|
1304 && (align >= 64 || !STRICT_ALIGNMENT))
|
|
1305 {
|
|
1306 move_bytes = 8;
|
|
1307 mode = DImode;
|
|
1308 gen_func.mov = gen_movdi;
|
|
1309 if (offset == 0 && align < 64)
|
|
1310 {
|
|
1311 rtx addr;
|
|
1312
|
|
1313 /* If the address form is reg+offset with offset not a
|
|
1314 multiple of four, reload into reg indirect form here
|
|
1315 rather than waiting for reload. This way we get one
|
|
1316 reload, not one per load and/or store. */
|
|
1317 addr = XEXP (orig_dest, 0);
|
|
1318 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
|
|
1319 && GET_CODE (XEXP (addr, 1)) == CONST_INT
|
|
1320 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
|
|
1321 {
|
|
1322 addr = copy_addr_to_reg (addr);
|
|
1323 orig_dest = replace_equiv_address (orig_dest, addr);
|
|
1324 }
|
|
1325 addr = XEXP (orig_src, 0);
|
|
1326 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
|
|
1327 && GET_CODE (XEXP (addr, 1)) == CONST_INT
|
|
1328 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
|
|
1329 {
|
|
1330 addr = copy_addr_to_reg (addr);
|
|
1331 orig_src = replace_equiv_address (orig_src, addr);
|
|
1332 }
|
|
1333 }
|
|
1334 }
|
|
1335 else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64)
|
|
1336 { /* move up to 8 bytes at a time */
|
|
1337 move_bytes = (bytes > 8) ? 8 : bytes;
|
|
1338 gen_func.movmemsi = gen_movmemsi_2reg;
|
|
1339 }
|
|
1340 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
|
|
1341 { /* move 4 bytes */
|
|
1342 move_bytes = 4;
|
|
1343 mode = SImode;
|
|
1344 gen_func.mov = gen_movsi;
|
|
1345 }
|
|
1346 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
|
|
1347 { /* move 2 bytes */
|
|
1348 move_bytes = 2;
|
|
1349 mode = HImode;
|
|
1350 gen_func.mov = gen_movhi;
|
|
1351 }
|
|
1352 else if (TARGET_STRING && bytes > 1)
|
|
1353 { /* move up to 4 bytes at a time */
|
|
1354 move_bytes = (bytes > 4) ? 4 : bytes;
|
|
1355 gen_func.movmemsi = gen_movmemsi_1reg;
|
|
1356 }
|
|
1357 else /* move 1 byte at a time */
|
|
1358 {
|
|
1359 move_bytes = 1;
|
|
1360 mode = QImode;
|
|
1361 gen_func.mov = gen_movqi;
|
|
1362 }
|
|
1363
|
|
1364 src = adjust_address (orig_src, mode, offset);
|
|
1365 dest = adjust_address (orig_dest, mode, offset);
|
|
1366
|
|
1367 if (mode != BLKmode)
|
|
1368 {
|
|
1369 rtx tmp_reg = gen_reg_rtx (mode);
|
|
1370
|
|
1371 emit_insn ((*gen_func.mov) (tmp_reg, src));
|
|
1372 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
|
|
1373 }
|
|
1374
|
|
1375 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
|
|
1376 {
|
|
1377 int i;
|
|
1378 for (i = 0; i < num_reg; i++)
|
|
1379 emit_insn (stores[i]);
|
|
1380 num_reg = 0;
|
|
1381 }
|
|
1382
|
|
1383 if (mode == BLKmode)
|
|
1384 {
|
|
1385 /* Move the address into scratch registers. The movmemsi
|
|
1386 patterns require zero offset. */
|
|
1387 if (!REG_P (XEXP (src, 0)))
|
|
1388 {
|
|
1389 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
|
|
1390 src = replace_equiv_address (src, src_reg);
|
|
1391 }
|
|
1392 set_mem_size (src, move_bytes);
|
|
1393
|
|
1394 if (!REG_P (XEXP (dest, 0)))
|
|
1395 {
|
|
1396 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
|
|
1397 dest = replace_equiv_address (dest, dest_reg);
|
|
1398 }
|
|
1399 set_mem_size (dest, move_bytes);
|
|
1400
|
|
1401 emit_insn ((*gen_func.movmemsi) (dest, src,
|
|
1402 GEN_INT (move_bytes & 31),
|
|
1403 align_rtx));
|
|
1404 }
|
|
1405 }
|
|
1406
|
|
1407 return 1;
|
|
1408 }
|
|
1409
|
|
1410
|
|
1411 /* Return a string to perform a load_multiple operation.
|
|
1412 operands[0] is the vector.
|
|
1413 operands[1] is the source address.
|
|
1414 operands[2] is the first destination register. */
|
|
1415
|
|
1416 const char *
|
|
1417 rs6000_output_load_multiple (rtx operands[3])
|
|
1418 {
|
|
1419 /* We have to handle the case where the pseudo used to contain the address
|
|
1420 is assigned to one of the output registers. */
|
|
1421 int i, j;
|
|
1422 int words = XVECLEN (operands[0], 0);
|
|
1423 rtx xop[10];
|
|
1424
|
|
1425 if (XVECLEN (operands[0], 0) == 1)
|
|
1426 return "lwz %2,0(%1)";
|
|
1427
|
|
1428 for (i = 0; i < words; i++)
|
|
1429 if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1]))
|
|
1430 {
|
|
1431 if (i == words-1)
|
|
1432 {
|
|
1433 xop[0] = GEN_INT (4 * (words-1));
|
|
1434 xop[1] = operands[1];
|
|
1435 xop[2] = operands[2];
|
|
1436 output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop);
|
|
1437 return "";
|
|
1438 }
|
|
1439 else if (i == 0)
|
|
1440 {
|
|
1441 xop[0] = GEN_INT (4 * (words-1));
|
|
1442 xop[1] = operands[1];
|
|
1443 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1);
|
|
1444 output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop);
|
|
1445 return "";
|
|
1446 }
|
|
1447 else
|
|
1448 {
|
|
1449 for (j = 0; j < words; j++)
|
|
1450 if (j != i)
|
|
1451 {
|
|
1452 xop[0] = GEN_INT (j * 4);
|
|
1453 xop[1] = operands[1];
|
|
1454 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j);
|
|
1455 output_asm_insn ("lwz %2,%0(%1)", xop);
|
|
1456 }
|
|
1457 xop[0] = GEN_INT (i * 4);
|
|
1458 xop[1] = operands[1];
|
|
1459 output_asm_insn ("lwz %1,%0(%1)", xop);
|
|
1460 return "";
|
|
1461 }
|
|
1462 }
|
|
1463
|
|
1464 return "lswi %2,%1,%N0";
|
|
1465 }
|
|
1466
|