Mercurial > hg > CbC > CbC_gcc
diff gcc/config/nds32/nds32-memory-manipulation.c @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | 04ced10e8804 |
children | 1830386684a0 |
line wrap: on
line diff
--- a/gcc/config/nds32/nds32-memory-manipulation.c Fri Oct 27 22:46:09 2017 +0900 +++ b/gcc/config/nds32/nds32-memory-manipulation.c Thu Oct 25 07:37:49 2018 +0900 @@ -1,6 +1,6 @@ /* Auxiliary functions for expand movmem, setmem, cmpmem, load_multiple and store_multiple pattern of Andes NDS32 cpu for GNU compiler - Copyright (C) 2012-2017 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. Contributed by Andes Technology Corporation. This file is part of GCC. @@ -21,6 +21,8 @@ /* ------------------------------------------------------------------------ */ +#define IN_TARGET_CODE 1 + #include "config.h" #include "system.h" #include "coretypes.h" @@ -30,6 +32,1035 @@ #include "memmodel.h" #include "emit-rtl.h" #include "explow.h" +#include "tree.h" +#include "expr.h" +#include "optabs.h" +#include "nds32-protos.h" + +/* ------------------------------------------------------------------------ */ + +/* Auxiliary static function definitions. */ + +static void +nds32_emit_load_store (rtx reg, rtx mem, + enum machine_mode mode, + int offset, bool load_p) +{ + rtx new_mem; + new_mem = adjust_address (mem, mode, offset); + if (load_p) + emit_move_insn (reg, new_mem); + else + emit_move_insn (new_mem, reg); +} + +static void +nds32_emit_post_inc_load_store (rtx reg, rtx base_reg, + enum machine_mode mode, + bool load_p) +{ + gcc_assert (GET_MODE (reg) == mode); + gcc_assert (GET_MODE (base_reg) == Pmode); + + /* Do not gen (set (reg) (mem (post_inc (reg)))) directly here since it may + not recognize by gcc, so let gcc combine it at auto_inc_dec pass. */ + if (load_p) + emit_move_insn (reg, + gen_rtx_MEM (mode, + base_reg)); + else + emit_move_insn (gen_rtx_MEM (mode, + base_reg), + reg); + + emit_move_insn (base_reg, + plus_constant(Pmode, base_reg, GET_MODE_SIZE (mode))); +} + +static void +nds32_emit_mem_move (rtx src, rtx dst, + enum machine_mode mode, + int addr_offset) +{ + gcc_assert (MEM_P (src) && MEM_P (dst)); + rtx tmp_reg = gen_reg_rtx (mode); + nds32_emit_load_store (tmp_reg, src, mode, + addr_offset, /* load_p */ true); + nds32_emit_load_store (tmp_reg, dst, mode, + addr_offset, /* load_p */ false); +} + +static void +nds32_emit_mem_move_block (int base_regno, int count, + rtx *dst_base_reg, rtx *dst_mem, + rtx *src_base_reg, rtx *src_mem, + bool update_base_reg_p) +{ + rtx new_base_reg; + + emit_insn (nds32_expand_load_multiple (base_regno, count, + *src_base_reg, *src_mem, + update_base_reg_p, &new_base_reg)); + if (update_base_reg_p) + { + *src_base_reg = new_base_reg; + *src_mem = gen_rtx_MEM (SImode, *src_base_reg); + } + + emit_insn (nds32_expand_store_multiple (base_regno, count, + *dst_base_reg, *dst_mem, + update_base_reg_p, &new_base_reg)); + + if (update_base_reg_p) + { + *dst_base_reg = new_base_reg; + *dst_mem = gen_rtx_MEM (SImode, *dst_base_reg); + } +} + +/* ------------------------------------------------------------------------ */ + +/* Auxiliary function for expand movmem pattern. */ + +static bool +nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem, + rtx size, + rtx alignment) +{ + /* Emit loop version of movmem. + + andi $size_least_3_bit, $size, #~7 + add $dst_end, $dst, $size + move $dst_itr, $dst + move $src_itr, $src + beqz $size_least_3_bit, .Lbyte_mode_entry ! Not large enough. + add $double_word_end, $dst, $size_least_3_bit + + .Ldouble_word_mode_loop: + lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr + smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr + ! move will delete after register allocation + move $src_itr, $src_itr' + move $dst_itr, $dst_itr' + ! Not readch upper bound. Loop. + bne $double_word_end, $dst_itr, .Ldouble_word_mode_loop + + .Lbyte_mode_entry: + beq $dst_itr, $dst_end, .Lend_label + .Lbyte_mode_loop: + lbi.bi $tmp, [$src_itr], #1 + sbi.bi $tmp, [$dst_itr], #1 + ! Not readch upper bound. Loop. + bne $dst_itr, $dst_end, .Lbyte_mode_loop + .Lend_label: + */ + rtx dst_base_reg, src_base_reg; + rtx dst_itr, src_itr; + rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m; + rtx dst_end; + rtx size_least_3_bit; + rtx double_word_end; + rtx double_word_mode_loop, byte_mode_entry, byte_mode_loop, end_label; + rtx tmp; + rtx mask_least_3_bit; + int start_regno; + bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0; + + if (TARGET_ISA_V3M && !align_to_4_bytes) + return 0; + + if (TARGET_REDUCED_REGS) + start_regno = 2; + else + start_regno = 16; + + dst_itr = gen_reg_rtx (Pmode); + src_itr = gen_reg_rtx (Pmode); + dst_end = gen_reg_rtx (Pmode); + tmp = gen_reg_rtx (QImode); + mask_least_3_bit = GEN_INT (~7); + + double_word_mode_loop = gen_label_rtx (); + byte_mode_entry = gen_label_rtx (); + byte_mode_loop = gen_label_rtx (); + end_label = gen_label_rtx (); + + dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0)); + src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0)); + /* andi $size_least_3_bit, $size, #~7 */ + size_least_3_bit = expand_binop (SImode, and_optab, size, mask_least_3_bit, + NULL_RTX, 0, OPTAB_WIDEN); + /* add $dst_end, $dst, $size */ + dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size, + NULL_RTX, 0, OPTAB_WIDEN); + + /* move $dst_itr, $dst + move $src_itr, $src */ + emit_move_insn (dst_itr, dst_base_reg); + emit_move_insn (src_itr, src_base_reg); + + /* beqz $size_least_3_bit, .Lbyte_mode_entry ! Not large enough. */ + emit_cmp_and_jump_insns (size_least_3_bit, const0_rtx, EQ, NULL, + SImode, 1, byte_mode_entry); + /* add $double_word_end, $dst, $size_least_3_bit */ + double_word_end = expand_binop (Pmode, add_optab, + dst_base_reg, size_least_3_bit, + NULL_RTX, 0, OPTAB_WIDEN); + + /* .Ldouble_word_mode_loop: */ + emit_label (double_word_mode_loop); + /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr + smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */ + src_itr_m = src_itr; + dst_itr_m = dst_itr; + srcmem_m = srcmem; + dstmem_m = dstmem; + nds32_emit_mem_move_block (start_regno, 2, + &dst_itr_m, &dstmem_m, + &src_itr_m, &srcmem_m, + true); + /* move $src_itr, $src_itr' + move $dst_itr, $dst_itr' */ + emit_move_insn (dst_itr, dst_itr_m); + emit_move_insn (src_itr, src_itr_m); + + /* ! Not readch upper bound. Loop. + bne $double_word_end, $dst_itr, .Ldouble_word_mode_loop */ + emit_cmp_and_jump_insns (double_word_end, dst_itr, NE, NULL, + Pmode, 1, double_word_mode_loop); + /* .Lbyte_mode_entry: */ + emit_label (byte_mode_entry); + + /* beq $dst_itr, $dst_end, .Lend_label */ + emit_cmp_and_jump_insns (dst_itr, dst_end, EQ, NULL, + Pmode, 1, end_label); + /* .Lbyte_mode_loop: */ + emit_label (byte_mode_loop); + + /* lbi.bi $tmp, [$src_itr], #1 */ + nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true); + + /* sbi.bi $tmp, [$dst_itr], #1 */ + nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false); + /* ! Not readch upper bound. Loop. + bne $dst_itr, $dst_end, .Lbyte_mode_loop */ + emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL, + SImode, 1, byte_mode_loop); + + /* .Lend_label: */ + emit_label (end_label); + + return true; +} + +static bool +nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem, + rtx size, rtx alignment) +{ + rtx dst_base_reg, src_base_reg; + rtx dst_itr, src_itr; + rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m; + rtx dst_end; + rtx double_word_mode_loop, byte_mode_loop; + rtx tmp; + int start_regno; + bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0; + unsigned HOST_WIDE_INT total_bytes = UINTVAL (size); + + if (TARGET_ISA_V3M && !align_to_4_bytes) + return 0; + + if (TARGET_REDUCED_REGS) + start_regno = 2; + else + start_regno = 16; + + dst_itr = gen_reg_rtx (Pmode); + src_itr = gen_reg_rtx (Pmode); + dst_end = gen_reg_rtx (Pmode); + tmp = gen_reg_rtx (QImode); + + double_word_mode_loop = gen_label_rtx (); + byte_mode_loop = gen_label_rtx (); + + dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0)); + src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0)); + + if (total_bytes < 8) + { + /* Emit total_bytes less than 8 loop version of movmem. + add $dst_end, $dst, $size + move $dst_itr, $dst + .Lbyte_mode_loop: + lbi.bi $tmp, [$src_itr], #1 + sbi.bi $tmp, [$dst_itr], #1 + ! Not readch upper bound. Loop. + bne $dst_itr, $dst_end, .Lbyte_mode_loop */ + + /* add $dst_end, $dst, $size */ + dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size, + NULL_RTX, 0, OPTAB_WIDEN); + /* move $dst_itr, $dst + move $src_itr, $src */ + emit_move_insn (dst_itr, dst_base_reg); + emit_move_insn (src_itr, src_base_reg); + + /* .Lbyte_mode_loop: */ + emit_label (byte_mode_loop); + + /* lbi.bi $tmp, [$src_itr], #1 */ + nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true); + + /* sbi.bi $tmp, [$dst_itr], #1 */ + nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false); + /* ! Not readch upper bound. Loop. + bne $dst_itr, $dst_end, .Lbyte_mode_loop */ + emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL, + SImode, 1, byte_mode_loop); + return true; + } + else if (total_bytes % 8 == 0) + { + /* Emit multiple of 8 loop version of movmem. + + add $dst_end, $dst, $size + move $dst_itr, $dst + move $src_itr, $src + + .Ldouble_word_mode_loop: + lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr + smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr + ! move will delete after register allocation + move $src_itr, $src_itr' + move $dst_itr, $dst_itr' + ! Not readch upper bound. Loop. + bne $double_word_end, $dst_itr, .Ldouble_word_mode_loop */ + + /* add $dst_end, $dst, $size */ + dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size, + NULL_RTX, 0, OPTAB_WIDEN); + + /* move $dst_itr, $dst + move $src_itr, $src */ + emit_move_insn (dst_itr, dst_base_reg); + emit_move_insn (src_itr, src_base_reg); + + /* .Ldouble_word_mode_loop: */ + emit_label (double_word_mode_loop); + /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr + smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */ + src_itr_m = src_itr; + dst_itr_m = dst_itr; + srcmem_m = srcmem; + dstmem_m = dstmem; + nds32_emit_mem_move_block (start_regno, 2, + &dst_itr_m, &dstmem_m, + &src_itr_m, &srcmem_m, + true); + /* move $src_itr, $src_itr' + move $dst_itr, $dst_itr' */ + emit_move_insn (dst_itr, dst_itr_m); + emit_move_insn (src_itr, src_itr_m); + + /* ! Not readch upper bound. Loop. + bne $double_word_end, $dst_itr, .Ldouble_word_mode_loop */ + emit_cmp_and_jump_insns (dst_end, dst_itr, NE, NULL, + Pmode, 1, double_word_mode_loop); + } + else + { + /* Handle size greater than 8, and not a multiple of 8. */ + return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem, + size, alignment); + } + + return true; +} + +static bool +nds32_expand_movmemsi_loop (rtx dstmem, rtx srcmem, + rtx size, rtx alignment) +{ + if (CONST_INT_P (size)) + return nds32_expand_movmemsi_loop_known_size (dstmem, srcmem, + size, alignment); + else + return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem, + size, alignment); +} + +static bool +nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem, + rtx total_bytes, rtx alignment) +{ + rtx dst_base_reg, src_base_reg; + rtx tmp_reg; + int maximum_bytes; + int maximum_bytes_per_inst; + int maximum_regs; + int start_regno; + int i, inst_num; + HOST_WIDE_INT remain_bytes, remain_words; + bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0; + bool align_to_2_bytes = (INTVAL (alignment) & 1) == 0; + + /* Because reduced-set regsiters has few registers + (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31' + cannot be used for register allocation), + using 8 registers (32 bytes) for moving memory block + may easily consume all of them. + It makes register allocation/spilling hard to work. + So we only allow maximum=4 registers (16 bytes) for + moving memory block under reduced-set registers. */ + if (TARGET_REDUCED_REGS) + { + maximum_regs = 4; + maximum_bytes = 64; + start_regno = 2; + } + else + { + /* $r25 is $tp so we use up to 8 registers. */ + maximum_regs = 8; + maximum_bytes = 160; + start_regno = 16; + } + maximum_bytes_per_inst = maximum_regs * UNITS_PER_WORD; + + /* 1. Total_bytes is integer for sure. + 2. Alignment is integer for sure. + 3. Maximum 4 or 10 registers and up to 4 instructions, + 4 * 4 * 4 = 64 bytes, 8 * 4 * 10 = 160 bytes. + 4. The dstmem cannot be volatile memory access. + 5. The srcmem cannot be volatile memory access. + 6. Known shared alignment not align to 4 byte in v3m since lmw/smw *NOT* + support unalign access with v3m configure. */ + if (GET_CODE (total_bytes) != CONST_INT + || GET_CODE (alignment) != CONST_INT + || INTVAL (total_bytes) > maximum_bytes + || MEM_VOLATILE_P (dstmem) + || MEM_VOLATILE_P (srcmem) + || (TARGET_ISA_V3M && !align_to_4_bytes)) + return false; + + dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0)); + src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0)); + remain_bytes = INTVAL (total_bytes); + + /* Do not update base address for last lmw/smw pair. */ + inst_num = ((INTVAL (total_bytes) + (maximum_bytes_per_inst - 1)) + / maximum_bytes_per_inst) - 1; + + for (i = 0; i < inst_num; i++) + { + nds32_emit_mem_move_block (start_regno, maximum_regs, + &dst_base_reg, &dstmem, + &src_base_reg, &srcmem, + true); + } + remain_bytes -= maximum_bytes_per_inst * inst_num; + + remain_words = remain_bytes / UNITS_PER_WORD; + remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD); + + if (remain_words != 0) + { + if (remain_bytes != 0) + nds32_emit_mem_move_block (start_regno, remain_words, + &dst_base_reg, &dstmem, + &src_base_reg, &srcmem, + true); + else + { + /* Do not update address if no further byte to move. */ + if (remain_words == 1) + { + /* emit move instruction if align to 4 byte and only 1 + word to move. */ + if (align_to_4_bytes) + nds32_emit_mem_move (srcmem, dstmem, SImode, 0); + else + { + tmp_reg = gen_reg_rtx (SImode); + emit_insn ( + gen_unaligned_load_w (tmp_reg, + gen_rtx_MEM (SImode, src_base_reg))); + emit_insn ( + gen_unaligned_store_w (gen_rtx_MEM (SImode, dst_base_reg), + tmp_reg)); + } + } + else + nds32_emit_mem_move_block (start_regno, remain_words, + &dst_base_reg, &dstmem, + &src_base_reg, &srcmem, + false); + } + } + + switch (remain_bytes) + { + case 3: + case 2: + { + if (align_to_2_bytes) + nds32_emit_mem_move (srcmem, dstmem, HImode, 0); + else + { + nds32_emit_mem_move (srcmem, dstmem, QImode, 0); + nds32_emit_mem_move (srcmem, dstmem, QImode, 1); + } + + if (remain_bytes == 3) + nds32_emit_mem_move (srcmem, dstmem, QImode, 2); + break; + } + case 1: + nds32_emit_mem_move (srcmem, dstmem, QImode, 0); + break; + case 0: + break; + default: + gcc_unreachable (); + } + + /* Successfully create patterns, return true. */ + return true; +} + +/* Function to move block memory content by + using load_multiple and store_multiple. + This is auxiliary extern function to help create rtx template. + Check nds32-multiple.md file for the patterns. */ +bool +nds32_expand_movmemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment) +{ + if (nds32_expand_movmemsi_unroll (dstmem, srcmem, total_bytes, alignment)) + return true; + + if (!optimize_size && optimize > 2) + return nds32_expand_movmemsi_loop (dstmem, srcmem, total_bytes, alignment); + + return false; +} + +/* ------------------------------------------------------------------------ */ + +/* Auxiliary function for expand setmem pattern. */ + +static rtx +nds32_gen_dup_4_byte_to_word_value_aux (rtx value, rtx value4word) +{ + gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value)); + + if (CONST_INT_P (value)) + { + unsigned HOST_WIDE_INT val = UINTVAL (value) & GET_MODE_MASK(QImode); + rtx new_val = gen_int_mode (val | (val << 8) + | (val << 16) | (val << 24), SImode); + /* Just calculate at here if it's constant value. */ + emit_move_insn (value4word, new_val); + } + else + { + if (NDS32_EXT_DSP_P ()) + { + /* ! prepare word + insb $tmp, $value, 1 ! $tmp <- 0x0000abab + pkbb16 $tmp6, $tmp2, $tmp2 ! $value4word <- 0xabababab */ + rtx tmp = gen_reg_rtx (SImode); + + convert_move (tmp, value, true); + + emit_insn ( + gen_insvsi_internal (tmp, gen_int_mode (0x8, SImode), tmp)); + + emit_insn (gen_pkbbsi_1 (value4word, tmp, tmp)); + } + else + { + /* ! prepare word + andi $tmp1, $value, 0xff ! $tmp1 <- 0x000000ab + slli $tmp2, $tmp1, 8 ! $tmp2 <- 0x0000ab00 + or $tmp3, $tmp1, $tmp2 ! $tmp3 <- 0x0000abab + slli $tmp4, $tmp3, 16 ! $tmp4 <- 0xabab0000 + or $val4word, $tmp3, $tmp4 ! $value4word <- 0xabababab */ + + rtx tmp1, tmp2, tmp3, tmp4; + tmp1 = expand_binop (SImode, and_optab, value, + gen_int_mode (0xff, SImode), + NULL_RTX, 0, OPTAB_WIDEN); + tmp2 = expand_binop (SImode, ashl_optab, tmp1, + gen_int_mode (8, SImode), + NULL_RTX, 0, OPTAB_WIDEN); + tmp3 = expand_binop (SImode, ior_optab, tmp1, tmp2, + NULL_RTX, 0, OPTAB_WIDEN); + tmp4 = expand_binop (SImode, ashl_optab, tmp3, + gen_int_mode (16, SImode), + NULL_RTX, 0, OPTAB_WIDEN); + + emit_insn (gen_iorsi3 (value4word, tmp3, tmp4)); + } + } + + return value4word; +} + +static rtx +nds32_gen_dup_4_byte_to_word_value (rtx value) +{ + rtx value4word = gen_reg_rtx (SImode); + nds32_gen_dup_4_byte_to_word_value_aux (value, value4word); + + return value4word; +} + +static rtx +nds32_gen_dup_8_byte_to_double_word_value (rtx value) +{ + rtx value4doubleword = gen_reg_rtx (DImode); + + nds32_gen_dup_4_byte_to_word_value_aux ( + value, nds32_di_low_part_subreg(value4doubleword)); + + emit_move_insn (nds32_di_high_part_subreg(value4doubleword), + nds32_di_low_part_subreg(value4doubleword)); + return value4doubleword; +} + + +static rtx +emit_setmem_doubleword_loop (rtx itr, rtx size, rtx value) +{ + rtx word_mode_label = gen_label_rtx (); + rtx word_mode_end_label = gen_label_rtx (); + rtx byte_mode_size = gen_reg_rtx (SImode); + rtx byte_mode_size_tmp = gen_reg_rtx (SImode); + rtx word_mode_end = gen_reg_rtx (SImode); + rtx size_for_word = gen_reg_rtx (SImode); + + /* and $size_for_word, $size, #~0x7 */ + size_for_word = expand_binop (SImode, and_optab, size, + gen_int_mode (~0x7, SImode), + NULL_RTX, 0, OPTAB_WIDEN); + + emit_move_insn (byte_mode_size, size); + + /* beqz $size_for_word, .Lbyte_mode_entry */ + emit_cmp_and_jump_insns (size_for_word, const0_rtx, EQ, NULL, + SImode, 1, word_mode_end_label); + /* add $word_mode_end, $dst, $size_for_word */ + word_mode_end = expand_binop (Pmode, add_optab, itr, size_for_word, + NULL_RTX, 0, OPTAB_WIDEN); + + /* andi $byte_mode_size, $size, 0x7 */ + byte_mode_size_tmp = expand_binop (SImode, and_optab, size, GEN_INT (0x7), + NULL_RTX, 0, OPTAB_WIDEN); + + emit_move_insn (byte_mode_size, byte_mode_size_tmp); + + /* .Lword_mode: */ + emit_label (word_mode_label); + /* ! word-mode set loop + smw.bim $value4word, [$dst_itr], $value4word, 0 + bne $word_mode_end, $dst_itr, .Lword_mode */ + emit_insn (gen_unaligned_store_update_base_dw (itr, + itr, + value)); + emit_cmp_and_jump_insns (word_mode_end, itr, NE, NULL, + Pmode, 1, word_mode_label); + + emit_label (word_mode_end_label); + + return byte_mode_size; +} + +static rtx +emit_setmem_byte_loop (rtx itr, rtx size, rtx value, bool need_end) +{ + rtx end = gen_reg_rtx (Pmode); + rtx byte_mode_label = gen_label_rtx (); + rtx end_label = gen_label_rtx (); + + value = force_reg (QImode, value); + + if (need_end) + end = expand_binop (Pmode, add_optab, itr, size, + NULL_RTX, 0, OPTAB_WIDEN); + /* beqz $byte_mode_size, .Lend + add $byte_mode_end, $dst_itr, $byte_mode_size */ + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL, + SImode, 1, end_label); + + if (!need_end) + end = expand_binop (Pmode, add_optab, itr, size, + NULL_RTX, 0, OPTAB_WIDEN); + + /* .Lbyte_mode: */ + emit_label (byte_mode_label); + + /* ! byte-mode set loop + sbi.bi $value, [$dst_itr] ,1 + bne $byte_mode_end, $dst_itr, .Lbyte_mode */ + nds32_emit_post_inc_load_store (value, itr, QImode, false); + + emit_cmp_and_jump_insns (end, itr, NE, NULL, + Pmode, 1, byte_mode_label); + /* .Lend: */ + emit_label (end_label); + + if (need_end) + return end; + else + return NULL_RTX; +} + +static bool +nds32_expand_setmem_loop (rtx dstmem, rtx size, rtx value) +{ + rtx value4doubleword; + rtx value4byte; + rtx dst; + rtx byte_mode_size; + + /* Emit loop version of setmem. + memset: + ! prepare word + andi $tmp1, $val, 0xff ! $tmp1 <- 0x000000ab + slli $tmp2, $tmp1, 8 ! $tmp2 <- 0x0000ab00 + or $tmp3, $val, $tmp2 ! $tmp3 <- 0x0000abab + slli $tmp4, $tmp3, 16 ! $tmp4 <- 0xabab0000 + or $val4word, $tmp3, $tmp4 ! $value4word <- 0xabababab + + and $size_for_word, $size, #-4 + beqz $size_for_word, .Lword_mode_end + + add $word_mode_end, $dst, $size_for_word + andi $byte_mode_size, $size, 3 + + .Lword_mode: + ! word-mode set loop + smw.bim $value4word, [$dst], $value4word, 0 + bne $word_mode_end, $dst, .Lword_mode + + .Lword_mode_end: + beqz $byte_mode_size, .Lend + add $byte_mode_end, $dst, $byte_mode_size + + .Lbyte_mode: + ! byte-mode set loop + sbi.bi $value4word, [$dst] ,1 + bne $byte_mode_end, $dst, .Lbyte_mode + .Lend: */ + + dst = copy_to_mode_reg (SImode, XEXP (dstmem, 0)); + + /* ! prepare word + andi $tmp1, $value, 0xff ! $tmp1 <- 0x000000ab + slli $tmp2, $tmp1, 8 ! $tmp2 <- 0x0000ab00 + or $tmp3, $tmp1, $tmp2 ! $tmp3 <- 0x0000abab + slli $tmp4, $tmp3, 16 ! $tmp4 <- 0xabab0000 + or $val4word, $tmp3, $tmp4 ! $value4word <- 0xabababab */ + value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value); + + /* and $size_for_word, $size, #-4 + beqz $size_for_word, .Lword_mode_end + + add $word_mode_end, $dst, $size_for_word + andi $byte_mode_size, $size, 3 + + .Lword_mode: + ! word-mode set loop + smw.bim $value4word, [$dst], $value4word, 0 + bne $word_mode_end, $dst, .Lword_mode + .Lword_mode_end: */ + byte_mode_size = emit_setmem_doubleword_loop (dst, size, value4doubleword); + + /* beqz $byte_mode_size, .Lend + add $byte_mode_end, $dst, $byte_mode_size + + .Lbyte_mode: + ! byte-mode set loop + sbi.bi $value, [$dst] ,1 + bne $byte_mode_end, $dst, .Lbyte_mode + .Lend: */ + + value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode, + subreg_lowpart_offset (QImode, DImode)); + + emit_setmem_byte_loop (dst, byte_mode_size, value4byte, false); + + return true; +} + +static bool +nds32_expand_setmem_loop_v3m (rtx dstmem, rtx size, rtx value) +{ + rtx base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0)); + rtx need_align_bytes = gen_reg_rtx (SImode); + rtx last_2_bit = gen_reg_rtx (SImode); + rtx byte_loop_base = gen_reg_rtx (SImode); + rtx byte_loop_size = gen_reg_rtx (SImode); + rtx remain_size = gen_reg_rtx (SImode); + rtx new_base_reg; + rtx value4byte, value4doubleword; + rtx byte_mode_size; + rtx last_byte_loop_label = gen_label_rtx (); + + size = force_reg (SImode, size); + + value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value); + value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode, + subreg_lowpart_offset (QImode, DImode)); + + emit_move_insn (byte_loop_size, size); + emit_move_insn (byte_loop_base, base_reg); + + /* Jump to last byte loop if size is less than 16. */ + emit_cmp_and_jump_insns (size, gen_int_mode (16, SImode), LE, NULL, + SImode, 1, last_byte_loop_label); + + /* Make sure align to 4 byte first since v3m can't unalign access. */ + emit_insn (gen_andsi3 (last_2_bit, + base_reg, + gen_int_mode (0x3, SImode))); + + emit_insn (gen_subsi3 (need_align_bytes, + gen_int_mode (4, SImode), + last_2_bit)); + + /* Align to 4 byte. */ + new_base_reg = emit_setmem_byte_loop (base_reg, + need_align_bytes, + value4byte, + true); + + /* Calculate remain size. */ + emit_insn (gen_subsi3 (remain_size, size, need_align_bytes)); + + /* Set memory word by word. */ + byte_mode_size = emit_setmem_doubleword_loop (new_base_reg, + remain_size, + value4doubleword); + + emit_move_insn (byte_loop_base, new_base_reg); + emit_move_insn (byte_loop_size, byte_mode_size); + + emit_label (last_byte_loop_label); + + /* And set memory for remain bytes. */ + emit_setmem_byte_loop (byte_loop_base, byte_loop_size, value4byte, false); + return true; +} + +static bool +nds32_expand_setmem_unroll (rtx dstmem, rtx size, rtx value, + rtx align ATTRIBUTE_UNUSED, + rtx expected_align ATTRIBUTE_UNUSED, + rtx expected_size ATTRIBUTE_UNUSED) +{ + unsigned maximum_regs, maximum_bytes, start_regno, regno; + rtx value4word; + rtx dst_base_reg, new_base_reg; + unsigned HOST_WIDE_INT remain_bytes, remain_words, prepare_regs, fill_per_smw; + unsigned HOST_WIDE_INT real_size; + + if (TARGET_REDUCED_REGS) + { + maximum_regs = 4; + maximum_bytes = 64; + start_regno = 2; + } + else + { + maximum_regs = 8; + maximum_bytes = 128; + start_regno = 16; + } + + real_size = UINTVAL (size) & GET_MODE_MASK(SImode); + + if (!(CONST_INT_P (size) && real_size <= maximum_bytes)) + return false; + + remain_bytes = real_size; + + gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value)); + + value4word = nds32_gen_dup_4_byte_to_word_value (value); + + prepare_regs = remain_bytes / UNITS_PER_WORD; + + dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0)); + + if (prepare_regs > maximum_regs) + prepare_regs = maximum_regs; + + fill_per_smw = prepare_regs * UNITS_PER_WORD; + + regno = start_regno; + switch (prepare_regs) + { + case 2: + default: + { + rtx reg0 = gen_rtx_REG (SImode, regno); + rtx reg1 = gen_rtx_REG (SImode, regno+1); + unsigned last_regno = start_regno + prepare_regs - 1; + + emit_move_insn (reg0, value4word); + emit_move_insn (reg1, value4word); + rtx regd = gen_rtx_REG (DImode, regno); + regno += 2; + + /* Try to utilize movd44! */ + while (regno <= last_regno) + { + if ((regno + 1) <=last_regno) + { + rtx reg = gen_rtx_REG (DImode, regno); + emit_move_insn (reg, regd); + regno += 2; + } + else + { + rtx reg = gen_rtx_REG (SImode, regno); + emit_move_insn (reg, reg0); + regno += 1; + } + } + break; + } + case 1: + { + rtx reg = gen_rtx_REG (SImode, regno++); + emit_move_insn (reg, value4word); + } + break; + case 0: + break; + } + + if (fill_per_smw) + for (;remain_bytes >= fill_per_smw;remain_bytes -= fill_per_smw) + { + emit_insn (nds32_expand_store_multiple (start_regno, prepare_regs, + dst_base_reg, dstmem, + true, &new_base_reg)); + dst_base_reg = new_base_reg; + dstmem = gen_rtx_MEM (SImode, dst_base_reg); + } + + remain_words = remain_bytes / UNITS_PER_WORD; + + if (remain_words) + { + emit_insn (nds32_expand_store_multiple (start_regno, remain_words, + dst_base_reg, dstmem, + true, &new_base_reg)); + dst_base_reg = new_base_reg; + dstmem = gen_rtx_MEM (SImode, dst_base_reg); + } + + remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD); + + if (remain_bytes) + { + value = simplify_gen_subreg (QImode, value4word, SImode, + subreg_lowpart_offset(QImode, SImode)); + int offset = 0; + for (;remain_bytes;--remain_bytes, ++offset) + { + nds32_emit_load_store (value, dstmem, QImode, offset, false); + } + } + + return true; +} + +bool +nds32_expand_setmem (rtx dstmem, rtx size, rtx value, rtx align, + rtx expected_align, + rtx expected_size) +{ + bool align_to_4_bytes = (INTVAL (align) & 3) == 0; + + /* Only expand at O3 */ + if (optimize_size || optimize < 3) + return false; + + if (TARGET_ISA_V3M && !align_to_4_bytes) + return nds32_expand_setmem_loop_v3m (dstmem, size, value); + + if (nds32_expand_setmem_unroll (dstmem, size, value, + align, expected_align, expected_size)) + return true; + + return nds32_expand_setmem_loop (dstmem, size, value); +} + +/* ------------------------------------------------------------------------ */ + +/* Auxiliary function for expand strlen pattern. */ + +bool +nds32_expand_strlen (rtx result, rtx str, + rtx target_char, rtx align ATTRIBUTE_UNUSED) +{ + rtx base_reg, backup_base_reg; + rtx ffb_result; + rtx target_char_ptr, length; + rtx loop_label, tmp; + + if (optimize_size || optimize < 3) + return false; + + gcc_assert (MEM_P (str)); + gcc_assert (CONST_INT_P (target_char) || REG_P (target_char)); + + base_reg = copy_to_mode_reg (SImode, XEXP (str, 0)); + loop_label = gen_label_rtx (); + + ffb_result = gen_reg_rtx (Pmode); + tmp = gen_reg_rtx (SImode); + backup_base_reg = gen_reg_rtx (SImode); + + /* Emit loop version of strlen. + move $backup_base, $base + .Lloop: + lmw.bim $tmp, [$base], $tmp, 0 + ffb $ffb_result, $tmp, $target_char ! is there $target_char? + beqz $ffb_result, .Lloop + add $last_char_ptr, $base, $ffb_result + sub $length, $last_char_ptr, $backup_base */ + + /* move $backup_base, $base */ + emit_move_insn (backup_base_reg, base_reg); + + /* .Lloop: */ + emit_label (loop_label); + /* lmw.bim $tmp, [$base], $tmp, 0 */ + emit_insn (gen_unaligned_load_update_base_w (base_reg, tmp, base_reg)); + + /* ffb $ffb_result, $tmp, $target_char ! is there $target_char? */ + emit_insn (gen_unspec_ffb (ffb_result, tmp, target_char)); + + /* beqz $ffb_result, .Lloop */ + emit_cmp_and_jump_insns (ffb_result, const0_rtx, EQ, NULL, + SImode, 1, loop_label); + + /* add $target_char_ptr, $base, $ffb_result */ + target_char_ptr = expand_binop (Pmode, add_optab, base_reg, + ffb_result, NULL_RTX, 0, OPTAB_WIDEN); + + /* sub $length, $target_char_ptr, $backup_base */ + length = expand_binop (Pmode, sub_optab, target_char_ptr, + backup_base_reg, NULL_RTX, 0, OPTAB_WIDEN); + + emit_move_insn (result, length); + + return true; +} /* ------------------------------------------------------------------------ */ @@ -38,16 +1069,50 @@ Check nds32-multiple.md file for the patterns. */ rtx nds32_expand_load_multiple (int base_regno, int count, - rtx base_addr, rtx basemem) + rtx base_addr, rtx basemem, + bool update_base_reg_p, + rtx *update_base_reg) { int par_index; int offset; + int start_idx; rtx result; rtx new_addr, mem, reg; - /* Create the pattern that is presented in nds32-multiple.md. */ + /* Generate a unaligned load to prevent load instruction pull out from + parallel, and then it will generate lwi, and lose unaligned acces */ + if (count == 1) + { + reg = gen_rtx_REG (SImode, base_regno); + if (update_base_reg_p) + { + *update_base_reg = gen_reg_rtx (SImode); + return gen_unaligned_load_update_base_w (*update_base_reg, reg, base_addr); + } + else + return gen_unaligned_load_w (reg, gen_rtx_MEM (SImode, base_addr)); + } - result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + /* Create the pattern that is presented in nds32-multiple.md. */ + if (update_base_reg_p) + { + result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1)); + start_idx = 1; + } + else + { + result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + start_idx = 0; + } + + if (update_base_reg_p) + { + offset = count * 4; + new_addr = plus_constant (Pmode, base_addr, offset); + *update_base_reg = gen_reg_rtx (SImode); + + XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr); + } for (par_index = 0; par_index < count; par_index++) { @@ -58,7 +1123,7 @@ new_addr, offset); reg = gen_rtx_REG (SImode, base_regno + par_index); - XVECEXP (result, 0, par_index) = gen_rtx_SET (reg, mem); + XVECEXP (result, 0, (par_index + start_idx)) = gen_rtx_SET (reg, mem); } return result; @@ -66,16 +1131,49 @@ rtx nds32_expand_store_multiple (int base_regno, int count, - rtx base_addr, rtx basemem) + rtx base_addr, rtx basemem, + bool update_base_reg_p, + rtx *update_base_reg) { int par_index; int offset; + int start_idx; rtx result; rtx new_addr, mem, reg; + if (count == 1) + { + reg = gen_rtx_REG (SImode, base_regno); + if (update_base_reg_p) + { + *update_base_reg = gen_reg_rtx (SImode); + return gen_unaligned_store_update_base_w (*update_base_reg, base_addr, reg); + } + else + return gen_unaligned_store_w (gen_rtx_MEM (SImode, base_addr), reg); + } + /* Create the pattern that is presented in nds32-multiple.md. */ - result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + if (update_base_reg_p) + { + result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1)); + start_idx = 1; + } + else + { + result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + start_idx = 0; + } + + if (update_base_reg_p) + { + offset = count * 4; + new_addr = plus_constant (Pmode, base_addr, offset); + *update_base_reg = gen_reg_rtx (SImode); + + XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr); + } for (par_index = 0; par_index < count; par_index++) { @@ -86,58 +1184,10 @@ new_addr, offset); reg = gen_rtx_REG (SImode, base_regno + par_index); - XVECEXP (result, 0, par_index) = gen_rtx_SET (mem, reg); + XVECEXP (result, 0, par_index + start_idx) = gen_rtx_SET (mem, reg); } return result; } -/* Function to move block memory content by - using load_multiple and store_multiple. - This is auxiliary extern function to help create rtx template. - Check nds32-multiple.md file for the patterns. */ -int -nds32_expand_movmemqi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment) -{ - HOST_WIDE_INT in_words, out_words; - rtx dst_base_reg, src_base_reg; - int maximum_bytes; - - /* Because reduced-set regsiters has few registers - (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31' - cannot be used for register allocation), - using 8 registers (32 bytes) for moving memory block - may easily consume all of them. - It makes register allocation/spilling hard to work. - So we only allow maximum=4 registers (16 bytes) for - moving memory block under reduced-set registers. */ - if (TARGET_REDUCED_REGS) - maximum_bytes = 16; - else - maximum_bytes = 32; - - /* 1. Total_bytes is integer for sure. - 2. Alignment is integer for sure. - 3. Maximum 4 or 8 registers, 4 * 4 = 16 bytes, 8 * 4 = 32 bytes. - 4. Requires (n * 4) block size. - 5. Requires 4-byte alignment. */ - if (GET_CODE (total_bytes) != CONST_INT - || GET_CODE (alignment) != CONST_INT - || INTVAL (total_bytes) > maximum_bytes - || INTVAL (total_bytes) & 3 - || INTVAL (alignment) & 3) - return 0; - - dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0)); - src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0)); - - out_words = in_words = INTVAL (total_bytes) / UNITS_PER_WORD; - - emit_insn (nds32_expand_load_multiple (0, in_words, src_base_reg, srcmem)); - emit_insn (nds32_expand_store_multiple (0, out_words, dst_base_reg, dstmem)); - - /* Successfully create patterns, return 1. */ - return 1; -} - /* ------------------------------------------------------------------------ */