diff gcc/config/nds32/nds32-memory-manipulation.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/nds32/nds32-memory-manipulation.c	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/nds32/nds32-memory-manipulation.c	Thu Oct 25 07:37:49 2018 +0900
@@ -1,6 +1,6 @@
 /* Auxiliary functions for expand movmem, setmem, cmpmem, load_multiple
    and store_multiple pattern of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
    Contributed by Andes Technology Corporation.
 
    This file is part of GCC.
@@ -21,6 +21,8 @@
 
 /* ------------------------------------------------------------------------ */
 
+#define IN_TARGET_CODE 1
+
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -30,6 +32,1035 @@
 #include "memmodel.h"
 #include "emit-rtl.h"
 #include "explow.h"
+#include "tree.h"
+#include "expr.h"
+#include "optabs.h"
+#include "nds32-protos.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* Auxiliary static function definitions.  */
+
+static void
+nds32_emit_load_store (rtx reg, rtx mem,
+		       enum machine_mode mode,
+		       int offset, bool load_p)
+{
+  rtx new_mem;
+  new_mem = adjust_address (mem, mode, offset);
+  if (load_p)
+    emit_move_insn (reg, new_mem);
+  else
+    emit_move_insn (new_mem, reg);
+}
+
+static void
+nds32_emit_post_inc_load_store (rtx reg, rtx base_reg,
+				enum machine_mode mode,
+				bool load_p)
+{
+  gcc_assert (GET_MODE (reg) == mode);
+  gcc_assert (GET_MODE (base_reg) == Pmode);
+
+  /* Do not gen (set (reg) (mem (post_inc (reg)))) directly here since it may
+     not recognize by gcc, so let gcc combine it at auto_inc_dec pass.  */
+  if (load_p)
+    emit_move_insn (reg,
+		    gen_rtx_MEM (mode,
+				 base_reg));
+  else
+    emit_move_insn (gen_rtx_MEM (mode,
+				 base_reg),
+		    reg);
+
+  emit_move_insn (base_reg,
+		  plus_constant(Pmode, base_reg, GET_MODE_SIZE (mode)));
+}
+
+static void
+nds32_emit_mem_move (rtx src, rtx dst,
+		     enum machine_mode mode,
+		     int addr_offset)
+{
+  gcc_assert (MEM_P (src) && MEM_P (dst));
+  rtx tmp_reg = gen_reg_rtx (mode);
+  nds32_emit_load_store (tmp_reg, src, mode,
+			 addr_offset, /* load_p */ true);
+  nds32_emit_load_store (tmp_reg, dst, mode,
+			 addr_offset, /* load_p */ false);
+}
+
+static void
+nds32_emit_mem_move_block (int base_regno, int count,
+			   rtx *dst_base_reg, rtx *dst_mem,
+			   rtx *src_base_reg, rtx *src_mem,
+			   bool update_base_reg_p)
+{
+  rtx new_base_reg;
+
+  emit_insn (nds32_expand_load_multiple (base_regno, count,
+					 *src_base_reg, *src_mem,
+					 update_base_reg_p, &new_base_reg));
+  if (update_base_reg_p)
+    {
+      *src_base_reg = new_base_reg;
+      *src_mem = gen_rtx_MEM (SImode, *src_base_reg);
+    }
+
+  emit_insn (nds32_expand_store_multiple (base_regno, count,
+					  *dst_base_reg, *dst_mem,
+					  update_base_reg_p, &new_base_reg));
+
+  if (update_base_reg_p)
+    {
+      *dst_base_reg = new_base_reg;
+      *dst_mem = gen_rtx_MEM (SImode, *dst_base_reg);
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Auxiliary function for expand movmem pattern.  */
+
+static bool
+nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
+					 rtx size,
+					 rtx alignment)
+{
+  /* Emit loop version of movmem.
+
+       andi    $size_least_3_bit, $size, #~7
+       add     $dst_end, $dst, $size
+       move    $dst_itr, $dst
+       move    $src_itr, $src
+       beqz    $size_least_3_bit, .Lbyte_mode_entry ! Not large enough.
+       add     $double_word_end, $dst, $size_least_3_bit
+
+     .Ldouble_word_mode_loop:
+       lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+       smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr
+       ! move will delete after register allocation
+       move    $src_itr, $src_itr'
+       move    $dst_itr, $dst_itr'
+       ! Not readch upper bound. Loop.
+       bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop
+
+     .Lbyte_mode_entry:
+       beq     $dst_itr, $dst_end, .Lend_label
+     .Lbyte_mode_loop:
+       lbi.bi  $tmp, [$src_itr], #1
+       sbi.bi  $tmp, [$dst_itr], #1
+       ! Not readch upper bound. Loop.
+       bne     $dst_itr, $dst_end, .Lbyte_mode_loop
+     .Lend_label:
+  */
+  rtx dst_base_reg, src_base_reg;
+  rtx dst_itr, src_itr;
+  rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m;
+  rtx dst_end;
+  rtx size_least_3_bit;
+  rtx double_word_end;
+  rtx double_word_mode_loop, byte_mode_entry, byte_mode_loop, end_label;
+  rtx tmp;
+  rtx mask_least_3_bit;
+  int start_regno;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return 0;
+
+  if (TARGET_REDUCED_REGS)
+    start_regno = 2;
+  else
+    start_regno = 16;
+
+  dst_itr = gen_reg_rtx (Pmode);
+  src_itr = gen_reg_rtx (Pmode);
+  dst_end = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (QImode);
+  mask_least_3_bit = GEN_INT (~7);
+
+  double_word_mode_loop = gen_label_rtx ();
+  byte_mode_entry = gen_label_rtx ();
+  byte_mode_loop = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0));
+  /* andi   $size_least_3_bit, $size, #~7 */
+  size_least_3_bit = expand_binop (SImode, and_optab, size, mask_least_3_bit,
+				   NULL_RTX, 0, OPTAB_WIDEN);
+  /* add     $dst_end, $dst, $size */
+  dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+			  NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* move    $dst_itr, $dst
+     move    $src_itr, $src */
+  emit_move_insn (dst_itr, dst_base_reg);
+  emit_move_insn (src_itr, src_base_reg);
+
+  /* beqz    $size_least_3_bit, .Lbyte_mode_entry ! Not large enough. */
+  emit_cmp_and_jump_insns (size_least_3_bit, const0_rtx, EQ, NULL,
+			   SImode, 1, byte_mode_entry);
+  /* add     $double_word_end, $dst, $size_least_3_bit */
+  double_word_end = expand_binop (Pmode, add_optab,
+				  dst_base_reg, size_least_3_bit,
+				  NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* .Ldouble_word_mode_loop: */
+  emit_label (double_word_mode_loop);
+  /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+     smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */
+  src_itr_m = src_itr;
+  dst_itr_m = dst_itr;
+  srcmem_m = srcmem;
+  dstmem_m = dstmem;
+  nds32_emit_mem_move_block (start_regno, 2,
+			     &dst_itr_m, &dstmem_m,
+			     &src_itr_m, &srcmem_m,
+			     true);
+  /* move    $src_itr, $src_itr'
+     move    $dst_itr, $dst_itr' */
+  emit_move_insn (dst_itr, dst_itr_m);
+  emit_move_insn (src_itr, src_itr_m);
+
+  /* ! Not readch upper bound. Loop.
+     bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+  emit_cmp_and_jump_insns (double_word_end, dst_itr, NE, NULL,
+			   Pmode, 1, double_word_mode_loop);
+  /* .Lbyte_mode_entry: */
+  emit_label (byte_mode_entry);
+
+  /* beq     $dst_itr, $dst_end, .Lend_label */
+  emit_cmp_and_jump_insns (dst_itr, dst_end, EQ, NULL,
+			   Pmode, 1, end_label);
+  /* .Lbyte_mode_loop: */
+  emit_label (byte_mode_loop);
+
+  /* lbi.bi  $tmp, [$src_itr], #1 */
+  nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true);
+
+  /* sbi.bi  $tmp, [$dst_itr], #1 */
+  nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false);
+  /* ! Not readch upper bound. Loop.
+     bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+  emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL,
+			   SImode, 1, byte_mode_loop);
+
+  /* .Lend_label: */
+  emit_label (end_label);
+
+  return true;
+}
+
+static bool
+nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+				       rtx size, rtx alignment)
+{
+  rtx dst_base_reg, src_base_reg;
+  rtx dst_itr, src_itr;
+  rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m;
+  rtx dst_end;
+  rtx double_word_mode_loop, byte_mode_loop;
+  rtx tmp;
+  int start_regno;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+  unsigned HOST_WIDE_INT total_bytes = UINTVAL (size);
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return 0;
+
+  if (TARGET_REDUCED_REGS)
+    start_regno = 2;
+  else
+    start_regno = 16;
+
+  dst_itr = gen_reg_rtx (Pmode);
+  src_itr = gen_reg_rtx (Pmode);
+  dst_end = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (QImode);
+
+  double_word_mode_loop = gen_label_rtx ();
+  byte_mode_loop = gen_label_rtx ();
+
+  dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0));
+
+  if (total_bytes < 8)
+    {
+      /* Emit total_bytes less than 8 loop version of movmem.
+	add     $dst_end, $dst, $size
+	move    $dst_itr, $dst
+	.Lbyte_mode_loop:
+	lbi.bi  $tmp, [$src_itr], #1
+	sbi.bi  $tmp, [$dst_itr], #1
+	! Not readch upper bound. Loop.
+	bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+
+      /* add     $dst_end, $dst, $size */
+      dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+			      NULL_RTX, 0, OPTAB_WIDEN);
+      /* move    $dst_itr, $dst
+	 move    $src_itr, $src */
+      emit_move_insn (dst_itr, dst_base_reg);
+      emit_move_insn (src_itr, src_base_reg);
+
+      /* .Lbyte_mode_loop: */
+      emit_label (byte_mode_loop);
+
+      /* lbi.bi  $tmp, [$src_itr], #1 */
+      nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true);
+
+      /* sbi.bi  $tmp, [$dst_itr], #1 */
+      nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false);
+      /* ! Not readch upper bound. Loop.
+	 bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+      emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL,
+			       SImode, 1, byte_mode_loop);
+      return true;
+    }
+  else if (total_bytes % 8 == 0)
+    {
+      /* Emit multiple of 8 loop version of movmem.
+
+	 add     $dst_end, $dst, $size
+	 move    $dst_itr, $dst
+	 move    $src_itr, $src
+
+	.Ldouble_word_mode_loop:
+	lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+	smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr
+	! move will delete after register allocation
+	move    $src_itr, $src_itr'
+	move    $dst_itr, $dst_itr'
+	! Not readch upper bound. Loop.
+	bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+
+      /* add     $dst_end, $dst, $size */
+      dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+			      NULL_RTX, 0, OPTAB_WIDEN);
+
+      /* move    $dst_itr, $dst
+	 move    $src_itr, $src */
+      emit_move_insn (dst_itr, dst_base_reg);
+      emit_move_insn (src_itr, src_base_reg);
+
+      /* .Ldouble_word_mode_loop: */
+      emit_label (double_word_mode_loop);
+      /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+	 smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */
+      src_itr_m = src_itr;
+      dst_itr_m = dst_itr;
+      srcmem_m = srcmem;
+      dstmem_m = dstmem;
+      nds32_emit_mem_move_block (start_regno, 2,
+				 &dst_itr_m, &dstmem_m,
+				 &src_itr_m, &srcmem_m,
+				 true);
+      /* move    $src_itr, $src_itr'
+	 move    $dst_itr, $dst_itr' */
+      emit_move_insn (dst_itr, dst_itr_m);
+      emit_move_insn (src_itr, src_itr_m);
+
+      /* ! Not readch upper bound. Loop.
+	 bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+      emit_cmp_and_jump_insns (dst_end, dst_itr, NE, NULL,
+			       Pmode, 1, double_word_mode_loop);
+    }
+  else
+    {
+      /* Handle size greater than 8, and not a multiple of 8.  */
+      return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+						      size, alignment);
+    }
+
+  return true;
+}
+
+static bool
+nds32_expand_movmemsi_loop (rtx dstmem, rtx srcmem,
+			    rtx size, rtx alignment)
+{
+  if (CONST_INT_P (size))
+    return nds32_expand_movmemsi_loop_known_size (dstmem, srcmem,
+						  size, alignment);
+  else
+    return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+						    size, alignment);
+}
+
+static bool
+nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
+			      rtx total_bytes, rtx alignment)
+{
+  rtx dst_base_reg, src_base_reg;
+  rtx tmp_reg;
+  int maximum_bytes;
+  int maximum_bytes_per_inst;
+  int maximum_regs;
+  int start_regno;
+  int i, inst_num;
+  HOST_WIDE_INT remain_bytes, remain_words;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+  bool align_to_2_bytes = (INTVAL (alignment) & 1) == 0;
+
+  /* Because reduced-set regsiters has few registers
+     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31'
+      cannot be used for register allocation),
+     using 8 registers (32 bytes) for moving memory block
+     may easily consume all of them.
+     It makes register allocation/spilling hard to work.
+     So we only allow maximum=4 registers (16 bytes) for
+     moving memory block under reduced-set registers.  */
+  if (TARGET_REDUCED_REGS)
+    {
+      maximum_regs  = 4;
+      maximum_bytes = 64;
+      start_regno   = 2;
+    }
+  else
+    {
+      /* $r25 is $tp so we use up to 8 registers.  */
+      maximum_regs  = 8;
+      maximum_bytes = 160;
+      start_regno   = 16;
+    }
+  maximum_bytes_per_inst = maximum_regs * UNITS_PER_WORD;
+
+  /* 1. Total_bytes is integer for sure.
+     2. Alignment is integer for sure.
+     3. Maximum 4 or 10 registers and up to 4 instructions,
+	4 * 4 * 4 = 64 bytes, 8 * 4 * 10 = 160 bytes.
+     4. The dstmem cannot be volatile memory access.
+     5. The srcmem cannot be volatile memory access.
+     6. Known shared alignment not align to 4 byte in v3m since lmw/smw *NOT*
+	support unalign access with v3m configure.  */
+  if (GET_CODE (total_bytes) != CONST_INT
+      || GET_CODE (alignment) != CONST_INT
+      || INTVAL (total_bytes) > maximum_bytes
+      || MEM_VOLATILE_P (dstmem)
+      || MEM_VOLATILE_P (srcmem)
+      || (TARGET_ISA_V3M && !align_to_4_bytes))
+    return false;
+
+  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0));
+  remain_bytes = INTVAL (total_bytes);
+
+  /* Do not update base address for last lmw/smw pair.  */
+  inst_num = ((INTVAL (total_bytes) + (maximum_bytes_per_inst - 1))
+	      / maximum_bytes_per_inst) - 1;
+
+  for (i = 0; i < inst_num; i++)
+    {
+      nds32_emit_mem_move_block (start_regno, maximum_regs,
+				 &dst_base_reg, &dstmem,
+				 &src_base_reg, &srcmem,
+				 true);
+    }
+  remain_bytes -= maximum_bytes_per_inst * inst_num;
+
+  remain_words = remain_bytes / UNITS_PER_WORD;
+  remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD);
+
+  if (remain_words != 0)
+    {
+      if (remain_bytes != 0)
+	nds32_emit_mem_move_block (start_regno, remain_words,
+				   &dst_base_reg, &dstmem,
+				   &src_base_reg, &srcmem,
+				   true);
+      else
+	{
+	  /* Do not update address if no further byte to move.  */
+	  if (remain_words == 1)
+	   {
+	      /* emit move instruction if align to 4 byte and only 1
+		 word to move.  */
+	      if (align_to_4_bytes)
+		nds32_emit_mem_move (srcmem, dstmem, SImode, 0);
+	      else
+		{
+		  tmp_reg = gen_reg_rtx (SImode);
+		  emit_insn (
+		    gen_unaligned_load_w (tmp_reg,
+					  gen_rtx_MEM (SImode, src_base_reg)));
+		  emit_insn (
+		    gen_unaligned_store_w (gen_rtx_MEM (SImode, dst_base_reg),
+					   tmp_reg));
+		}
+	    }
+	  else
+	    nds32_emit_mem_move_block (start_regno, remain_words,
+				       &dst_base_reg, &dstmem,
+				       &src_base_reg, &srcmem,
+				       false);
+	}
+    }
+
+  switch (remain_bytes)
+    {
+    case 3:
+    case 2:
+      {
+	if (align_to_2_bytes)
+	  nds32_emit_mem_move (srcmem, dstmem, HImode, 0);
+	else
+	  {
+	    nds32_emit_mem_move (srcmem, dstmem, QImode, 0);
+	    nds32_emit_mem_move (srcmem, dstmem, QImode, 1);
+	  }
+
+	if (remain_bytes == 3)
+	  nds32_emit_mem_move (srcmem, dstmem, QImode, 2);
+	break;
+      }
+    case 1:
+      nds32_emit_mem_move (srcmem, dstmem, QImode, 0);
+      break;
+    case 0:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Successfully create patterns, return true.  */
+  return true;
+}
+
+/* Function to move block memory content by
+   using load_multiple and store_multiple.
+   This is auxiliary extern function to help create rtx template.
+   Check nds32-multiple.md file for the patterns.  */
+bool
+nds32_expand_movmemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
+{
+  if (nds32_expand_movmemsi_unroll (dstmem, srcmem, total_bytes, alignment))
+    return true;
+
+  if (!optimize_size && optimize > 2)
+    return nds32_expand_movmemsi_loop (dstmem, srcmem, total_bytes, alignment);
+
+  return false;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Auxiliary function for expand setmem pattern.  */
+
+static rtx
+nds32_gen_dup_4_byte_to_word_value_aux (rtx value, rtx value4word)
+{
+  gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value));
+
+  if (CONST_INT_P (value))
+    {
+      unsigned HOST_WIDE_INT val = UINTVAL (value) & GET_MODE_MASK(QImode);
+      rtx new_val = gen_int_mode (val | (val << 8)
+				  | (val << 16) | (val << 24), SImode);
+      /* Just calculate at here if it's constant value.  */
+      emit_move_insn (value4word, new_val);
+    }
+  else
+    {
+      if (NDS32_EXT_DSP_P ())
+	{
+	  /* ! prepare word
+	     insb    $tmp, $value, 1         ! $tmp  <- 0x0000abab
+	     pkbb16  $tmp6, $tmp2, $tmp2   ! $value4word  <- 0xabababab */
+	  rtx tmp = gen_reg_rtx (SImode);
+
+	  convert_move (tmp, value, true);
+
+	  emit_insn (
+	    gen_insvsi_internal (tmp, gen_int_mode (0x8, SImode), tmp));
+
+	  emit_insn (gen_pkbbsi_1 (value4word, tmp, tmp));
+	}
+      else
+	{
+	  /* ! prepare word
+	     andi    $tmp1, $value, 0xff       ! $tmp1  <- 0x000000ab
+	     slli    $tmp2, $tmp1, 8           ! $tmp2  <- 0x0000ab00
+	     or      $tmp3, $tmp1, $tmp2       ! $tmp3  <- 0x0000abab
+	     slli    $tmp4, $tmp3, 16          ! $tmp4  <- 0xabab0000
+	     or      $val4word, $tmp3, $tmp4   ! $value4word  <- 0xabababab  */
+
+	  rtx tmp1, tmp2, tmp3, tmp4;
+	  tmp1 = expand_binop (SImode, and_optab, value,
+			       gen_int_mode (0xff, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp2 = expand_binop (SImode, ashl_optab, tmp1,
+			       gen_int_mode (8, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp3 = expand_binop (SImode, ior_optab, tmp1, tmp2,
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp4 = expand_binop (SImode, ashl_optab, tmp3,
+			       gen_int_mode (16, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_insn (gen_iorsi3 (value4word, tmp3, tmp4));
+	}
+    }
+
+  return value4word;
+}
+
+static rtx
+nds32_gen_dup_4_byte_to_word_value (rtx value)
+{
+  rtx value4word = gen_reg_rtx (SImode);
+  nds32_gen_dup_4_byte_to_word_value_aux (value, value4word);
+
+  return value4word;
+}
+
+static rtx
+nds32_gen_dup_8_byte_to_double_word_value (rtx value)
+{
+  rtx value4doubleword = gen_reg_rtx (DImode);
+
+  nds32_gen_dup_4_byte_to_word_value_aux (
+    value, nds32_di_low_part_subreg(value4doubleword));
+
+  emit_move_insn (nds32_di_high_part_subreg(value4doubleword),
+		  nds32_di_low_part_subreg(value4doubleword));
+  return value4doubleword;
+}
+
+
+static rtx
+emit_setmem_doubleword_loop (rtx itr, rtx size, rtx value)
+{
+  rtx word_mode_label = gen_label_rtx ();
+  rtx word_mode_end_label = gen_label_rtx ();
+  rtx byte_mode_size = gen_reg_rtx (SImode);
+  rtx byte_mode_size_tmp = gen_reg_rtx (SImode);
+  rtx word_mode_end = gen_reg_rtx (SImode);
+  rtx size_for_word = gen_reg_rtx (SImode);
+
+  /* and     $size_for_word, $size, #~0x7  */
+  size_for_word = expand_binop (SImode, and_optab, size,
+				gen_int_mode (~0x7, SImode),
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (byte_mode_size, size);
+
+  /* beqz    $size_for_word, .Lbyte_mode_entry  */
+  emit_cmp_and_jump_insns (size_for_word, const0_rtx, EQ, NULL,
+			   SImode, 1, word_mode_end_label);
+  /* add     $word_mode_end, $dst, $size_for_word  */
+  word_mode_end = expand_binop (Pmode, add_optab, itr, size_for_word,
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* andi    $byte_mode_size, $size, 0x7  */
+  byte_mode_size_tmp = expand_binop (SImode, and_optab, size, GEN_INT (0x7),
+				     NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (byte_mode_size, byte_mode_size_tmp);
+
+  /* .Lword_mode:  */
+  emit_label (word_mode_label);
+  /*   ! word-mode set loop
+       smw.bim $value4word, [$dst_itr], $value4word, 0
+       bne     $word_mode_end, $dst_itr, .Lword_mode  */
+  emit_insn (gen_unaligned_store_update_base_dw (itr,
+						 itr,
+						 value));
+  emit_cmp_and_jump_insns (word_mode_end, itr, NE, NULL,
+			   Pmode, 1, word_mode_label);
+
+  emit_label (word_mode_end_label);
+
+  return byte_mode_size;
+}
+
+static rtx
+emit_setmem_byte_loop (rtx itr, rtx size, rtx value, bool need_end)
+{
+  rtx end  = gen_reg_rtx (Pmode);
+  rtx byte_mode_label = gen_label_rtx ();
+  rtx end_label = gen_label_rtx ();
+
+  value = force_reg (QImode, value);
+
+  if (need_end)
+    end = expand_binop (Pmode, add_optab, itr, size,
+			NULL_RTX, 0, OPTAB_WIDEN);
+  /*   beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst_itr, $byte_mode_size  */
+  emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL,
+			   SImode, 1, end_label);
+
+  if (!need_end)
+    end = expand_binop (Pmode, add_optab, itr, size,
+			NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* .Lbyte_mode:  */
+  emit_label (byte_mode_label);
+
+  /*   ! byte-mode set loop
+       sbi.bi  $value, [$dst_itr] ,1
+       bne     $byte_mode_end, $dst_itr, .Lbyte_mode */
+  nds32_emit_post_inc_load_store (value, itr, QImode, false);
+
+  emit_cmp_and_jump_insns (end, itr, NE, NULL,
+			   Pmode, 1, byte_mode_label);
+  /* .Lend: */
+  emit_label (end_label);
+
+  if (need_end)
+    return end;
+  else
+    return NULL_RTX;
+}
+
+static bool
+nds32_expand_setmem_loop (rtx dstmem, rtx size, rtx value)
+{
+  rtx value4doubleword;
+  rtx value4byte;
+  rtx dst;
+  rtx byte_mode_size;
+
+  /* Emit loop version of setmem.
+     memset:
+       ! prepare word
+       andi    $tmp1, $val, 0xff               ! $tmp1  <- 0x000000ab
+       slli    $tmp2, $tmp1, 8                 ! $tmp2  <- 0x0000ab00
+       or      $tmp3, $val, $tmp2              ! $tmp3  <- 0x0000abab
+       slli    $tmp4, $tmp3, 16                ! $tmp4  <- 0xabab0000
+       or      $val4word, $tmp3, $tmp4         ! $value4word  <- 0xabababab
+
+       and     $size_for_word, $size, #-4
+       beqz    $size_for_word, .Lword_mode_end
+
+       add     $word_mode_end, $dst, $size_for_word
+       andi    $byte_mode_size, $size, 3
+
+     .Lword_mode:
+       ! word-mode set loop
+       smw.bim $value4word, [$dst], $value4word, 0
+       bne     $word_mode_end, $dst, .Lword_mode
+
+     .Lword_mode_end:
+       beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst, $byte_mode_size
+
+     .Lbyte_mode:
+       ! byte-mode set loop
+       sbi.bi  $value4word, [$dst] ,1
+       bne     $byte_mode_end, $dst, .Lbyte_mode
+     .Lend: */
+
+  dst = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+
+  /* ! prepare word
+     andi    $tmp1, $value, 0xff             ! $tmp1  <- 0x000000ab
+     slli    $tmp2, $tmp1, 8                 ! $tmp2  <- 0x0000ab00
+     or      $tmp3, $tmp1, $tmp2             ! $tmp3  <- 0x0000abab
+     slli    $tmp4, $tmp3, 16                ! $tmp4  <- 0xabab0000
+     or      $val4word, $tmp3, $tmp4         ! $value4word  <- 0xabababab  */
+  value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value);
+
+  /*   and     $size_for_word, $size, #-4
+       beqz    $size_for_word, .Lword_mode_end
+
+       add     $word_mode_end, $dst, $size_for_word
+       andi    $byte_mode_size, $size, 3
+
+     .Lword_mode:
+       ! word-mode set loop
+       smw.bim $value4word, [$dst], $value4word, 0
+       bne     $word_mode_end, $dst, .Lword_mode
+     .Lword_mode_end:  */
+  byte_mode_size = emit_setmem_doubleword_loop (dst, size, value4doubleword);
+
+  /*   beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst, $byte_mode_size
+
+     .Lbyte_mode:
+       ! byte-mode set loop
+       sbi.bi  $value, [$dst] ,1
+       bne     $byte_mode_end, $dst, .Lbyte_mode
+     .Lend: */
+
+  value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode,
+				    subreg_lowpart_offset (QImode, DImode));
+
+  emit_setmem_byte_loop (dst, byte_mode_size, value4byte, false);
+
+  return true;
+}
+
+static bool
+nds32_expand_setmem_loop_v3m (rtx dstmem, rtx size, rtx value)
+{
+  rtx base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  rtx need_align_bytes = gen_reg_rtx (SImode);
+  rtx last_2_bit = gen_reg_rtx (SImode);
+  rtx byte_loop_base = gen_reg_rtx (SImode);
+  rtx byte_loop_size = gen_reg_rtx (SImode);
+  rtx remain_size = gen_reg_rtx (SImode);
+  rtx new_base_reg;
+  rtx value4byte, value4doubleword;
+  rtx byte_mode_size;
+  rtx last_byte_loop_label = gen_label_rtx ();
+
+  size = force_reg (SImode, size);
+
+  value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value);
+  value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode,
+				    subreg_lowpart_offset (QImode, DImode));
+
+  emit_move_insn (byte_loop_size, size);
+  emit_move_insn (byte_loop_base, base_reg);
+
+  /* Jump to last byte loop if size is less than 16.  */
+  emit_cmp_and_jump_insns (size, gen_int_mode (16, SImode), LE, NULL,
+			   SImode, 1, last_byte_loop_label);
+
+  /* Make sure align to 4 byte first since v3m can't unalign access.  */
+  emit_insn (gen_andsi3 (last_2_bit,
+			 base_reg,
+			 gen_int_mode (0x3, SImode)));
+
+  emit_insn (gen_subsi3 (need_align_bytes,
+			 gen_int_mode (4, SImode),
+			 last_2_bit));
+
+  /* Align to 4 byte. */
+  new_base_reg = emit_setmem_byte_loop (base_reg,
+					need_align_bytes,
+					value4byte,
+					true);
+
+  /* Calculate remain size. */
+  emit_insn (gen_subsi3 (remain_size, size, need_align_bytes));
+
+  /* Set memory word by word. */
+  byte_mode_size = emit_setmem_doubleword_loop (new_base_reg,
+						remain_size,
+						value4doubleword);
+
+  emit_move_insn (byte_loop_base, new_base_reg);
+  emit_move_insn (byte_loop_size, byte_mode_size);
+
+  emit_label (last_byte_loop_label);
+
+  /* And set memory for remain bytes. */
+  emit_setmem_byte_loop (byte_loop_base, byte_loop_size, value4byte, false);
+  return true;
+}
+
+static bool
+nds32_expand_setmem_unroll (rtx dstmem, rtx size, rtx value,
+			    rtx align ATTRIBUTE_UNUSED,
+			    rtx expected_align ATTRIBUTE_UNUSED,
+			    rtx expected_size ATTRIBUTE_UNUSED)
+{
+  unsigned maximum_regs, maximum_bytes, start_regno, regno;
+  rtx value4word;
+  rtx dst_base_reg, new_base_reg;
+  unsigned HOST_WIDE_INT remain_bytes, remain_words, prepare_regs, fill_per_smw;
+  unsigned HOST_WIDE_INT real_size;
+
+  if (TARGET_REDUCED_REGS)
+    {
+      maximum_regs  = 4;
+      maximum_bytes = 64;
+      start_regno   = 2;
+    }
+  else
+    {
+      maximum_regs  = 8;
+      maximum_bytes = 128;
+      start_regno   = 16;
+    }
+
+  real_size = UINTVAL (size) & GET_MODE_MASK(SImode);
+
+  if (!(CONST_INT_P (size) && real_size <= maximum_bytes))
+    return false;
+
+  remain_bytes = real_size;
+
+  gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value));
+
+  value4word = nds32_gen_dup_4_byte_to_word_value (value);
+
+  prepare_regs = remain_bytes / UNITS_PER_WORD;
+
+  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+
+  if (prepare_regs > maximum_regs)
+    prepare_regs = maximum_regs;
+
+  fill_per_smw = prepare_regs * UNITS_PER_WORD;
+
+  regno = start_regno;
+  switch (prepare_regs)
+    {
+    case 2:
+    default:
+      {
+	rtx reg0 = gen_rtx_REG (SImode, regno);
+	rtx reg1 = gen_rtx_REG (SImode, regno+1);
+	unsigned last_regno = start_regno + prepare_regs - 1;
+
+	emit_move_insn (reg0, value4word);
+	emit_move_insn (reg1, value4word);
+	rtx regd = gen_rtx_REG (DImode, regno);
+	regno += 2;
+
+	/* Try to utilize movd44!  */
+	while (regno <= last_regno)
+	  {
+	    if ((regno + 1) <=last_regno)
+	      {
+		rtx reg = gen_rtx_REG (DImode, regno);
+		emit_move_insn (reg, regd);
+		regno += 2;
+	      }
+	    else
+	      {
+		rtx reg = gen_rtx_REG (SImode, regno);
+		emit_move_insn (reg, reg0);
+		regno += 1;
+	      }
+	  }
+	break;
+      }
+    case 1:
+      {
+	rtx reg = gen_rtx_REG (SImode, regno++);
+	emit_move_insn (reg, value4word);
+      }
+      break;
+    case 0:
+      break;
+    }
+
+  if (fill_per_smw)
+    for (;remain_bytes >= fill_per_smw;remain_bytes -= fill_per_smw)
+      {
+	emit_insn (nds32_expand_store_multiple (start_regno, prepare_regs,
+						dst_base_reg, dstmem,
+						true, &new_base_reg));
+	dst_base_reg = new_base_reg;
+	dstmem = gen_rtx_MEM (SImode, dst_base_reg);
+      }
+
+  remain_words = remain_bytes / UNITS_PER_WORD;
+
+  if (remain_words)
+    {
+      emit_insn (nds32_expand_store_multiple (start_regno, remain_words,
+					      dst_base_reg, dstmem,
+					      true, &new_base_reg));
+      dst_base_reg = new_base_reg;
+      dstmem = gen_rtx_MEM (SImode, dst_base_reg);
+    }
+
+  remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD);
+
+  if (remain_bytes)
+    {
+      value = simplify_gen_subreg (QImode, value4word, SImode,
+				   subreg_lowpart_offset(QImode, SImode));
+      int offset = 0;
+      for (;remain_bytes;--remain_bytes, ++offset)
+	{
+	  nds32_emit_load_store (value, dstmem, QImode, offset, false);
+	}
+    }
+
+  return true;
+}
+
+bool
+nds32_expand_setmem (rtx dstmem, rtx size, rtx value, rtx align,
+		     rtx expected_align,
+		     rtx expected_size)
+{
+  bool align_to_4_bytes = (INTVAL (align) & 3) == 0;
+
+  /* Only expand at O3 */
+  if (optimize_size || optimize < 3)
+    return false;
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return nds32_expand_setmem_loop_v3m (dstmem, size, value);
+
+  if (nds32_expand_setmem_unroll (dstmem, size, value,
+				  align, expected_align, expected_size))
+    return true;
+
+  return nds32_expand_setmem_loop (dstmem, size, value);
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* Auxiliary function for expand strlen pattern.  */
+
+bool
+nds32_expand_strlen (rtx result, rtx str,
+		     rtx target_char, rtx align ATTRIBUTE_UNUSED)
+{
+  rtx base_reg, backup_base_reg;
+  rtx ffb_result;
+  rtx target_char_ptr, length;
+  rtx loop_label, tmp;
+
+  if (optimize_size || optimize < 3)
+    return false;
+
+  gcc_assert (MEM_P (str));
+  gcc_assert (CONST_INT_P (target_char) || REG_P (target_char));
+
+  base_reg = copy_to_mode_reg (SImode, XEXP (str, 0));
+  loop_label = gen_label_rtx ();
+
+  ffb_result = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (SImode);
+  backup_base_reg = gen_reg_rtx (SImode);
+
+  /* Emit loop version of strlen.
+       move  $backup_base, $base
+     .Lloop:
+       lmw.bim $tmp, [$base], $tmp, 0
+       ffb   $ffb_result, $tmp, $target_char   ! is there $target_char?
+       beqz  $ffb_result, .Lloop
+       add   $last_char_ptr, $base, $ffb_result
+       sub   $length, $last_char_ptr, $backup_base  */
+
+  /* move  $backup_base, $base  */
+  emit_move_insn (backup_base_reg, base_reg);
+
+  /* .Lloop:  */
+  emit_label (loop_label);
+  /* lmw.bim $tmp, [$base], $tmp, 0  */
+  emit_insn (gen_unaligned_load_update_base_w (base_reg, tmp, base_reg));
+
+  /*  ffb   $ffb_result, $tmp, $target_char   ! is there $target_char?  */
+  emit_insn (gen_unspec_ffb (ffb_result, tmp, target_char));
+
+  /* beqz  $ffb_result, .Lloop  */
+  emit_cmp_and_jump_insns (ffb_result, const0_rtx, EQ, NULL,
+			   SImode, 1, loop_label);
+
+  /* add   $target_char_ptr, $base, $ffb_result   */
+  target_char_ptr = expand_binop (Pmode, add_optab, base_reg,
+				ffb_result, NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* sub   $length, $target_char_ptr, $backup_base  */
+  length = expand_binop (Pmode, sub_optab, target_char_ptr,
+			 backup_base_reg, NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (result, length);
+
+  return true;
+}
 
 /* ------------------------------------------------------------------------ */
 
@@ -38,16 +1069,50 @@
    Check nds32-multiple.md file for the patterns.  */
 rtx
 nds32_expand_load_multiple (int base_regno, int count,
-			    rtx base_addr, rtx basemem)
+			    rtx base_addr, rtx basemem,
+			    bool update_base_reg_p,
+			    rtx *update_base_reg)
 {
   int par_index;
   int offset;
+  int start_idx;
   rtx result;
   rtx new_addr, mem, reg;
 
-  /* Create the pattern that is presented in nds32-multiple.md.  */
+  /* Generate a unaligned load to prevent load instruction pull out from
+     parallel, and then it will generate lwi, and lose unaligned acces */
+  if (count == 1)
+    {
+      reg = gen_rtx_REG (SImode, base_regno);
+      if (update_base_reg_p)
+	{
+	  *update_base_reg = gen_reg_rtx (SImode);
+	  return gen_unaligned_load_update_base_w (*update_base_reg, reg, base_addr);
+	}
+      else
+	return gen_unaligned_load_w (reg, gen_rtx_MEM (SImode, base_addr));
+    }
 
-  result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+  /* Create the pattern that is presented in nds32-multiple.md.  */
+  if (update_base_reg_p)
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1));
+      start_idx = 1;
+    }
+  else
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+      start_idx = 0;
+    }
+
+  if (update_base_reg_p)
+    {
+      offset           = count * 4;
+      new_addr         = plus_constant (Pmode, base_addr, offset);
+      *update_base_reg = gen_reg_rtx (SImode);
+
+      XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr);
+    }
 
   for (par_index = 0; par_index < count; par_index++)
     {
@@ -58,7 +1123,7 @@
 					       new_addr, offset);
       reg      = gen_rtx_REG (SImode, base_regno + par_index);
 
-      XVECEXP (result, 0, par_index) = gen_rtx_SET (reg, mem);
+      XVECEXP (result, 0, (par_index + start_idx)) = gen_rtx_SET (reg, mem);
     }
 
   return result;
@@ -66,16 +1131,49 @@
 
 rtx
 nds32_expand_store_multiple (int base_regno, int count,
-			     rtx base_addr, rtx basemem)
+			     rtx base_addr, rtx basemem,
+			     bool update_base_reg_p,
+			     rtx *update_base_reg)
 {
   int par_index;
   int offset;
+  int start_idx;
   rtx result;
   rtx new_addr, mem, reg;
 
+  if (count == 1)
+    {
+      reg = gen_rtx_REG (SImode, base_regno);
+      if (update_base_reg_p)
+	{
+	  *update_base_reg = gen_reg_rtx (SImode);
+	  return gen_unaligned_store_update_base_w (*update_base_reg, base_addr, reg);
+	}
+      else
+	return gen_unaligned_store_w (gen_rtx_MEM (SImode, base_addr), reg);
+    }
+
   /* Create the pattern that is presented in nds32-multiple.md.  */
 
-  result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+  if (update_base_reg_p)
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1));
+      start_idx = 1;
+    }
+  else
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+      start_idx = 0;
+    }
+
+  if (update_base_reg_p)
+    {
+      offset           = count * 4;
+      new_addr         = plus_constant (Pmode, base_addr, offset);
+      *update_base_reg = gen_reg_rtx (SImode);
+
+      XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr);
+    }
 
   for (par_index = 0; par_index < count; par_index++)
     {
@@ -86,58 +1184,10 @@
 					       new_addr, offset);
       reg      = gen_rtx_REG (SImode, base_regno + par_index);
 
-      XVECEXP (result, 0, par_index) = gen_rtx_SET (mem, reg);
+      XVECEXP (result, 0, par_index + start_idx) = gen_rtx_SET (mem, reg);
     }
 
   return result;
 }
 
-/* Function to move block memory content by
-   using load_multiple and store_multiple.
-   This is auxiliary extern function to help create rtx template.
-   Check nds32-multiple.md file for the patterns.  */
-int
-nds32_expand_movmemqi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
-{
-  HOST_WIDE_INT in_words, out_words;
-  rtx dst_base_reg, src_base_reg;
-  int maximum_bytes;
-
-  /* Because reduced-set regsiters has few registers
-     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31'
-      cannot be used for register allocation),
-     using 8 registers (32 bytes) for moving memory block
-     may easily consume all of them.
-     It makes register allocation/spilling hard to work.
-     So we only allow maximum=4 registers (16 bytes) for
-     moving memory block under reduced-set registers.  */
-  if (TARGET_REDUCED_REGS)
-    maximum_bytes = 16;
-  else
-    maximum_bytes = 32;
-
-  /* 1. Total_bytes is integer for sure.
-     2. Alignment is integer for sure.
-     3. Maximum 4 or 8 registers, 4 * 4 = 16 bytes, 8 * 4 = 32 bytes.
-     4. Requires (n * 4) block size.
-     5. Requires 4-byte alignment.  */
-  if (GET_CODE (total_bytes) != CONST_INT
-      || GET_CODE (alignment) != CONST_INT
-      || INTVAL (total_bytes) > maximum_bytes
-      || INTVAL (total_bytes) & 3
-      || INTVAL (alignment) & 3)
-    return 0;
-
-  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
-  src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0));
-
-  out_words = in_words = INTVAL (total_bytes) / UNITS_PER_WORD;
-
-  emit_insn (nds32_expand_load_multiple (0, in_words, src_base_reg, srcmem));
-  emit_insn (nds32_expand_store_multiple (0, out_words, dst_base_reg, dstmem));
-
-  /* Successfully create patterns, return 1.  */
-  return 1;
-}
-
 /* ------------------------------------------------------------------------ */