CbC/CbC_gcc: gcc/config/score/mul-div.S comparison

comparison gcc/config/score/mul-div.S @ 0:a06113de4d67

first commit

author	kent <kent@cr.ie.u-ryukyu.ac.jp>
date	Fri, 17 Jul 2009 14:47:48 +0900
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:a06113de4d67
+/* Copyright (C) 2005, 2007 Free Software Foundation, Inc.
+Contributed by Sunnorth
+This file is part of GCC.
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published
+by the Free Software Foundation; either version 3, or (at your
+option) any later version.
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+License for more details.
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+#define ra r3
+#define a0 r4
+#define a1 r5
+#define a2 r6
+#define a3 r7
+#define v0 r23
+#define t0 r8
+#define t1 r9
+#define t2 r10
+#define t3 r11
+#define t4 r22
+#ifndef __pic__
+#if !defined(L_mulsi3) && !defined(L_divsi3)
+.text
+.global _flush_cache
+#ifdef __score3__
+_flush_cache:
+br      r3
+#else
+_flush_cache:
+srli    r9, r5, 4
+mv      r8, r4
+mtsr    r9, sr0
+1:
+cache   0xe, [r8, 0]            # write back invalid dcache
+addi    r8, 16
+bcnz    1b
+mfcr    r8, cr4
+bittst! r8, 0x3                 # if LDM is enable, write back LDM
+beq!    6f
+ldi     r10, 0
+cache   0xc, [r10, 0]
+6:
+bittst! r8, 0x2                 # if LIM is enable, refill it
+beq!    7f
+cache   0x4, [r10, 0]
+7:
+#nop!
+#nop!
+#nop!
+#nop!
+#nop!
+mv      r8, r4
+mtsr    r9, sr0
+2:
+cache   0x2, [r8, 0]            # invalid unlock icache
+#nop!
+#nop!
+#nop!
+#nop!
+#nop!
+addi    r8, 16
+bcnz    2b
+br      r3
+#endif
+#endif
+/* FUNCTION
+(U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
+REGISTERS:
+use     t0
+modify  a0
+a1      -> become 0
+NOTE:
+this seems to give better performance to just rotate and add.  */
+#ifdef L_mulsi3
+.text
+.global __umulsi3
+.global __mulsi3
+/* signed multiplication (32x32)  */
+.ent    __mulsi3
+__umulsi3:
+__mulsi3:
+li      t1, 0
+__mulsi3_loop:
+andri.c t0, a1, 1               # t0 = multiplier[0]
+srli    a1, a1, 1               # a1 /= 2
+beq     __mulsi3_loop2          # skip if (t0 == 0)
+add     t1, t1, a0              # add multiplicand
+__mulsi3_loop2:
+slli    a0, a0, 1               # multiplicand mul 2
+cmpi.c  a1, 0
+bne     __mulsi3_loop
+mv      r4, t1
+br      ra
+.end    __mulsi3
+#endif /* L_mulsi3 */
+/* FUNCTION
+UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
+INT32 (v0) = __divsi3 (INT32 (a0),  INT32 (a1));
+UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
+INT32 (v0) = __modsi3 (INT32 (a0),  INT32 (a1));
+DESCRIPTION
+performs 32-bit division/modulo.
+REGISTERS
+used t0      bit-index
+t1
+modify a0    becomes remainer  */
+#ifdef L_divsi3
+.text
+.global __udivsi3
+.global __umodsi3
+.global __divsi3
+.global __modsi3
+/* unsigned division  */
+.ent    __udivsi3
+__udivsi3:
+li      t4, 0
+cmpi.c  a1, 0
+beq     __uds_exit
+li      t0, 1
+blt     __uds_ok
+__uds_normalize:
+cmp.c   a0, a1
+bcc     __uds_ok
+slli    a1, a1, 1
+slli    t0, t0, 1
+cmpi.c  a1, 0
+bge     __uds_normalize
+__uds_ok:
+__uds_loop2:
+cmp.c   a0, a1
+bcc     __uds_loop3
+sub     a0, a0, a1
+or      t4, t4, t0
+__uds_loop3:
+srli    t0, t0, 1
+srli    a1, a1, 1
+cmpi.c  t0, 0
+bne     __uds_loop2
+__uds_exit:
+mv      a1, a0
+mv      r4, t4
+br      ra
+.end    __udivsi3
+/* unsigned modulus  */
+.ent    __umodsi3
+__umodsi3:
+mv      t3, ra
+jl      __udivsi3
+mv      r4, a1
+br      t3
+.end    __umodsi3
+/* abs and div  */
+.ent    __orgsi3
+__orgsi3:
+cmpi.c  a0, 0
+bge     __orgsi3_a0p
+neg     a0, a0
+__orgsi3_a0p:
+cmpi.c  a1, 0
+bge     __udivsi3
+neg     a1, a1
+b       __udivsi3               # goto udivsi3
+.end    __orgsi3
+/* signed division  */
+.ent    __divsi3
+__divsi3:
+mv      t3, ra
+xor     t2, a0, a1
+jl      __orgsi3
+__divsi3_adjust:
+cmpi.c  t2, 0
+bge     __divsi3_exit
+neg     r4, r4
+__divsi3_exit:
+br      t3
+.end    __divsi3
+/* signed modulus  */
+.ent    __modsi3
+__modsi3:
+mv      t3, ra
+mv      t2, a0
+jl      __orgsi3
+mv      r4, a1
+b       __divsi3_adjust
+.end    __modsi3
+#endif /* L_divsi3 */
+#else /* -fPIC */
+#if !defined(L_mulsi3) && !defined(L_divsi3)
+.set pic
+.text
+.global _flush_cache
+#ifdef __score3__
+_flush_cache:
+br      r3
+#else
+_flush_cache:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+srli    r9, r5, 4
+mv      r8, r4
+mtsr    r9, sr0
+1:
+cache   0xe, [r8, 0]            # write back invalid dcache
+addi    r8, 16
+bcnz    1b
+mfcr    r8, cr4
+bittst! r8, 0x3                 # if LDM is enable, write back LDM
+beq!    6f
+ldi     r10, 0
+cache   0xc, [r10, 0]
+6:
+bittst! r8, 0x2                 # if LIM is enable, refill it
+beq!    7f
+cache   0x4, [r10, 0]
+7:
+#nop!
+#nop!
+#nop!
+#nop!
+#nop!
+mv      r8, r4
+mtsr    r9, sr0
+2:
+cache   0x2, [r8, 0]            # invalid unlock icache
+#nop!
+#nop!
+#nop!
+#nop!
+#nop!
+addi    r8, 16
+bcnz    2b
+.cprestore r0, 12               # pic used
+addi    r0, 8                   # pic used
+br      r3
+#endif
+#endif
+/* FUNCTION
+(U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
+REGISTERS:
+use     t0
+modify  a0
+a1      -> become 0
+NOTE:
+this seems to give better performance to just rotate and add.  */
+#ifdef L_mulsi3
+.set pic
+.text
+.global __umulsi3
+.global __mulsi3
+/* signed multiplication (32x32)  */
+.ent    __mulsi3
+__umulsi3:
+__mulsi3:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+li      t1, 0
+__mulsi3_loop:
+andri.c t0, a1, 1               # t0 = multiplier[0]
+srli    a1, a1, 1               # a1 /= 2
+beq     __mulsi3_loop2          # skip if (t0 == 0)
+add     t1, t1, a0              # add multiplicand
+__mulsi3_loop2:
+slli    a0, a0, 1               # multiplicand mul 2
+cmpi.c  a1, 0
+bne     __mulsi3_loop
+mv      r4, t1
+.cprestore r0, 12               # pic used
+addi    r0, 8                   # pic used
+br      ra
+.end    __mulsi3
+#endif /* L_mulsi3 */
+/* FUNCTION
+UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
+INT32 (v0) = __divsi3 (INT32 (a0),  INT32 (a1));
+UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
+INT32 (v0) = __modsi3 (INT32 (a0),  INT32 (a1));
+DESCRIPTION
+performs 32-bit division/modulo.
+REGISTERS
+used t0      bit-index
+t1
+modify a0    becomes remainer  */
+#ifdef L_divsi3
+.set pic
+.text
+.global __udivsi3
+.global __umodsi3
+.global __divsi3
+.global __modsi3
+/* unsigned division  */
+.ent    __udivsi3
+__udivsi3:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+li      t4, 0
+cmpi.c  a1, 0
+beq     __uds_exit
+li      t0, 1
+blt     __uds_ok
+__uds_normalize:
+cmp.c   a0, a1
+bcc     __uds_ok
+slli    a1, a1, 1
+slli    t0, t0, 1
+cmpi.c  a1, 0
+bge     __uds_normalize
+__uds_ok:
+__uds_loop2:
+cmp.c   a0, a1
+bcc     __uds_loop3
+sub     a0, a0, a1
+or      t4, t4, t0
+__uds_loop3:
+srli    t0, t0, 1
+srli    a1, a1, 1
+cmpi.c  t0, 0
+bne     __uds_loop2
+__uds_exit:
+mv      a1, a0
+mv      r4, t4
+.cprestore r0, 12               # pic used
+addi    r0, 8                   # pic used
+br      ra
+.end    __udivsi3
+/* unsigned modulus  */
+.ent    __umodsi3
+__umodsi3:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+li      t1, 0
+mv      t3, ra
+la      r29, __udivsi3
+brl     r29
+mv      r4, a1
+.cprestore r0, 12               # pic used
+addi    r0, 8                   # pic used
+br      t3
+.end    __umodsi3
+/* abs and div  */
+.ent    __orgsi3
+__orgsi3:
+cmpi.c  a0, 0
+bge     __orgsi3_a0p
+neg     a0, a0
+__orgsi3_a0p:
+cmpi.c  a1, 0
+bge     __udivsi3
+neg     a1, a1
+b       __udivsi3               # goto udivsi3
+.end    __orgsi3
+/* signed division  */
+.ent    __divsi3
+__divsi3:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+mv      t3, ra
+xor     t2, a0, a1
+la      r29, __orgsi3
+brl     r29
+__divsi3_adjust:
+cmpi.c  t2, 0
+bge     __divsi3_exit
+neg     r4, r4
+__divsi3_exit:
+.cprestore r0, 12               # pic used
+addi    r0, 8                   # pic used
+br      t3
+.end    __divsi3
+/* signed modulus  */
+.ent    __modsi3
+__modsi3:
+addi    r0, -8                  # pic used
+.cpload r29                     # pic used
+mv      t3, ra
+mv      t2, a0
+la      r29, __orgsi3
+brl     r29
+mv      r4, a1
+b       __divsi3_adjust
+.end    __modsi3
+#endif /*L_divsi3 */
+#endif

Mercurial > hg > CbC > CbC_gcc

comparison gcc/config/score/mul-div.S @ 0:a06113de4d67