File: lib1funcs.asm

package info (click to toggle)
gccxml 0.9.0%2Bcvs20100501-2
links: PTS
area: main
in suites: squeeze
size: 79,132 kB
ctags: 73,371
sloc: ansic: 751,436; cpp: 34,175; asm: 26,833; sh: 5,077; makefile: 4,696; lex: 589; awk: 566; perl: 334; yacc: 271; pascal: 86; python: 29
file content (3893 lines) | stat: -rw-r--r-- 115,469 bytes
parent folder | download | duplicates (3)
/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   2004, 2005, 2006
   Free Software Foundation, Inc.

This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.

In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file.  (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)

This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; see the file COPYING.  If not, write to
the Free Software Foundation, 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.  */

!! libgcc routines for the Renesas / SuperH SH CPUs.
!! Contributed by Steve Chamberlain.
!! sac@cygnus.com

!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
!! recoded in assembly by Toshiyasu Morita
!! tm@netcom.com

/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
   ELF local label prefixes by J"orn Rennecke
   amylaar@cygnus.com  */

#include "lib1funcs.h"

#if ! __SH5__
#ifdef L_ashiftrt
        .global        GLOBAL(ashiftrt_r4_0)
        .global        GLOBAL(ashiftrt_r4_1)
        .global        GLOBAL(ashiftrt_r4_2)
        .global        GLOBAL(ashiftrt_r4_3)
        .global        GLOBAL(ashiftrt_r4_4)
        .global        GLOBAL(ashiftrt_r4_5)
        .global        GLOBAL(ashiftrt_r4_6)
        .global        GLOBAL(ashiftrt_r4_7)
        .global        GLOBAL(ashiftrt_r4_8)
        .global        GLOBAL(ashiftrt_r4_9)
        .global        GLOBAL(ashiftrt_r4_10)
        .global        GLOBAL(ashiftrt_r4_11)
        .global        GLOBAL(ashiftrt_r4_12)
        .global        GLOBAL(ashiftrt_r4_13)
        .global        GLOBAL(ashiftrt_r4_14)
        .global        GLOBAL(ashiftrt_r4_15)
        .global        GLOBAL(ashiftrt_r4_16)
        .global        GLOBAL(ashiftrt_r4_17)
        .global        GLOBAL(ashiftrt_r4_18)
        .global        GLOBAL(ashiftrt_r4_19)
        .global        GLOBAL(ashiftrt_r4_20)
        .global        GLOBAL(ashiftrt_r4_21)
        .global        GLOBAL(ashiftrt_r4_22)
        .global        GLOBAL(ashiftrt_r4_23)
        .global        GLOBAL(ashiftrt_r4_24)
        .global        GLOBAL(ashiftrt_r4_25)
        .global        GLOBAL(ashiftrt_r4_26)
        .global        GLOBAL(ashiftrt_r4_27)
        .global        GLOBAL(ashiftrt_r4_28)
        .global        GLOBAL(ashiftrt_r4_29)
        .global        GLOBAL(ashiftrt_r4_30)
        .global        GLOBAL(ashiftrt_r4_31)
        .global        GLOBAL(ashiftrt_r4_32)

        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32))

        .align        1
GLOBAL(ashiftrt_r4_32):
GLOBAL(ashiftrt_r4_31):
        rotcl        r4
        rts
        subc        r4,r4

GLOBAL(ashiftrt_r4_30):
        shar        r4
GLOBAL(ashiftrt_r4_29):
        shar        r4
GLOBAL(ashiftrt_r4_28):
        shar        r4
GLOBAL(ashiftrt_r4_27):
        shar        r4
GLOBAL(ashiftrt_r4_26):
        shar        r4
GLOBAL(ashiftrt_r4_25):
        shar        r4
GLOBAL(ashiftrt_r4_24):
        shlr16        r4
        shlr8        r4
        rts
        exts.b        r4,r4

GLOBAL(ashiftrt_r4_23):
        shar        r4
GLOBAL(ashiftrt_r4_22):
        shar        r4
GLOBAL(ashiftrt_r4_21):
        shar        r4
GLOBAL(ashiftrt_r4_20):
        shar        r4
GLOBAL(ashiftrt_r4_19):
        shar        r4
GLOBAL(ashiftrt_r4_18):
        shar        r4
GLOBAL(ashiftrt_r4_17):
        shar        r4
GLOBAL(ashiftrt_r4_16):
        shlr16        r4
        rts
        exts.w        r4,r4

GLOBAL(ashiftrt_r4_15):
        shar        r4
GLOBAL(ashiftrt_r4_14):
        shar        r4
GLOBAL(ashiftrt_r4_13):
        shar        r4
GLOBAL(ashiftrt_r4_12):
        shar        r4
GLOBAL(ashiftrt_r4_11):
        shar        r4
GLOBAL(ashiftrt_r4_10):
        shar        r4
GLOBAL(ashiftrt_r4_9):
        shar        r4
GLOBAL(ashiftrt_r4_8):
        shar        r4
GLOBAL(ashiftrt_r4_7):
        shar        r4
GLOBAL(ashiftrt_r4_6):
        shar        r4
GLOBAL(ashiftrt_r4_5):
        shar        r4
GLOBAL(ashiftrt_r4_4):
        shar        r4
GLOBAL(ashiftrt_r4_3):
        shar        r4
GLOBAL(ashiftrt_r4_2):
        shar        r4
GLOBAL(ashiftrt_r4_1):
        rts
        shar        r4

GLOBAL(ashiftrt_r4_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashiftrt_r4_0))
        ENDFUNC(GLOBAL(ashiftrt_r4_1))
        ENDFUNC(GLOBAL(ashiftrt_r4_2))
        ENDFUNC(GLOBAL(ashiftrt_r4_3))
        ENDFUNC(GLOBAL(ashiftrt_r4_4))
        ENDFUNC(GLOBAL(ashiftrt_r4_5))
        ENDFUNC(GLOBAL(ashiftrt_r4_6))
        ENDFUNC(GLOBAL(ashiftrt_r4_7))
        ENDFUNC(GLOBAL(ashiftrt_r4_8))
        ENDFUNC(GLOBAL(ashiftrt_r4_9))
        ENDFUNC(GLOBAL(ashiftrt_r4_10))
        ENDFUNC(GLOBAL(ashiftrt_r4_11))
        ENDFUNC(GLOBAL(ashiftrt_r4_12))
        ENDFUNC(GLOBAL(ashiftrt_r4_13))
        ENDFUNC(GLOBAL(ashiftrt_r4_14))
        ENDFUNC(GLOBAL(ashiftrt_r4_15))
        ENDFUNC(GLOBAL(ashiftrt_r4_16))
        ENDFUNC(GLOBAL(ashiftrt_r4_17))
        ENDFUNC(GLOBAL(ashiftrt_r4_18))
        ENDFUNC(GLOBAL(ashiftrt_r4_19))
        ENDFUNC(GLOBAL(ashiftrt_r4_20))
        ENDFUNC(GLOBAL(ashiftrt_r4_21))
        ENDFUNC(GLOBAL(ashiftrt_r4_22))
        ENDFUNC(GLOBAL(ashiftrt_r4_23))
        ENDFUNC(GLOBAL(ashiftrt_r4_24))
        ENDFUNC(GLOBAL(ashiftrt_r4_25))
        ENDFUNC(GLOBAL(ashiftrt_r4_26))
        ENDFUNC(GLOBAL(ashiftrt_r4_27))
        ENDFUNC(GLOBAL(ashiftrt_r4_28))
        ENDFUNC(GLOBAL(ashiftrt_r4_29))
        ENDFUNC(GLOBAL(ashiftrt_r4_30))
        ENDFUNC(GLOBAL(ashiftrt_r4_31))
        ENDFUNC(GLOBAL(ashiftrt_r4_32))
#endif

#ifdef L_ashiftrt_n

!
! GLOBAL(ashrsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!

        .global        GLOBAL(ashrsi3)
        HIDDEN_FUNC(GLOBAL(ashrsi3))
        .align        2
GLOBAL(ashrsi3):
        mov        #31,r0
        and        r0,r5
        mova        LOCAL(ashrsi3_table),r0
        mov.b        @(r0,r5),r5
#ifdef __sh1__
        add        r5,r0
        jmp        @r0
#else
        braf        r5
#endif
        mov        r4,r0

        .align        2
LOCAL(ashrsi3_table):
        .byte                LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table)
        .byte                LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table)

LOCAL(ashrsi3_31):
        rotcl        r0
        rts
        subc        r0,r0

LOCAL(ashrsi3_30):
        shar        r0
LOCAL(ashrsi3_29):
        shar        r0
LOCAL(ashrsi3_28):
        shar        r0
LOCAL(ashrsi3_27):
        shar        r0
LOCAL(ashrsi3_26):
        shar        r0
LOCAL(ashrsi3_25):
        shar        r0
LOCAL(ashrsi3_24):
        shlr16        r0
        shlr8        r0
        rts
        exts.b        r0,r0

LOCAL(ashrsi3_23):
        shar        r0
LOCAL(ashrsi3_22):
        shar        r0
LOCAL(ashrsi3_21):
        shar        r0
LOCAL(ashrsi3_20):
        shar        r0
LOCAL(ashrsi3_19):
        shar        r0
LOCAL(ashrsi3_18):
        shar        r0
LOCAL(ashrsi3_17):
        shar        r0
LOCAL(ashrsi3_16):
        shlr16        r0
        rts
        exts.w        r0,r0

LOCAL(ashrsi3_15):
        shar        r0
LOCAL(ashrsi3_14):
        shar        r0
LOCAL(ashrsi3_13):
        shar        r0
LOCAL(ashrsi3_12):
        shar        r0
LOCAL(ashrsi3_11):
        shar        r0
LOCAL(ashrsi3_10):
        shar        r0
LOCAL(ashrsi3_9):
        shar        r0
LOCAL(ashrsi3_8):
        shar        r0
LOCAL(ashrsi3_7):
        shar        r0
LOCAL(ashrsi3_6):
        shar        r0
LOCAL(ashrsi3_5):
        shar        r0
LOCAL(ashrsi3_4):
        shar        r0
LOCAL(ashrsi3_3):
        shar        r0
LOCAL(ashrsi3_2):
        shar        r0
LOCAL(ashrsi3_1):
        rts
        shar        r0

LOCAL(ashrsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashrsi3))
#endif

#ifdef L_ashiftlt

!
! GLOBAL(ashlsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!
        .global        GLOBAL(ashlsi3)
        HIDDEN_FUNC(GLOBAL(ashlsi3))
        .align        2
GLOBAL(ashlsi3):
        mov        #31,r0
        and        r0,r5
        mova        LOCAL(ashlsi3_table),r0
        mov.b        @(r0,r5),r5
#ifdef __sh1__
        add        r5,r0
        jmp        @r0
#else
        braf        r5
#endif
        mov        r4,r0

        .align        2
LOCAL(ashlsi3_table):
        .byte                LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table)
        .byte                LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table)

LOCAL(ashlsi3_6):
        shll2        r0
LOCAL(ashlsi3_4):
        shll2        r0
LOCAL(ashlsi3_2):
        rts
        shll2        r0

LOCAL(ashlsi3_7):
        shll2        r0
LOCAL(ashlsi3_5):
        shll2        r0
LOCAL(ashlsi3_3):
        shll2        r0
LOCAL(ashlsi3_1):
        rts
        shll        r0

LOCAL(ashlsi3_14):
        shll2        r0
LOCAL(ashlsi3_12):
        shll2        r0
LOCAL(ashlsi3_10):
        shll2        r0
LOCAL(ashlsi3_8):
        rts
        shll8        r0

LOCAL(ashlsi3_15):
        shll2        r0
LOCAL(ashlsi3_13):
        shll2        r0
LOCAL(ashlsi3_11):
        shll2        r0
LOCAL(ashlsi3_9):
        shll8        r0
        rts
        shll        r0

LOCAL(ashlsi3_22):
        shll2        r0
LOCAL(ashlsi3_20):
        shll2        r0
LOCAL(ashlsi3_18):
        shll2        r0
LOCAL(ashlsi3_16):
        rts
        shll16        r0

LOCAL(ashlsi3_23):
        shll2        r0
LOCAL(ashlsi3_21):
        shll2        r0
LOCAL(ashlsi3_19):
        shll2        r0
LOCAL(ashlsi3_17):
        shll16        r0
        rts
        shll        r0

LOCAL(ashlsi3_30):
        shll2        r0
LOCAL(ashlsi3_28):
        shll2        r0
LOCAL(ashlsi3_26):
        shll2        r0
LOCAL(ashlsi3_24):
        shll16        r0
        rts
        shll8        r0

LOCAL(ashlsi3_31):
        shll2        r0
LOCAL(ashlsi3_29):
        shll2        r0
LOCAL(ashlsi3_27):
        shll2        r0
LOCAL(ashlsi3_25):
        shll16        r0
        shll8        r0
        rts
        shll        r0

LOCAL(ashlsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashlsi3))
#endif

#ifdef L_lshiftrt

!
! GLOBAL(lshrsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!
        .global        GLOBAL(lshrsi3)
        HIDDEN_FUNC(GLOBAL(lshrsi3))
        .align        2
GLOBAL(lshrsi3):
        mov        #31,r0
        and        r0,r5
        mova        LOCAL(lshrsi3_table),r0
        mov.b        @(r0,r5),r5
#ifdef __sh1__
        add        r5,r0
        jmp        @r0
#else
        braf        r5
#endif
        mov        r4,r0

        .align        2
LOCAL(lshrsi3_table):
        .byte                LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table)
        .byte                LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table)

LOCAL(lshrsi3_6):
        shlr2        r0
LOCAL(lshrsi3_4):
        shlr2        r0
LOCAL(lshrsi3_2):
        rts
        shlr2        r0

LOCAL(lshrsi3_7):
        shlr2        r0
LOCAL(lshrsi3_5):
        shlr2        r0
LOCAL(lshrsi3_3):
        shlr2        r0
LOCAL(lshrsi3_1):
        rts
        shlr        r0

LOCAL(lshrsi3_14):
        shlr2        r0
LOCAL(lshrsi3_12):
        shlr2        r0
LOCAL(lshrsi3_10):
        shlr2        r0
LOCAL(lshrsi3_8):
        rts
        shlr8        r0

LOCAL(lshrsi3_15):
        shlr2        r0
LOCAL(lshrsi3_13):
        shlr2        r0
LOCAL(lshrsi3_11):
        shlr2        r0
LOCAL(lshrsi3_9):
        shlr8        r0
        rts
        shlr        r0

LOCAL(lshrsi3_22):
        shlr2        r0
LOCAL(lshrsi3_20):
        shlr2        r0
LOCAL(lshrsi3_18):
        shlr2        r0
LOCAL(lshrsi3_16):
        rts
        shlr16        r0

LOCAL(lshrsi3_23):
        shlr2        r0
LOCAL(lshrsi3_21):
        shlr2        r0
LOCAL(lshrsi3_19):
        shlr2        r0
LOCAL(lshrsi3_17):
        shlr16        r0
        rts
        shlr        r0

LOCAL(lshrsi3_30):
        shlr2        r0
LOCAL(lshrsi3_28):
        shlr2        r0
LOCAL(lshrsi3_26):
        shlr2        r0
LOCAL(lshrsi3_24):
        shlr16        r0
        rts
        shlr8        r0

LOCAL(lshrsi3_31):
        shlr2        r0
LOCAL(lshrsi3_29):
        shlr2        r0
LOCAL(lshrsi3_27):
        shlr2        r0
LOCAL(lshrsi3_25):
        shlr16        r0
        shlr8        r0
        rts
        shlr        r0

LOCAL(lshrsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(lshrsi3))
#endif

#ifdef L_movmem
        .text
        .balign        4
        .global        GLOBAL(movmem)
        HIDDEN_FUNC(GLOBAL(movmem))
        HIDDEN_ALIAS(movstr,movmem)
        /* This would be a lot simpler if r6 contained the byte count
           minus 64, and we wouldn't be called here for a byte count of 64.  */
GLOBAL(movmem):
        sts.l        pr,@-r15
        shll2        r6
        bsr        GLOBAL(movmemSI52+2)
        mov.l        @(48,r5),r0
        .balign        4
LOCAL(movmem_loop): /* Reached with rts */
        mov.l        @(60,r5),r0
        add        #-64,r6
        mov.l        r0,@(60,r4)
        tst        r6,r6
        mov.l        @(56,r5),r0
        bt        LOCAL(movmem_done)
        mov.l        r0,@(56,r4)
        cmp/pl        r6
        mov.l        @(52,r5),r0
        add        #64,r5
        mov.l        r0,@(52,r4)
        add        #64,r4
        bt        GLOBAL(movmemSI52)
! done all the large groups, do the remainder
! jump to movmem+
        mova        GLOBAL(movmemSI4)+4,r0
        add        r6,r0
        jmp        @r0
LOCAL(movmem_done): ! share slot insn, works out aligned.
        lds.l        @r15+,pr
        mov.l        r0,@(56,r4)
        mov.l        @(52,r5),r0
        rts
        mov.l        r0,@(52,r4)
        .balign        4
! ??? We need aliases movstr* for movmem* for the older libraries.  These
! aliases will be removed at the some point in the future.
        .global        GLOBAL(movmemSI64)
        HIDDEN_FUNC(GLOBAL(movmemSI64))
        HIDDEN_ALIAS(movstrSI64,movmemSI64)
GLOBAL(movmemSI64):
        mov.l        @(60,r5),r0
        mov.l        r0,@(60,r4)
        .global        GLOBAL(movmemSI60)
        HIDDEN_FUNC(GLOBAL(movmemSI60))
        HIDDEN_ALIAS(movstrSI60,movmemSI60)
GLOBAL(movmemSI60):
        mov.l        @(56,r5),r0
        mov.l        r0,@(56,r4)
        .global        GLOBAL(movmemSI56)
        HIDDEN_FUNC(GLOBAL(movmemSI56))
        HIDDEN_ALIAS(movstrSI56,movmemSI56)
GLOBAL(movmemSI56):
        mov.l        @(52,r5),r0
        mov.l        r0,@(52,r4)
        .global        GLOBAL(movmemSI52)
        HIDDEN_FUNC(GLOBAL(movmemSI52))
        HIDDEN_ALIAS(movstrSI52,movmemSI52)
GLOBAL(movmemSI52):
        mov.l        @(48,r5),r0
        mov.l        r0,@(48,r4)
        .global        GLOBAL(movmemSI48)
        HIDDEN_FUNC(GLOBAL(movmemSI48))
        HIDDEN_ALIAS(movstrSI48,movmemSI48)
GLOBAL(movmemSI48):
        mov.l        @(44,r5),r0
        mov.l        r0,@(44,r4)
        .global        GLOBAL(movmemSI44)
        HIDDEN_FUNC(GLOBAL(movmemSI44))
        HIDDEN_ALIAS(movstrSI44,movmemSI44)
GLOBAL(movmemSI44):
        mov.l        @(40,r5),r0
        mov.l        r0,@(40,r4)
        .global        GLOBAL(movmemSI40)
        HIDDEN_FUNC(GLOBAL(movmemSI40))
        HIDDEN_ALIAS(movstrSI40,movmemSI40)
GLOBAL(movmemSI40):
        mov.l        @(36,r5),r0
        mov.l        r0,@(36,r4)
        .global        GLOBAL(movmemSI36)
        HIDDEN_FUNC(GLOBAL(movmemSI36))
        HIDDEN_ALIAS(movstrSI36,movmemSI36)
GLOBAL(movmemSI36):
        mov.l        @(32,r5),r0
        mov.l        r0,@(32,r4)
        .global        GLOBAL(movmemSI32)
        HIDDEN_FUNC(GLOBAL(movmemSI32))
        HIDDEN_ALIAS(movstrSI32,movmemSI32)
GLOBAL(movmemSI32):
        mov.l        @(28,r5),r0
        mov.l        r0,@(28,r4)
        .global        GLOBAL(movmemSI28)
        HIDDEN_FUNC(GLOBAL(movmemSI28))
        HIDDEN_ALIAS(movstrSI28,movmemSI28)
GLOBAL(movmemSI28):
        mov.l        @(24,r5),r0
        mov.l        r0,@(24,r4)
        .global        GLOBAL(movmemSI24)
        HIDDEN_FUNC(GLOBAL(movmemSI24))
        HIDDEN_ALIAS(movstrSI24,movmemSI24)
GLOBAL(movmemSI24):
        mov.l        @(20,r5),r0
        mov.l        r0,@(20,r4)
        .global        GLOBAL(movmemSI20)
        HIDDEN_FUNC(GLOBAL(movmemSI20))
        HIDDEN_ALIAS(movstrSI20,movmemSI20)
GLOBAL(movmemSI20):
        mov.l        @(16,r5),r0
        mov.l        r0,@(16,r4)
        .global        GLOBAL(movmemSI16)
        HIDDEN_FUNC(GLOBAL(movmemSI16))
        HIDDEN_ALIAS(movstrSI16,movmemSI16)
GLOBAL(movmemSI16):
        mov.l        @(12,r5),r0
        mov.l        r0,@(12,r4)
        .global        GLOBAL(movmemSI12)
        HIDDEN_FUNC(GLOBAL(movmemSI12))
        HIDDEN_ALIAS(movstrSI12,movmemSI12)
GLOBAL(movmemSI12):
        mov.l        @(8,r5),r0
        mov.l        r0,@(8,r4)
        .global        GLOBAL(movmemSI8)
        HIDDEN_FUNC(GLOBAL(movmemSI8))
        HIDDEN_ALIAS(movstrSI8,movmemSI8)
GLOBAL(movmemSI8):
        mov.l        @(4,r5),r0
        mov.l        r0,@(4,r4)
        .global        GLOBAL(movmemSI4)
        HIDDEN_FUNC(GLOBAL(movmemSI4))
        HIDDEN_ALIAS(movstrSI4,movmemSI4)
GLOBAL(movmemSI4):
        mov.l        @(0,r5),r0
        rts
        mov.l        r0,@(0,r4)

        ENDFUNC(GLOBAL(movmemSI64))
        ENDFUNC(GLOBAL(movmemSI60))
        ENDFUNC(GLOBAL(movmemSI56))
        ENDFUNC(GLOBAL(movmemSI52))
        ENDFUNC(GLOBAL(movmemSI48))
        ENDFUNC(GLOBAL(movmemSI44))
        ENDFUNC(GLOBAL(movmemSI40))
        ENDFUNC(GLOBAL(movmemSI36))
        ENDFUNC(GLOBAL(movmemSI32))
        ENDFUNC(GLOBAL(movmemSI28))
        ENDFUNC(GLOBAL(movmemSI24))
        ENDFUNC(GLOBAL(movmemSI20))
        ENDFUNC(GLOBAL(movmemSI16))
        ENDFUNC(GLOBAL(movmemSI12))
        ENDFUNC(GLOBAL(movmemSI8))
        ENDFUNC(GLOBAL(movmemSI4))
        ENDFUNC(GLOBAL(movmem))
#endif

#ifdef L_movmem_i4
        .text
        .global        GLOBAL(movmem_i4_even)
        .global        GLOBAL(movmem_i4_odd)
        .global        GLOBAL(movmemSI12_i4)

        HIDDEN_FUNC(GLOBAL(movmem_i4_even))
        HIDDEN_FUNC(GLOBAL(movmem_i4_odd))
        HIDDEN_FUNC(GLOBAL(movmemSI12_i4))

        HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even)
        HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd)
        HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4)

        .p2align        5
L_movmem_2mod4_end:
        mov.l        r0,@(16,r4)
        rts
        mov.l        r1,@(20,r4)

        .p2align        2

GLOBAL(movmem_i4_even):
        mov.l        @r5+,r0
        bra        L_movmem_start_even
        mov.l        @r5+,r1

GLOBAL(movmem_i4_odd):
        mov.l        @r5+,r1
        add        #-4,r4
        mov.l        @r5+,r2
        mov.l        @r5+,r3
        mov.l        r1,@(4,r4)
        mov.l        r2,@(8,r4)

L_movmem_loop:
        mov.l        r3,@(12,r4)
        dt        r6
        mov.l        @r5+,r0
        bt/s        L_movmem_2mod4_end
        mov.l        @r5+,r1
        add        #16,r4
L_movmem_start_even:
        mov.l        @r5+,r2
        mov.l        @r5+,r3
        mov.l        r0,@r4
        dt        r6
        mov.l        r1,@(4,r4)
        bf/s        L_movmem_loop
        mov.l        r2,@(8,r4)
        rts
        mov.l        r3,@(12,r4)

        ENDFUNC(GLOBAL(movmem_i4_even))
        ENDFUNC(GLOBAL(movmem_i4_odd))

        .p2align        4
GLOBAL(movmemSI12_i4):
        mov.l        @r5,r0
        mov.l        @(4,r5),r1
        mov.l        @(8,r5),r2
        mov.l        r0,@r4
        mov.l        r1,@(4,r4)
        rts
        mov.l        r2,@(8,r4)

        ENDFUNC(GLOBAL(movmemSI12_i4))
#endif

#ifdef L_mulsi3


        .global        GLOBAL(mulsi3)
        HIDDEN_FUNC(GLOBAL(mulsi3))

! r4 =       aabb
! r5 =       ccdd
! r0 = aabb*ccdd  via partial products
!
! if aa == 0 and cc = 0
! r0 = bb*dd
!
! else
! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536)
!

GLOBAL(mulsi3):
        mulu.w  r4,r5                ! multiply the lsws  macl=bb*dd
        mov     r5,r3                ! r3 = ccdd
        swap.w  r4,r2                ! r2 = bbaa
        xtrct   r2,r3                ! r3 = aacc
        tst          r3,r3                ! msws zero ?
        bf      hiset
        rts                        ! yes - then we have the answer
        sts     macl,r0

hiset:        sts        macl,r0                ! r0 = bb*dd
        mulu.w        r2,r5                ! brewing macl = aa*dd
        sts        macl,r1
        mulu.w        r3,r4                ! brewing macl = cc*bb
        sts        macl,r2
        add        r1,r2
        shll16        r2
        rts
        add        r2,r0

        ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
#ifdef L_sdivsi3_i4
        .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
!! args in r4 and r5, result in fpul, clobber dr0, dr2

        .global        GLOBAL(sdivsi3_i4)
        HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
        lds r4,fpul
        float fpul,dr0
        lds r5,fpul
        float fpul,dr2
        fdiv dr2,dr0
        rts
        ftrc dr0,fpul

        ENDFUNC(GLOBAL(sdivsi3_i4))
#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2

#if ! __SH5__ || __SH5__ == 32
#if __SH5__
        .mode        SHcompact
#endif
        .global        GLOBAL(sdivsi3_i4)
        HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
        sts.l fpscr,@-r15
        mov #8,r2
        swap.w r2,r2
        lds r2,fpscr
        lds r4,fpul
        float fpul,dr0
        lds r5,fpul
        float fpul,dr2
        fdiv dr2,dr0
        ftrc dr0,fpul
        rts
        lds.l @r15+,fpscr

        ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ */
#endif

#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
   sh2e/sh3e code.  */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
!!
!! Steve Chamberlain
!! sac@cygnus.com
!!
!!

!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit

        .global        GLOBAL(sdivsi3)
#if __SHMEDIA__
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align        2
#if 0
/* The assembly code that follows is a hand-optimized version of the C
   code that follows.  Note that the registers that are modified are
   exactly those listed as clobbered in the patterns divsi3_i1 and
   divsi3_i1_media.
        
int __sdivsi3 (i, j)
     int i, j;
{
  register unsigned long long r18 asm ("r18");
  register unsigned long long r19 asm ("r19");
  register unsigned long long r0 asm ("r0") = 0;
  register unsigned long long r1 asm ("r1") = 1;
  register int r2 asm ("r2") = i >> 31;
  register int r3 asm ("r3") = j >> 31;

  r2 = r2 ? r2 : r1;
  r3 = r3 ? r3 : r1;
  r18 = i * r2;
  r19 = j * r3;
  r2 *= r3;
  
  r19 <<= 31;
  r1 <<= 31;
  do
    if (r18 >= r19)
      r0 |= r1, r18 -= r19;
  while (r19 >>= 1, r1 >>= 1);

  return r2 * (int)r0;
}
*/
GLOBAL(sdivsi3):
        pt/l        LOCAL(sdivsi3_dontadd), tr2
        pt/l        LOCAL(sdivsi3_loop), tr1
        ptabs/l        r18, tr0
        movi        0, r0
        movi        1, r1
        shari.l        r4, 31, r2
        shari.l        r5, 31, r3
        cmveq        r2, r1, r2
        cmveq        r3, r1, r3
        muls.l        r4, r2, r18
        muls.l        r5, r3, r19
        muls.l        r2, r3, r2
        shlli        r19, 31, r19
        shlli        r1, 31, r1
LOCAL(sdivsi3_loop):
        bgtu        r19, r18, tr2
        or        r0, r1, r0
        sub        r18, r19, r18
LOCAL(sdivsi3_dontadd):
        shlri        r1, 1, r1
        shlri        r19, 1, r19
        bnei        r1, 0, tr1
        muls.l        r0, r2, r0
        add.l        r0, r63, r0
        blink        tr0, r63
#elif 0 /* ! 0 */
 // inputs: r4,r5
 // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0
 // result in r0
GLOBAL(sdivsi3):
 // can create absolute value without extra latency,
 // but dependent on proper sign extension of inputs:
 // shari.l r5,31,r2
 // xor r5,r2,r20
 // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended.
 shari.l r5,31,r2
 ori r2,1,r2
 muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended.
 movi 0xffffffffffffbb0c,r19 // shift count eqiv 76
 shari.l r4,31,r3
 nsb r20,r0
 shlld r20,r0,r25
 shlri r25,48,r25
 sub r19,r25,r1
 mmulfx.w r1,r1,r2
 mshflo.w r1,r63,r1
 // If r4 was to be used in-place instead of r21, could use this sequence
 // to compute absolute:
 // sub r63,r4,r19 // compute absolute value of r4
 // shlri r4,32,r3 // into lower 32 bit of r4, keeping
 // mcmv r19,r3,r4 // the sign in the upper 32 bits intact.
 ori r3,1,r3
 mmulfx.w r25,r2,r2
 sub r19,r0,r0
 muls.l r4,r3,r21
 msub.w r1,r2,r2
 addi r2,-2,r1
 mulu.l r21,r1,r19
 mmulfx.w r2,r2,r2
 shlli r1,15,r1
 shlrd r19,r0,r19
 mulu.l r19,r20,r3
 mmacnfx.wl r25,r2,r1
 ptabs r18,tr0
 sub r21,r3,r25

 mulu.l r25,r1,r2
 addi r0,14,r0
 xor r4,r5,r18
 shlrd r2,r0,r2
 mulu.l r2,r20,r3
 add r19,r2,r19
 shari.l r18,31,r18
 sub r25,r3,r25

 mulu.l r25,r1,r2
 sub r25,r20,r25
 add r19,r18,r19
 shlrd r2,r0,r2
 mulu.l r2,r20,r3
 addi r25,1,r25
 add r19,r2,r19

 cmpgt r25,r3,r25
 add.l r19,r25,r0
 xor r0,r18,r0
 blink tr0,r63
#else /* ! 0 && ! 0 */

 // inputs: r4,r5
 // clobbered: r1,r18,r19,r20,r21,r25,tr0
 // result in r0
        HIDDEN_FUNC(GLOBAL(sdivsi3_2))
#ifndef __pic__
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3): /* this is the shcompact entry point */
 // The special SHmedia entry point sdivsi3_1 prevents accidental linking
 // with the SHcompact implementation, which clobbers tr1 / tr2.
 .global GLOBAL(sdivsi3_1)
GLOBAL(sdivsi3_1):
 .global GLOBAL(div_table_internal)
 movi (GLOBAL(div_table_internal) >> 16) & 65535, r20
 shori GLOBAL(div_table_internal) & 65535, r20
#endif
 .global GLOBAL(sdivsi3_2)
 // div_table in r20
 // clobbered: r1,r18,r19,r21,r25,tr0
GLOBAL(sdivsi3_2):
 nsb r5, r1
 shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
 shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
 ldx.ub r20, r21, r19 // u0.8
 shari r25, 32, r25   // normalize to s2.30
 shlli r21, 1, r21
 muls.l r25, r19, r19 // s2.38
 ldx.w r20, r21, r21  // s2.14
  ptabs r18, tr0
 shari r19, 24, r19   // truncate to s2.14
 sub r21, r19, r19    // some 11 bit inverse in s1.14
 muls.l r19, r19, r21 // u0.28
  sub r63, r1, r1
  addi r1, 92, r1
 muls.l r25, r21, r18 // s2.58
 shlli r19, 45, r19   // multiply by two and convert to s2.58
  /* bubble */
 sub r19, r18, r18
 shari r18, 28, r18   // some 22 bit inverse in s1.30
 muls.l r18, r25, r0  // s2.60
  muls.l r18, r4, r25 // s32.30
  /* bubble */
 shari r0, 16, r19   // s-16.44
 muls.l r19, r18, r19 // s-16.74
  shari r25, 63, r0
  shari r4, 14, r18   // s19.-14
 shari r19, 30, r19   // s-16.44
 muls.l r19, r18, r19 // s15.30
  xor r21, r0, r21    // You could also use the constant 1 << 27.
  add r21, r25, r21
 sub r21, r19, r21
 shard r21, r1, r21
 sub r21, r0, r0
 blink tr0, r63
#ifndef __pic__
        ENDFUNC(GLOBAL(sdivsi3))
#endif
        ENDFUNC(GLOBAL(sdivsi3_2))
#endif
#elif defined __SHMEDIA__
/* m5compact-nofpu */
 // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3):
        pt/l LOCAL(sdivsi3_dontsub), tr0
        pt/l LOCAL(sdivsi3_loop), tr1
        ptabs/l r18,tr2
        shari.l r4,31,r18
        shari.l r5,31,r19
        xor r4,r18,r20
        xor r5,r19,r21
        sub.l r20,r18,r20
        sub.l r21,r19,r21
        xor r18,r19,r19
        shlli r21,32,r25
        addi r25,-1,r21
        addz.l r20,r63,r20
LOCAL(sdivsi3_loop):
        shlli r20,1,r20
        bgeu/u r21,r20,tr0
        sub r20,r21,r20
LOCAL(sdivsi3_dontsub):
        addi.l r25,-1,r25
        bnei r25,-32,tr1
        xor r20,r19,r20
        sub.l r20,r19,r0
        blink tr2,r63
        ENDFUNC(GLOBAL(sdivsi3))
#else /* ! __SHMEDIA__ */
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3):
        mov        r4,r1
        mov        r5,r0

        tst        r0,r0
        bt        div0
        mov        #0,r2
        div0s        r2,r1
        subc        r3,r3
        subc        r2,r1
        div0s        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        div1        r0,r3
        rotcl        r1
        addc        r2,r1
        rts
        mov        r1,r0


div0:        rts
        mov        #0,r0

        ENDFUNC(GLOBAL(sdivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* ! __SH4__ */
#endif
#ifdef L_udivsi3_i4

        .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
!! and t bit

        .global        GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        mov #1,r1
        cmp/hi r1,r5
        bf trivial
        rotr r1
        xor r1,r4
        lds r4,fpul
        mova L1,r0
#ifdef FMOVD_WORKS
        fmov.d @r0+,dr4
#else
        fmov.s @r0+,DR40
        fmov.s @r0,DR41
#endif
        float fpul,dr0
        xor r1,r5
        lds r5,fpul
        float fpul,dr2
        fadd dr4,dr0
        fadd dr4,dr2
        fdiv dr2,dr0
        rts
        ftrc dr0,fpul

trivial:
        rts
        lds r4,fpul

        .align 2
#ifdef FMOVD_WORKS
        .align 3        ! make double below 8 byte aligned.
#endif
L1:
        .double 2147483648

        ENDFUNC(GLOBAL(udivsi3_i4))
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__)
#if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
        .mode        SHmedia
        .global        GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        addz.l        r4,r63,r20
        addz.l        r5,r63,r21
        fmov.qd        r20,dr0
        fmov.qd        r21,dr32
        ptabs        r18,tr0
        float.qd dr0,dr0
        float.qd dr32,dr32
        fdiv.d        dr0,dr32,dr0
        ftrc.dq dr0,dr32
        fmov.s fr33,fr32
        blink tr0,r63

        ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4

        .global        GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        mov #1,r1
        cmp/hi r1,r5
        bf trivial
        sts.l fpscr,@-r15
        mova L1,r0
        lds.l @r0+,fpscr
        rotr r1
        xor r1,r4
        lds r4,fpul
#ifdef FMOVD_WORKS
        fmov.d @r0+,dr4
#else
        fmov.s @r0+,DR40
        fmov.s @r0,DR41
#endif
        float fpul,dr0
        xor r1,r5
        lds r5,fpul
        float fpul,dr2
        fadd dr4,dr0
        fadd dr4,dr2
        fdiv dr2,dr0
        ftrc dr0,fpul
        rts
        lds.l @r15+,fpscr

#ifdef FMOVD_WORKS
        .align 3        ! make double below 8 byte aligned.
#endif
trivial:
        rts
        lds r4,fpul

        .align 2
L1:
#ifndef FMOVD_WORKS
        .long 0x80000
#else
        .long 0x180000
#endif
        .double 2147483648

        ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
#endif

#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
   sh2e/sh3e code.  */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)

!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit
        .global        GLOBAL(udivsi3)
        HIDDEN_FUNC(GLOBAL(udivsi3))

#if __SHMEDIA__
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align        2
#if 0
/* The assembly code that follows is a hand-optimized version of the C
   code that follows.  Note that the registers that are modified are
   exactly those listed as clobbered in the patterns udivsi3_i1 and
   udivsi3_i1_media.
        
unsigned 
__udivsi3 (i, j)
    unsigned i, j; 
{
  register unsigned long long r0 asm ("r0") = 0;
  register unsigned long long r18 asm ("r18") = 1;
  register unsigned long long r4 asm ("r4") = i;
  register unsigned long long r19 asm ("r19") = j;

  r19 <<= 31;
  r18 <<= 31;
  do
    if (r4 >= r19)
      r0 |= r18, r4 -= r19;
  while (r19 >>= 1, r18 >>= 1);

  return r0;
}
*/
GLOBAL(udivsi3):
        pt/l        LOCAL(udivsi3_dontadd), tr2
        pt/l        LOCAL(udivsi3_loop), tr1
        ptabs/l        r18, tr0
        movi        0, r0
        movi        1, r18
        addz.l        r5, r63, r19
        addz.l        r4, r63, r4
        shlli        r19, 31, r19
        shlli        r18, 31, r18
LOCAL(udivsi3_loop):
        bgtu        r19, r4, tr2
        or        r0, r18, r0
        sub        r4, r19, r4
LOCAL(udivsi3_dontadd):
        shlri        r18, 1, r18
        shlri        r19, 1, r19
        bnei        r18, 0, tr1
        blink        tr0, r63
#else
GLOBAL(udivsi3):
 // inputs: r4,r5
 // clobbered: r18,r19,r20,r21,r22,r25,tr0
 // result in r0.
 addz.l r5,r63,r22
 nsb r22,r0
 shlld r22,r0,r25
 shlri r25,48,r25
 movi 0xffffffffffffbb0c,r20 // shift count eqiv 76
 sub r20,r25,r21
 mmulfx.w r21,r21,r19
 mshflo.w r21,r63,r21
 ptabs r18,tr0
 mmulfx.w r25,r19,r19
 sub r20,r0,r0
 /* bubble */
 msub.w r21,r19,r19
 addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21
                    before the msub.w, but we need a different value for
                    r19 to keep errors under control.  */
 mulu.l r4,r21,r18
 mmulfx.w r19,r19,r19
 shlli r21,15,r21
 shlrd r18,r0,r18
 mulu.l r18,r22,r20
 mmacnfx.wl r25,r19,r21
 /* bubble */
 sub r4,r20,r25

 mulu.l r25,r21,r19
 addi r0,14,r0
 /* bubble */
 shlrd r19,r0,r19
 mulu.l r19,r22,r20
 add r18,r19,r18
 /* bubble */
 sub.l r25,r20,r25

 mulu.l r25,r21,r19
 addz.l r25,r63,r25
 sub r25,r22,r25
 shlrd r19,r0,r19
 mulu.l r19,r22,r20
 addi r25,1,r25
 add r18,r19,r18

 cmpgt r25,r20,r25
 add.l r18,r25,r0
 blink tr0,r63
#endif
#elif defined (__SHMEDIA__)
/* m5compact-nofpu - more emphasis on code size than on speed, but don't
   ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.
   So use a short shmedia loop.  */
 // clobbered: r20,r21,r25,tr0,tr1,tr2
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
GLOBAL(udivsi3):
 pt/l LOCAL(udivsi3_dontsub), tr0
 pt/l LOCAL(udivsi3_loop), tr1
 ptabs/l r18,tr2
 shlli r5,32,r25
 addi r25,-1,r21
 addz.l r4,r63,r20
LOCAL(udivsi3_loop):
 shlli r20,1,r20
 bgeu/u r21,r20,tr0
 sub r20,r21,r20
LOCAL(udivsi3_dontsub):
 addi.l r25,-1,r25
 bnei r25,-32,tr1
 add.l r20,r63,r0
 blink tr2,r63
#else /* ! defined (__SHMEDIA__) */
LOCAL(div8):
 div1 r5,r4
LOCAL(div7):
 div1 r5,r4; div1 r5,r4; div1 r5,r4
 div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4

LOCAL(divx4):
 div1 r5,r4; rotcl r0
 div1 r5,r4; rotcl r0
 div1 r5,r4; rotcl r0
 rts; div1 r5,r4

GLOBAL(udivsi3):
 sts.l pr,@-r15
 extu.w r5,r0
 cmp/eq r5,r0
#ifdef __sh1__
 bf LOCAL(large_divisor)
#else
 bf/s LOCAL(large_divisor)
#endif
 div0u
 swap.w r4,r0
 shlr16 r4
 bsr LOCAL(div8)
 shll16 r5
 bsr LOCAL(div7)
 div1 r5,r4
 xtrct r4,r0
 xtrct r0,r4
 bsr LOCAL(div8)
 swap.w r4,r4
 bsr LOCAL(div7)
 div1 r5,r4
 lds.l @r15+,pr
 xtrct r4,r0
 swap.w r0,r0
 rotcl r0
 rts
 shlr16 r5

LOCAL(large_divisor):
#ifdef __sh1__
 div0u
#endif
 mov #0,r0
 xtrct r4,r0
 xtrct r0,r4
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 lds.l @r15+,pr
 rts
 rotcl r0

        ENDFUNC(GLOBAL(udivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* __SH4__ */
#endif /* L_udivsi3 */

#ifdef L_udivdi3
#ifdef __SHMEDIA__
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        .global        GLOBAL(udivdi3)
        FUNC(GLOBAL(udivdi3))
GLOBAL(udivdi3):
        HIDDEN_ALIAS(udivdi3_internal,udivdi3)
        shlri r3,1,r4
        nsb r4,r22
        shlld r3,r22,r6
        shlri r6,49,r5
        movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
        sub r21,r5,r1
        mmulfx.w r1,r1,r4
        mshflo.w r1,r63,r1
        sub r63,r22,r20 // r63 == 64 % 64
        mmulfx.w r5,r4,r4
        pta LOCAL(large_divisor),tr0
        addi r20,32,r9
        msub.w r1,r4,r1
        madd.w r1,r1,r1
        mmulfx.w r1,r1,r4
        shlri r6,32,r7
        bgt/u r9,r63,tr0 // large_divisor
        mmulfx.w r5,r4,r4
        shlri r2,32+14,r19
        addi r22,-31,r0
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r19,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        mulu.l r5,r3,r8
        mshalds.l r1,r21,r1
        shari r4,26,r4
        shlld r8,r0,r8
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r2,r8,r2
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */

        shlri r2,22,r21
        mulu.l r21,r1,r21
        shlld r5,r0,r8
        addi r20,30-22,r0
        shlrd r21,r0,r21
        mulu.l r21,r3,r5
        add r8,r21,r8
        mcmpgt.l r21,r63,r21 // See Note 1
        addi r20,30,r0
        mshfhi.l r63,r21,r21
        sub r2,r5,r2
        andc r2,r21,r2

        /* small divisor: need a third divide step */
        mulu.l r2,r1,r7
        ptabs r18,tr0
        addi r2,1,r2
        shlrd r7,r0,r7
        mulu.l r7,r3,r5
        add r8,r7,r8
        sub r2,r3,r2
        cmpgt r2,r5,r5
        add r8,r5,r2
        /* could test r3 here to check for divide by zero.  */
        blink tr0,r63

LOCAL(large_divisor):
        mmulfx.w r5,r4,r4
        shlrd r2,r9,r25
        shlri r25,32,r8
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r8,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        shlri r5,14-1,r8
        mulu.l r8,r7,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r25,r5,r25
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */

        shlri r25,22,r21
        mulu.l r21,r1,r21
        pta LOCAL(no_lo_adj),tr0
        addi r22,32,r0
        shlri r21,40,r21
        mulu.l r21,r7,r5
        add r8,r21,r8
        shlld r2,r0,r2
        sub r25,r5,r25
        bgtu/u r7,r25,tr0 // no_lo_adj
        addi r8,1,r8
        sub r25,r7,r25
LOCAL(no_lo_adj):
        mextr4 r2,r25,r2

        /* large_divisor: only needs a few adjustments.  */
        mulu.l r8,r6,r5
        ptabs r18,tr0
        /* bubble */
        cmpgtu r5,r2,r5
        sub r8,r5,r2
        blink tr0,r63
        ENDFUNC(GLOBAL(udivdi3))
/* Note 1: To shift the result of the second divide stage so that the result
   always fits into 32 bits, yet we still reduce the rest sufficiently
   would require a lot of instructions to do the shifts just right.  Using
   the full 64 bit shift result to multiply with the divisor would require
   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
   Fortunately, if the upper 32 bits of the shift result are nonzero, we
   know that the rest after taking this partial result into account will
   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
   upper 32 bits of the partial result are nonzero.  */
#endif /* __SHMEDIA__ */
#endif /* L_udivdi3 */

#ifdef L_divdi3
#ifdef __SHMEDIA__
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        .global        GLOBAL(divdi3)
        FUNC(GLOBAL(divdi3))
GLOBAL(divdi3):
        pta GLOBAL(udivdi3_internal),tr0
        shari r2,63,r22
        shari r3,63,r23
        xor r2,r22,r2
        xor r3,r23,r3
        sub r2,r22,r2
        sub r3,r23,r3
        beq/u r22,r23,tr0
        ptabs r18,tr1
        blink tr0,r18
        sub r63,r2,r2
        blink tr1,r63
        ENDFUNC(GLOBAL(divdi3))
#endif /* __SHMEDIA__ */
#endif /* L_divdi3 */

#ifdef L_umoddi3
#ifdef __SHMEDIA__
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        .global        GLOBAL(umoddi3)
        FUNC(GLOBAL(umoddi3))
GLOBAL(umoddi3):
        HIDDEN_ALIAS(umoddi3_internal,umoddi3)
        shlri r3,1,r4
        nsb r4,r22
        shlld r3,r22,r6
        shlri r6,49,r5
        movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
        sub r21,r5,r1
        mmulfx.w r1,r1,r4
        mshflo.w r1,r63,r1
        sub r63,r22,r20 // r63 == 64 % 64
        mmulfx.w r5,r4,r4
        pta LOCAL(large_divisor),tr0
        addi r20,32,r9
        msub.w r1,r4,r1
        madd.w r1,r1,r1
        mmulfx.w r1,r1,r4
        shlri r6,32,r7
        bgt/u r9,r63,tr0 // large_divisor
        mmulfx.w r5,r4,r4
        shlri r2,32+14,r19
        addi r22,-31,r0
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r19,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        mulu.l r5,r3,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        shlld r5,r0,r5
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r2,r5,r2
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */

        shlri r2,22,r21
        mulu.l r21,r1,r21
        addi r20,30-22,r0
        /* bubble */ /* could test r3 here to check for divide by zero.  */
        shlrd r21,r0,r21
        mulu.l r21,r3,r5
        mcmpgt.l r21,r63,r21 // See Note 1
        addi r20,30,r0
        mshfhi.l r63,r21,r21
        sub r2,r5,r2
        andc r2,r21,r2

        /* small divisor: need a third divide step */
        mulu.l r2,r1,r7
        ptabs r18,tr0
        sub r2,r3,r8 /* re-use r8 here for rest - r3 */
        shlrd r7,r0,r7
        mulu.l r7,r3,r5
        /* bubble */
        addi r8,1,r7
        cmpgt r7,r5,r7
        cmvne r7,r8,r2
        sub r2,r5,r2
        blink tr0,r63

LOCAL(large_divisor):
        mmulfx.w r5,r4,r4
        shlrd r2,r9,r25
        shlri r25,32,r8
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r8,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        shlri r5,14-1,r8
        mulu.l r8,r7,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r25,r5,r25
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */

        shlri r25,22,r21
        mulu.l r21,r1,r21
        pta LOCAL(no_lo_adj),tr0
        addi r22,32,r0
        shlri r21,40,r21
        mulu.l r21,r7,r5
        add r8,r21,r8
        shlld r2,r0,r2
        sub r25,r5,r25
        bgtu/u r7,r25,tr0 // no_lo_adj
        addi r8,1,r8
        sub r25,r7,r25
LOCAL(no_lo_adj):
        mextr4 r2,r25,r2

        /* large_divisor: only needs a few adjustments.  */
        mulu.l r8,r6,r5
        ptabs r18,tr0
        add r2,r6,r7
        cmpgtu r5,r2,r8
        cmvne r8,r7,r2
        sub r2,r5,r2
        shlrd r2,r22,r2
        blink tr0,r63
        ENDFUNC(GLOBAL(umoddi3))
/* Note 1: To shift the result of the second divide stage so that the result
   always fits into 32 bits, yet we still reduce the rest sufficiently
   would require a lot of instructions to do the shifts just right.  Using
   the full 64 bit shift result to multiply with the divisor would require
   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
   Fortunately, if the upper 32 bits of the shift result are nonzero, we
   know that the rest after taking this partial result into account will
   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
   upper 32 bits of the partial result are nonzero.  */
#endif /* __SHMEDIA__ */
#endif /* L_umoddi3 */

#ifdef L_moddi3
#ifdef __SHMEDIA__
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        .global        GLOBAL(moddi3)
        FUNC(GLOBAL(moddi3))
GLOBAL(moddi3):
        pta GLOBAL(umoddi3_internal),tr0
        shari r2,63,r22
        shari r3,63,r23
        xor r2,r22,r2
        xor r3,r23,r3
        sub r2,r22,r2
        sub r3,r23,r3
        beq/u r22,r63,tr0
        ptabs r18,tr1
        blink tr0,r18
        sub r63,r2,r2
        blink tr1,r63
        ENDFUNC(GLOBAL(moddi3))
#endif /* __SHMEDIA__ */
#endif /* L_moddi3 */

#ifdef L_set_fpscr
#if !defined (__SH2A_NOFPU__)
#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
#ifdef __SH5__
        .mode        SHcompact
#endif
        .global GLOBAL(set_fpscr)
        HIDDEN_FUNC(GLOBAL(set_fpscr))
GLOBAL(set_fpscr):
        lds r4,fpscr
#ifdef __PIC__
        mov.l        r12,@-r15
        mova        LOCAL(set_fpscr_L0),r0
        mov.l        LOCAL(set_fpscr_L0),r12
        add        r0,r12
        mov.l        LOCAL(set_fpscr_L1),r0
        mov.l        @(r0,r12),r1
        mov.l        @r15+,r12
#else
        mov.l LOCAL(set_fpscr_L1),r1
#endif
        swap.w r4,r0
        or #24,r0
#ifndef FMOVD_WORKS
        xor #16,r0
#endif
#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
        swap.w r0,r3
        mov.l r3,@(4,r1)
#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
        swap.w r0,r2
        mov.l r2,@r1
#endif
#ifndef FMOVD_WORKS
        xor #8,r0
#else
        xor #24,r0
#endif
#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
        swap.w r0,r2
        rts
        mov.l r2,@r1
#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
        swap.w r0,r3
        rts
        mov.l r3,@(4,r1)
#endif
        .align 2
#ifdef __PIC__
LOCAL(set_fpscr_L0):
        .long _GLOBAL_OFFSET_TABLE_
LOCAL(set_fpscr_L1):
        .long GLOBAL(fpscr_values@GOT)
#else
LOCAL(set_fpscr_L1):
        .long GLOBAL(fpscr_values)
#endif

        ENDFUNC(GLOBAL(set_fpscr))
#ifndef NO_FPSCR_VALUES
#ifdef __ELF__
        .comm   GLOBAL(fpscr_values),8,4
#else
        .comm   GLOBAL(fpscr_values),8
#endif /* ELF */
#endif /* NO_FPSCR_VALUES */
#endif /* SH2E / SH3E / SH4 */
#endif /* __SH2A_NOFPU__ */
#endif /* L_set_fpscr */
#ifdef L_ic_invalidate
#if __SH5__ == 32
        .mode        SHmedia
        .section        .text..SHmedia32,"ax"
        .align        2
        .global        GLOBAL(init_trampoline)
        HIDDEN_FUNC(GLOBAL(init_trampoline))
GLOBAL(init_trampoline):
        st.l        r0,8,r2
#ifdef __LITTLE_ENDIAN__
        movi        9,r20
        shori        0x402b,r20
        shori        0xd101,r20
        shori        0xd002,r20
#else
        movi        0xffffffffffffd002,r20
        shori        0xd101,r20
        shori        0x402b,r20
        shori        9,r20
#endif
        st.q        r0,0,r20
        st.l        r0,12,r3
        ENDFUNC(GLOBAL(init_trampoline))
        .global        GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
        ocbwb        r0,0
        synco
        icbi        r0, 0
        ptabs        r18, tr0
        synci
        blink        tr0, r63
        ENDFUNC(GLOBAL(ic_invalidate))
#elif defined(__SH4A__)
        .global GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
        ocbwb        @r4
        synco
        rts
        icbi        @r4
        ENDFUNC(GLOBAL(ic_invalidate))
#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
        /* For system code, we use ic_invalidate_line_i, but user code
           needs a different mechanism.  A kernel call is generally not
           available, and it would also be slow.  Different SH4 variants use
           different sizes and associativities of the Icache.  We use a small
           bit of dispatch code that can be put hidden in every shared object,
           which calls the actual processor-specific invalidation code in a
           separate module.
           Or if you have operating system support, the OS could mmap the
           procesor-specific code from a single page, since it is highly
           repetitive.  */
        .global GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
        mov.l        0f,r1
#ifdef __pic__
        mova        0f,r0
        mov.l        1f,r2
        add        r1,r0
        mov.l        @(r0,r2),r1
#endif
        ocbwb        @r4
        mov.l        @(8,r1),r0
        sub        r1,r4
        and        r4,r0
        add        r1,r0
        jmp        @r0
        mov.l        @(4,r1),r0
#ifndef __pic__
0:        .long   GLOBAL(ic_invalidate_array)
#else /* __pic__ */
        .global GLOBAL(ic_invalidate_array)
        /* ??? Why won't the assembler allow to add these two constants?  */
0:        .long   _GLOBAL_OFFSET_TABLE_
1:        .long   GLOBAL(ic_invalidate_array)@GOT
        ENDFUNC(GLOBAL(ic_invalidate))
#endif /* __pic__ */
#endif /* SH4 */
#endif /* L_ic_invalidate */

#ifdef L_ic_invalidate_array
#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))))
        .global GLOBAL(ic_invalidate_array)
        /* This is needed when an SH4 dso with trampolines is used on SH4A.  */
        .global GLOBAL(ic_invalidate_array)
        FUNC(GLOBAL(ic_invalidate_array))
GLOBAL(ic_invalidate_array):
        add        r1,r4
        synco
        rts
        icbi        @r4
        .long        0
        ENDFUNC(GLOBAL(ic_invalidate_array))
#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
        .global GLOBAL(ic_invalidate_array)
        .p2align 5
        FUNC(GLOBAL(ic_invalidate_array))
/* This must be aligned to the beginning of a cache line.  */
GLOBAL(ic_invalidate_array):
#ifndef WAYS
#define WAYS 4
#define WAY_SIZE 0x4000
#endif
#if WAYS == 1
        .rept        WAY_SIZE * WAYS / 32
        rts
        nop
        .rept        7
        .long        WAY_SIZE - 32
        .endr
        .endr
#elif WAYS <= 6
        .rept        WAY_SIZE * WAYS / 32
        braf        r0
        add        #-8,r0
        .long        WAY_SIZE + 8
        .long        WAY_SIZE - 32
        .rept        WAYS-2
        braf        r0
        nop
        .endr
        .rept        7 - WAYS
        rts
        nop
        .endr
        .endr
#else /* WAYS > 6 */
        /* This variant needs two different pages for mmap-ing.  */
         .rept        WAYS-1
        .rept        WAY_SIZE / 32
        braf        r0
        nop
        .long        WAY_SIZE
        .rept 6
        .long        WAY_SIZE - 32
        .endr
        .endr
        .endr
        .rept        WAY_SIZE / 32
        rts
        .rept        15
        nop
        .endr
        .endr
#endif /* WAYS */
        ENDFUNC(GLOBAL(ic_invalidate_array))
#endif /* SH4 */
#endif /* L_ic_invalidate_array */

#if defined (__SH5__) && __SH5__ == 32
#ifdef L_shcompact_call_trampoline
        .section        .rodata
        .align        1
LOCAL(ct_main_table):
.word        LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label)
.word        LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label)
        .mode        SHmedia
        .section        .text..SHmedia32, "ax"
        .align        2
        
     /* This function loads 64-bit general-purpose registers from the
        stack, from a memory address contained in them or from an FP
        register, according to a cookie passed in r1.  Its execution
        time is linear on the number of registers that actually have
        to be copied.  See sh.h for details on the actual bit pattern.

        The function to be called is passed in r0.  If a 32-bit return
        value is expected, the actual function will be tail-called,
        otherwise the return address will be stored in r10 (that the
        caller should expect to be clobbered) and the return value
        will be expanded into r2/r3 upon return.  */
        
        .global        GLOBAL(GCC_shcompact_call_trampoline)
        FUNC(GLOBAL(GCC_shcompact_call_trampoline))
GLOBAL(GCC_shcompact_call_trampoline):
        ptabs/l        r0, tr0        /* Prepare to call the actual function.  */
        movi        ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0
        pt/l        LOCAL(ct_loop), tr1
        addz.l        r1, r63, r1
        shori        ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0
LOCAL(ct_loop):
        nsb        r1, r28
        shlli        r28, 1, r29
        ldx.w        r0, r29, r30
LOCAL(ct_main_label):
        ptrel/l        r30, tr2
        blink        tr2, r63
LOCAL(ct_r2_fp):        /* Copy r2 from an FP register.  */
        /* It must be dr0, so just do it.  */
        fmov.dq        dr0, r2
        movi        7, r30
        shlli        r30, 29, r31
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r3_fp):        /* Copy r3 from an FP register.  */
        /* It is either dr0 or dr2.  */
        movi        7, r30
        shlri        r1, 26, r32
        shlli        r30, 26, r31
        andc        r1, r31, r1
        fmov.dq        dr0, r3
        beqi/l        r32, 4, tr1
        fmov.dq        dr2, r3
        blink        tr1, r63
LOCAL(ct_r4_fp):        /* Copy r4 from an FP register.  */
        shlri        r1, 23 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32
LOCAL(ct_r4_fp_base):
        ptrel/l        r32, tr2
        movi        7, r30
        shlli        r30, 23, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r4_fp_copy):
        fmov.dq        dr0, r4
        blink        tr1, r63
        fmov.dq        dr2, r4
        blink        tr1, r63
        fmov.dq        dr4, r4
        blink        tr1, r63
LOCAL(ct_r5_fp):        /* Copy r5 from an FP register.  */
        shlri        r1, 20 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32
LOCAL(ct_r5_fp_base):
        ptrel/l        r32, tr2
        movi        7, r30
        shlli        r30, 20, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r5_fp_copy):
        fmov.dq        dr0, r5
        blink        tr1, r63
        fmov.dq        dr2, r5
        blink        tr1, r63
        fmov.dq        dr4, r5
        blink        tr1, r63
        fmov.dq        dr6, r5
        blink        tr1, r63
LOCAL(ct_r6_fph):        /* Copy r6 from a high FP register.  */
        /* It must be dr8.  */
        fmov.dq        dr8, r6
        movi        15, r30
        shlli        r30, 16, r31
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r6_fpl):        /* Copy r6 from a low FP register.  */
        shlri        r1, 16 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32
LOCAL(ct_r6_fp_base):
        ptrel/l        r32, tr2
        movi        7, r30
        shlli        r30, 16, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r6_fp_copy):
        fmov.dq        dr0, r6
        blink        tr1, r63
        fmov.dq        dr2, r6
        blink        tr1, r63
        fmov.dq        dr4, r6
        blink        tr1, r63
        fmov.dq        dr6, r6
        blink        tr1, r63
LOCAL(ct_r7_fph):        /* Copy r7 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi        15 << 12, r31
        shlri        r1, 12, r32
        andc        r1, r31, r1
        fmov.dq        dr8, r7
        beqi/l        r32, 8, tr1
        fmov.dq        dr10, r7
        blink        tr1, r63
LOCAL(ct_r7_fpl):        /* Copy r7 from a low FP register.  */
        shlri        r1, 12 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32
LOCAL(ct_r7_fp_base):
        ptrel/l        r32, tr2
        movi        7 << 12, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r7_fp_copy):
        fmov.dq        dr0, r7
        blink        tr1, r63
        fmov.dq        dr2, r7
        blink        tr1, r63
        fmov.dq        dr4, r7
        blink        tr1, r63
        fmov.dq        dr6, r7
        blink        tr1, r63
LOCAL(ct_r8_fph):        /* Copy r8 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi        15 << 8, r31
        andi        r1, 1 << 8, r32
        andc        r1, r31, r1
        fmov.dq        dr8, r8
        beq/l        r32, r63, tr1
        fmov.dq        dr10, r8
        blink        tr1, r63
LOCAL(ct_r8_fpl):        /* Copy r8 from a low FP register.  */
        shlri        r1, 8 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32
LOCAL(ct_r8_fp_base):
        ptrel/l        r32, tr2
        movi        7 << 8, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r8_fp_copy):
        fmov.dq        dr0, r8
        blink        tr1, r63
        fmov.dq        dr2, r8
        blink        tr1, r63
        fmov.dq        dr4, r8
        blink        tr1, r63
        fmov.dq        dr6, r8
        blink        tr1, r63
LOCAL(ct_r9_fph):        /* Copy r9 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi        15 << 4, r31
        andi        r1, 1 << 4, r32
        andc        r1, r31, r1
        fmov.dq        dr8, r9
        beq/l        r32, r63, tr1
        fmov.dq        dr10, r9
        blink        tr1, r63
LOCAL(ct_r9_fpl):        /* Copy r9 from a low FP register.  */
        shlri        r1, 4 - 3, r34
        andi        r34, 3 << 3, r33
        addi        r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32
LOCAL(ct_r9_fp_base):
        ptrel/l        r32, tr2
        movi        7 << 4, r31
        andc        r1, r31, r1
        blink        tr2, r63
LOCAL(ct_r9_fp_copy):
        fmov.dq        dr0, r9
        blink        tr1, r63
        fmov.dq        dr2, r9
        blink        tr1, r63
        fmov.dq        dr4, r9
        blink        tr1, r63
        fmov.dq        dr6, r9
        blink        tr1, r63
LOCAL(ct_r2_ld):        /* Copy r2 from a memory address.  */
        pt/l        LOCAL(ct_r2_load), tr2
        movi        3, r30
        shlli        r30, 29, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r2, 8, r3
        ldx.q        r2, r63, r2
        /* Fall through.  */
LOCAL(ct_r3_ld):        /* Copy r3 from a memory address.  */
        pt/l        LOCAL(ct_r3_load), tr2
        movi        3, r30
        shlli        r30, 26, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r3, 8, r4
        ldx.q        r3, r63, r3
LOCAL(ct_r4_ld):        /* Copy r4 from a memory address.  */
        pt/l        LOCAL(ct_r4_load), tr2
        movi        3, r30
        shlli        r30, 23, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r4, 8, r5
        ldx.q        r4, r63, r4
LOCAL(ct_r5_ld):        /* Copy r5 from a memory address.  */
        pt/l        LOCAL(ct_r5_load), tr2
        movi        3, r30
        shlli        r30, 20, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r5, 8, r6
        ldx.q        r5, r63, r5
LOCAL(ct_r6_ld):        /* Copy r6 from a memory address.  */
        pt/l        LOCAL(ct_r6_load), tr2
        movi        3 << 16, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r6, 8, r7
        ldx.q        r6, r63, r6
LOCAL(ct_r7_ld):        /* Copy r7 from a memory address.  */
        pt/l        LOCAL(ct_r7_load), tr2
        movi        3 << 12, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r7, 8, r8
        ldx.q        r7, r63, r7
LOCAL(ct_r8_ld):        /* Copy r8 from a memory address.  */
        pt/l        LOCAL(ct_r8_load), tr2
        movi        3 << 8, r31
        and        r1, r31, r32
        andc        r1, r31, r1
        beq/l        r31, r32, tr2
        addi.l        r8, 8, r9
        ldx.q        r8, r63, r8
LOCAL(ct_r9_ld):        /* Copy r9 from a memory address.  */
        pt/l        LOCAL(ct_check_tramp), tr2
        ldx.q        r9, r63, r9
        blink        tr2, r63
LOCAL(ct_r2_load):
        ldx.q        r2, r63, r2
        blink        tr1, r63
LOCAL(ct_r3_load):
        ldx.q        r3, r63, r3
        blink        tr1, r63
LOCAL(ct_r4_load):
        ldx.q        r4, r63, r4
        blink        tr1, r63
LOCAL(ct_r5_load):
        ldx.q        r5, r63, r5
        blink        tr1, r63
LOCAL(ct_r6_load):
        ldx.q        r6, r63, r6
        blink        tr1, r63
LOCAL(ct_r7_load):
        ldx.q        r7, r63, r7
        blink        tr1, r63
LOCAL(ct_r8_load):
        ldx.q        r8, r63, r8
        blink        tr1, r63
LOCAL(ct_r2_pop):        /* Pop r2 from the stack.  */
        movi        1, r30
        ldx.q        r15, r63, r2
        shlli        r30, 29, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r3_pop):        /* Pop r3 from the stack.  */
        movi        1, r30
        ldx.q        r15, r63, r3
        shlli        r30, 26, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r4_pop):        /* Pop r4 from the stack.  */
        movi        1, r30
        ldx.q        r15, r63, r4
        shlli        r30, 23, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r5_pop):        /* Pop r5 from the stack.  */
        movi        1, r30
        ldx.q        r15, r63, r5
        shlli        r30, 20, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r6_pop):        /* Pop r6 from the stack.  */
        movi        1, r30
        ldx.q        r15, r63, r6
        shlli        r30, 16, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r7_pop):        /* Pop r7 from the stack.  */
        ldx.q        r15, r63, r7
        movi        1 << 12, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_r8_pop):        /* Pop r8 from the stack.  */
        ldx.q        r15, r63, r8
        movi        1 << 8, r31
        addi.l        r15, 8, r15
        andc        r1, r31, r1
        blink        tr1, r63
LOCAL(ct_pop_seq):        /* Pop a sequence of registers off the stack.  */
        andi        r1, 7 << 1, r30
        movi        (LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32
        shlli        r30, 2, r31
        shori        LOCAL(ct_end_of_pop_seq) & 65535, r32
        sub.l        r32, r31, r33
        ptabs/l        r33, tr2
        blink        tr2, r63
LOCAL(ct_start_of_pop_seq):        /* Beginning of pop sequence.  */
        ldx.q        r15, r63, r3
        addi.l        r15, 8, r15
        ldx.q        r15, r63, r4
        addi.l        r15, 8, r15
        ldx.q        r15, r63, r5
        addi.l        r15, 8, r15
        ldx.q        r15, r63, r6
        addi.l        r15, 8, r15
        ldx.q        r15, r63, r7
        addi.l        r15, 8, r15
        ldx.q        r15, r63, r8
        addi.l        r15, 8, r15
LOCAL(ct_r9_pop):        /* Pop r9 from the stack.  */
        ldx.q        r15, r63, r9
        addi.l        r15, 8, r15
LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction.  */
LOCAL(ct_check_tramp):        /* Check whether we need a trampoline.  */
        pt/u        LOCAL(ct_ret_wide), tr2
        andi        r1, 1, r1
        bne/u        r1, r63, tr2
LOCAL(ct_call_func):        /* Just branch to the function.  */
        blink        tr0, r63
LOCAL(ct_ret_wide):        /* Call the function, so that we can unpack its 
                           64-bit return value.  */
        add.l        r18, r63, r10
        blink        tr0, r18
        ptabs        r10, tr0
#if __LITTLE_ENDIAN__
        shari        r2, 32, r3
        add.l        r2, r63, r2
#else
        add.l        r2, r63, r3
        shari        r2, 32, r2
#endif
        blink        tr0, r63

        ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline))
#endif /* L_shcompact_call_trampoline */

#ifdef L_shcompact_return_trampoline
     /* This function does the converse of the code in `ret_wide'
        above.  It is tail-called by SHcompact functions returning
        64-bit non-floating-point values, to pack the 32-bit values in
        r2 and r3 into r2.  */

        .mode        SHmedia
        .section        .text..SHmedia32, "ax"
        .align        2
        .global        GLOBAL(GCC_shcompact_return_trampoline)
        HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline))
GLOBAL(GCC_shcompact_return_trampoline):
        ptabs/l        r18, tr0
#if __LITTLE_ENDIAN__
        addz.l        r2, r63, r2
        shlli        r3, 32, r3
#else
        addz.l        r3, r63, r3
        shlli        r2, 32, r2
#endif
        or        r3, r2, r2
        blink        tr0, r63

        ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline))
#endif /* L_shcompact_return_trampoline */

#ifdef L_shcompact_incoming_args
        .section        .rodata
        .align        1
LOCAL(ia_main_table):
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label)
.word        1 /* Invalid, just loop */
.word        1 /* Invalid, just loop */
.word        LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
.word        LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
        .mode        SHmedia
        .section        .text..SHmedia32, "ax"
        .align        2
        
     /* This function stores 64-bit general-purpose registers back in
        the stack, and loads the address in which each register
        was stored into itself.  The lower 32 bits of r17 hold the address
        to begin storing, and the upper 32 bits of r17 hold the cookie.
        Its execution time is linear on the
        number of registers that actually have to be copied, and it is
        optimized for structures larger than 64 bits, as opposed to
        individual `long long' arguments.  See sh.h for details on the
        actual bit pattern.  */
        
        .global        GLOBAL(GCC_shcompact_incoming_args)
         FUNC(GLOBAL(GCC_shcompact_incoming_args))
GLOBAL(GCC_shcompact_incoming_args):
        ptabs/l        r18, tr0        /* Prepare to return.  */
        shlri        r17, 32, r0        /* Load the cookie.  */
        movi        ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43
        pt/l        LOCAL(ia_loop), tr1
        add.l        r17, r63, r17
        shori        ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43
LOCAL(ia_loop):
        nsb        r0, r36
        shlli        r36, 1, r37
        ldx.w        r43, r37, r38
LOCAL(ia_main_label):
        ptrel/l        r38, tr2
        blink        tr2, r63
LOCAL(ia_r2_ld):        /* Store r2 and load its address.  */
        movi        3, r38
        shlli        r38, 29, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r2
        add.l        r17, r63, r2
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r3_ld):        /* Store r3 and load its address.  */
        movi        3, r38
        shlli        r38, 26, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r3
        add.l        r17, r63, r3
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r4_ld):        /* Store r4 and load its address.  */
        movi        3, r38
        shlli        r38, 23, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r4
        add.l        r17, r63, r4
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r5_ld):        /* Store r5 and load its address.  */
        movi        3, r38
        shlli        r38, 20, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r5
        add.l        r17, r63, r5
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r6_ld):        /* Store r6 and load its address.  */
        movi        3, r38
        shlli        r38, 16, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r6
        add.l        r17, r63, r6
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r7_ld):        /* Store r7 and load its address.  */
        movi        3 << 12, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r7
        add.l        r17, r63, r7
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r8_ld):        /* Store r8 and load its address.  */
        movi        3 << 8, r39
        and        r0, r39, r40
        andc        r0, r39, r0
        stx.q        r17, r63, r8
        add.l        r17, r63, r8
        addi.l        r17, 8, r17
        beq/u        r39, r40, tr1
LOCAL(ia_r9_ld):        /* Store r9 and load its address.  */
        stx.q        r17, r63, r9
        add.l        r17, r63, r9
        blink        tr0, r63
LOCAL(ia_r2_push):        /* Push r2 onto the stack.  */
        movi        1, r38
        shlli        r38, 29, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r2
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r3_push):        /* Push r3 onto the stack.  */
        movi        1, r38
        shlli        r38, 26, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r3
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r4_push):        /* Push r4 onto the stack.  */
        movi        1, r38
        shlli        r38, 23, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r4
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r5_push):        /* Push r5 onto the stack.  */
        movi        1, r38
        shlli        r38, 20, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r5
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r6_push):        /* Push r6 onto the stack.  */
        movi        1, r38
        shlli        r38, 16, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r6
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r7_push):        /* Push r7 onto the stack.  */
        movi        1 << 12, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r7
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_r8_push):        /* Push r8 onto the stack.  */
        movi        1 << 8, r39
        andc        r0, r39, r0
        stx.q        r17, r63, r8
        addi.l        r17, 8, r17
        blink        tr1, r63
LOCAL(ia_push_seq):        /* Push a sequence of registers onto the stack.  */
        andi        r0, 7 << 1, r38
        movi        (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40
        shlli        r38, 2, r39
        shori        LOCAL(ia_end_of_push_seq) & 65535, r40
        sub.l        r40, r39, r41
        ptabs/l        r41, tr2
        blink        tr2, r63
LOCAL(ia_stack_of_push_seq):         /* Beginning of push sequence.  */
        stx.q        r17, r63, r3
        addi.l        r17, 8, r17
        stx.q        r17, r63, r4
        addi.l        r17, 8, r17
        stx.q        r17, r63, r5
        addi.l        r17, 8, r17
        stx.q        r17, r63, r6
        addi.l        r17, 8, r17
        stx.q        r17, r63, r7
        addi.l        r17, 8, r17
        stx.q        r17, r63, r8
        addi.l        r17, 8, r17
LOCAL(ia_r9_push):        /* Push r9 onto the stack.  */
        stx.q        r17, r63, r9
LOCAL(ia_return):        /* Return.  */
        blink        tr0, r63
LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction.  */
        ENDFUNC(GLOBAL(GCC_shcompact_incoming_args))
#endif /* L_shcompact_incoming_args */
#endif
#if __SH5__
#ifdef L_nested_trampoline
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align        3 /* It is copied in units of 8 bytes in SHmedia mode.  */
        .global        GLOBAL(GCC_nested_trampoline)
        HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline))
GLOBAL(GCC_nested_trampoline):
        .mode        SHmedia
        ptrel/u        r63, tr0
        gettr        tr0, r0
#if __SH5__ == 64
        ld.q        r0, 24, r1
#else
        ld.l        r0, 24, r1
#endif
        ptabs/l        r1, tr1
#if __SH5__ == 64
        ld.q        r0, 32, r1
#else
        ld.l        r0, 28, r1
#endif
        blink        tr1, r63

        ENDFUNC(GLOBAL(GCC_nested_trampoline))
#endif /* L_nested_trampoline */
#endif /* __SH5__ */
#if __SH5__ == 32
#ifdef L_push_pop_shmedia_regs
        .section        .text..SHmedia32,"ax"
        .mode        SHmedia
        .align        2
#ifndef __SH4_NOFPU__        
        .global        GLOBAL(GCC_push_shmedia_regs)
        FUNC(GLOBAL(GCC_push_shmedia_regs))
GLOBAL(GCC_push_shmedia_regs):
        addi.l        r15, -14*8, r15
        fst.d        r15, 13*8, dr62
        fst.d        r15, 12*8, dr60
        fst.d        r15, 11*8, dr58
        fst.d        r15, 10*8, dr56
        fst.d        r15,  9*8, dr54
        fst.d        r15,  8*8, dr52
        fst.d        r15,  7*8, dr50
        fst.d        r15,  6*8, dr48
        fst.d        r15,  5*8, dr46
        fst.d        r15,  4*8, dr44
        fst.d        r15,  3*8, dr42
        fst.d        r15,  2*8, dr40
        fst.d        r15,  1*8, dr38
        fst.d        r15,  0*8, dr36
#else /* ! __SH4_NOFPU__ */
        .global        GLOBAL(GCC_push_shmedia_regs_nofpu)
        FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
GLOBAL(GCC_push_shmedia_regs_nofpu):
#endif /* ! __SH4_NOFPU__ */
        ptabs/l        r18, tr0
        addi.l        r15, -27*8, r15
        gettr        tr7, r62
        gettr        tr6, r61
        gettr        tr5, r60
        st.q        r15, 26*8, r62
        st.q        r15, 25*8, r61
        st.q        r15, 24*8, r60
        st.q        r15, 23*8, r59
        st.q        r15, 22*8, r58
        st.q        r15, 21*8, r57
        st.q        r15, 20*8, r56
        st.q        r15, 19*8, r55
        st.q        r15, 18*8, r54
        st.q        r15, 17*8, r53
        st.q        r15, 16*8, r52
        st.q        r15, 15*8, r51
        st.q        r15, 14*8, r50
        st.q        r15, 13*8, r49
        st.q        r15, 12*8, r48
        st.q        r15, 11*8, r47
        st.q        r15, 10*8, r46
        st.q        r15,  9*8, r45
        st.q        r15,  8*8, r44
        st.q        r15,  7*8, r35
        st.q        r15,  6*8, r34
        st.q        r15,  5*8, r33
        st.q        r15,  4*8, r32
        st.q        r15,  3*8, r31
        st.q        r15,  2*8, r30
        st.q        r15,  1*8, r29
        st.q        r15,  0*8, r28
        blink        tr0, r63
#ifndef __SH4_NOFPU__        
        ENDFUNC(GLOBAL(GCC_push_shmedia_regs))
#else
        ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
#endif
#ifndef __SH4_NOFPU__        
        .global        GLOBAL(GCC_pop_shmedia_regs)
        FUNC(GLOBAL(GCC_pop_shmedia_regs))
GLOBAL(GCC_pop_shmedia_regs):
        pt        .L0, tr1
        movi        41*8, r0
        fld.d        r15, 40*8, dr62
        fld.d        r15, 39*8, dr60
        fld.d        r15, 38*8, dr58
        fld.d        r15, 37*8, dr56
        fld.d        r15, 36*8, dr54
        fld.d        r15, 35*8, dr52
        fld.d        r15, 34*8, dr50
        fld.d        r15, 33*8, dr48
        fld.d        r15, 32*8, dr46
        fld.d        r15, 31*8, dr44
        fld.d        r15, 30*8, dr42
        fld.d        r15, 29*8, dr40
        fld.d        r15, 28*8, dr38
        fld.d        r15, 27*8, dr36
        blink        tr1, r63
#else /* ! __SH4_NOFPU__        */
        .global        GLOBAL(GCC_pop_shmedia_regs_nofpu)
        FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
GLOBAL(GCC_pop_shmedia_regs_nofpu):
#endif /* ! __SH4_NOFPU__        */
        movi        27*8, r0
.L0:
        ptabs        r18, tr0
        ld.q        r15, 26*8, r62
        ld.q        r15, 25*8, r61
        ld.q        r15, 24*8, r60
        ptabs        r62, tr7
        ptabs        r61, tr6
        ptabs        r60, tr5
        ld.q        r15, 23*8, r59
        ld.q        r15, 22*8, r58
        ld.q        r15, 21*8, r57
        ld.q        r15, 20*8, r56
        ld.q        r15, 19*8, r55
        ld.q        r15, 18*8, r54
        ld.q        r15, 17*8, r53
        ld.q        r15, 16*8, r52
        ld.q        r15, 15*8, r51
        ld.q        r15, 14*8, r50
        ld.q        r15, 13*8, r49
        ld.q        r15, 12*8, r48
        ld.q        r15, 11*8, r47
        ld.q        r15, 10*8, r46
        ld.q        r15,  9*8, r45
        ld.q        r15,  8*8, r44
        ld.q        r15,  7*8, r35
        ld.q        r15,  6*8, r34
        ld.q        r15,  5*8, r33
        ld.q        r15,  4*8, r32
        ld.q        r15,  3*8, r31
        ld.q        r15,  2*8, r30
        ld.q        r15,  1*8, r29
        ld.q        r15,  0*8, r28
        add.l        r15, r0, r15
        blink        tr0, r63

#ifndef __SH4_NOFPU__
        ENDFUNC(GLOBAL(GCC_pop_shmedia_regs))
#else
        ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
#endif
#endif /* __SH5__ == 32 */
#endif /* L_push_pop_shmedia_regs */

#ifdef L_div_table
#if __SH5__
#if defined(__pic__) && defined(__SHMEDIA__)
        .global        GLOBAL(sdivsi3)
        FUNC(GLOBAL(sdivsi3))
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
#if 0
/* ??? FIXME: Presumably due to a linker bug, exporting data symbols
   in a text section does not work (at least for shared libraries):
   the linker sets the LSB of the address as if this was SHmedia code.  */
#define TEXT_DATA_BUG
#endif
        .align        2
 // inputs: r4,r5
 // clobbered: r1,r18,r19,r20,r21,r25,tr0
 // result in r0
 .global GLOBAL(sdivsi3)
GLOBAL(sdivsi3):
#ifdef TEXT_DATA_BUG
 ptb datalabel Local_div_table,tr0
#else
 ptb GLOBAL(div_table_internal),tr0
#endif
 nsb r5, r1
 shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
 shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
 /* bubble */
 gettr tr0,r20
 ldx.ub r20, r21, r19 // u0.8
 shari r25, 32, r25   // normalize to s2.30
 shlli r21, 1, r21
 muls.l r25, r19, r19 // s2.38
 ldx.w r20, r21, r21  // s2.14
  ptabs r18, tr0
 shari r19, 24, r19   // truncate to s2.14
 sub r21, r19, r19    // some 11 bit inverse in s1.14
 muls.l r19, r19, r21 // u0.28
  sub r63, r1, r1
  addi r1, 92, r1
 muls.l r25, r21, r18 // s2.58
 shlli r19, 45, r19   // multiply by two and convert to s2.58
  /* bubble */
 sub r19, r18, r18
 shari r18, 28, r18   // some 22 bit inverse in s1.30
 muls.l r18, r25, r0  // s2.60
  muls.l r18, r4, r25 // s32.30
  /* bubble */
 shari r0, 16, r19   // s-16.44
 muls.l r19, r18, r19 // s-16.74
  shari r25, 63, r0
  shari r4, 14, r18   // s19.-14
 shari r19, 30, r19   // s-16.44
 muls.l r19, r18, r19 // s15.30
  xor r21, r0, r21    // You could also use the constant 1 << 27.
  add r21, r25, r21
 sub r21, r19, r21
 shard r21, r1, r21
 sub r21, r0, r0
 blink tr0, r63
        ENDFUNC(GLOBAL(sdivsi3))
/* This table has been generated by divtab.c .
Defects for bias -330:
   Max defect: 6.081536e-07 at -1.000000e+00
   Min defect: 2.849516e-08 at 1.030651e+00
   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
   Defect at 1: 1.238659e-07
   Defect at -2: 1.061708e-07 */
#else /* ! __pic__ || ! __SHMEDIA__ */
        .section        .rodata
#endif /* __pic__ */
#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__)
        .balign 2
        .type        Local_div_table,@object
        .size        Local_div_table,128
/* negative division constants */
        .word        -16638
        .word        -17135
        .word        -17737
        .word        -18433
        .word        -19103
        .word        -19751
        .word        -20583
        .word        -21383
        .word        -22343
        .word        -23353
        .word        -24407
        .word        -25582
        .word        -26863
        .word        -28382
        .word        -29965
        .word        -31800
/* negative division factors */
        .byte        66
        .byte        70
        .byte        75
        .byte        81
        .byte        87
        .byte        93
        .byte        101
        .byte        109
        .byte        119
        .byte        130
        .byte        142
        .byte        156
        .byte        172
        .byte        192
        .byte        214
        .byte        241
        .skip 16
Local_div_table:
        .skip 16
/* positive division factors */
        .byte        241
        .byte        214
        .byte        192
        .byte        172
        .byte        156
        .byte        142
        .byte        130
        .byte        119
        .byte        109
        .byte        101
        .byte        93
        .byte        87
        .byte        81
        .byte        75
        .byte        70
        .byte        66
/* positive division constants */
        .word        31801
        .word        29966
        .word        28383
        .word        26864
        .word        25583
        .word        24408
        .word        23354
        .word        22344
        .word        21384
        .word        20584
        .word        19752
        .word        19104
        .word        18434
        .word        17738
        .word        17136
        .word        16639
        .section        .rodata
#endif /* TEXT_DATA_BUG */
        .balign 2
        .type        GLOBAL(div_table),@object
        .size        GLOBAL(div_table),128
/* negative division constants */
        .word        -16638
        .word        -17135
        .word        -17737
        .word        -18433
        .word        -19103
        .word        -19751
        .word        -20583
        .word        -21383
        .word        -22343
        .word        -23353
        .word        -24407
        .word        -25582
        .word        -26863
        .word        -28382
        .word        -29965
        .word        -31800
/* negative division factors */
        .byte        66
        .byte        70
        .byte        75
        .byte        81
        .byte        87
        .byte        93
        .byte        101
        .byte        109
        .byte        119
        .byte        130
        .byte        142
        .byte        156
        .byte        172
        .byte        192
        .byte        214
        .byte        241
        .skip 16
        .global        GLOBAL(div_table)
GLOBAL(div_table):
        HIDDEN_ALIAS(div_table_internal,div_table)
        .skip 16
/* positive division factors */
        .byte        241
        .byte        214
        .byte        192
        .byte        172
        .byte        156
        .byte        142
        .byte        130
        .byte        119
        .byte        109
        .byte        101
        .byte        93
        .byte        87
        .byte        81
        .byte        75
        .byte        70
        .byte        66
/* positive division constants */
        .word        31801
        .word        29966
        .word        28383
        .word        26864
        .word        25583
        .word        24408
        .word        23354
        .word        22344
        .word        21384
        .word        20584
        .word        19752
        .word        19104
        .word        18434
        .word        17738
        .word        17136
        .word        16639

#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
/* This code used shld, thus is not suitable for SH1 / SH2.  */

/* Signed / unsigned division without use of FPU, optimized for SH4.
   Uses a lookup table for divisors in the range -128 .. +128, and
   div1 with case distinction for larger divisors in three more ranges.
   The code is lumped together with the table to allow the use of mova.  */
#ifdef __LITTLE_ENDIAN__
#define L_LSB 0
#define L_LSWMSB 1
#define L_MSWLSB 2
#else
#define L_LSB 3
#define L_LSWMSB 2
#define L_MSWLSB 1
#endif

        .balign 4
        .global        GLOBAL(udivsi3_i4i)
        FUNC(GLOBAL(udivsi3_i4i))
GLOBAL(udivsi3_i4i):
        mov.w LOCAL(c128_w), r1
        div0u
        mov r4,r0
        shlr8 r0
        cmp/hi r1,r5
        extu.w r5,r1
        bf LOCAL(udiv_le128)
        cmp/eq r5,r1
        bf LOCAL(udiv_ge64k)
        shlr r0
        mov r5,r1
        shll16 r5
        mov.l r4,@-r15
        div1 r5,r0
        mov.l r1,@-r15
        div1 r5,r0
        div1 r5,r0
        bra LOCAL(udiv_25)
        div1 r5,r0

LOCAL(div_le128):
        mova LOCAL(div_table_ix),r0
        bra LOCAL(div_le128_2)
        mov.b @(r0,r5),r1
LOCAL(udiv_le128):
        mov.l r4,@-r15
        mova LOCAL(div_table_ix),r0
        mov.b @(r0,r5),r1
        mov.l r5,@-r15
LOCAL(div_le128_2):
        mova LOCAL(div_table_inv),r0
        mov.l @(r0,r1),r1
        mov r5,r0
        tst #0xfe,r0
        mova LOCAL(div_table_clz),r0
        dmulu.l r1,r4
        mov.b @(r0,r5),r1
        bt/s LOCAL(div_by_1)
        mov r4,r0
        mov.l @r15+,r5
        sts mach,r0
        /* clrt */
        addc r4,r0
        mov.l @r15+,r4
        rotcr r0
        rts
        shld r1,r0

LOCAL(div_by_1_neg):
        neg r4,r0
LOCAL(div_by_1):
        mov.l @r15+,r5
        rts
        mov.l @r15+,r4

LOCAL(div_ge64k):
        bt/s LOCAL(div_r8)
        div0u
        shll8 r5
        bra LOCAL(div_ge64k_2)
        div1 r5,r0
LOCAL(udiv_ge64k):
        cmp/hi r0,r5
        mov r5,r1
        bt LOCAL(udiv_r8)
        shll8 r5
        mov.l r4,@-r15
        div1 r5,r0
        mov.l r1,@-r15
LOCAL(div_ge64k_2):
        div1 r5,r0
        mov.l LOCAL(zero_l),r1
        .rept 4
        div1 r5,r0
        .endr
        mov.l r1,@-r15
        div1 r5,r0
        mov.w LOCAL(m256_w),r1
        div1 r5,r0
        mov.b r0,@(L_LSWMSB,r15)
        xor r4,r0
        and r1,r0
        bra LOCAL(div_ge64k_end)
        xor r4,r0
        
LOCAL(div_r8):
        shll16 r4
        bra LOCAL(div_r8_2)
        shll8 r4
LOCAL(udiv_r8):
        mov.l r4,@-r15
        shll16 r4
        clrt
        shll8 r4
        mov.l r5,@-r15
LOCAL(div_r8_2):
        rotcl r4
        mov r0,r1
        div1 r5,r1
        mov r4,r0
        rotcl r0
        mov r5,r4
        div1 r5,r1
        .rept 5
        rotcl r0; div1 r5,r1
        .endr
        rotcl r0
        mov.l @r15+,r5
        div1 r4,r1
        mov.l @r15+,r4
        rts
        rotcl r0

        ENDFUNC(GLOBAL(udivsi3_i4i))

        .global        GLOBAL(sdivsi3_i4i)
        FUNC(GLOBAL(sdivsi3_i4i))
        /* This is link-compatible with a GLOBAL(sdivsi3) call,
           but we effectively clobber only r1.  */
GLOBAL(sdivsi3_i4i):
        mov.l r4,@-r15
        cmp/pz r5
        mov.w LOCAL(c128_w), r1
        bt/s LOCAL(pos_divisor)
        cmp/pz r4
        mov.l r5,@-r15
        neg r5,r5
        bt/s LOCAL(neg_result)
        cmp/hi r1,r5
        neg r4,r4
LOCAL(pos_result):
        extu.w r5,r0
        bf LOCAL(div_le128)
        cmp/eq r5,r0
        mov r4,r0
        shlr8 r0
        bf/s LOCAL(div_ge64k)
        cmp/hi r0,r5
        div0u
        shll16 r5
        div1 r5,r0
        div1 r5,r0
        div1 r5,r0
LOCAL(udiv_25):
        mov.l LOCAL(zero_l),r1
        div1 r5,r0
        div1 r5,r0
        mov.l r1,@-r15
        .rept 3
        div1 r5,r0
        .endr
        mov.b r0,@(L_MSWLSB,r15)
        xtrct r4,r0
        swap.w r0,r0
        .rept 8
        div1 r5,r0
        .endr
        mov.b r0,@(L_LSWMSB,r15)
LOCAL(div_ge64k_end):
        .rept 8
        div1 r5,r0
        .endr
        mov.l @r15+,r4 ! zero-extension and swap using LS unit.
        extu.b r0,r0
        mov.l @r15+,r5
        or r4,r0
        mov.l @r15+,r4
        rts
        rotcl r0

LOCAL(div_le128_neg):
        tst #0xfe,r0
        mova LOCAL(div_table_ix),r0
        mov.b @(r0,r5),r1
        mova LOCAL(div_table_inv),r0
        bt/s LOCAL(div_by_1_neg)
        mov.l @(r0,r1),r1
        mova LOCAL(div_table_clz),r0
        dmulu.l r1,r4
        mov.b @(r0,r5),r1
        mov.l @r15+,r5
        sts mach,r0
        /* clrt */
        addc r4,r0
        mov.l @r15+,r4
        rotcr r0
        shld r1,r0
        rts
        neg r0,r0

LOCAL(pos_divisor):
        mov.l r5,@-r15
        bt/s LOCAL(pos_result)
        cmp/hi r1,r5
        neg r4,r4
LOCAL(neg_result):
        extu.w r5,r0
        bf LOCAL(div_le128_neg)
        cmp/eq r5,r0
        mov r4,r0
        shlr8 r0
        bf/s LOCAL(div_ge64k_neg)
        cmp/hi r0,r5
        div0u
        mov.l LOCAL(zero_l),r1
        shll16 r5
        div1 r5,r0
        mov.l r1,@-r15
        .rept 7
        div1 r5,r0
        .endr
        mov.b r0,@(L_MSWLSB,r15)
        xtrct r4,r0
        swap.w r0,r0
        .rept 8
        div1 r5,r0
        .endr
        mov.b r0,@(L_LSWMSB,r15)
LOCAL(div_ge64k_neg_end):
        .rept 8
        div1 r5,r0
        .endr
        mov.l @r15+,r4 ! zero-extension and swap using LS unit.
        extu.b r0,r1
        mov.l @r15+,r5
        or r4,r1
LOCAL(div_r8_neg_end):
        mov.l @r15+,r4
        rotcl r1
        rts
        neg r1,r0

LOCAL(div_ge64k_neg):
        bt/s LOCAL(div_r8_neg)
        div0u
        shll8 r5
        mov.l LOCAL(zero_l),r1
        .rept 6
        div1 r5,r0
        .endr
        mov.l r1,@-r15
        div1 r5,r0
        mov.w LOCAL(m256_w),r1
        div1 r5,r0
        mov.b r0,@(L_LSWMSB,r15)
        xor r4,r0
        and r1,r0
        bra LOCAL(div_ge64k_neg_end)
        xor r4,r0

LOCAL(c128_w):
        .word 128

LOCAL(div_r8_neg):
        clrt
        shll16 r4
        mov r4,r1
        shll8 r1
        mov r5,r4
        .rept 7
        rotcl r1; div1 r5,r0
        .endr
        mov.l @r15+,r5
        rotcl r1
        bra LOCAL(div_r8_neg_end)
        div1 r4,r0

LOCAL(m256_w):
        .word 0xff00
/* This table has been generated by divtab-sh4.c.  */
        .balign 4
LOCAL(div_table_clz):
        .byte        0
        .byte        1
        .byte        0
        .byte        -1
        .byte        -1
        .byte        -2
        .byte        -2
        .byte        -2
        .byte        -2
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -3
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -4
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -5
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
        .byte        -6
/* Lookup table translating positive divisor to index into table of
   normalized inverse.  N.B. the '0' entry is also the last entry of the
 previous table, and causes an unaligned access for division by zero.  */
LOCAL(div_table_ix):
        .byte        -6
        .byte        -128
        .byte        -128
        .byte        0
        .byte        -128
        .byte        -64
        .byte        0
        .byte        64
        .byte        -128
        .byte        -96
        .byte        -64
        .byte        -32
        .byte        0
        .byte        32
        .byte        64
        .byte        96
        .byte        -128
        .byte        -112
        .byte        -96
        .byte        -80
        .byte        -64
        .byte        -48
        .byte        -32
        .byte        -16
        .byte        0
        .byte        16
        .byte        32
        .byte        48
        .byte        64
        .byte        80
        .byte        96
        .byte        112
        .byte        -128
        .byte        -120
        .byte        -112
        .byte        -104
        .byte        -96
        .byte        -88
        .byte        -80
        .byte        -72
        .byte        -64
        .byte        -56
        .byte        -48
        .byte        -40
        .byte        -32
        .byte        -24
        .byte        -16
        .byte        -8
        .byte        0
        .byte        8
        .byte        16
        .byte        24
        .byte        32
        .byte        40
        .byte        48
        .byte        56
        .byte        64
        .byte        72
        .byte        80
        .byte        88
        .byte        96
        .byte        104
        .byte        112
        .byte        120
        .byte        -128
        .byte        -124
        .byte        -120
        .byte        -116
        .byte        -112
        .byte        -108
        .byte        -104
        .byte        -100
        .byte        -96
        .byte        -92
        .byte        -88
        .byte        -84
        .byte        -80
        .byte        -76
        .byte        -72
        .byte        -68
        .byte        -64
        .byte        -60
        .byte        -56
        .byte        -52
        .byte        -48
        .byte        -44
        .byte        -40
        .byte        -36
        .byte        -32
        .byte        -28
        .byte        -24
        .byte        -20
        .byte        -16
        .byte        -12
        .byte        -8
        .byte        -4
        .byte        0
        .byte        4
        .byte        8
        .byte        12
        .byte        16
        .byte        20
        .byte        24
        .byte        28
        .byte        32
        .byte        36
        .byte        40
        .byte        44
        .byte        48
        .byte        52
        .byte        56
        .byte        60
        .byte        64
        .byte        68
        .byte        72
        .byte        76
        .byte        80
        .byte        84
        .byte        88
        .byte        92
        .byte        96
        .byte        100
        .byte        104
        .byte        108
        .byte        112
        .byte        116
        .byte        120
        .byte        124
        .byte        -128
/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */
        .balign 4
LOCAL(zero_l):
        .long        0x0
        .long        0xF81F81F9
        .long        0xF07C1F08
        .long        0xE9131AC0
        .long        0xE1E1E1E2
        .long        0xDAE6076C
        .long        0xD41D41D5
        .long        0xCD856891
        .long        0xC71C71C8
        .long        0xC0E07039
        .long        0xBACF914D
        .long        0xB4E81B4F
        .long        0xAF286BCB
        .long        0xA98EF607
        .long        0xA41A41A5
        .long        0x9EC8E952
        .long        0x9999999A
        .long        0x948B0FCE
        .long        0x8F9C18FA
        .long        0x8ACB90F7
        .long        0x86186187
        .long        0x81818182
        .long        0x7D05F418
        .long        0x78A4C818
        .long        0x745D1746
        .long        0x702E05C1
        .long        0x6C16C16D
        .long        0x68168169
        .long        0x642C8591
        .long        0x60581606
        .long        0x5C9882BA
        .long        0x58ED2309
LOCAL(div_table_inv):
        .long        0x55555556
        .long        0x51D07EAF
        .long        0x4E5E0A73
        .long        0x4AFD6A06
        .long        0x47AE147B
        .long        0x446F8657
        .long        0x41414142
        .long        0x3E22CBCF
        .long        0x3B13B13C
        .long        0x38138139
        .long        0x3521CFB3
        .long        0x323E34A3
        .long        0x2F684BDB
        .long        0x2C9FB4D9
        .long        0x29E4129F
        .long        0x27350B89
        .long        0x24924925
        .long        0x21FB7813
        .long        0x1F7047DD
        .long        0x1CF06ADB
        .long        0x1A7B9612
        .long        0x18118119
        .long        0x15B1E5F8
        .long        0x135C8114
        .long        0x11111112
        .long        0xECF56BF
        .long        0xC9714FC
        .long        0xA6810A7
        .long        0x8421085
        .long        0x624DD30
        .long        0x4104105
        .long        0x2040811
        /* maximum error: 0.987342 scaled: 0.921875*/

        ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* SH3 / SH4 */

#endif /* L_div_table */

#ifdef L_udiv_qrnnd_16
#if !__SHMEDIA__
        HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16))
        /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
        /* n1 < d, but n1 might be larger than d1.  */
        .global GLOBAL(udiv_qrnnd_16)
        .balign 8
GLOBAL(udiv_qrnnd_16):
        div0u
        cmp/hi r6,r0
        bt .Lots
        .rept 16
        div1 r6,r0 
        .endr
        extu.w r0,r1
        bt 0f
        add r6,r0
0:        rotcl r1
        mulu.w r1,r5
        xtrct r4,r0
        swap.w r0,r0
        sts macl,r2
        cmp/hs r2,r0
        sub r2,r0
        bt 0f
        addc r5,r0
        add #-1,r1
        bt 0f
1:        add #-1,r1
        rts
        add r5,r0
        .balign 8
.Lots:
        sub r5,r0
        swap.w r4,r1
        xtrct r0,r1
        clrt
        mov r1,r0
        addc r5,r0
        mov #-1,r1
        SL1(bf, 1b,
        shlr16 r1)
0:        rts
        nop
        ENDFUNC(GLOBAL(udiv_qrnnd_16))
#endif /* !__SHMEDIA__ */
#endif /* L_udiv_qrnnd_16 */