File: powr.S

package info (click to toggle)
libffm 0.28-1
links: PTS
area: main
in suites: potato
size: 220 kB
ctags: 185
sloc: asm: 3,028; makefile: 95; ansic: 12; sh: 2
file content (140 lines) | stat: -rw-r--r-- 3,351 bytes
parent folder | download | duplicates (4)
/*
   libffm	- Free, pretty fast replacement for some math (libm) routines 
			on Linux/AXP, optimized for the 21164

   Copyright (C) 1998  Joachim Wesner <joachim.wesner@frankfurt.netsurf.de>
                  and  Kazushige Goto <goto@statabo.rim.or.jp>

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library (see file COPYING.LIB); if not, write 
   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, 
   MA 02139, USA.
*/
/*
   Simple, fast powr(x, y) helper function for the fast log2/exp2 routines, 
   by Joachim Wesner <joachim.wesner@frankfurt.netsurf.de>, July 1998.
    
   This routine is "quick&dirty" as in the Cray powr function, i.e. NO 
   extended precision is used in calculating log(x)*y, so the relative
   error in the final result will grow to ~ 0.4*|ld(x)|*|y| ulp.
    
   However, for integral values of y, repeated multiplications will be used 
   to calculate the result. So, for "random" x, the rel. error seems to
   be in the order of 0.4*|y| ulp and for (not too large) integral values of 
   x, the final result should be EXACT.
    
   In contrast to the Cray powr function, a different multiplication scheme
   is used, where the number of multiplications only increases with log2(|y|).

   No special handling of illegal arguments or NANs, yet !!!
*/

/* modified and improved by Kazushige Goto <goto@statabo.rim.or.jp> */

	.set noat
	.set noreorder

.text
	.align 5
	.globl powr
	.ent powr
powr:
	fabs	$f17,  $f10		# $f10 = fabs($f17)
	mov	1, $1
	lda	$30,-16($30)
	fbeq	$f16,$x_eq_0		# x == 0

	stq	$1,  0($30)
	ldgp	$29,.-powr($27)
	.frame	$30,16,$26,0
	.prologue	1
	nop

#ifdef PROF
	lda	$28, _mcount
	jsr	$28, ($28), _mcount
	unop
	unop
#endif
	ldt	$f0, 0($30)
	cvttqc	$f10, $f11		# double -> int (f)
	fbeq	$f17,$y_eq_0		# y == 0
	cvtqt	$f0,  $f0		# $f0 = 1.0

	cvtqt	$f11, $f1		# int -> double
	stt	$f11,8($30)
	cmpteq	$f1,$f10,$f1		# if (n==f)
	ldq	$2,  8($30)

	fmov	$f0, $f10		# z = 1.0
	fbeq	$f1,$L8			# Skip
	unop
	beq	$2,$L10			# while(n)
	.align	4

$loop:	# This loop is very tight.
	blbc	$2,$L12			# if (n&1)
	mult	$f10,$f16,$f10		# z = z * x
$L12:
	sra	$2,1,$2			# n >>= 1
	mult	$f16,$f16,$f16		# x = x * x

	unop
	unop
	unop	
	bne	$2,$loop

$L10:
	fblt	$f17, $L14
	fmov	$f10, $f0		# return z
	addq	$30,16,$30
	ret	$31,($26),1
	.align	4
$L14:
	divt	$f0,$f10,$f0
	addq	$30,16,$30
	ret	$31,($26),1
	.align	4

$L8:
	lda	$27, log2
	stt	$f2,8($30)
	fmov	$f17, $f2
	stq	$26,0($30)
	jsr	$26, ($27), log2
	mult	$f0,$f2,$f16
	ldgp	$29,4($26)
	jsr	$26,exp2
	ldq	$26,0($30)
	ldt	$f2,8($30)
$L17:
	addq	$30,16,$30
	ret	$31,($26),1
	.align 4

$x_eq_0:
	fclr	$f0
	addq	$30,16,$30
	ret	$31,($26),1
	.align 4

$y_eq_0:
	cvtqt	$f0,  $f0		# $f0 = 1.0
	addq	$30,16,$30
	ret	$31,($26),1
	.end powr

#ifdef POW
	.globl pow
	pow = powr
#endif