File: pixel_resample_s16.s

package info (click to toggle)
pdp 1%3A0.14.1%2Bdarcs20180201-2
links: PTS, VCS
area: main
in suites: bullseye, buster
size: 3,220 kB
sloc: ansic: 22,424; asm: 2,088; makefile: 532; perl: 145; sh: 108; cpp: 4
file content (314 lines) | stat: -rw-r--r-- 7,373 bytes
parent folder | download | duplicates (6)
	

#interpolation data:
#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
#* 2 vectors: fractional part (unsigned)
#* 2 vectors: addresses of pixel blocks

#coord conversion data:
#1 vector: 32bit splatted address	
#1 vector: 16bit splatted w-1
#1 vector: 16bit splatted h-1
#1 vector: 16bit splatted w (reuse w-1 with add?)
#1 dword:  32 bit line offset

#coord generation data:	several vectors for parameter update stuff..

#coordinate systems: 16 bit virtual coordinates (signed, center relative)
#* 2 vectors: virtual coordinates
#(evt tussenstap + conversie naar 16 bit virtual)


#step 1:	generate virtual coords

		
#step 2:	virtual coords -> block adresses + fractional adresses
#* mulhigh: real coords (x,y) (center relative)
#* add center -> unsigned (top left relative)
#* mullow: fractional part (x_frac, y_frac)
#* mulhigh, mullow, pack 32bit: y_offset
#* pack 32bit: x_offset
#* add, shift, add start address: real addresses
	

#step3:		data fetch using generated addresses: 
#		this step would be much simpler in 4x16bit rgba. life's a bitch..

#step4:		billinear interpolation

#stat5:		store



		# this can be simplified by doing 32 bit unaligned moves
		# and vector unpacking on the data

	

		# cooked image data structure
		# pixel environment temp storage
		TL1 = 0x00
		TL2 = 0x02
		TL3 = 0x04
		TL4 = 0x06
		TR1 = 0x08
		TR2 = 0x0A
		TR3 = 0x0C
		TR4 = 0x0E
		BL1 = 0x10
		BL2 = 0x12
		BL3 = 0x14
		BL4 = 0x16
		BR1 = 0x18
		BR2 = 0x1A
		BR3 = 0x1C
		BR4 = 0x1E
		# addresses of pixel blocks
		ADDRESS1  = 0x20
		ADDRESS2  = 0x24
		ADDRESS3  = 0x28
		ADDRESS4  = 0x2C

		# second env + address buffer (testing:	 not used)
		SECONDBUFFER = 0x30
	
		# 32bit splatted bitmap address
		V2PLANEADDRESS = 0x60
		# 16bit splatted image constants
		V4TWOWIDTHM1 = 0x68
		V4TWOHEIGHTM1 = 0x70
		V4LINEOFFSET = 0x78
		# data struct size
		RESAMPLEDATASIZE = 0x80
	
	

		# interpolation routine
		# input:	%mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
		#		%esi: temp & algo data structure

getpixelsbilin:	psrlw $1, %mm0			# convert to range 0->0x7fff [0,0.5[
		psrlw $1, %mm1
		movq %mm0, %mm2
		movq %mm1, %mm3
		movq V4TWOWIDTHM1(%esi), %mm4	# 2 * (width - 1)
		movq V4TWOHEIGHTM1(%esi), %mm5	# 2 * (height - 1)
		pmulhw %mm5, %mm3		# mm3 == y coord (topleft relative)
		pmulhw %mm4, %mm2		# mm2 == x coord (topleft relative)
		pmullw %mm5, %mm1		# mm1 == y frac (unsigned)
		pmullw %mm4, %mm0		# mm0 == x frac (unsigned)

		movq %mm3, %mm5			# copy y coord 
		pmullw V4LINEOFFSET(%esi), %mm3	# low part of line offset
		pmulhw V4LINEOFFSET(%esi), %mm5	# high part of line offset

		movq %mm2, %mm7			# copy x coord vector
		pxor %mm4, %mm4
		punpcklwd %mm4, %mm2		# low part in %mm2
		punpckhwd %mm4, %mm7		# hight part in %mm7
	
		movq %mm3, %mm6			# copy
		punpcklwd %mm5, %mm3		# unpack low part in %mm3
		punpckhwd %mm5, %mm6		# high part int %mm6

		paddd %mm2, %mm3
		paddd %mm7, %mm6
		pslld $1, %mm3			# convert to word adresses
		pslld $1, %mm6

		paddd V2PLANEADDRESS(%esi), %mm3	# add pixel plane address
		paddd V2PLANEADDRESS(%esi), %mm6

		movq %mm3, ADDRESS1(%esi)	# store adresses
		movq %mm6, ADDRESS3(%esi)

		pcmpeqw %mm2, %mm2		# all ones
		movq %mm0, %mm4			# copy x frac
		movq %mm1, %mm5			# copy y frac
		pxor %mm2, %mm4			# compute compliment (approx negative)
		pxor %mm2, %mm5

		psrlw $1, %mm0			# shift right (0.5 * (frac x)
		psrlw $1, %mm1			# shift right (0.5 * (frac y)
		psrlw $1, %mm4			# shift right (0.5 * (1 - frac x)
		psrlw $1, %mm5			# shift right (0.5 * (1 - frac y)

		movq %mm0, %mm2			# copy of frac x
		movq %mm4, %mm3			# copy of (1-frac x)
						# fetch data

		#jmp skipfetch			# seems the fetch is the real killer. try to optimize this
						# using 32 bit accesses & shifts

						# the src image data struct is padded to the cooked data struct
		movl RESAMPLEDATASIZE(%esi), %edi
		shll $1, %edi

		movl ADDRESS1(%esi), %ecx 
		movl ADDRESS2(%esi), %edx
	
		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, TL1(%esi)
		movw %bx, TL2(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, TR1(%esi)
		movw %bx, TR2(%esi)

		addl %edi, %ecx
		addl %edi, %edx

		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, BL1(%esi)
		movw %bx, BL2(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, BR1(%esi)
		movw %bx, BR2(%esi)

		
		movl ADDRESS3(%esi), %ecx 
		movl ADDRESS4(%esi), %edx


		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, TL3(%esi)
		movw %bx, TL4(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, TR3(%esi)
		movw %bx, TR4(%esi)
	
		addl %edi, %ecx
		addl %edi, %edx

		movw (%ecx), %ax
		movw (%edx), %bx
		movw %ax, BL3(%esi)
		movw %bx, BL4(%esi)
		movw 2(%ecx), %ax
		movw 2(%edx), %bx
		movw %ax, BR3(%esi)
		movw %bx, BR4(%esi)

	
skipfetch:	
		pmulhw TL1(%esi), %mm4		# bilin interpolation
		pmulhw TR1(%esi), %mm0
		pmulhw BL1(%esi), %mm3
		pmulhw BR1(%esi), %mm2


		paddw %mm4, %mm0
		paddw %mm3, %mm2

		pmulhw %mm5, %mm0
		pmulhw %mm1, %mm2

		paddw %mm2, %mm0
		psllw $2, %mm0			# compensate for gain reduction

		ret


		// linear mapping data struct
		ROWSTATEX = 0x0
		ROWSTATEY = 0x8
		COLSTATEX = 0x10
		COLSTATEY = 0x18
		ROWINCX = 0x20		
		ROWINCY = 0x28
		COLINCX = 0x30		
		COLINCY = 0x38

		// image data struct
		LINEOFFSET = 0x0
		IMAGEADDRESS = 0x4
		WIDTH = 0x8
		HEIGHT = 0xC
		IMAGEDATASIZE = 0x10
		


# pixel_resample_linmap_s16(void *x)		
.globl pixel_resample_linmap_s16
.type  pixel_resample_linmap_s16,@function

		SOURCEIMAGE = RESAMPLEDATASIZE
		DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
		LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
	
pixel_resample_linmap_s16:	
		pushl %ebp
		movl %esp, %ebp
		pushl %esi
		pushl %edi
		pushl %ebx


		movl 8(%ebp),  %esi			# get data struct
		movl DESTIMAGE+HEIGHT(%esi), %edx	# image height
		movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
		shrl $2, %ecx				# vector count
		.align 16
	
linmap_looprow:
		movq LINMAPDATA+ROWSTATEX(%esi), %mm0	# get current coordinates
		movq LINMAPDATA+ROWSTATEY(%esi), %mm1

linmap_loopcol:		
		movq %mm0, %mm4				# copy
		movq %mm1, %mm5
		paddd LINMAPDATA+ROWINCX(%esi), %mm4	# increment
		paddd LINMAPDATA+ROWINCY(%esi), %mm5
		movq %mm4, %mm6				# copy
		movq %mm5, %mm7	
		paddd LINMAPDATA+ROWINCX(%esi), %mm6	# increment
		paddd LINMAPDATA+ROWINCY(%esi), %mm7
		movq %mm6, LINMAPDATA+ROWSTATEX(%esi)	# store next state
		movq %mm7, LINMAPDATA+ROWSTATEY(%esi) 

		psrad $16, %mm0				# round to 16 bit
		psrad $16, %mm1
		psrad $16, %mm4
		psrad $16, %mm5
		packssdw %mm4, %mm0			# pack new coordinates
		packssdw %mm5, %mm1
	
		push %ecx
		push %edx
		push %edi
	
		call getpixelsbilin			# do interpolation

		pop %edi
		pop %edx
		pop %ecx
		movq %mm0, (%edi)			# store 4 pixels
		addl $0x8, %edi				# point to next 4 pixels
		decl %ecx				# dec row counter
		jnz linmap_looprow

		movq LINMAPDATA+COLSTATEX(%esi), %mm0	# get column state vector
		movq LINMAPDATA+COLSTATEY(%esi), %mm1
		movl DESTIMAGE+WIDTH(%esi), %ecx	# image width
		shrl $2, %ecx				# vector count
		paddd LINMAPDATA+COLINCX(%esi), %mm0	# increment
		paddd LINMAPDATA+COLINCY(%esi), %mm1
		movq %mm0, LINMAPDATA+COLSTATEX(%esi)	# store
		movq %mm1, LINMAPDATA+COLSTATEY(%esi)
		decl %edx				# dec column counter
		jnz linmap_loopcol
		
		emms
		popl %ebx
		popl %edi
		popl %esi
		leave
		ret