337 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			337 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| ; PowerPC optimized zoom for Goom
 | |
| ; © 2001-2003 Guillaume Borios
 | |
| ; This library is free software; you can redistribute it and/or
 | |
| ; modify it under the terms of the GNU Library General Public
 | |
| ; License as published by the Free Software Foundation; either
 | |
| ; version 2 of the License, or (at your option) any later version.
 | |
| ;
 | |
| ; This library is distributed in the hope that it will be useful,
 | |
| ; but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ; Library General Public License for more details.
 | |
| ;
 | |
| ; You should have received a copy of the GNU Library General Public
 | |
| ; License along with this library; if not, write to the
 | |
| ; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 | |
| ; Boston, MA 02110-1301, USA.
 | |
| 
 | |
| ; Change log :
 | |
| ; 21 Dec 2003 : Use of altivec is now determined with a parameter
 | |
| 
 | |
| ; Section definition : We use a read only section
 | |
| .text
 | |
| 
 | |
| ; name of the function to call by C program : ppc_zoom
 | |
| ; We declare this label as a global to extend its scope outside this file
 | |
| .globl _ppc_zoom_generic
 | |
| .globl _ppc_zoom_G4
 | |
| 
 | |
| ; Description :
 | |
| ; This routine dynamically computes and applies a zoom filter
 | |
| 
 | |
| ; parameters :
 | |
| ; r3  <=> unsigned int sizeX (in pixels)
 | |
| ; r4  <=> unsigned int sizeY (in pixels)
 | |
| ; r5  <=> unsigned int * frompixmap
 | |
| ; r6  <=> unsigned int * topixmap
 | |
| ; r7  <=> unsigned int * brutS
 | |
| ; r8  <=> unsigned int * brutD
 | |
| ; r9  <=> unsigned int buffratio
 | |
| ; r10 <=> int [16][16] precalccoeffs
 | |
| 
 | |
| ; globals after init
 | |
| ; r5  <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
 | |
| ; r6  <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
 | |
| ; r3 <=> ax = x max in 16th of pixels (replaces old r3)
 | |
| ; r4 <=> ay = y max in 16th of pixels (replaces old r4)
 | |
| ; r20 <=> row size in bytes
 | |
| ; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
 | |
| ; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
 | |
| ; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)
 | |
| 
 | |
| ; ABI notes :
 | |
| ; r1 is the Stack Pointer (SP) => Do not use
 | |
| ; r13..r31 are non-volatiles => Do not use
 | |
| 
 | |
| _ppc_zoom_generic:
 | |
| 
 | |
| ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
 | |
| stmw 	r18,-56(r1)
 | |
| 
 | |
| ; init
 | |
| li      r18,0		; Default value if out of range : 0 (Black)
 | |
| mr      r11,r10
 | |
| lis     r12,0xFF
 | |
| mullw   r2,r3,r4	; Number of pixels to compute
 | |
| subi    r30,r8,0
 | |
| slwi	r20,r3,2
 | |
| srawi   r19,r20,2
 | |
| ori     r12,r12,0xFF
 | |
| subi    r3,r3,1
 | |
| subi    r4,r4,1
 | |
| mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
 | |
| subi    r31,r7,0
 | |
| subi    r6,r6,4
 | |
| slwi	r3,r3,4
 | |
| slwi	r4,r4,4
 | |
| 
 | |
| ;pre init for loop
 | |
| lwz	r2,0(r31)    ; px
 | |
| lwz	r29,4(r31)   ; py
 | |
| lwz	r8,0(r30)    ; px2
 | |
| lwz	r10,4(r30)   ; py2
 | |
| 
 | |
| b       L1
 | |
| .align  5
 | |
| L1:
 | |
| 
 | |
| ; computes dynamically the position to fetch
 | |
| sub     r8,r8,r2
 | |
| sub     r10,r10,r29
 | |
| mullw   r8,r8,r9
 | |
| addi    r31,r31,8
 | |
| mullw   r10,r10,r9
 | |
| addi    r30,r30,8
 | |
| 
 | |
| srawi   r8,r8,16
 | |
| srawi   r10,r10,16
 | |
| add     r2,r2,r8
 | |
| add     r29,r29,r10
 | |
| 
 | |
| ; if px>ax or py>ay goto outofrange
 | |
| ; computes the attenuation coeffs and the original point address
 | |
| rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
 | |
| cmpl    cr4,0,r2,r3
 | |
| rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r10%16)*4 | r10)
 | |
| cmpl    cr7,0,r29,r4
 | |
| srawi   r29,r29,4     ; pos computing
 | |
| bge-	cr4,L4
 | |
| srawi   r2,r2,4       ; pos computing
 | |
| mullw   r29, r29,r19  ; pos computing
 | |
| bge-	cr7,L4
 | |
| 
 | |
| ; Channels notation : 00112233 (AARRVVBB)
 | |
| 
 | |
| add     r2,r2,r29    		; pos computing
 | |
| lwzx    r10,r11,r10		; Loads coefs
 | |
| slwi    r2,r2,2      		; pos computing
 | |
| add	r2,r2,r5     		; pos computing
 | |
| rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
 | |
| lwz	r25,0(r2)		; Loads col1 -> r25
 | |
| lwz	r26,4(r2)		; Loads col2 -> r26
 | |
| rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
 | |
| rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
 | |
| add	r2,r2,r20		; Adds one line for future load of col3 and col4
 | |
| and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
 | |
| rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
 | |
| andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
 | |
| mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3
 | |
| 
 | |
| 
 | |
| ; computes final pixel color
 | |
| and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
 | |
| lwz	r27,0(r2)		; Loads col3 -> r27
 | |
| mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
 | |
| mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
 | |
| andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
 | |
| mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
 | |
| lwz	r28,4(r2)		; Loads col4 -> r28
 | |
| add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
 | |
| and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
 | |
| add	r25,r25,r29		; Adds col1 & col2 channel 2
 | |
| mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
 | |
| andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
 | |
| mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
 | |
| lwz	r2,0(r31)		; px
 | |
| add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
 | |
| and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
 | |
| mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
 | |
| add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
 | |
| lwz 	r8,0(r30)    		; px2
 | |
| andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
 | |
| add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
 | |
| lwz	r10,4(r30)   		; py2
 | |
| mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
 | |
| srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
 | |
| lwz	r29,4(r31)              ; py
 | |
| add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
 | |
| rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
 | |
| stwu	r7,4(r6)		; Stores the computed pixel
 | |
| bdnz	L1			; Iterate again if needed
 | |
| b       L3	;goto end	; If not, returns from the function
 | |
| 
 | |
| 
 | |
| ; if out of range
 | |
| L4:
 | |
| stwu	r18,4(r6)
 | |
| lwz	r8,0(r30)    ; px2
 | |
| lwz	r10,4(r30)   ; py2
 | |
| lwz	r2,0(r31)    ; px
 | |
| lwz	r29,4(r31)   ; py
 | |
| bdnz	L1
 | |
| 
 | |
| 
 | |
| L3:
 | |
| 
 | |
| ; Restore saved registers and return
 | |
| lmw	r18,-56(r1)
 | |
| blr
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| _ppc_zoom_G4:
 | |
| 
 | |
| ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
 | |
| stmw 	r17,-60(r1)
 | |
| 
 | |
| ; init
 | |
| li      r18,0		; Default value if out of range : 0 (Black)
 | |
| mr      r11,r10
 | |
| lis     r12,0xFF
 | |
| mullw   r2,r3,r4	; Number of pixels to compute
 | |
| subi    r30,r8,0
 | |
| slwi	r20,r3,2
 | |
| srawi   r19,r20,2
 | |
| ori     r12,r12,0xFF
 | |
| subi    r3,r3,1
 | |
| subi    r4,r4,1
 | |
| mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
 | |
| subi    r31,r7,0
 | |
| subi    r6,r6,4
 | |
| slwi	r3,r3,4
 | |
| slwi	r4,r4,4
 | |
| 
 | |
| ;pre init for loop
 | |
| lwz	r2,0(r31)    ; px
 | |
| lwz	r29,4(r31)   ; py
 | |
| lwz	r8,0(r30)    ; px2
 | |
| lwz	r10,4(r30)   ; py2
 | |
| 
 | |
| ;*********************
 | |
| lis     r17,0x0F01
 | |
| 
 | |
| b       L100
 | |
| .align  5
 | |
| L100:
 | |
| 
 | |
| addi    r6,r6,4
 | |
| 
 | |
| ; Optimization to ensure the destination buffer
 | |
| ; won't be loaded into the data cache
 | |
| rlwinm. r0,r6,0,27,31
 | |
| bne+    L500
 | |
| dcbz    0,r6
 | |
| ;dcba    0,r6
 | |
| L500:
 | |
| 
 | |
| ; computes dynamically the position to fetch
 | |
| ;mullw   r8,r8,r29
 | |
| ;mullw   r2,r2,r29
 | |
| ;add     r2,r8,r2
 | |
| ;srawi   r2,r2,17
 | |
| 
 | |
| sub     r8,r8,r2
 | |
| sub     r10,r10,r29
 | |
| mullw   r8,r8,r9
 | |
| addi    r31,r31,8
 | |
| mullw   r10,r10,r9
 | |
| addi    r30,r30,8
 | |
| 
 | |
| dst     r30,r17,0
 | |
| 
 | |
| srawi    r8,r8,16
 | |
| srawi    r10,r10,16
 | |
| add     r2,r2,r8
 | |
| add     r29,r29,r10
 | |
| 
 | |
| dst     r31,r17,1
 | |
| 
 | |
| ; if px>ax or py>ay goto outofrange
 | |
| ; computes the attenuation coeffs and the original point address
 | |
| rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
 | |
| cmpl    cr4,0,r2,r3
 | |
| rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r29%16)*4 | r10)
 | |
| cmpl    cr7,0,r29,r4
 | |
| srawi   r29,r29,4     ; pos computing
 | |
| bge-	cr4,L400
 | |
| srawi   r2,r2,4       ; pos computing
 | |
| mullw   r29, r29,r19  ; pos computing
 | |
| bge-	cr7,L400
 | |
| 
 | |
| ; Channels notation : 00112233 (AARRVVBB)
 | |
| 
 | |
| add     r2,r2,r29    		; pos computing
 | |
| lwzx    r10,r11,r10		; Loads coefs
 | |
| slwi    r2,r2,2      		; pos computing
 | |
| add	r2,r2,r5     		; pos computing
 | |
| rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
 | |
| lwz	r25,0(r2)		; Loads col1 -> r25
 | |
| lwz	r26,4(r2)		; Loads col2 -> r26
 | |
| rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
 | |
| rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
 | |
| add	r2,r2,r20		; Adds one line for future load of col3 and col4
 | |
| and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
 | |
| rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
 | |
| dst     r2,r17,2
 | |
| rlwinm  r25,r25,0,16,23		; Masks col1 channel 2 : 0x0000XX00
 | |
| ;andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
 | |
| mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3
 | |
| 
 | |
| 
 | |
| ; computes final pixel color
 | |
| and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
 | |
| lwz	r27,0(r2)		; Loads col3 -> r27
 | |
| mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
 | |
| mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
 | |
| rlwinm  r29,r26,0,16,23		; Masks col2 channel 2 : 0x0000XX00
 | |
| ;andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
 | |
| mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
 | |
| lwz	r28,4(r2)		; Loads col4 -> r28
 | |
| add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
 | |
| and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
 | |
| add	r25,r25,r29		; Adds col1 & col2 channel 2
 | |
| mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
 | |
| rlwinm  r29,r27,0,16,23		; Masks col3 channel 2 : 0x0000XX00
 | |
| ;andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
 | |
| mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
 | |
| lwz	r2,0(r31)		; px
 | |
| add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
 | |
| and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
 | |
| mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
 | |
| add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
 | |
| lwz 	r8,0(r30)    		; px2
 | |
| rlwinm  r28,r28,0,16,23		; Masks col4 channel 2 : 0x0000XX00
 | |
| ;andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
 | |
| add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
 | |
| lwz	r10,4(r30)   		; py2
 | |
| mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
 | |
| srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
 | |
| lwz	r29,4(r31)              ; py
 | |
| add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
 | |
| rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
 | |
| stw	r7,0(r6)		; Stores the computed pixel
 | |
| bdnz	L100			; Iterate again if needed
 | |
| b       L300	;goto end	; If not, returns from the function
 | |
| 
 | |
| 
 | |
| ; if out of range
 | |
| L400:
 | |
| stw	r18,0(r6)
 | |
| lwz	r8,0(r30)    ; px2
 | |
| lwz	r10,4(r30)   ; py2
 | |
| lwz	r2,0(r31)    ; px
 | |
| lwz	r29,4(r31)   ; py
 | |
| bdnz	L100
 | |
| 
 | |
| 
 | |
| L300:
 | |
| 
 | |
| ; Restore saved registers and return
 | |
| lmw	r17,-60(r1)
 | |
| blr
 |