alephnull on 26 May 2009
Here is an ascii step-by-step of what's going on in the loop
x[] = | b3/c3 | b2/c2 | b1/c1 | b0/c0 |g[] = | a3>0 | a2>0 | a1>0 | a0>0 | //guards
y[] = | !( (a3>0) & a3 ) | !( (a2>0) & a2 ) | !( (a1>0) & a1 ) | !( (a0>0) & a0 ) | //mask1
z[] = | (a3>0) & x3 | (a2>0) & x2 | (a1>0) & x1 | (a0>0) & x0 | //mask2
a[] = | y3|z3 | y2|z2 | y1|z1 | y0|z0 | //combine: mask1 OR mask2
And here's also the same thing in assembly (in GNU sytax) if anyone wants it.
loopInit:
xor %eax, %eax #intialize array index to 0
mov N, %edi #store loop guard in edi
shr $2, %edi #divide N by 4 using N>>2
cmp %edi, %eax #test loop iteration constraint store result in %status
jge loopEnd #jump to loopEnd if N/4 <= 0
loop:
#Load array data into registers
movaps a(%eax), %xmm0
movaps b(%eax), %xmm2
divps c(%eax), %xmm2 #vertically divide (4-float array packed into 128-bit register) b by c
xorps %xmm1, %xmm1 #Quick way of zeroing register
cmpltps %xmm0, %xmm1 #vertically compare xmm0[0 to 3] > 0 (eg. if a[0-2]>0 and a[3]<=0 result is
#xmm1 = 0xFFFFFFFFFFFFFFFFFFFFFFFF00000000 (elements 0-2 set to all 1s and 0s for 3)
movaps %xmm1, %xmm3 #copy g into xmm1
andnps %xmm0, %xmm3 #create mask 1
andps %xmm1, %xmm2 #create mask 2
orps %xmm2, %xmm3 #combine masks
#store result back in array a
movaps %xmm3, a(%eax)
add $16, %eax #increment loop counter by 4 floats (16 bytes)
cmp %edi, %eax #test loop iteration constraint result stored in %status
jl loop #jump if second value less than first (based on status register)
loopEnd:







