Deano Calver
Newcomer
I'm doing some planning for VS 3.0 and one algorithm really needed a linked list, sounds like the kind of thing a GPU has no chance of doing.
Well had a play and got this code to compile, haven't actually used it yet but it compiles o.k using effect edit. Don't think there's anything wrong with the approach (except the speed )
I was also optimisating a tone-mapping process and came up with a 511 instruction shader that uses loops to execute 64900 instructions per pixel . Haven't actually tested if it actual faster yet, but theoritically it is faster than the current recursive decent process. The thing I really likem is that it involves the first optimisation I ever made way back on an Atari ST . Loops costing more than the operation was a rule back then (as now on SM 3.0), so for something like clearing the screen you would manually unroll the loop hundreds of times. This SM 3.0 repeats a few instructions 100 times to sum 100 pixel horizontally before looping back to do the next y line.
Any way some untested code to look at...
Well had a play and got this code to compile, haven't actually used it yet but it compiles o.k using effect edit. Don't think there's anything wrong with the approach (except the speed )
I was also optimisating a tone-mapping process and came up with a 511 instruction shader that uses loops to execute 64900 instructions per pixel . Haven't actually tested if it actual faster yet, but theoritically it is faster than the current recursive decent process. The thing I really likem is that it involves the first optimisation I ever made way back on an Atari ST . Loops costing more than the operation was a rule back then (as now on SM 3.0), so for something like clearing the screen you would manually unroll the loop hundreds of times. This SM 3.0 repeats a few instructions 100 times to sum 100 pixel horizontally before looping back to do the next y line.
Any way some untested code to look at...
Code:
//------- CUT HERE -------------
// VS 3.0 linked list (HLSL)
// end of list is actuall any negative number
#define END_OF_LIST -1
float TextureWidth = 256;
struct ListNode
{
// x = data index
// y = prev
// z = next
// w = unused
float4 payload;
};
sampler2D sampListNodes;
sampler2D sampData;
// converts an index into a 2D u,v coordinate
// Requires D3DTADDRESS_WARP in the U dimension to work
float2 GPUConvert1DAddressTo2D( float OneD )
{
return( OneD, OneD * (1.f / TextureWidth) );
}
void GetNode( in float nodeIndex,
out float dataIndex,
out float prevIndex,
out float nextIndex )
{
float2 nodeUV = GPUConvert1DAddressTo2D( nodeIndex );
float4 listNode = tex2Dlod( sampListNodes, float4(nodeUV,0,0) );
dataIndex = listNode.x;
prevIndex = listNode.y;
nextIndex = listNode.z;
}
// assumes startNode is a valid list node index
void WalkLinkedListForward( in float startNode )
{
float curNode = startNode;
float dataIndex, prevNode, nextNode;
do
{
// get the list node structure at curNode
GetNode( curNode, dataIndex, prevNode, nextNode );
// get the data held in the current list node
float2 dataUV = GPUConvert1DAddressTo2D( dataIndex );
float4 data = tex2Dlod( sampData, float4(dataUV,0,0) );
// do something with the data
// Process(data);
// next index
curNode = nextNode;
} while( curNode > 0 );
// loop while we have a posive index (there floats remember)
}
//------- cut here ----------
// PS 3.0 1 pass screen log accumlator
mov r0.xy, v0.xy // set uv to start of this block 1
mov r3.r, 0 // clear accumulator 1
mov r1.a, c4.a // set a = sqrt(delta) so r1.a * r1.a = delta 1
rep i2 // loop across y 3
BEGIN_REPEAT_BLOCK(100) // imaginary macro to physical repeat the code 100 times
texld r1.rgb, r0, s0 // get the pixel from the framebuffer
dp4 r2.r, r1.rgba, c3.rgba // delta + Lw(x,y)
log r2.r, r2.r // log(x)
add r3.r, r3.r, r2.r // Sum of log-luminance
add r0.x, c1.x // increment u
END_REPEAT_BLOCK // total cost of block 500
add r0.y, c1.y // increment v 1
mov r0.x, c0.x // reset u to 0 1
endrep // end y loop 2
mov 0C0.rgba, r3.xxxx // output accumulator