IIRC, given the simplicity of SM2.0 looping, it may not be necessary, perhaps apart from very small loops.tcchiu said:Does the ATI or nVidia driver unroll the loops (SM 2.0 flow control)?
If the loops can be unrolled, they are. Basically, any loop that doesn't depend upon per-vertex or per-pixel information is unrolled. This is simply because it is typically assumed that many pixels and vertices will be drawn per branch, and the branching itself will incur a performance hit. Thus it makes more sense to just eat that little bit of extra data swapping that is needed when the new shader is loaded, instead of eating some constant performance hit for each pixel/vertex.tcchiu said:Does the ATI or nVidia driver unroll the loops (SM 2.0 flow control)?
Well, not for pixel shaders, but for vertex shaders there seems to be. . .DemoCoder said:It is the HLSL compiler in SM2.0 which unrolls the loops, not the driver. There is no loop instruction in SM2.0 assembly.
Ostsol said:Well, not for pixel shaders, but for vertex shaders there seems to be. . .DemoCoder said:It is the HLSL compiler in SM2.0 which unrolls the loops, not the driver. There is no loop instruction in SM2.0 assembly.
VS 2.0 Instructions
float4 main(float4 texCoord: TEXCOORD0) : COLOR {
[loop]
for (int i = 0; i < 8; i++){
texCoord += 3.7 * texCoord.wzyx;
}
return texCoord;
}
ps_3_0
def c0, 3.70000005, 0, 0, 0
dcl_texcoord v0
mad r0, v0.wzyx, c0.x, v0
mad r0, r0.wzyx, c0.x, r0
mad r0, r0.wzyx, c0.x, r0
mad r0, r0.wzyx, c0.x, r0
mad r0, r0.wzyx, c0.x, r0
mad r0, r0.wzyx, c0.x, r0
mad r0, r0.wzyx, c0.x, r0
mad oC0, r0.wzyx, c0.x, r0
ps_3_0
def c0, 0, 3.70000005, 0, 0
defi i0, 8, 0, 0, 0
dcl_texcoord v0
mov r0, v0
rep i0
mad r0, r0.wzyx, c0.y, r0
endrep
mov oC0, r0
Does the ATI or nVidia driver unroll the loops (SM 2.0 flow control)?
Yes, but the vendor compilers will sometimes unroll the loop themselves if it's statically analyzable... Making it dynamic will avoid this, unless they recompile on parameter changes.
Use the [loop] attribute on loops and [branch] on branches.
I think you'll have to use a recent version of the SDK though for the compiler to support these [] attributes.
float diffuse_light = 0;
float specular_light = 0;
float3 view_normal = normalize( input.ViewNormal );
[loop]
for ( int i = 0; i < 2; i++ )
{
float3 view_light = normalize( input.ViewLights[ i ] );
diffuse_light += saturate( dot( view_normal, view_light ));
float3 halfvector = normalize( view_light - normalize( input.Position.xyz ));
specular_light += pow( dot( view_normal, halfvector ), 64.0 );
}
Yes, but the vendor compilers will sometimes unroll the loop themselves if it's statically analyzable... Making it dynamic will avoid this, unless they recompile on parameter changes.
Any ideas why I get this error message?
struct VS_INPUT
{
float4 Position : POSITION0;
float3 Normal : NORMAL0;
};
struct VS_OUTPUT
{
float4 ProjPosition : POSITION;
float4 Position : TEXCOORD0;
float3 Normal : TEXCOORD1;
float3 ViewNormal : TEXCOORD2;
float3 ViewLights[ 2 ] : TEXCOORD3;
float2 TexCoords[ 3 ] : TEXCOORD5;
};
struct PS_OUTPUT
{
float4 Color : COLOR0;
};
float4x4 Projection : register( c0 );
float4x4 WorldViewTransform : register( c4 );
float4 Material_Color1 : register( c8 );
float4 Material_Color2 : register( c9 );
float4 Material_Specular : register( c10 );
float3 LightVectors[ 2 ] : register( c11 );
float4 Reg13 : register( c13 );
#define Time Reg13.x
#define Global_Ambient Reg13.y
#define Global_Diffuse Reg13.z
#define Global_Specular Reg13.w
float4 TexScale : register( c14 );
// --------------------------------------------------------------------------------------------------------------------
// VertexShader
// --------------------------------------------------------------------------------------------------------------------
#ifdef VERTEXSHADER
VS_OUTPUT __VertexShader( VS_INPUT input )
{
VS_OUTPUT output;
// Position & normal
output.Position = mul( input.Position, WorldViewTransform );
output.ProjPosition = mul( output.Position, Projection );
output.Normal = input.Normal;
output.ViewNormal = mul( input.Normal, WorldViewTransform );
// Light sources
[unroll]
for ( int i = 0; i < 2; i++ )
{
output.ViewLights[ i ] = mul( LightVectors[ i ], WorldViewTransform );
}
// Generate texcoords
[unroll] // FIXME: Indexing of l-values are not supported?
for ( int i = 0; i < 3; i++ )
{
output.TexCoords[ i ] = TexScale.xy * input.Position.xy;
output.TexCoords[ i ] += TexScale.w * ( Time + input.Position.z );
input.Position.xyz = input.Position.yzx;
TexScale.xyz = TexScale.yzx;
}
return output;
}
#endif
// --------------------------------------------------------------------------------------------------------------------
// PixelShader
// --------------------------------------------------------------------------------------------------------------------
#ifdef PIXELSHADER
sampler2D Texture1;
sampler2D Texture2;
PS_OUTPUT __PixelShader( VS_OUTPUT input )
{
PS_OUTPUT output;
// Light sources
float diffuse_light = 0;
float specular_light = 0;
float3 normal = normalize( input.Normal );
float3 view_normal = normalize( input.ViewNormal );
[loop]
for ( int i = 0; i < 2; i++ )
{
float3 view_light = normalize( input.ViewLights[ i ] );
float3 halfvector = normalize( view_light - normalize( input.Position.xyz ));
specular_light += pow( dot( view_normal, halfvector ), 64.0 );
diffuse_light += saturate( dot( view_normal, view_light ));
}
// Material: texture & colors
float4 texture_map = 0;
[loop]
for ( int i = 0; i < 3; i++ )
{
float4 texture_1 = tex2D( Texture1, input.TexCoords[ i ] );
float4 texture_2 = tex2D( Texture2, input.TexCoords[ i ] );
texture_map += texture_1 * texture_2 * abs( input.Normal.z );
normal.xyz = normal.yzx;
}
float4 diffuse_color =
lerp( Material_Color1, Material_Color2, texture_map );
float4 specular_color =
Material_Specular * texture_map * saturate( -view_normal.z );
// Compute final color
output.Color =
Global_Ambient * diffuse_color +
Global_Diffuse * diffuse_color * diffuse_light +
Global_Specular * specular_color * specular_light;
// Per-pixel distance-fog
output.Color = lerp( output.Color, 0.933333333, saturate( abs( input.Position.z ) * 0.02f ));
return output;
}
#endif