Chalnoth said:
Were the moves used to reduce the number of registers used? If so, then this would be in line with what is expected when optimizing for the NV3x architecture.
It's also why I agree with one very specific part of Cg: it has the capability for hardware-specific compiler targets.
Well I might as well put the code here, it's not that secret, delux mapping shader from Tenebrae 2.
The original HLSL/Cg code (the same compiles for both):
struct inputVertex
{
float3 norVec : COLOR0;
float3 tanVec : TEXCOORD0;
float3 binVec : TEXCOORD1;
float2 deLuxCoord : TEXCOORD2;
float2 texCoord : TEXCOORD3;
float3 position: TEXCOORD4;
};
uniform float3 eyeposition;
uniform float3 lightposition;
float4 main(inputVertex I, uniform samplerCUBE tangentCube,
uniform samplerCUBE binormalCube,
uniform sampler2D deLuxMap, uniform sampler2D normalMap,
uniform sampler2D baseMap,
uniform sampler2D lightMap) : COLOR
{
//normal
float3 normal = I.norVec.xyz;
//tangent
float3 tangent = I.tanVec.xyz;
//binormal
float3 binormal = I.binVec.xyz;
// Get the worldspace delux
float3 wDelux = 2 * tex2D(deLuxMap,I.deLuxCoord).xyz - 1;
//Put into tangent space
float3 tDelux;
tDelux.x = dot(wDelux,tangent);
tDelux.y = dot(wDelux,binormal);
tDelux.z = dot(wDelux,normal);
tDelux = normalize(tDelux);
// Get the normal from normal map lookup
float3 matNormal = 2 * tex2D(normalMap, I.texCoord).xyz - 1;
// normal . light vector
float diffdot = saturate(dot(tDelux, matNormal));
// calculate halfvector
float3 halfvec = lightposition - I.position + eyeposition - I.position;
float3 trans;
trans.x = dot(halfvec, tangent);
trans.y = dot(halfvec, binormal);
trans.z = dot(halfvec, normal);
halfvec = normalize(trans);
float specdot = saturate(dot(halfvec, matNormal));
specdot = pow(specdot, 16);
float3 base = tex2D(baseMap, I.texCoord).xyz;
float3 lmap = tex2D(lightMap, I.texCoord).xyz;
float3 res = ((base * diffdot) + specdot) * lmap;
return res.xyzz;
}
Cg compiler output:
// DX9 Pixel Shader by NVIDIA Cg compiler
ps_2_0
// cgc version 1.1.0003, build date Mar 4 2003 12:32:10
// command line args: -profile ps_2_0
//vendor NVIDIA Corporation
//version 1.0.02
//profile ps_2_0
//program main
//semantic main.tangentCube
//semantic main.binormalCube
//semantic main.deLuxMap
//semantic main.normalMap
//semantic main.baseMap
//semantic main.lightMap
//semantic lightposition
//semantic eyeposition
//var float3 lightposition : : c[2] : -1 : 1
//var float3 eyeposition : : c[1] : -1 : 1
//var float3 I.norVec : $vin.COLOR0 : COLOR0 : 0 : 1
//var float3 I.tanVec : $vin.TEXCOORD0 : TEXCOORD0 : 0 : 1
//var float3 I.binVec : $vin.TEXCOORD1 : TEXCOORD1 : 0 : 1
//var float2 I.deLuxCoord : $vin.TEXCOORD2 : TEXCOORD2 : 0 : 1
//var float2 I.texCoord : $vin.TEXCOORD3 : TEXCOORD3 : 0 : 1
//var float3 I.position : $vin.TEXCOORD4 : TEXCOORD4 : 0 : 1
//var samplerCUBE tangentCube : : texunit 0 : 1 : 1
//var samplerCUBE binormalCube : : texunit 1 : 2 : 1
//var sampler2D deLuxMap : : texunit 2 : 3 : 1
//var sampler2D normalMap : : texunit 3 : 4 : 1
//var sampler2D baseMap : : texunit 4 : 5 : 1
//var sampler2D lightMap : : texunit 5 : 6 : 1
//var float4 main : $vout.COLOR : COLOR : -1 : 1
dcl_2d s2
dcl_2d s3
dcl_2d s4
dcl_2d s5
def c0, 2.000000, 1.000000, 16.000000, 0.000000
dcl v0.xyz
dcl t0.xyz
dcl t1.xyz
dcl t2.xy
dcl t3.xy
dcl t4.xyz
texld r0, t2, s2
texld r1, t3, s3
mad r0.xyz, c0.x, r0, -c0.y
mad r1.xyz, c0.x, r1, -c0.y
add r2.xyz, c2, -t4
add r2.xyz, r2, c1
add r2.xyz, r2, -t4
dp3 r0.w, r2, t0
mov r3.x, r0.w
dp3 r0.w, r2, t1
mov r3.y, r0.w
dp3 r0.w, r2, v0
mov r3.z, r0.w
dp3 r0.w, r0, t0
mov r2.x, r0.w
dp3 r0.w, r0, t1
mov r2.y, r0.w
dp3 r0.x, r0, v0
mov r2.z, r0.x
dp3 r0.x, r3, r3
rsq r0.x, r0.x
mul r3.xyz, r0.x, r3
dp3_sat r0.x, r3, r1
pow r1.w, r0.x, c0.z
dp3 r0.x, r2, r2
rsq r0.x, r0.x
mul r2.xyz, r0.x, r2
dp3_sat r0.x, r2, r1
texld r2, t3, s4
texld r3, t3, s5
mad r0.xyz, r2, r0.x, r1.w
mul r3.xyz, r0, r3
mov r0.xyz, r3
mov r0.w, r3.z
mov oC0, r0
// 35 instructions, 4 R-regs.
// End of program
58 lines, 0 errors.
DX9 HLSL compiler output:
//
// Generated by Microsoft (R) D3DX9 Shader Compiler
//
// Source: deluxearb.cg
// Flags: /E:main /T
s_2_0
//
// Parameters:
//
// sampler2D $baseMap;
// sampler2D $deLuxMap;
// sampler2D $lightMap;
// sampler2D $normalMap;
// float3 eyeposition;
// float3 lightposition;
//
//
// Registers:
//
// Name Reg Size
// ------------- ----- ----
// eyeposition c0 1
// lightposition c1 1
// $deLuxMap s0 1
// $normalMap s1 1
// $baseMap s2 1
// $lightMap s3 1
//
ps_2_0
def c2, 2, -1, 16, 0
dcl v0.xyz
dcl t0.xyz
dcl t1.xyz
dcl t2.xy
dcl t3.xy
dcl t4.xyz
dcl_2d s0
dcl_2d s1
dcl_2d s2
dcl_2d s3
texld r3, t3, s1
texld r2, t2, s0
texld r1, t3, s2
texld r0, t3, s3
add r4.xyz, -t4, c1
add r4.xyz, r4, c0
add r5.xyz, r4, -t4
dp3 r4.x, r5, t0
dp3 r4.y, r5, t1
dp3 r4.z, r5, v0
dp3 r0.w, r4, r4
rsq r0.w, r0.w
mul r4.xyz, r4, r0.w
mad r3.xyz, c2.x, r3, c2.y
dp3_sat r0.w, r4, r3
pow r1.w, r0.w, c2.z
mad r4.xyz, c2.x, r2, c2.y
dp3 r2.x, r4, t0
dp3 r2.y, r4, t1
dp3 r2.z, r4, v0
dp3 r0.w, r2, r2
rsq r0.w, r0.w
mul r2.xyz, r2, r0.w
dp3_sat r0.w, r2, r3
mad r1.xyz, r1, r0.w, r1.w
mul r1.xyz, r0, r1
mov r0.xyz, r1
mov r0.w, r1.z
mov oC0, r0
// approximately 31 instruction slots used (4 texture, 27 arithmetic)
The only difference to my own assembler version was a bit different order and not doing the last movs quite as literally, alpha is ignored anyway.