Dawn's skin shader:
Also, note how only H0->H3 is used. Sounds like nVidia knows about the register performance
Okay, so in that program, there are:
ADDX: 2
MULX: 9
MADX: 3
DP3X: 4
TEX: 7
MULH: 1
MOVH: 7
-> 5a, 13m, 3S, 4T, 1M and 7MOVs
All MOVs seem to do FP->FP
According to thepkrl data, the float/texture part would take 7 cycles ( 1.5+4+1 = 6.5 -> 7 )
I didn't check whether "m"s are dependent.
If they aren't, then it's possible to do the INT part in 7 cycles too, in parallel to all of the FP/Tex stuff:
12/3 + 6/2 = 4 + 3 = 7
Does that make sense?
BTW, I'd like to know how much rounds SM, SA, TM and TA takes. If they only take one, then the FP/Tex part would only take 6 cycles...
Uttar
Code:
!!FP1.0
# NV_fragment_program generated by NVIDIA Cg compiler
# cgc version 1.5.0001, build date Oct 15 2002 01:04:08
# command line args: -profile fp30
#vendor NVIDIA Corporation
#version 1.0.1
#profile fp30
#program main
#semantic main.skinColor_frontSpec
#semantic main.skin_norm_sideSpec
#semantic main.g_specular_colorShift
#semantic main.g_blood_texture
#semantic main.g_transmission_terms
#semantic main.g_diffuse_Cube
#semantic main.g_specular_Cube
#semantic main.g_nrmalize_Cube
#semantic main.g_dappleProjection
#semantic main.g_hilight_Cube
#semantic main.g_oiliness
#var float4 v2f.skinColor_frontSpec : $vin.TEXCOORD0 : TEXCOORD0 : 0 : 1
#var float3 v2f.worldEyeDir : $vin.TEXCOORD2 : TEXCOORD2 : 0 : 1
#var float3 v2f.worldTanMatrixX : $vin.TEXCOORD5 : TEXCOORD5 : 0 : 1
#var float3 v2f.worldTanMatrixY : $vin.TEXCOORD6 : TEXCOORD6 : 0 : 1
#var float3 v2f.worldTanMatrixZ : $vin.TEXCOORD7 : TEXCOORD7 : 0 : 1
#var float4 v2f.SkinSilouetteVec : $vin.TEXCOORD3 : TEXCOORD3 : 0 : 1
#var samplerRECT skinColor_frontSpec : : texunit 0 : 1 : 1
#var samplerRECT skin_norm_sideSpec : : texunit 1 : 2 : 1
#var samplerRECT g_specular_colorShift : : texunit 2 : 3 : 1
#var samplerRECT g_blood_texture : : texunit 3 : 4 : 1
#var samplerRECT g_transmission_terms : : texunit 4 : 5 : 1
#var samplerCUBE g_diffuse_Cube : : texunit 5 : 6 : 1
#var samplerCUBE g_specular_Cube : : texunit 6 : 7 : 1
#var samplerCUBE g_nrmalize_Cube : : texunit 7 : 8 : 1
#var samplerRECT g_dappleProjection : : texunit 8 : 9 : 1
#var samplerCUBE g_hilight_Cube : : texunit 9 : 10 : 1
#var float2 g_oiliness : : : 11 : 1
#var half4 COL : $vout.COLOR : COLOR : -1 : 1
DEFINE LUMINANCE = {0.299, 0.587, 0.114, 0.0};
DECLARE g_oiliness;
############################################################################################
# These two blocks slow code down 10% #
############################################################################################
TEX H0, f[TEX0], TEX1, 2D; # store range-compressed normal (and side spec) in H1
TEX H1, f[TEX0], TEX0, 2D; # store skin color in H3
MOVH H2.x, g_oiliness.y;
MULX H0.xy, H0, H2.x;
TEX H0, H0, TEX7, CUBE;
MOVH H3, f[TEX5];
DP3X H2.x, H0, H3;
MOVH H3, f[TEX6];
DP3X H2.y, H0, H3;
MOVH H3, f[TEX7];
DP3X H2.z, H0, H3; # H2 now contains worldNormal - extinguish
MOVH H0.xyz, f[TEX2]; # store v2f.world_V in H1
DP3X H2.w, H0, H2; # H2.w = dot(Normal, View)
MULX H2.xyz,-H2, -2; # twice the normal
MADX H3.xyz, -H2, H2.w, H0; # H4 = -2*dot(H0, H2)*H2 + H0 = reflection vector. Normal doesn't need to be fixed, because it's uniformly scaled
TEX H0.xyz, H2, TEX5, CUBE; # diffuse lighting
TEX H2, H2, TEX6, CUBE; # side specular
MULX H0.xyz, H0, H1; # skin diffuse*diffuse color
#MULX H2, H2, H0.w; # side_spec*side_spec term (H0.w is the side_spec term)
MULX H2, H2, H1.w; # side_spec*side_spec term (H0.w is the side_spec term)
MOVH H1.xyz, f[TEX3]; # copy ndotv (H1.w is the spec map)
MULX H0, H0, H1.x;
MADX H0, H2, H1.y, H0; # diffuse + side_spec (H2 avail)
TEX H2, f[TEX2], TEX9, CUBE; # fetch hilight
TEX H3, H3, TEX6, CUBE; # fetch direct specular
MULX H2.xyz, H2, H1.y; # hilight by facing ratio
MULX H2.xyz, H2, H1.x;
MULX H3, H3, H1.w; # spec*spec_map
MULX H3, H3, 0.02; # scale specular
MOVH H3.w, g_oiliness.x;
MULH H3, H3, H3.w; # scale the oiliness
MADX H2.xyz, H2, 0.7, H3; # 0.7*hilight + direct spec
ADDX H0, H2, H0; # diffuse specs and hilight
#MADX o[COLH], H0, H1.x, H2.w; # add haze
ADDX o[COLH], H0, H2.w; # add haze
END
# End of program
Also, note how only H0->H3 is used. Sounds like nVidia knows about the register performance
Okay, so in that program, there are:
ADDX: 2
MULX: 9
MADX: 3
DP3X: 4
TEX: 7
MULH: 1
MOVH: 7
-> 5a, 13m, 3S, 4T, 1M and 7MOVs
All MOVs seem to do FP->FP
According to thepkrl data, the float/texture part would take 7 cycles ( 1.5+4+1 = 6.5 -> 7 )
I didn't check whether "m"s are dependent.
If they aren't, then it's possible to do the INT part in 7 cycles too, in parallel to all of the FP/Tex stuff:
12/3 + 6/2 = 4 + 3 = 7
Does that make sense?
BTW, I'd like to know how much rounds SM, SA, TM and TA takes. If they only take one, then the FP/Tex part would only take 6 cycles...
Uttar