This one goes to 6400/626.0 =10 clocks although I count 11 passes.
Code:
ps_2_0
def c0, 0.0f, 0.0f, 2.0f, 0.0f
def c1, 0.4f, 0.5f, 0.9f, 16.0f
dcl t0.xy
dcl t1.xyz
dcl t2.xyz
dcl_2d s0
dcl_2d s1
// Normalize light direction
dp3_pp r1.w, t1, t1 // free fp16 nrm
rsq_pp r1.w, r1.w
mul_pp r1.xyz, t1, r1.w
// Calculate halfway vector
add_pp r0.xyz, c0, -t2 //pass 1
dp3_pp r0.w, r0, r0 //pass 2
rsq_pp r0.w, r0.w //pass 2 c-i
mad_pp r0.xyz, r0, r0.w, r1 //pass 3
dp3_pp r0.w, r0, r0 // free fp16 nrm
rsq_pp r0.w, r0.w
mul_pp r0.xyz, r0, r0.w
// Load and normalize normal
texld_pp r2, t0, s0 //pass 4
dp3_pp r2.w, r2, r2 //pass 5
rsq_pp r2.w, r2.w //pass 5 c-i
mul_pp r2.xyz, r2, r2.w //pass 6
// Calculate lighting
dp3_pp r1.w, r2, r0 // pass 7
dp3_pp r1.xyz, r2, r1 // pass 8
pow_pp r1.w, r1.w, c1.w //pass 8 c-i
mad_pp r1.xyz, r1, c1, r1.www //pass 9
// Add base texture
texld_pp r0, t0, s1 //pass 10
mul_pp r0, r1, r0 //pass 11
mov_pp oC0, r0