littlehead
Newcomer
David mentioned the L1 texture cache size is 8K for one TCP. so the total L1 on the chip should be 8K * 8 on G80 or 8K * 10 on G200.
Ok. If the L1 is private, usually be true on CPU side. So the max L1 size which could be accessed by single pixel shading thread is 8K.
So I use the pointer chasing approach to verify this. the data looks quite interesting. and the point has big L1 miss latency is appeared on 20Kb.
That thread will loop on an given working set and will create significant latency if the workset larger than the cache.
The data:
workset latency
8192Byte 0.174301
12288Byte 0.180389
16384Byte 0.187164
20480Byte 0.289429
24576Byte 58.757538
28672Byte 58.551575
32768Byte 59.707016
36864Byte 59.930389
here's the shader code,
ps.3.0;
dcl_texcoord0 v0.xy;
dcl_2d s0;
dcl_2d s1;
dcl_2d s2;
dcl_2d s3;
mov r1.xy, v0.xy;
rep i0;
rep i1;
rep i2;
texld r1.rgba , r1, s0;
texld r1.rgba , r1, s1;
texld r1.rgba , r1, s2;
texld r1.rgba , r1, s3;
endrep;
endrep;
endrep;
nrm r2, r1;
mov oC0.rgba ,r2 ;
Ok. If the L1 is private, usually be true on CPU side. So the max L1 size which could be accessed by single pixel shading thread is 8K.
So I use the pointer chasing approach to verify this. the data looks quite interesting. and the point has big L1 miss latency is appeared on 20Kb.
That thread will loop on an given working set and will create significant latency if the workset larger than the cache.
The data:
workset latency
8192Byte 0.174301
12288Byte 0.180389
16384Byte 0.187164
20480Byte 0.289429
24576Byte 58.757538
28672Byte 58.551575
32768Byte 59.707016
36864Byte 59.930389
here's the shader code,
ps.3.0;
dcl_texcoord0 v0.xy;
dcl_2d s0;
dcl_2d s1;
dcl_2d s2;
dcl_2d s3;
mov r1.xy, v0.xy;
rep i0;
rep i1;
rep i2;
texld r1.rgba , r1, s0;
texld r1.rgba , r1, s1;
texld r1.rgba , r1, s2;
texld r1.rgba , r1, s3;
endrep;
endrep;
endrep;
nrm r2, r1;
mov oC0.rgba ,r2 ;