Less than 500mm2, suppose to be smaller than G200b which was 485mm2(?).I though GT300 was expected to be in the 500-600 size range.
Last edited by a moderator:
Less than 500mm2, suppose to be smaller than G200b which was 485mm2(?).I though GT300 was expected to be in the 500-600 size range.
Surely you mean that a tape out costs $6M? That'd be reasonable, plus an easy $1-10M for any extra engineering and validation effort.
DK
Less than 500mm2, suppose to be smaller than G200b which was 285mm2(?).
How large gain you see by going compute shader depends on your ALU:TEX ratio. A compute shader implementation will likely require somewhat more ALU operation than a pixel shader implementation, whereas the TEX operations drop significantly. ATI chip that have plenty of ALU and relatively few TEX naturally sees a large boost in performance. For Nvidia who has less ALU power and more TEX power the performance boost will be smaller.
ps_4_0
dcl_constantbuffer cb0[5], immediateIndexed
dcl_sampler s0, mode_default
dcl_resource_texture2d (float,float,float,float) t0
dcl_resource_texture2d (float,float,float,float) t1
dcl_input_ps_siv linear noperspective v0.xy, position
dcl_input_ps linear v1.zw
dcl_output o0.xyzw
dcl_temps 8
sample_l r0.xyzw, v1.zwzz, t1.xyzw, s0, l(0.000000)
mul r0.y, cb0[0].z, l(0.500000)
mul r0.yz, r0.yyyy, cb0[3].xxyx
div r0.yz, r0.yyzy, r0.xxxx
mul r1.xy, r0.yzyy, cb0[4].zwzz
min r0.w, r1.y, r1.x
min r0.w, r0.w, cb0[0].x
lt r1.x, r0.w, l(1.000000)
if_nz r1.x
mov o0.xyzw, l(1.000000,1.000000,1.000000,1.000000)
ret
endif
mad r1.xy, v1.zwzz, l(2.000000, -2.000000, 0.000000, 0.000000), l(-1.000000, 1.000000, 0.000000, 0.000000)
mul r1.xy, r1.xyxx, cb0[3].zwzz
mul r1.xy, r0.xxxx, r1.xyxx
add r1.w, r0.w, l(1.000000)
div r0.yz, r0.yyzy, r1.wwww
ftoi r2.xy, v0.xyxx
and r2.xy, r2.xyxx, l(63, 63, 0, 0)
mov r2.zw, l(0,0,0,0)
ld r2.xyzw, r2.xyzw, t0.xyzw
mul r0.yz, r0.yyzy, r2.xxyx
mul r2.xy, r2.zzzz, r0.yzyy
mul r2.xy, r2.xyxx, cb0[4].zwzz
round_ne r2.xy, r2.xyxx
mad r2.xy, r2.xyxx, cb0[4].xyxx, v1.zwzz
mul r0.yz, r0.yyzy, cb0[4].zzwz
round_ne r0.yz, r0.yyzy
mul r3.xy, r0.yzyy, cb0[4].xyxx
add r1.w, cb0[1].y, l(-1.570796)
sincos r4.x, r5.x, r1.w
div r1.w, r4.x, r5.x
ne r2.w, cb0[1].y, l(0.000000)
mad r3.z, r1.w, r1.w, l(1.000000)
sqrt r3.z, r3.z
div r3.z, r1.w, r3.z
movc r2.w, r2.w, r3.z, l(-1.000000)
mov r1.z, r0.x
mov r3.zw, r2.xxxy
mov r4.x, l(0)
mov r4.y, r1.w
mov r4.z, r2.w
mov r4.w, l(1.000000)
loop
lt r5.x, r0.w, r4.w
breakc_nz r5.x
mad r3.zw, r0.yyyz, cb0[4].xxxy, r3.zzzw
sample_l r5.xyzw, r3.zwzz, t1.xyzw, s0, l(0.000000)
mad r5.yz, r3.zzwz, l(0.000000, 2.000000, -2.000000, 0.000000), l(0.000000, -1.000000, 1.000000, 0.000000)
mul r5.yz, r5.yyzy, cb0[3].zzwz
mul r6.xy, r5.xxxx, r5.yzyy
mov r6.z, r5.x
add r5.yzw, -r1.xxyz, r6.xxyz
dp3 r5.w, r5.yzwy, r5.yzwy
lt r6.x, r5.w, cb0[1].x
add r5.x, r0.x, -r5.x
dp2 r5.y, r5.yzyy, r5.yzyy
sqrt r5.y, r5.y
div r5.y, r5.x, r5.y
lt r6.y, r4.y, r5.y
if_nz r6.y
mad r6.y, r5.y, r5.y, l(1.000000)
sqrt r6.y, r6.y
div r5.z, r5.y, r6.y
sqrt r5.w, r5.w
mul r5.w, r5.w, cb0[0].w
mul r5.w, r5.w, r5.w
mad r5.w, -r5.w, cb0[1].w, l(1.000000)
add r6.y, -r4.z, r5.z
mad r5.x, r5.w, r6.y, r4.x
else
mov r5.xyz, r4.xyzx
endif
movc r4.xyz, r6.xxxx, r5.xyzx, r4.xyzx
add r4.w, r4.w, l(1.000000)
endloop
mul r2.xy, r2.zzzz, -r3.xyxx
mul r2.xy, r2.xyxx, cb0[4].zwzz
round_ne r2.xy, r2.xyxx
mad r2.xy, r2.xyxx, cb0[4].xyxx, v1.zwzz
mov r3.xy, r2.xyxx
mov r5.x, r4.x
mov r5.y, r1.w
mov r5.z, r2.w
mov r2.z, l(1.000000)
loop
lt r3.z, r0.w, r2.z
breakc_nz r3.z
mad r3.xy, -r0.yzyy, cb0[4].xyxx, r3.xyxx
sample_l r6.xyzw, r3.xyxx, t1.xyzw, s0, l(0.000000)
mad r3.zw, r3.xxxy, l(0.000000, 0.000000, 2.000000, -2.000000), l(0.000000, 0.000000, -1.000000, 1.000000)
mul r3.zw, r3.zzzw, cb0[3].zzzw
mul r7.xy, r6.xxxx, r3.zwzz
mov r7.z, r6.x
add r4.yzw, -r1.xxyz, r7.xxyz
dp3 r3.z, r4.yzwy, r4.yzwy
lt r3.w, r3.z, cb0[1].x
add r4.w, r0.x, -r6.x
dp2 r4.y, r4.yzyy, r4.yzyy
sqrt r4.y, r4.y
div r6.y, r4.w, r4.y
lt r4.y, r5.y, r6.y
if_nz r4.y
mad r4.y, r6.y, r6.y, l(1.000000)
sqrt r4.y, r4.y
div r6.z, r6.y, r4.y
sqrt r3.z, r3.z
mul r3.z, r3.z, cb0[0].w
mul r3.z, r3.z, r3.z
mad r3.z, -r3.z, cb0[1].w, l(1.000000)
add r4.y, -r5.z, r6.z
mad r6.x, r3.z, r4.y, r5.x
else
mov r6.xyz, r5.xyzx
endif
movc r5.xyz, r3.wwww, r6.xyzx, r5.xyzx
add r2.z, r2.z, l(1.000000)
endloop
mad r0.x, r5.x, l(0.500000), l(-1.000000)
max r0.x, r0.x, l(0.000000)
add r0.x, r0.x, r0.x
div r0.x, r0.x, cb0[0].y
mad o0.xyzw, -r0.xxxx, cb0[2].xxxx, l(1.000000, 1.000000, 1.000000, 1.000000)
ret
G200b is *called* GTX 285, it's way larger than that
gt200b is > 400mm^2.
484 mm² if I remember correctly -- quite similar to the G80 die-size.
Seems like ATI benefits substantially more from a CS implementation, I bet with multiple pixels being computed per thread, which claws-back ALU utilisation.
Does that hold true for getting the data into the chip in the first place also? I was under the impression, that fecthing depth-values (which now is done from a single-channel texture) dozens of times is limiting performance foremost - somewhere in the back of my head i have a number uf up to 160 fetches, thus 160 TEX-instructions involved with SSAO (accordingly less with fetch4 or gather)
I thought, that was the major bottleneck right now and that the use of CS LDS could at least alleviate some double- and triple-fetching of the same data in different threads.
The inner loop (the two inner loops seen here are effectively the same as each other), on its shortest path, has 20 ALU instructions and 1 TEX. So it is ALU bound on NVidia.
It's particularly problematic because NVidia doesn't like high register allocations. But if average cumulative latency per thread is very low (due to shared memory usage) then that problem diminishes.Yeah, that goes back to the discussion we were having a while back. It remains to be seen whether devs are willing to go the extra mile to do this sort of thing. Though in this case doing multiple pixels per thread may be the more natural thing to do anyway.
Well people still didn't know any facts about ALU or TMUs up to 2 weeks before the 4850 hit the shops in AsiaHow long before the RV770 came out did we get reliable information about the chip? Im wondering about this because it seems even quieter than last time! Where are our insider rumours?! Maybe AMD employees have taken to heart that they have to keep it a secret to help keep themselves and their workmates in jobs during this recession?
Well people still didn't know any facts about ALU or TMUs up to 2 weeks before the 4850 hit the shops in Asia
Oh dear, so we won't know anything for sure until sometime in August at the earliest since the release data is set for late September.
Oh I'm sure before then the real specs will be leaked by "someone," just like Rv770. Maybe even months before like Rv770. If I remember correctly the 800 sp number was known months before launch, but most people blew it off as improbable. With other more conservative rumors given more credibility.
The problem is, there is absolutely no way to know which rumor is true or if any rumors are true until someone has an actual card to play with.
Regards,
SB