AMD: R8xx Speculation

LordEC911 · Jul 15, 2009

aaronspink said:
I though GT300 was expected to be in the 500-600 size range.

Less than 500mm2, suppose to be smaller than G200b which was 485mm2(?).

Lukfi · Jul 15, 2009

G200b is *called* GTX 285, it's way larger than that

aaronspink · Jul 15, 2009

dkanter said:
Surely you mean that a tape out costs $6M? That'd be reasonable, plus an easy $1-10M for any extra engineering and validation effort.

DK

Extra setups, test wafer runs, extra lab time, etc. It all adds up pretty quick for another post-si design. Then you need yet more mask sets for an actual production run.

aaronspink · Jul 15, 2009

LordEC911 said:
Less than 500mm2, suppose to be smaller than G200b which was 285mm2(?).

gt200b is > 400mm^2.

fellix · Jul 15, 2009

484 mm² if I remember correctly -- quite similar to the G80 die-size.

CarstenS · Jul 15, 2009

Humus said:
How large gain you see by going compute shader depends on your ALU:TEX ratio. A compute shader implementation will likely require somewhat more ALU operation than a pixel shader implementation, whereas the TEX operations drop significantly. ATI chip that have plenty of ALU and relatively few TEX naturally sees a large boost in performance. For Nvidia who has less ALU power and more TEX power the performance boost will be smaller.

Does that hold true for getting the data into the chip in the first place also? I was under the impression, that fecthing depth-values (which now is done from a single-channel texture) dozens of times is limiting performance foremost - somewhere in the back of my head i have a number uf up to 160 fetches, thus 160 TEX-instructions involved with SSAO (accordingly less with fetch4 or gather)

I thought, that was the major bottleneck right now and that the use of CS LDS could at least alleviate some double- and triple-fetching of the same data in different threads.

Jawed · Jul 15, 2009

The original codename on the first video was "EG Broadway", which I interpret to mean "Evergreen Broadway". I've just realised that "Broadway" would be the name of a road (like Avenue, or Boulevard). So Evergreen Broadway would be a road lined with Evergreen trees. Which implies "multiple" evergreens. Which makes me think MCM

Jawed

Jawed · Jul 15, 2009

I downloaded the NVidia SSAO sample (D3D10) and edited the code to hard code the lowest quality version of the shader with no normals (since quality and use of normals is parameter driven) - here is the D3D assembly:

Code:

ps_4_0
dcl_constantbuffer cb0[5], immediateIndexed
dcl_sampler s0, mode_default
dcl_resource_texture2d (float,float,float,float) t0
dcl_resource_texture2d (float,float,float,float) t1
dcl_input_ps_siv linear noperspective v0.xy, position
dcl_input_ps linear v1.zw
dcl_output o0.xyzw
dcl_temps 8
sample_l r0.xyzw, v1.zwzz, t1.xyzw, s0, l(0.000000)
mul r0.y, cb0[0].z, l(0.500000)
mul r0.yz, r0.yyyy, cb0[3].xxyx
div r0.yz, r0.yyzy, r0.xxxx
mul r1.xy, r0.yzyy, cb0[4].zwzz
min r0.w, r1.y, r1.x
min r0.w, r0.w, cb0[0].x
lt r1.x, r0.w, l(1.000000)
if_nz r1.x
  mov o0.xyzw, l(1.000000,1.000000,1.000000,1.000000)
  ret 
endif 
mad r1.xy, v1.zwzz, l(2.000000, -2.000000, 0.000000, 0.000000), l(-1.000000, 1.000000, 0.000000, 0.000000)
mul r1.xy, r1.xyxx, cb0[3].zwzz
mul r1.xy, r0.xxxx, r1.xyxx
add r1.w, r0.w, l(1.000000)
div r0.yz, r0.yyzy, r1.wwww
ftoi r2.xy, v0.xyxx
and r2.xy, r2.xyxx, l(63, 63, 0, 0)
mov r2.zw, l(0,0,0,0)
ld r2.xyzw, r2.xyzw, t0.xyzw
mul r0.yz, r0.yyzy, r2.xxyx
mul r2.xy, r2.zzzz, r0.yzyy
mul r2.xy, r2.xyxx, cb0[4].zwzz
round_ne r2.xy, r2.xyxx
mad r2.xy, r2.xyxx, cb0[4].xyxx, v1.zwzz
mul r0.yz, r0.yyzy, cb0[4].zzwz
round_ne r0.yz, r0.yyzy
mul r3.xy, r0.yzyy, cb0[4].xyxx
add r1.w, cb0[1].y, l(-1.570796)
sincos r4.x, r5.x, r1.w
div r1.w, r4.x, r5.x
ne r2.w, cb0[1].y, l(0.000000)
mad r3.z, r1.w, r1.w, l(1.000000)
sqrt r3.z, r3.z
div r3.z, r1.w, r3.z
movc r2.w, r2.w, r3.z, l(-1.000000)
mov r1.z, r0.x
mov r3.zw, r2.xxxy
mov r4.x, l(0)
mov r4.y, r1.w
mov r4.z, r2.w
mov r4.w, l(1.000000)
loop 
  lt r5.x, r0.w, r4.w
  breakc_nz r5.x
  mad r3.zw, r0.yyyz, cb0[4].xxxy, r3.zzzw
  sample_l r5.xyzw, r3.zwzz, t1.xyzw, s0, l(0.000000)
  mad r5.yz, r3.zzwz, l(0.000000, 2.000000, -2.000000, 0.000000), l(0.000000, -1.000000, 1.000000, 0.000000)
  mul r5.yz, r5.yyzy, cb0[3].zzwz
  mul r6.xy, r5.xxxx, r5.yzyy
  mov r6.z, r5.x
  add r5.yzw, -r1.xxyz, r6.xxyz
  dp3 r5.w, r5.yzwy, r5.yzwy
  lt r6.x, r5.w, cb0[1].x
  add r5.x, r0.x, -r5.x
  dp2 r5.y, r5.yzyy, r5.yzyy
  sqrt r5.y, r5.y
  div r5.y, r5.x, r5.y
  lt r6.y, r4.y, r5.y
  if_nz r6.y
    mad r6.y, r5.y, r5.y, l(1.000000)
    sqrt r6.y, r6.y
    div r5.z, r5.y, r6.y
    sqrt r5.w, r5.w
    mul r5.w, r5.w, cb0[0].w
    mul r5.w, r5.w, r5.w
    mad r5.w, -r5.w, cb0[1].w, l(1.000000)
    add r6.y, -r4.z, r5.z
    mad r5.x, r5.w, r6.y, r4.x
  else 
    mov r5.xyz, r4.xyzx
  endif 
  movc r4.xyz, r6.xxxx, r5.xyzx, r4.xyzx
  add r4.w, r4.w, l(1.000000)
endloop 
mul r2.xy, r2.zzzz, -r3.xyxx
mul r2.xy, r2.xyxx, cb0[4].zwzz
round_ne r2.xy, r2.xyxx
mad r2.xy, r2.xyxx, cb0[4].xyxx, v1.zwzz
mov r3.xy, r2.xyxx
mov r5.x, r4.x
mov r5.y, r1.w
mov r5.z, r2.w
mov r2.z, l(1.000000)
loop 
  lt r3.z, r0.w, r2.z
  breakc_nz r3.z
  mad r3.xy, -r0.yzyy, cb0[4].xyxx, r3.xyxx
  sample_l r6.xyzw, r3.xyxx, t1.xyzw, s0, l(0.000000)
  mad r3.zw, r3.xxxy, l(0.000000, 0.000000, 2.000000, -2.000000), l(0.000000, 0.000000, -1.000000, 1.000000)
  mul r3.zw, r3.zzzw, cb0[3].zzzw
  mul r7.xy, r6.xxxx, r3.zwzz
  mov r7.z, r6.x
  add r4.yzw, -r1.xxyz, r7.xxyz
  dp3 r3.z, r4.yzwy, r4.yzwy
  lt r3.w, r3.z, cb0[1].x
  add r4.w, r0.x, -r6.x
  dp2 r4.y, r4.yzyy, r4.yzyy
  sqrt r4.y, r4.y
  div r6.y, r4.w, r4.y
  lt r4.y, r5.y, r6.y
  if_nz r4.y
    mad r4.y, r6.y, r6.y, l(1.000000)
    sqrt r4.y, r4.y
    div r6.z, r6.y, r4.y
    sqrt r3.z, r3.z
    mul r3.z, r3.z, cb0[0].w
    mul r3.z, r3.z, r3.z
    mad r3.z, -r3.z, cb0[1].w, l(1.000000)
    add r4.y, -r5.z, r6.z
    mad r6.x, r3.z, r4.y, r5.x
  else 
    mov r6.xyz, r5.xyzx
  endif 
  movc r5.xyz, r3.wwww, r6.xyzx, r5.xyzx
  add r2.z, r2.z, l(1.000000)
endloop 
mad r0.x, r5.x, l(0.500000), l(-1.000000)
max r0.x, r0.x, l(0.000000)
add r0.x, r0.x, r0.x
div r0.x, r0.x, cb0[0].y
mad o0.xyzw, -r0.xxxx, cb0[2].xxxx, l(1.000000, 1.000000, 1.000000, 1.000000)
ret

The inner loop (the two inner loops seen here are effectively the same as each other), on its shortest path, has 20 ALU instructions and 1 TEX. So it is ALU bound on NVidia. Though in this case of the code I haven't worked out what typical iteration counts for the loops would be.

These are the factors affecting performance and how a CUDA version might help (though CUDA might only help in the higher quality versions?):

register count - for this shader it is quite high, 10 vec4 registers are mentioned in that code, and 40 scalar registers is a lot for NVidia's architecture (though compilation could easily get rid of 8 or 10, say). So using shared memory in a CUDA version of this shader might help in relieving register pressure
latency-hiding - the substantial register allocation reduces the number of warps in flight, e.g. 16 warps, which impacts upon overall capability to hide all latencies (register fetches, TEX, dynamic branching etc.). A CUDA version with samples fetched into shared memory for use by multiple pixels would massively reduce the average latency experienced per pixel. Perhaps to the point where the warp count is a non-issue
redundant computation - I've not investigated this thoroughly, so I've no idea how much computation is redundant across neighbouring pixels in the result. If partial results for samples can be shared by multiple pixels then that would reduce ALU:TEX

I think it's 14 ALU cycles for this inner loop on ATI (or 20 for the longest path), for each TEX. The inner loop is quite inefficient on ATI, 41/44% ALU utilisation.

Jawed

Jawed · Jul 15, 2009

I've realised, from the notes on using this SDK example, that the main shader is recommended to be used at half-resolution with a follow-up pass (consisting of two passes itself) using a bilateral blur filter at full resolution.

There are a number of options for the blur filter. The simplest appears to be TEX bound with 2 TEX and 14 ALUs in the D3D assembly (almost entirely scalar instructions).

So shared memory should make this substantially faster, with samples shared by pixels as per the GDC09 example of a gaussian blur :smile:

On ATI the inner loop of this blur shader is miserably ALU-bound with terrible efficiency, only 28% ALU lane utilisation (10 cycles), a throughput of 12G loops per second.

Seems like ATI benefits substantially more from a CS implementation, I bet with multiple pixels being computed per thread, which claws-back ALU utilisation.

In absolute performance, comparing a CS4.x implementation using shared memory, I wouldn't be surprised if GTX285 is substantially faster than HD4890. Shared memory fetch bandwidth is vastly higher in NVidia:

GTX285 - 240 scalar fetches per clock at 1476MHz = 354G fetches per second
HD4890 - 160 scalar fetches per clock at 850MHz = 136G fetches per second

If a thread uses a sample more than once then ATI rapidly catches up - as subsequent uses of such a value are theoretically 800 ALU lanes * 3 operands = 2400 scalar fetches per clock = 2040G fetches per second. This is because subsequent fetches are from register file (and in-pipe registers), not from shared memory. The programmer can do something similar on NVidia by copying from shared memory to a register, before such multiple uses, getting an effective 1151G fetches per second (240 lanes, 3.25 operands per clock - counting transcendental functions at 1/4 throughput).

On both architectures there is the problem of register allocation for such copies from shared memory, though this is much less of an issue on ATI (though ATI always suffers the register overhead, while it's a trade-of in NVidia).

For the overall SSAO CS implementation (HBAO pass followed by bilateral pass), who knows...

Jawed

LordEC911 · Jul 15, 2009

Lukfi said:
G200b is *called* GTX 285, it's way larger than that

aaronspink said:
gt200b is > 400mm^2.

fellix said:
484 mm² if I remember correctly -- quite similar to the G80 die-size.

Thanks guys, was a typo that is now fixed, meant to put 485mm2. (I just typed in 285 again... but caught myself)
Edit- I guess that's what I get for not proof reading my post.

trinibwoy · Jul 15, 2009

Jawed said:
Seems like ATI benefits substantially more from a CS implementation, I bet with multiple pixels being computed per thread, which claws-back ALU utilisation.

Yeah, that goes back to the discussion we were having a while back. It remains to be seen whether devs are willing to go the extra mile to do this sort of thing. Though in this case doing multiple pixels per thread may be the more natural thing to do anyway.

Humus · Jul 15, 2009

CarstenS said:
Does that hold true for getting the data into the chip in the first place also? I was under the impression, that fecthing depth-values (which now is done from a single-channel texture) dozens of times is limiting performance foremost - somewhere in the back of my head i have a number uf up to 160 fetches, thus 160 TEX-instructions involved with SSAO (accordingly less with fetch4 or gather)

I thought, that was the major bottleneck right now and that the use of CS LDS could at least alleviate some double- and triple-fetching of the same data in different threads.

160 TEX is of course an arbitrary number. An implementation could use any number of samples it needs to get the best quality / cost ratio. 160 is way more than what an ordinary implementation would use. A normal number would be perhaps 32 samples. But lets say 32 TEX are replaced by a single fetch per thread, which plus the margin in the threadgroup might mean an average of perhaps two fetch per pixel, it's fairly easy to see how that would shift a heavily TEX limited situation to something heavily ALU bound.

If TEX is the bottleneck on both ATI and Nvidia, you could perhaps see Nvidia winning by 50%. If a CS implementation shifts this to being ALU bound instead you might instead see ATI winning by 50%. That doesn't mean Nvidia didn't get any performance gain, just that their gain would be smaller. So if the original implementation was say 3ms on ATI and 2ms on Nvidia, after CS optimization it could perhaps be 0.7ms on ATI and 1ms on Nvidia.

Humus · Jul 15, 2009

Jawed said:
The inner loop (the two inner loops seen here are effectively the same as each other), on its shortest path, has 20 ALU instructions and 1 TEX. So it is ALU bound on NVidia.

It's definitively possible to make a substantially more optimized SSAO implementation than this sample though. With that said, SSAO probably doesn't represent where the largest gains are to be had with a CS implementation. Most implementations are fairly heavy on ALU as well. The largest gains are probably on blur filters, depth of field, and the like, where ALU is low and TEX is high.

Jawed · Jul 15, 2009

trinibwoy said:
Yeah, that goes back to the discussion we were having a while back. It remains to be seen whether devs are willing to go the extra mile to do this sort of thing. Though in this case doing multiple pixels per thread may be the more natural thing to do anyway.

It's particularly problematic because NVidia doesn't like high register allocations. But if average cumulative latency per thread is very low (due to shared memory usage) then that problem diminishes.

I don't know if there are any pragmas in SM5 that cover this kind of hardware variability (not sure if it's reasonable) - the unroll pragma is a start, so any kind of loop that has independent iterations is naturally parallelising, meaning it's up to the compiler writers - but that only works for loops with a constant iteration count (and not very large, e.g. <=32 iterations). It's also possible to make an overlapping loop, which isn't constant-iterations, but there's enough iteration independence to overlap computation, e.g. have 4 iterations in flight at a time.

Standardised shared memory sizes per SM spec is a start, as that constrains one dimension of this variability.

The performance differences between shared memory on these architectures (just like the differences in ALU:TEX or dynamic-branching divergence penalty) obviously all cause some grief.

The other side of the coin is the kind of "auto-tuning" CUDA stuff we've seen a fair amount of, something that should be feasible under SM5. Paramerisation of the shaders in these terms is something that SM5 is a lot friendlier for. Though there is still the combinatorial problem, e.g. the optimum profile for a kernel that runs as CS is "straight-forward", but when you have VS-HS-TS-HS-GS-PS configured the meaningful profiling of individual shaders is gonna be pretty horrible, as the combinations explode in your face. Though there's some research on heuristics for steering the tuning - but it could be years before that can cope with more than just a single CS kernel configured.

Jawed

Squilliam · Jul 16, 2009

How long before the RV770 came out did we get reliable information about the chip? Im wondering about this because it seems even quieter than last time! Where are our insider rumours?! Maybe AMD employees have taken to heart that they have to keep it a secret to help keep themselves and their workmates in jobs during this recession?

Nebuchadnezzar · Jul 16, 2009

Squilliam said:
How long before the RV770 came out did we get reliable information about the chip? Im wondering about this because it seems even quieter than last time! Where are our insider rumours?! Maybe AMD employees have taken to heart that they have to keep it a secret to help keep themselves and their workmates in jobs during this recession?

Well people still didn't know any facts about ALU or TMUs up to 2 weeks before the 4850 hit the shops in Asia

Squilliam · Jul 16, 2009

Nebuchadnezzar said:
Well people still didn't know any facts about ALU or TMUs up to 2 weeks before the 4850 hit the shops in Asia

Oh dear, so we won't know anything for sure until sometime in August at the earliest since the release data is set for late September.

Silent_Buddha · Jul 16, 2009

Squilliam said:
Oh dear, so we won't know anything for sure until sometime in August at the earliest since the release data is set for late September.

Oh I'm sure before then the real specs will be leaked by "someone," just like Rv770. Maybe even months before like Rv770. If I remember correctly the 800 sp number was known months before launch, but most people blew it off as improbable. With other more conservative rumors given more credibility.

The problem is, there is absolutely no way to know which rumor is true or if any rumors are true until someone has an actual card to play with.

Regards,
SB

LordEC911 · Jul 16, 2009

Silent_Buddha said:
Oh I'm sure before then the real specs will be leaked by "someone," just like Rv770. Maybe even months before like Rv770. If I remember correctly the 800 sp number was known months before launch, but most people blew it off as improbable. With other more conservative rumors given more credibility.

The problem is, there is absolutely no way to know which rumor is true or if any rumors are true until someone has an actual card to play with.

Regards,
SB

Most of the time it isn't AMD flooding the rumor mill with misinformation...
I believe there was only a few sources from Asia that were stating the 480SP increase TO 800SP while almost all other sources were stating the 480SP total rumor.

Dave Baumann · Jul 16, 2009

First time I ever saw the notion of 480 SP was in this forum...

AMD: R8xx Speculation

How soon will Nvidia respond with GT300 to upcoming ATI-RV870 lineup GPUs

Within 1 or 2 weeks

Within a month

Within couple months

Very late this year

Not until next year

LordEC911

Lukfi

aaronspink

aaronspink

fellix

CarstenS

Moderator

Jawed

Jawed

Jawed

LordEC911

trinibwoy

Meh

Humus

Crazy coder

Humus

Crazy coder

Jawed

Squilliam

Beyond3d isn't defined yet

Nebuchadnezzar

Squilliam

Beyond3d isn't defined yet

Silent_Buddha

LordEC911

Dave Baumann

Gamerscore Wh...

Similar threads