AMD: R8xx Speculation

How soon will Nvidia respond with GT300 to upcoming ATI-RV870 lineup GPUs

  • Within 1 or 2 weeks

    Votes: 1 0.6%
  • Within a month

    Votes: 5 3.2%
  • Within couple months

    Votes: 28 18.1%
  • Very late this year

    Votes: 52 33.5%
  • Not until next year

    Votes: 69 44.5%

  • Total voters
    155
  • Poll closed .
We need an answer to the "is fp16-filtering important?" question...

I'd say it's not that important for it to be single cycle. The vast majority of textures are still DXT or RGBA8. With DX10 there are also good alternatives like R11G11B10 that works fine for most HDR stuff.
 
Different Assembly depending on chip

I've been playing with the arithmetic testing shaders from Rightmark 2 (SM4), Mineral and Fire:

Code:
//
// Project:
//
// D3D RightMark
//
// Description:
//
// Effect file for "Pixel Shading 4 Arithmetic" test.
//
// Effect Version: 
//
// DirectX 10.0 (Apr 2007 SDK Update)
//
// Shader Profile:
//
// Pixel Shaders 4.0
//
// Vertex Shaders 4.0
//
//
// Global Variables
//
cbuffer cb0
{
 float fGeometryScale = 0.01;
 float fMaterialPower = 32.0;
 float fPI = 3.14159;
}
// Light
struct rm_light
{
 float4 vPosition;
 float4 vDiffuse;
 float4 vSpecular;
 float  fAttenuation;
};
cbuffer cb1
{
 rm_light Light;
 float4 vCameraPos;   // Camera Position
 float4x4 mW;    // World Matrix
 float4x4 mWVP;    // World * View * Projection Matrix
 float fTime;
 int nShaderType;
 bool nSSAA;
}
// Textures
Texture1D texLookup;
Texture2D texNormal;
//
// Samplers
//
SamplerState AnisoSampler
{
 Filter = ANISOTROPIC;
 AddressU  = Wrap;
 AddressV  = Wrap;
 AddressW  = Wrap;
 MaxAnisotropy = 16;
};
SamplerState LinearSampler
{
    Filter = MIN_MAG_MIP_LINEAR;
 AddressU  = Wrap;
 AddressV  = Wrap;
 AddressW  = Wrap;
};
SamplerState LinearClampSampler
{
    Filter = MIN_MAG_MIP_LINEAR;
 AddressU  = Clamp;
 AddressV  = Clamp;
 AddressW  = Clamp;
};
SamplerState PointSampler
{
    Filter = MIN_MAG_MIP_POINT;
 AddressU  = Wrap;
 AddressV  = Wrap;
 AddressW  = Wrap;
};
 
//-----------------------------------------------------------------------------------------
// State Structures
//-----------------------------------------------------------------------------------------
BlendState NoBlending
{
    AlphaToCoverageEnable = FALSE;
    BlendEnable[0] = FALSE;
};
RasterizerState CullBack
{
    CullMode = Back;
};
//
// Shaders
//
struct VS_IN
{
 float4 vPosition : POSITION;
 float2 vTexCoord : TEXCOORD;
 float3 vNormal  : NORMAL;
};
//
// Marble Shader ( (c) Paul Baker )
//
struct MINERAL_DATA
{
 float4 vPosition : SV_POSITION;
 float3 vTexCoord : TEXCOORD0;
 float3 vNormal  : TEXCOORD1;
 float3 vView  : TEXCOORD2;
 float3 vLight  : TEXCOORD3;
 float2 vBumpCoord : TEXCOORD4;
};
MINERAL_DATA MineralVS( VS_IN In )
{
 MINERAL_DATA o;
 // Vertex Position -> World Space
 float4 vPositionWorld = mul( In.vPosition, mW );
 // Output the 3d texture coords
 o.vTexCoord  = (In.vPosition.xyz + 100.0) * 0.0008;
 // Calculate the world space normal
 o.vNormal = mul( In.vNormal.xyz, mW );
 // View Vector
 o.vView = vPositionWorld - vCameraPos.xyz;
 // Light Vector
 o.vLight = ( Light.vPosition.xyz - vPositionWorld * fGeometryScale ) * Light.fAttenuation;
 // Bump coords
 o.vBumpCoord = In.vTexCoord;
 // Position
 o.vPosition = mul( In.vPosition, mWVP );
 // Finalize
 return o;
}
float Random(float3 vCoord)
{
 int3 rnd = (int)(vCoord * 1000000);
 return (float)( ((rnd.x * rnd.y + rnd.z) >> 16) & 0x7fff ) / 32767.0;
}
float GetWeight(float3 vCoord)
{
 float3 c = saturate(vCoord);
 return 4.0 *
  ( 1 - ( 3*c.x*c.x - 2*c.x*c.x*c.x ) ) * 
  ( 1 - ( 3*c.y*c.y - 2*c.y*c.y*c.y ) ) * 
  ( 1 - ( 3*c.z*c.z - 2*c.z*c.z*c.z ) );
}
float3 GetGradient(float3 vCoord)
{
 float phi = 2.0 * fPI * Random(vCoord);
 float theta = fPI * Random(vCoord);
 float sin_phi, cos_phi;
 float sin_theta, cos_theta;
 sincos(phi, sin_phi, cos_phi);
 sincos(theta, sin_theta, cos_theta);
 return float3(
  cos_theta * cos_phi,
  cos_theta * sin_phi,
  sin_theta);
}
const float3 c_GridOffset[8] = {
 float3( 0, 0, 0 ),
 float3( 0.015625, 0.0, 0.0 ),
 float3( 0.0, 0.015625, 0.0 ),
 float3( -0.015625, 0.0, 0.0 ),
 float3( 0.0,-0.015625, 0.015625 ),
 float3( 0.015625, 0.0, 0.0 ),
 float3( 0.0, 0.015625, 0.0 ),
 float3( -0.015625, 0.0, 0.0 ),
};
float4 MineralPS( MINERAL_DATA In ) : SV_TARGET
{
 // LIGHTING CALCULATIONS
 float3 normal = normalize( In.vNormal );
 float3 view = normalize( In.vView );
 float3 light = normalize( In.vLight );
 float attenuation = saturate( 1.0 - dot( In.vLight, In.vLight ) );
 normal = texNormal.Sample( AnisoSampler, In.vBumpCoord ) * 2 - 1;
 // Calculate L dot N (world space) and add ambient
 float diffuse = saturate( saturate( dot( light, normal ) ) + 0.4 );
 // Calculate the reflection vector, R=2(L.N)N-L
 float3 reflection = reflect( normal, light );
 // Calculate R dot V (world space)
 float specular = saturate( dot( reflection, view ) );
 // Calculate the lighting contributions
 specular = pow( specular, fMaterialPower );
 // TEXTURE CALCULATIONS
 float3 noise = 0;
 float3 noise2 = 0;
 // Calculate the position of one grid corner of the cube
 float3 gridCorner = floor( In.vTexCoord * 64.0 ) * 0.015625;
 for ( int i = 0; i < 8; ++i )
 {
  gridCorner += c_GridOffset[i];
  // Sample the gradient texture at this point
  float3 gradient = GetGradient( gridCorner );
  // Calculate the vector from the grid corner
  float3 cornerVector = ( In.vTexCoord - gridCorner ) * 64.0;
  // Dot the vector with the gradient
  float gradientDotCornerVector = dot( cornerVector, gradient );
  // Look up the weighting factor this corner will have on the result
  float3 absCornerVector = abs( cornerVector );
  float3 weighting = GetWeight( absCornerVector );
  // Bias the dot product by the weighting factor and save in noise
  noise += gradientDotCornerVector * weighting;
 }
 // Now repeat for the second octave, using iTexCoords*2 and saving the result in noise2
 gridCorner = floor( 2 * In.vTexCoord * 64.0 ) * 0.015625;
 for ( int i = 0; i < 8; ++i )
 {
  gridCorner += c_GridOffset[i];
  // Sample the gradient texture at this point
  float3 gradient = GetGradient( gridCorner );
  // Calculate the vector from the grid corner
  float3 cornerVector = ( In.vTexCoord - gridCorner ) * 64.0;
  // Dot the vector with the gradient
  float gradientDotCornerVector = dot( cornerVector, gradient );
  // Look up the weighting factor this corner will have on the result
  float3 absCornerVector = abs( cornerVector );
  float3 weighting = GetWeight( absCornerVector );
  // Bias the dot product by the weighting factor and save in noise
  noise2 += gradientDotCornerVector * weighting;
 }
 // Final noise value
 noise = noise + 0.5 * noise2;
 // Calculate sin(120*(iTexCoords.x+iTexCoords.z)+3*noise) to get marble effect
 noise *= 3;
 noise += 120 * In.vTexCoord.x;
 noise += 120 * In.vTexCoord.z;
 noise = sin( noise.x );
 // Scale and bias the texture coordinate from [-1, 1] to [0, 1]
 noise = noise * 0.65 + 0.35;
 // Look up color in 1d texture
 noise = texLookup.SampleLevel( LinearSampler, noise, 0 );
 // Modulate the result by the diffuse lighting and add specular
 return float4( attenuation * (diffuse * noise + specular), 1);
}
//
// Noise-based fire shader
//
struct FIRE_DATA
{
 float4 vPosition : SV_POSITION;
 float3 vTexCoord0 : TEXCOORD0;
 float3 vTexCoord1 : TEXCOORD1;
};
FIRE_DATA FireVS( VS_IN In )
{
 FIRE_DATA o;
 // Position
 o.vPosition = mul( In.vPosition, mWVP );
 // Output the 3d texture coords
 o.vTexCoord0  = (In.vPosition.yzx + 100.0 - fTime * 15) * 0.0018;
 o.vTexCoord1  = (In.vPosition.yzx + 100.0 - fTime * 6) * 0.00021;
 // Finalize
 return o;
}
float4 FirePS ( FIRE_DATA In ) : SV_TARGET
{
 // TEXTURE CALCULATIONS
 float noises[4];
 for ( int j = 0; j < 4; ++j )
 {
  // Calculate the position of one grid corner of the cube
  noises[j] = 0;
  float3 texCoord = ((j & 1) + 1) * ((j & 2) ? In.vTexCoord1 : In.vTexCoord0);
  float3 gridCorner = floor( texCoord * 64.0 ) * 0.015625;
  for ( int i = 0; i < 8; ++i )
  {
   gridCorner += c_GridOffset[i];
   // Sample the gradient texture at this point
   float3 gradient = GetGradient( gridCorner );
   // Calculate the vector from the grid corner
   float3 cornerVector = ( texCoord - gridCorner ) * 64.0;
   // Dot the vector with the gradient
   float gradientDotCornerVector = dot( cornerVector, gradient );
   // Look up the weighting factor this corner will have on the result
   float3 absCornerVector = abs( cornerVector );
   float3 weighting = GetWeight( absCornerVector );
   // Bias the dot product by the weighting factor and save in noise
   noises[j] += gradientDotCornerVector * weighting;
  }
 }
 float3 noise[2];
 for ( int j = 0; j < 2; ++j )
 {
  noise[j] = noises[j*2] + 0.5 * noises[j*2+1];
  // Calculate sin(120*(iTexCoords.x+iTexCoords.z)+3*noise) to get marble effect
  float3 texCoord = (j & 1) ? In.vTexCoord1 : In.vTexCoord0;
  noise[j] *= 3;
  noise[j] += 120 * texCoord.x;
  noise[j] += 120 * texCoord.z;
  noise[j] = sin( noise[j].x );
 }
 // Final noise value
 float3 fnoise = lerp( noise[0], noise[1], 0.7 );
 // Scale and bias the texture coordinate from [-1, 1] to [0, 1]
 fnoise = fnoise * 0.65 + 0.35;
 // Look up color in 1d texture
 fnoise = texLookup.SampleLevel( LinearSampler, fnoise, 0 );
 if ( fnoise.x < 0.2 ) discard;
 return float4( fnoise, 1);
}
and I've discovered that R6xx and RV7xx GPUs get different assembly code:

Code:
MineralPS
               Instr.
         GPRs  Groups
HD2900    49    201
HD2400    11    266
HD2600    11    266
HD3870    49    201
HD4870    45    198
HD4670    13    227
 
FirePS
               Instr.
         GPRs  Groups
HD2900    10    502
HD2400    10    502
HD2600    10    502
HD3870    10    502
HD4870    10    432
HD4670    10    432
My first thought is that RV7xx GPUs have more functionality on the XYZW ALU lanes, I think there's extra bitwise and perhaps integer functionality.

But apart from that, the MineralPS results seem to suggest that the "smaller" GPUs have a register file that's smaller than merely scaled down from the bigger GPUs according to SIMD-width.

The only problem with this is that on MineralPS HD4670, according to ibxt:

RV730 - http://www.ixbt.com/video3/rv730-part2.shtml
RV770 - http://www.ixbt.com/video3/rv770-2-part2.shtml

comes in at 41% of the performance of HD4870, instead of being about 35% as indicated by GPUSA :???:

So maybe GPUSA is not representing RV770 performance correctly. Additionally, perhaps the RV770 results on ixbt need to be updated (old driver)?

GPUSA also produces 3 different assemblies for MineralVS: R6xx, RV770 and RV730 :???:

It's worth noting that according to GPUSA the MineralPS shader should run at 2.2x the fillrate of FirePS. But as measured by ibxt, it's only about 1.7x. That's a pretty huge shortfall.

Maybe what's happening is the slower assembly (that suits HD4670's smaller register file) is also running on HD4870, thus slowing HD4870 unnecessarily? That's only a 14.6% discrepancy, far short of explaining the gap between 1.7x and 2.2x.

So, overall I'm not sure if the smaller GPUs do have this reduced-capacity-per-ALU register file or not.

Jawed
 
9400GT seems to be worse, being G96b to compete with RV710, which is about 73mm2. How big is G96b?
roughly 120mm^2.
http://forum.beyond3d.com/showthread.php?t=47604

Has nvidia given up on lowest-end chips completely (G98b)?. HD4550 vs. 9400GT is a somewhat interesting fight, since both have somewhat comparable performance in the end (4550 is clearly faster though), arriving with very different means: 64bit vs 128bit memory interface (though the former being twice as fast thus same memory bandwidth), very small (but fully enabled) chip vs. much larger chip (but with half the shader units disabled).
 
My first thought is that RV7xx GPUs have more functionality on the XYZW ALU lanes, I think there's extra bitwise and perhaps integer functionality.
We know that rv7xx gpus can do bitshifts on all units instead of only the fat unit.
 
Ah, I thought that's the 65nm version. That thread seems to indicate there's a "C" version too. Maybe G96c is a 55nm chip that's smaller than 120mm2?

If not, 120 versus 73 is just awe-inspiring. But NVidia's had a few months' head start.

Has nvidia given up on lowest-end chips completely (G98b)?. HD4550 vs. 9400GT is a somewhat interesting fight, since both have somewhat comparable performance in the end (4550 is clearly faster though), arriving with very different means: 64bit vs 128bit memory interface (though the former being twice as fast thus same memory bandwidth), very small (but fully enabled) chip vs. much larger chip (but with half the shader units disabled).
Follow the breadcrumbs that start here:

http://forum.beyond3d.com/showthread.php?p=1162236#post1162236

:LOL:

Jawed
 
We know that rv7xx gpus can do bitshifts on all units instead of only the fat unit.
OK, so I've just had a look and found a sprinkling of ASHR instructions in MineralPS that are solely T: in R6xx but any lane in RV7xx. Since there's a lot of transcendentals in this shader, RV7xx gets 15 ASHRs away from T: with only one on T:.

According to GPUSA, that only makes for a 3-instruction group difference between R600/RV670 and RV770, 201 versus 198 - surprising.

Jawed
 
HD4850 is definitely short on bandwidth, while HD4870 has too much.
That's a rather silly statement. The right amount of BW depends entirely on the cost scaling of the BW and the application's workload distribution. Unless you have an obscene amount of BW (like R600), you will always have parts of the frame that are BW starved and other parts that use only a fraction of the BW.

Both solutions are very sensible. Increase the RAM speed on the 4850, and the associated cost probably won't make any more competitive of a product. Decrease the RAM speed on the 4870 (by cost or type), and the reduction in cost won't get you many more sales, especially since it would allow NVidia to honestly claim that the GTX260 is faster.
 
Correct me if I am wrong, but IMO pad size also scales with mem clock frequency - albeit presumably not linear - and both RV380 and RV515 had much lower mem clocks than RV730. Plus, pads supposedly scale very good with process technologie, but is it really so that they do not scale at all?
Why would pad size scale at all with frequency or with process tech?
Pad sizes are almost exclusively determined by advances in packaging technology, not silicon process technology.
 
Why would pad size scale at all with frequency or with process tech?
Pad sizes are almost exclusively determined by advances in packaging technology, not silicon process technology.
It's just what I've been told. I am not engineer, you know? :)

So, 'tis not the case you say? I am equally fine with that and glad to be a bit more educated now.
 
Actually, they're just being dumb, they probably heard "Pentium-D" style and concluded that on their own; however Intel uses MCMs (two chips next to each other on the same package), not SiPs. Not that it matters since this all sounds like so much bullshit.

Intel uses MCMs now (or at least until Nehalem), but the Pentium D product was two P4 dies that were cut out as a single chip and their FSBs were tied together.
The "on top of" stuff would make more sense if they meant to say "right next to".

GPUs lack FSBs with multisocket capability wired in, so it's not as conceptually straightforward as it was for the P4. Some additional thought in the design or wiring magic in the package would be necessary.
 
That's a rather silly statement.
I made that statement as a caution against the "28% bandwidth" figure I posted. When comparing RV730 with RV770 the bandwidth situation is a bit muddy :cry:

Of course a bandwidth comparison can never be "idealised", but with HD4870 usually maxxing out at around 30% or so faster than HD4850, its 115.2GB/s as a baseline for RV730 is perhaps a bit misleading.

If we say that a realistic figure for HD4870's bandwidth, for such a comparison with HD4670 is about 90GB/s (realistic meaning "maximum of 4xAA"), then HD4670's bandwidth is more like 36% of HD4870's.

That's more in line with the kind of performance range we see, 40-60% of HD4870.

I for one decry the limited testing of HD4870 versus HD4850 at 8xMSAA, here's something:

http://www.pcgameshardware.com/aid,..._Nvidia_Geforce_GTX_260_with_216_ALUs/?page=7

44% at 1920 :p

Jawed
 
Sorry, Jawed, but I can't really understand what you're trying to say in half of that last post of yours.

RV730 is huge compared to G96 at 121mm2 on 65nm. Sure, G96 is pathetic, but does RV730 need to be 20% bigger?...
It's twice the speed of G96, and the 4650 can use dirt cheap DDR2 yet still be faster.

Making RV730 even slower would leave a rather large gap between it and the 4830 (or RV740, if it arrives). I think ATI targetted a very good level of performance, and NVidia can't even catch it with a 50% larger bus and die (9600 GSO, discounting disabled units). Nobody will recommend G96 when $10-20 more gets you twice the performance. If you don't need performance then RV710 will be fine.
 
Mintmaster: Don't be so sure. nVidia pays $10 to OEMs for each sold GF9500 1GB. I don't know if it's true for US, but it works here in this way... Customers ar happy (PC with 1GB GF9 for a bargain price!) and OEMs too...
 
Mintmaster: Don't be so sure. nVidia pays $10 to OEMs for each sold GF9500 1GB. I don't know if it's true for US, but it works here in this way... Customers ar happy (PC with 1GB GF9 for a bargain price!) and OEMs too...

Forgive my ignorance but is that leagal?
 
Mintmaster: Don't be so sure. nVidia pays $10 to OEMs for each sold GF9500 1GB. I don't know if it's true for US, but it works here in this way... Customers ar happy (PC with 1GB GF9 for a bargain price!) and OEMs too...
The 4650 also has a 1GB DDR2 model, and if NVidia wants to erode margins that way then they are free to do so.

I still think ATI made the right choice. Cost/perf is almost like 7600 vs. X1600, and at that time NVidia was able to charge a lot more for the 7600. The same is now true for ATI, as they are the undisputed kings of 128-bit video cards.
 
I've been playing with the arithmetic testing shaders from Rightmark 2 (SM4), Mineral and Fire:

Code:
MineralPS
               Instr.
         GPRs  Groups
HD2900    49    201
HD2400    11    266
HD2600    11    266
HD3870    49    201
HD4870    45    198
HD4670    13    227
 
FirePS
               Instr.
         GPRs  Groups
HD2900    10    502
HD2400    10    502
HD2600    10    502
HD3870    10    502
HD4870    10    432
HD4670    10    432
It's worth noting that according to GPUSA the MineralPS shader should run at 2.2x the fillrate of FirePS. But as measured by ibxt, it's only about 1.7x. That's a pretty huge shortfall.
I've realised that 45 or 49 registers is going to reduce the number of batches in flight in each SIMD.

If 128 batches can each have 2 registers, then 45 registers will only allow for 5 batches. Somehow I suspect the number of batches must be even, i.e. 4.

With 4 batches, I can't help wondering if the two texturing instructions (one being dependent) are somehow not having their latency fully hidden. Instruction 0 is TEX, used in instruction 4 and the other TEX instruction, 197, is dependent upon 196 and used in 198.

Jawed
 
I don't know why you suspect that, but it's not true.
If it's not true, then maybe that provides an alternative mechanism for reduced performance. Batches are issued in pairs to the ALUs and TUs.

So if one batch pairing is only half populated, then that's theoretically 5/6th performance. Instead of 2.2x scaling from the Fire shader, it would amount to 1.8x scaling. That's kinda close to the observed 1.7x scaling in ixbt's tests.

Jawed
 
Back
Top