37-gpudrivenrendering: Fixed GL shaders.
This commit is contained in:
parent
e474666a55
commit
90aadf835b
@ -14,29 +14,33 @@ uniform vec4 u_inputRTSize;
|
||||
NUM_THREADS(16, 16, 1)
|
||||
void main()
|
||||
{
|
||||
//this shader can be used to both copy a mip over to the output and downscale it.
|
||||
// this shader can be used to both copy a mip over to the output and downscale it.
|
||||
|
||||
ivec2 coord = gl_GlobalInvocationID.xy;
|
||||
ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
|
||||
|
||||
if (all(coord.xy < u_inputRTSize.xy))
|
||||
if (all(lessThan(coord.xy, u_inputRTSize.xy) ) )
|
||||
{
|
||||
float maxDepth = 1.0;
|
||||
|
||||
if ( u_inputRTSize.z > 1)
|
||||
if (u_inputRTSize.z > 1)
|
||||
{
|
||||
vec4 depths = vec4( imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy ).r,
|
||||
imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,0) ).r,
|
||||
imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(0,1)).r,
|
||||
imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,1)).r
|
||||
);
|
||||
vec4 depths = vec4(
|
||||
imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy ) ).x
|
||||
, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 0.0) ) ).x
|
||||
, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(0.0, 1.0) ) ).x
|
||||
, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 1.0) ) ).x
|
||||
);
|
||||
|
||||
//find and return max depth
|
||||
maxDepth = max(max(depths.x, depths.y), max(depths.z, depths.w));
|
||||
// find and return max depth
|
||||
maxDepth = max(
|
||||
max(depths.x, depths.y)
|
||||
, max(depths.z, depths.w)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
//do not downscale, just copy the value over to the output rendertarget
|
||||
maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy ).r;
|
||||
// do not downscale, just copy the value over to the output rendertarget
|
||||
maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy).x;
|
||||
}
|
||||
|
||||
imageStore(s_texOcclusionDepthOut, coord, vec4(maxDepth,0,0,1) );
|
||||
|
@ -21,31 +21,32 @@ void main()
|
||||
bool predicate = false;
|
||||
|
||||
//make sure that we not processing more instances than available
|
||||
if (gl_GlobalInvocationID.x < (int)u_cullingConfig.x)
|
||||
if (gl_GlobalInvocationID.x < uint(u_cullingConfig.x) )
|
||||
{
|
||||
//get the bounding box for this instance
|
||||
vec4 bboxMin = instanceDataIn[2 * gl_GlobalInvocationID.x] ;
|
||||
vec3 bboxMax = instanceDataIn[2 * gl_GlobalInvocationID.x + 1].xyz;
|
||||
|
||||
int drawcallID = bboxMin.w;
|
||||
int drawcallID = int(bboxMin.w);
|
||||
|
||||
//Adapted from http://blog.selfshadow.com/publications/practical-visibility/
|
||||
vec3 bboxSize = bboxMax.xyz - bboxMin.xyz;
|
||||
|
||||
vec3 boxCorners[] = { bboxMin.xyz,
|
||||
bboxMin.xyz + vec3(bboxSize.x,0,0),
|
||||
bboxMin.xyz + vec3(0, bboxSize.y,0),
|
||||
bboxMin.xyz + vec3(0, 0, bboxSize.z),
|
||||
bboxMin.xyz + vec3(bboxSize.xy,0),
|
||||
bboxMin.xyz + vec3(0, bboxSize.yz),
|
||||
bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z),
|
||||
bboxMin.xyz + bboxSize.xyz
|
||||
};
|
||||
float minZ = 1;
|
||||
vec2 minXY = vec2(1,1);
|
||||
vec2 maxXY = vec2(0,0);
|
||||
vec3 boxCorners[] = {
|
||||
bboxMin.xyz,
|
||||
bboxMin.xyz + vec3(bboxSize.x,0,0),
|
||||
bboxMin.xyz + vec3(0, bboxSize.y,0),
|
||||
bboxMin.xyz + vec3(0, 0, bboxSize.z),
|
||||
bboxMin.xyz + vec3(bboxSize.xy,0),
|
||||
bboxMin.xyz + vec3(0, bboxSize.yz),
|
||||
bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z),
|
||||
bboxMin.xyz + bboxSize.xyz
|
||||
};
|
||||
float minZ = 1.0;
|
||||
vec2 minXY = vec2(1.0, 1.0);
|
||||
vec2 maxXY = vec2(0.0, 0.0);
|
||||
|
||||
[unroll]
|
||||
UNROLL
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
//transform World space aaBox to NDC
|
||||
@ -67,14 +68,14 @@ void main()
|
||||
vec4 boxUVs = vec4(minXY, maxXY);
|
||||
|
||||
// Calculate hi-Z buffer mip
|
||||
ivec2 size = (maxXY - minXY) * u_inputRTSize.xy;
|
||||
ivec2 size = ivec2( (maxXY - minXY) * u_inputRTSize.xy);
|
||||
float mip = ceil(log2(max(size.x, size.y)));
|
||||
|
||||
mip = clamp(mip, 0, u_cullingConfig.z);
|
||||
|
||||
// Texel footprint for the lower (finer-grained) level
|
||||
float level_lower = max(mip - 1, 0);
|
||||
vec2 scale = exp2(-level_lower);
|
||||
float level_lower = max(mip - 1, 0);
|
||||
vec2 scale = vec2_splat(exp2(-level_lower) );
|
||||
vec2 a = floor(boxUVs.xy*scale);
|
||||
vec2 b = ceil(boxUVs.zw*scale);
|
||||
vec2 dims = b - a;
|
||||
@ -98,7 +99,7 @@ void main()
|
||||
predicate = true;
|
||||
|
||||
//increase instance count for this particular prop type
|
||||
InterlockedAdd( drawcallInstanceCount[ drawcallID ], 1);
|
||||
atomicAdd(drawcallInstanceCount[ drawcallID ], 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,77 +23,80 @@ BUFFER_WR(instanceDataOut, vec4, 5);
|
||||
uniform vec4 u_cullingConfig;
|
||||
|
||||
// Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris
|
||||
groupshared uint temp[2048];
|
||||
SHARED uint temp[2048];
|
||||
|
||||
NUM_THREADS(1024, 1, 1)
|
||||
void main()
|
||||
{
|
||||
int tID = gl_GlobalInvocationID.x;
|
||||
int NoofInstancesPowOf2 = u_cullingConfig.y;
|
||||
int NoofDrawcalls = u_cullingConfig.w;
|
||||
uint tID = gl_GlobalInvocationID.x;
|
||||
int NoofInstancesPowOf2 = int(u_cullingConfig.y);
|
||||
int NoofDrawcalls = int(u_cullingConfig.w);
|
||||
|
||||
int offset = 1;
|
||||
temp[2 * tID] = instancePredicates[2 * tID]; // load input into shared memory
|
||||
temp[2 * tID + 1] = instancePredicates[2 * tID + 1];
|
||||
temp[2 * tID ] = uint(instancePredicates[2 * tID ]); // load input into shared memory
|
||||
temp[2 * tID + 1] = uint(instancePredicates[2 * tID + 1]);
|
||||
|
||||
int d;
|
||||
|
||||
//perform reduction
|
||||
for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1)
|
||||
{
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
barrier();
|
||||
|
||||
if (tID < d)
|
||||
{
|
||||
int ai = offset * (2 * tID + 1) - 1;
|
||||
int bi = offset * (2 * tID + 2) - 1;
|
||||
int ai = int(offset * (2 * tID + 1) - 1);
|
||||
int bi = int(offset * (2 * tID + 2) - 1);
|
||||
temp[bi] += temp[ai];
|
||||
}
|
||||
|
||||
offset *= 2;
|
||||
}
|
||||
|
||||
// clear the last element
|
||||
if (tID == 0)
|
||||
{
|
||||
temp[NoofInstancesPowOf2 - 1] = 0;
|
||||
}
|
||||
|
||||
//perform downsweep and build scan
|
||||
// perform downsweep and build scan
|
||||
for ( d = 1; d < NoofInstancesPowOf2; d *= 2)
|
||||
{
|
||||
offset >>= 1;
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
barrier();
|
||||
|
||||
if (tID < d)
|
||||
{
|
||||
int ai = offset * (2 * tID + 1) - 1;
|
||||
int bi = offset * (2 * tID + 2) - 1;
|
||||
int t = temp[ai];
|
||||
int ai = int(offset * (2 * tID + 1) - 1);
|
||||
int bi = int(offset * (2 * tID + 2) - 1);
|
||||
int t = int(temp[ai]);
|
||||
temp[ai] = temp[bi];
|
||||
temp[bi] += t;
|
||||
}
|
||||
}
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
barrier();
|
||||
|
||||
int index = 2 * tID;
|
||||
int index = int(2 * tID);
|
||||
|
||||
//scatter results
|
||||
if (instancePredicates[index] != 0)
|
||||
// scatter results
|
||||
if (instancePredicates[index])
|
||||
{
|
||||
instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
|
||||
instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
|
||||
instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
|
||||
instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
|
||||
instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ];
|
||||
instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
|
||||
instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
|
||||
instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
|
||||
}
|
||||
|
||||
index = 2 * tID + 1;
|
||||
index = int(2 * tID + 1);
|
||||
|
||||
if (instancePredicates[index] != 0)
|
||||
if (instancePredicates[index])
|
||||
{
|
||||
instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
|
||||
instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
|
||||
instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
|
||||
instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
|
||||
instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ];
|
||||
instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
|
||||
instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
|
||||
instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
|
||||
}
|
||||
|
||||
if (tID == 0)
|
||||
|
@ -11,12 +11,12 @@ uniform vec4 u_colour[50];
|
||||
|
||||
void main()
|
||||
{
|
||||
vec4 colour = u_colour[v_materialID.x];
|
||||
vec4 colour = u_colour[uint(v_materialID)];
|
||||
|
||||
if ( colour.w < 1.0f )
|
||||
{
|
||||
//render dithered alpha
|
||||
if ( (gl_FragCoord.x % 2) == (gl_FragCoord.y % 2) )
|
||||
if ( (int(gl_FragCoord.x) % 2) == (int(gl_FragCoord.y) % 2) )
|
||||
discard;
|
||||
}
|
||||
|
||||
|
@ -418,6 +418,11 @@ public:
|
||||
// Enable debug text.
|
||||
bgfx::setDebug(m_debug);
|
||||
|
||||
//create uniforms
|
||||
u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4);
|
||||
u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4);
|
||||
u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4);
|
||||
|
||||
//create props
|
||||
{
|
||||
m_totalInstancesCount = 0;
|
||||
@ -769,11 +774,6 @@ public:
|
||||
//create samplers
|
||||
s_texOcclusionDepthIn = bgfx::createUniform("s_texOcclusionDepthIn", bgfx::UniformType::Int1);
|
||||
|
||||
//create uniforms
|
||||
u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4);
|
||||
u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4);
|
||||
u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4);
|
||||
|
||||
m_timeOffset = bx::getHPCounter();
|
||||
|
||||
m_useIndirect = true;
|
||||
|
@ -1,7 +1,9 @@
|
||||
uint v_materialID : TEXCOORD0;
|
||||
|
||||
vec3 a_position : POSITION;
|
||||
vec2 a_texcoord0 : TEXCOORD0;
|
||||
vec4 i_data0 : TEXCOORD7;
|
||||
vec4 i_data1 : TEXCOORD6;
|
||||
vec4 i_data2 : TEXCOORD5;
|
||||
vec4 i_data3 : TEXCOORD4;
|
||||
|
||||
vec2 v_texcoord0 : TEXCOORD0;
|
||||
float v_materialID : TEXCOORD0;
|
||||
|
@ -1,4 +0,0 @@
|
||||
vec2 v_texcoord0 : TEXCOORD0;
|
||||
|
||||
vec3 a_position : POSITION;
|
||||
vec2 a_texcoord0 : TEXCOORD0;
|
@ -464,6 +464,7 @@ or _OPTIONS["with-combined-examples"] then
|
||||
, "34-mvs"
|
||||
, "35-dynamic"
|
||||
, "36-sky"
|
||||
, "37-gpudrivenrendering"
|
||||
)
|
||||
|
||||
-- C99 source doesn't compile under WinRT settings
|
||||
|
@ -251,39 +251,16 @@ __IMAGE_IMPL_A(r32ui, x, uvec4, xxxx)
|
||||
__IMAGE_IMPL_A(rg32ui, xy, uvec4, xyyy)
|
||||
__IMAGE_IMPL_A(rgba32ui, xyzw, uvec4, xyzw)
|
||||
|
||||
#define __ATOMIC_IMPL_TYPE(_genType, _glFunc, _dxFunc) \
|
||||
_genType _glFunc(inout _genType _mem, _genType _data) \
|
||||
{ \
|
||||
_genType result; \
|
||||
_dxFunc(_mem, _data, result); \
|
||||
return result; \
|
||||
}
|
||||
#define atomicAdd(_mem, _data) InterlockedAdd(_mem, _data)
|
||||
#define atomicAnd(_mem, _data) InterlockedAnd(_mem, _data)
|
||||
#define atomicExchange(_mem, _data) InterlockedExchange(_mem, _data)
|
||||
#define atomicMax(_mem, _data) InterlockedMax(_mem, _data)
|
||||
#define atomicMin(_mem, _data) InterlockedMin(_mem, _data)
|
||||
#define atomicOr(_mem, _data) InterlockedOr(_mem, _data)
|
||||
#define atomicXor(_mem, _data) InterlockedXor(_mem, _data)
|
||||
|
||||
#define __ATOMIC_IMPL(_glFunc, _dxFunc) \
|
||||
__ATOMIC_IMPL_TYPE(int, _glFunc, _dxFunc) \
|
||||
__ATOMIC_IMPL_TYPE(uint, _glFunc, _dxFunc)
|
||||
|
||||
__ATOMIC_IMPL(atomicAdd, InterlockedAdd);
|
||||
__ATOMIC_IMPL(atomicAnd, InterlockedAnd);
|
||||
__ATOMIC_IMPL(atomicExchange, InterlockedExchange);
|
||||
__ATOMIC_IMPL(atomicMax, InterlockedMax);
|
||||
__ATOMIC_IMPL(atomicMin, InterlockedMin);
|
||||
__ATOMIC_IMPL(atomicOr, InterlockedOr);
|
||||
__ATOMIC_IMPL(atomicXor, InterlockedXor);
|
||||
|
||||
int atomicCompSwap(inout int _mem, int _compare, int _data)
|
||||
{
|
||||
int result;
|
||||
InterlockedCompareExchange(_mem, _compare, _data, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
uint atomicCompSwap(inout uint _mem, uint _compare, uint _data)
|
||||
{
|
||||
uint result;
|
||||
InterlockedCompareExchange(_mem, _compare, _data, result);
|
||||
return result;
|
||||
}
|
||||
#define atomicCompSwap(_mem, _compare, _data) \
|
||||
InterlockedCompareExchange(_mem,_compare, _data)
|
||||
|
||||
// InterlockedCompareStore
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user