assao

2018-12-14 22:09:37 +01:00 · 2018-12-14 22:09:37 +01:00 · ead0e5b0fb
commit ead0e5b0fb
parent d2812b9318
29 changed files with 2995 additions and 0 deletions
--- a/examples/39-assao/assao.cpp
+++ b/examples/39-assao/assao.cpp
--- a/examples/39-assao/cs_assao_apply.sc
+++ b/examples/39-assao/cs_assao_apply.sc
@ -0,0 +1,103 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2DARRAY(s_finalSSAO,  1); 
+
+// unpacking for edges; 2 bits per edge mean 4 gradient values (0, 0.33, 0.66, 1) for smoother transitions!
+
+vec4 UnpackEdges( float _packedVal )
+{
+    uint packedVal = uint(_packedVal * 255.5);
+    vec4 edgesLRTB;
+    edgesLRTB.x = float((packedVal >> 6) & 0x03) / 3.0;          // there's really no need for mask (as it's an 8 bit input) but I'll leave it in so it doesn't cause any trouble in the future
+    edgesLRTB.y = float((packedVal >> 4) & 0x03) / 3.0;
+    edgesLRTB.z = float((packedVal >> 2) & 0x03) / 3.0;
+    edgesLRTB.w = float((packedVal >> 0) & 0x03) / 3.0;
+
+    return saturate( edgesLRTB + u_invSharpness );
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		float ao;
+		uvec2 pixPos     = uvec2(dtID.xy);
+		uvec2 pixPosHalf = pixPos / uvec2(2, 2);
+
+		// calculate index in the four deinterleaved source array texture
+		int mx = (int(pixPos.x) % 2);
+#if BGFX_SHADER_LANGUAGE_GLSL
+		int dimy = imageSize(s_target).y; 
+		int my = (int(dimy-1-pixPos.y) % 2);
+#else
+		int my = (int(pixPos.y) % 2);
+#endif
+		int ic = mx + my * 2;       // center index
+		int ih = (1-mx) + my * 2;   // neighbouring, horizontal
+		int iv = mx + (1-my) * 2;   // neighbouring, vertical
+		int id = (1-mx) + (1-my)*2; // diagonal
+
+		vec2 centerVal = texelFetch(s_finalSSAO, ivec3(pixPosHalf, ic), 0 ).xy;
+    
+		ao = centerVal.x;
+
+	#if 1   // change to 0 if you want to disable last pass high-res blur (for debugging purposes, etc.)
+		vec4 edgesLRTB = UnpackEdges( centerVal.y );
+
+		// return 1.0 - vec4( edgesLRTB.x, edgesLRTB.y * 0.5 + edgesLRTB.w * 0.5, edgesLRTB.z, 0.0 ); // debug show edges
+
+		// convert index shifts to sampling offsets
+		float fmx   = float(mx);
+		float fmy   = float(my);
+    
+		// in case of an edge, push sampling offsets away from the edge (towards pixel center)
+		float fmxe  = (edgesLRTB.y - edgesLRTB.x);
+		float fmye  = (edgesLRTB.w - edgesLRTB.z);
+
+		// calculate final sampling offsets and sample using bilinear filter
+#if BGFX_SHADER_LANGUAGE_GLSL
+		vec2  uvH = (dtID.xy + vec2( fmx + fmxe - 0.5, 1.0 - (0.5 - fmy) ) ) * 0.5 * u_halfViewportPixelSize;
+#else
+		vec2  uvH = (dtID.xy + vec2( fmx + fmxe - 0.5, 0.5 - fmy ) ) * 0.5 * u_halfViewportPixelSize;
+#endif
+		float   aoH = texture2DArrayLod(s_finalSSAO, vec3( uvH, ih ), 0 ).x;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		vec2  uvV = (dtID.xy + vec2( 0.5 - fmx, 1.0 - (fmy - 0.5 + fmye) ) ) * 0.5 * u_halfViewportPixelSize;
+#else
+		vec2  uvV = (dtID.xy + vec2( 0.5 - fmx, fmy - 0.5 + fmye ) ) * 0.5 * u_halfViewportPixelSize;
+#endif
+		float   aoV = texture2DArrayLod(s_finalSSAO, vec3( uvV, iv ), 0 ).x;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		vec2  uvD = (dtID.xy + vec2( fmx - 0.5 + fmxe, 1.0 - (fmy - 0.5 + fmye) ) ) * 0.5 * u_halfViewportPixelSize;
+#else
+		vec2  uvD = (dtID.xy + vec2( fmx - 0.5 + fmxe, fmy - 0.5 + fmye ) ) * 0.5 * u_halfViewportPixelSize;
+#endif
+		float   aoD = texture2DArrayLod(s_finalSSAO, vec3( uvD, id ), 0 ).x;
+
+		// reduce weight for samples near edge - if the edge is on both sides, weight goes to 0
+		vec4 blendWeights;
+		blendWeights.x = 1.0;
+		blendWeights.y = (edgesLRTB.x + edgesLRTB.y) * 0.5;
+		blendWeights.z = (edgesLRTB.z + edgesLRTB.w) * 0.5;
+		blendWeights.w = (blendWeights.y + blendWeights.z) * 0.5;
+
+		// calculate weighted average
+		float blendWeightsSum   = dot( blendWeights, vec4( 1.0, 1.0, 1.0, 1.0 ) );
+		ao = dot( vec4( ao, aoH, aoV, aoD ), blendWeights ) / blendWeightsSum;
+	#endif
+
+		ao = pow(ao,1.0/2.2);
+
+		imageStore(s_target, ivec2(dtID.xy), ao.xxxx);
+	}
+}
+
--- a/examples/39-assao/cs_assao_generate_importance_map.sc
+++ b/examples/39-assao/cs_assao_generate_importance_map.sc
@ -0,0 +1,50 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2DARRAY(s_finalSSAO,  1); 
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		uvec2 basePos = uvec2(dtID.xy) * 2;
+
+		vec2 baseUV = (vec2(basePos) + vec2( 0.5, 0.5 ) ) * u_halfViewportPixelSize;
+		vec2 gatherUV = (vec2(basePos) + vec2( 1.0, 1.0 ) ) * u_halfViewportPixelSize;
+
+		float avg = 0.0;
+		float minV = 1.0;
+		float maxV = 0.0;
+		UNROLL
+		for( int i = 0; i < 4; i++ )
+		{
+			vec4 vals = textureGather(s_finalSSAO, vec3( gatherUV, i ) );
+
+			// apply the same modifications that would have been applied in the main shader
+			vals = u_effectShadowStrength * vals;
+
+			vals = 1-vals;
+
+			vals = pow( saturate( vals ), u_effectShadowPow.xxxx );
+
+			avg += dot( vec4( vals.x, vals.y, vals.z, vals.w ), vec4( 1.0 / 16.0, 1.0 / 16.0, 1.0 / 16.0, 1.0 / 16.0 ) );
+
+			maxV = max( maxV, max( max( vals.x, vals.y ), max( vals.z, vals.w ) ) );
+			minV = min( minV, min( min( vals.x, vals.y ), min( vals.z, vals.w ) ) );
+		}
+
+		float minMaxDiff = maxV - minV;
+
+		imageStore(s_target, ivec2(dtID.xy), pow( saturate( minMaxDiff * 2.0 ), 0.8 ).xxxx);
+	}
+}
--- a/examples/39-assao/cs_assao_generate_q.sh
+++ b/examples/39-assao/cs_assao_generate_q.sh
@ -0,0 +1,520 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+// progressive poisson-like pattern; x, y are in [-1, 1] range, .z is length( vec2(x,y) ), .w is log2( z )
+#define INTELSSAO_MAIN_DISK_SAMPLE_COUNT (32)
+CONST(vec4 g_samplePatternMain[INTELSSAO_MAIN_DISK_SAMPLE_COUNT]) =
+{
+   { 0.78488064,  0.56661671,  1.500000, -0.126083},    { 0.26022232, -0.29575172,  1.500000, -1.064030},    { 0.10459357,  0.08372527,  1.110000, -2.730563},    {-0.68286800,  0.04963045,  1.090000, -0.498827},
+   {-0.13570161, -0.64190155,  1.250000, -0.532765},    {-0.26193795, -0.08205118,  0.670000, -1.783245},    {-0.61177456,  0.66664219,  0.710000, -0.044234},    { 0.43675563,  0.25119025,  0.610000, -1.167283},
+   { 0.07884444,  0.86618668,  0.640000, -0.459002},    {-0.12790935, -0.29869005,  0.600000, -1.729424},    {-0.04031125,  0.02413622,  0.600000, -4.792042},    { 0.16201244, -0.52851415,  0.790000, -1.067055},
+   {-0.70991218,  0.47301072,  0.640000, -0.335236},    { 0.03277707, -0.22349690,  0.600000, -1.982384},    { 0.68921727,  0.36800742,  0.630000, -0.266718},    { 0.29251814,  0.37775412,  0.610000, -1.422520},
+   {-0.12224089,  0.96582592,  0.600000, -0.426142},    { 0.11071457, -0.16131058,  0.600000, -2.165947},    { 0.46562141, -0.59747696,  0.600000, -0.189760},    {-0.51548797,  0.11804193,  0.600000, -1.246800},
+   { 0.89141309, -0.42090443,  0.600000,  0.028192},    {-0.32402530, -0.01591529,  0.600000, -1.543018},    { 0.60771245,  0.41635221,  0.600000, -0.605411},    { 0.02379565, -0.08239821,  0.600000, -3.809046},
+   { 0.48951152, -0.23657045,  0.600000, -1.189011},    {-0.17611565, -0.81696892,  0.600000, -0.513724},    {-0.33930185, -0.20732205,  0.600000, -1.698047},    {-0.91974425,  0.05403209,  0.600000,  0.062246},
+   {-0.15064627, -0.14949332,  0.600000, -1.896062},    { 0.53180975, -0.35210401,  0.600000, -0.758838},    { 0.41487166,  0.81442589,  0.600000, -0.505648},    {-0.24106961, -0.32721516,  0.600000, -1.665244}
+};
+
+// these values can be changed (up to SSAO_MAX_TAPS) with no changes required elsewhere; values for 4th and 5th preset are ignored but array needed to avoid compilation errors
+// the actual number of texture samples is two times this value (each "tap" has two symmetrical depth texture samples)
+CONST(uint g_numTaps[5]) = { 3, 5, 12, 0, 0 };
+
+// an example of higher quality low/medium/high settings
+// CONST(uint g_numTaps[5])  = { 4, 9, 16, 0, 0 };
+
+// ** WARNING ** if changing anything here, please remember to update the corresponding C++ code!
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Optional parts that can be enabled for a required quality preset level and above (0 == Low, 1 == Medium, 2 == High, 3 == Highest/Adaptive, 4 == reference/unused )
+// Each has its own cost. To disable just set to 5 or above.
+//
+// (experimental) tilts the disk (although only half of the samples!) towards surface normal; this helps with effect uniformity between objects but reduces effect distance and has other side-effects
+#define SSAO_TILT_SAMPLES_ENABLE_AT_QUALITY_PRESET                      (99)        // to disable simply set to 99 or similar
+#define SSAO_TILT_SAMPLES_AMOUNT                                        (0.4)
+//
+#define SSAO_HALOING_REDUCTION_ENABLE_AT_QUALITY_PRESET                 (1)         // to disable simply set to 99 or similar
+#define SSAO_HALOING_REDUCTION_AMOUNT                                   (0.6)       // values from 0.0 - 1.0, 1.0 means max weighting (will cause artifacts, 0.8 is more reasonable)
+//
+#define SSAO_NORMAL_BASED_EDGES_ENABLE_AT_QUALITY_PRESET                (2)         // to disable simply set to 99 or similar
+#define SSAO_NORMAL_BASED_EDGES_DOT_THRESHOLD                           (0.5)       // use 0-0.1 for super-sharp normal-based edges
+//
+#define SSAO_DETAIL_AO_ENABLE_AT_QUALITY_PRESET                         (1)         // whether to use DetailAOStrength; to disable simply set to 99 or similar
+//
+#define SSAO_DEPTH_MIPS_ENABLE_AT_QUALITY_PRESET                        (2)         // !!warning!! the MIP generation on the C++ side will be enabled on quality preset 2 regardless of this value, so if changing here, change the C++ side too
+#define SSAO_DEPTH_MIPS_GLOBAL_OFFSET                                   (-4.3)      // best noise/quality/performance tradeoff, found empirically
+//
+// !!warning!! the edge handling is hard-coded to 'disabled' on quality level 0, and enabled above, on the C++ side; while toggling it here will work for 
+// testing purposes, it will not yield performance gains (or correct results)
+#define SSAO_DEPTH_BASED_EDGES_ENABLE_AT_QUALITY_PRESET                 (1)     
+//
+#define SSAO_REDUCE_RADIUS_NEAR_SCREEN_BORDER_ENABLE_AT_QUALITY_PRESET  (99)        // 99 means disabled; only helpful if artifacts at the edges caused by lack of out of screen depth data are not acceptable with the depth sampler in either clamp or mirror modes
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+SAMPLER2D(s_viewspaceDepthSource,  0); 
+SAMPLER2D(s_viewspaceDepthSourceMirror,  1); 
+IMAGE2D_RO(s_normalmapSource, rgba8, 2);
+UIMAGE2D_RO(s_loadCounter, r32ui, 3); 
+SAMPLER2D(s_importanceMap,  4); 
+IMAGE2D_ARRAY_RO(s_baseSSAO, rg8, 5);
+IMAGE2D_ARRAY_WR(s_target, rg8, 6);
+
+// packing/unpacking for edges; 2 bits per edge mean 4 gradient values (0, 0.33, 0.66, 1) for smoother transitions!
+float PackEdges( vec4 edgesLRTB )
+{
+//    ivec4 edgesLRTBi = ivec4( saturate( edgesLRTB ) * 3.0 + 0.5 );
+//    return ( (edgesLRTBi.x << 6) + (edgesLRTBi.y << 4) + (edgesLRTBi.z << 2) + (edgesLRTBi.w << 0) ) / 255.0;
+
+    // optimized, should be same as above
+    edgesLRTB = round( saturate( edgesLRTB ) * 3.05 );
+    return dot( edgesLRTB, vec4( 64.0 / 255.0, 16.0 / 255.0, 4.0 / 255.0, 1.0 / 255.0 ) ) ;
+}
+
+vec3 NDCToViewspace( vec2 pos, float viewspaceDepth )
+{
+    vec3 ret;
+
+    ret.xy = (u_ndcToViewMul * pos.xy + u_ndcToViewAdd) * viewspaceDepth;
+
+    ret.z = viewspaceDepth;
+
+    return ret;
+}
+
+// calculate effect radius and fit our screen sampling pattern inside it
+void CalculateRadiusParameters( const float pixCenterLength, const vec2 pixelDirRBViewspaceSizeAtCenterZ, out float pixLookupRadiusMod, out float effectRadius, out float falloffCalcMulSq )
+{
+    effectRadius = u_effectRadius;
+
+    // leaving this out for performance reasons: use something similar if radius needs to scale based on distance
+    //effectRadius *= pow( pixCenterLength, u_radiusDistanceScalingFunctionPow);
+
+    // when too close, on-screen sampling disk will grow beyond screen size; limit this to avoid closeup temporal artifacts
+    const float tooCloseLimitMod = saturate( pixCenterLength * u_effectSamplingRadiusNearLimitRec ) * 0.8 + 0.2;
+    
+    effectRadius *= tooCloseLimitMod;
+
+    // 0.85 is to reduce the radius to allow for more samples on a slope to still stay within influence
+    pixLookupRadiusMod = (0.85 * effectRadius) / pixelDirRBViewspaceSizeAtCenterZ.x;
+
+    // used to calculate falloff (both for AO samples and per-sample weights)
+    falloffCalcMulSq= -1.0f / (effectRadius*effectRadius);
+}
+
+vec4 CalculateEdges( const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ )
+{
+    // slope-sensitive depth-based edge detection
+    vec4 edgesLRTB = vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ;
+    vec4 edgesLRTBSlopeAdjusted = edgesLRTB + edgesLRTB.yxwz;
+    edgesLRTB = min( abs( edgesLRTB ), abs( edgesLRTBSlopeAdjusted ) );
+    return saturate( ( 1.3 - edgesLRTB / (centerZ * 0.040) ) );
+
+    // cheaper version but has artifacts
+    // edgesLRTB = abs( vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ; );
+    // return saturate( ( 1.3 - edgesLRTB / (pixZ * 0.06 + 0.1) ) );
+}
+
+vec3 DecodeNormal( vec3 encodedNormal )
+{
+    vec3 normal = encodedNormal * u_normalsUnpackMul.xxx + u_normalsUnpackAdd.xxx;
+
+#if SSAO_ENABLE_NORMAL_WORLD_TO_VIEW_CONVERSION
+	normal = vec3( dot(normal, u_normalsWorldToViewspaceMatrix0.xyz),
+					dot(normal, u_normalsWorldToViewspaceMatrix1.xyz),
+					dot(normal, u_normalsWorldToViewspaceMatrix2.xyz));
+#endif
+
+    // normal = normalize( normal );    // normalize adds around 2.5% cost on High settings but makes little (PSNR 66.7) visual difference when normals are as in the sample (stored in R8G8B8A8_UNORM,
+    //                                  // decoded in the shader), however it will likely be required if using different encoding/decoding or the inputs are not normalized, etc.
+
+    return normal;
+}
+
+vec3 LoadNormal( ivec2 pos )
+{
+    vec3 encodedNormal = imageLoad(s_normalmapSource, pos).xyz;
+    return DecodeNormal( encodedNormal );
+}
+
+vec3 LoadNormal( ivec2 pos, ivec2 offset )
+{
+    vec3 encodedNormal = imageLoad(s_normalmapSource, pos + offset ).xyz;
+    return DecodeNormal( encodedNormal );
+}
+
+// all vectors in viewspace
+float CalculatePixelObscurance( vec3 pixelNormal, vec3 hitDelta, float falloffCalcMulSq )
+{
+  float lengthSq = dot( hitDelta, hitDelta );
+  float NdotD = dot(pixelNormal, hitDelta) / sqrt(lengthSq);
+
+  float falloffMult = max( 0.0, lengthSq * falloffCalcMulSq + 1.0 );
+
+  return max( 0, NdotD - u_effectHorizonAngleThreshold ) * falloffMult;
+}
+
+void SSAOTapInner( const int qualityLevel, inout float obscuranceSum, inout float weightSum, const vec2 samplingUV, const float mipLevel, const vec3 pixCenterPos, const vec3 negViewspaceDir,vec3 pixelNormal, const float falloffCalcMulSq, const float weightMod, const int dbgTapIndex)
+{
+    // get depth at sample
+    float viewspaceSampleZ = texture2DLod(s_viewspaceDepthSource, samplingUV.xy, mipLevel ).x;
+
+    // convert to viewspace
+    vec3 hitPos = NDCToViewspace( samplingUV.xy, viewspaceSampleZ ).xyz;
+    vec3 hitDelta = hitPos - pixCenterPos;
+
+    float obscurance = CalculatePixelObscurance( pixelNormal, hitDelta, falloffCalcMulSq );
+    float weight = 1.0;
+ 
+    if( qualityLevel >= SSAO_HALOING_REDUCTION_ENABLE_AT_QUALITY_PRESET )
+    {
+        //float reduct = max( 0, dot( hitDelta, negViewspaceDir ) );
+        float reduct = max( 0, -hitDelta.z ); // cheaper, less correct version
+        reduct = saturate( reduct * u_negRecEffectRadius + 2.0 ); // saturate( 2.0 - reduct / u_effectRadius );
+        weight = SSAO_HALOING_REDUCTION_AMOUNT * reduct + (1.0 - SSAO_HALOING_REDUCTION_AMOUNT);
+    }
+    weight *= weightMod;
+    obscuranceSum += obscurance * weight;
+    weightSum += weight;
+}
+
+void SSAOTap( const int qualityLevel, inout float obscuranceSum, inout float weightSum, const int tapIndex, const mat2 rotScale, const vec3 pixCenterPos, const vec3 negViewspaceDir, vec3 pixelNormal, const vec2 normalizedScreenPos, const float mipOffset, const float falloffCalcMulSq, float weightMod, vec2 normXY, float normXYLength)
+{
+    vec2  sampleOffset;
+    float   samplePow2Len;
+
+    // patterns
+    {
+        vec4 newSample = g_samplePatternMain[tapIndex];
+        sampleOffset    = mul( rotScale, newSample.xy );
+        samplePow2Len   = newSample.w;                      // precalculated, same as: samplePow2Len = log2( length( newSample.xy ) );
+        weightMod *= newSample.z;
+    }
+
+    // snap to pixel center (more correct obscurance math, avoids artifacts)
+    sampleOffset                    = round(sampleOffset);
+
+    // calculate MIP based on the sample distance from the centre, similar to as described 
+    // in http://graphics.cs.williams.edu/papers/SAOHPG12/.
+    float mipLevel = ( qualityLevel < SSAO_DEPTH_MIPS_ENABLE_AT_QUALITY_PRESET )?(0):(samplePow2Len + mipOffset);
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+	sampleOffset.y = -sampleOffset.y;
+#endif
+    vec2 samplingUV = sampleOffset * u_viewport2xPixelSize + normalizedScreenPos;
+
+    SSAOTapInner( qualityLevel, obscuranceSum, weightSum, samplingUV, mipLevel, pixCenterPos, negViewspaceDir, pixelNormal, falloffCalcMulSq, weightMod, tapIndex * 2);
+
+    // for the second tap, just use the mirrored offset
+    vec2 sampleOffsetMirroredUV    = -sampleOffset;
+
+    // tilt the second set of samples so that the disk is effectively rotated by the normal
+    // effective at removing one set of artifacts, but too expensive for lower quality settings
+    if( qualityLevel >= SSAO_TILT_SAMPLES_ENABLE_AT_QUALITY_PRESET )
+    {
+        float dotNorm = dot( sampleOffsetMirroredUV, normXY );
+        sampleOffsetMirroredUV -= dotNorm * normXYLength * normXY;
+        sampleOffsetMirroredUV = round(sampleOffsetMirroredUV);
+    }
+
+    // snap to pixel center (more correct obscurance math, avoids artifacts)
+    vec2 samplingMirroredUV = sampleOffsetMirroredUV * u_viewport2xPixelSize + normalizedScreenPos;
+
+    SSAOTapInner( qualityLevel, obscuranceSum, weightSum, samplingMirroredUV, mipLevel, pixCenterPos, negViewspaceDir, pixelNormal, falloffCalcMulSq, weightMod, tapIndex * 2 + 1);
+}
+
+// this function is designed to only work with half/half depth at the moment - there's a couple of hardcoded paths that expect pixel/texel size, so it will not work for full res
+void GenerateSSAOShadowsInternal( out float outShadowTerm, out vec4 outEdges, out float outWeight, 
+	const vec2 SVPos, const int qualityLevel, bool adaptiveBase)
+{
+    vec2 SVPosRounded = trunc( SVPos );
+    uvec2 SVPosui = uvec2( SVPosRounded ); //same as uvec2( SVPos )
+
+    const uint numberOfTaps = (adaptiveBase)?(SSAO_ADAPTIVE_TAP_BASE_COUNT) : ( g_numTaps[qualityLevel] );
+    float pixZ, pixLZ, pixTZ, pixRZ, pixBZ;
+
+#if BGFX_SHADER_LANGUAGE_GLSL  
+    vec4 valuesUL     = textureGather(s_viewspaceDepthSourceMirror, SVPosRounded * u_halfViewportPixelSize + vec2(0.0,u_halfViewportPixelSize.y)).wzyx;
+    vec4 valuesBR     = textureGatherOffset(s_viewspaceDepthSourceMirror, SVPosRounded * u_halfViewportPixelSize + vec2(0.0,u_halfViewportPixelSize.y), ivec2( 1, -1 ) ).wzyx;
+#else
+    vec4 valuesUL     = textureGather(s_viewspaceDepthSourceMirror, SVPosRounded * u_halfViewportPixelSize );
+    vec4 valuesBR     = textureGatherOffset(s_viewspaceDepthSourceMirror, SVPosRounded * u_halfViewportPixelSize, ivec2( 1, 1 ) );
+#endif
+
+    // get this pixel's viewspace depth
+    pixZ = valuesUL.y; 
+
+    // get left right top bottom neighbouring pixels for edge detection (gets compiled out on qualityLevel == 0)
+    pixLZ   = valuesUL.x;
+    pixTZ   = valuesUL.z;
+    pixRZ   = valuesBR.z;
+    pixBZ   = valuesBR.x;
+
+    vec2 normalizedScreenPos = SVPosRounded * u_viewport2xPixelSize + u_viewport2xPixelSize_x_025;
+    vec3 pixCenterPos = NDCToViewspace( normalizedScreenPos, pixZ ); // g
+
+    // Load this pixel's viewspace normal
+    uvec2 fullResCoord = uvec2(SVPosui * 2 + u_perPassFullResCoordOffset.xy);
+    vec3 pixelNormal = LoadNormal( ivec2(fullResCoord) );
+
+    const vec2 pixelDirRBViewspaceSizeAtCenterZ = pixCenterPos.z * u_ndcToViewMul * u_viewport2xPixelSize;  // optimized approximation of:  vec2 pixelDirRBViewspaceSizeAtCenterZ = NDCToViewspace( normalizedScreenPos.xy + u_viewportPixelSize.xy, pixCenterPos.z ).xy - pixCenterPos.xy;
+
+    float pixLookupRadiusMod;
+    float falloffCalcMulSq;
+
+    // calculate effect radius and fit our screen sampling pattern inside it
+    float effectViewspaceRadius;
+    CalculateRadiusParameters( length( pixCenterPos ), pixelDirRBViewspaceSizeAtCenterZ, pixLookupRadiusMod, effectViewspaceRadius, falloffCalcMulSq );
+
+    // calculate samples rotation/scaling
+    mat2 rotScale;
+    {
+        // reduce effect radius near the screen edges slightly; ideally, one would render a larger depth buffer (5% on each side) instead
+        if( !adaptiveBase && (qualityLevel >= SSAO_REDUCE_RADIUS_NEAR_SCREEN_BORDER_ENABLE_AT_QUALITY_PRESET) )
+        {
+            float nearScreenBorder = min( min( normalizedScreenPos.x, 1.0 - normalizedScreenPos.x ), min( normalizedScreenPos.y, 1.0 - normalizedScreenPos.y ) );
+            nearScreenBorder = saturate( 10.0 * nearScreenBorder + 0.6 );
+            pixLookupRadiusMod *= nearScreenBorder;
+        }
+
+        // load & update pseudo-random rotation matrix
+#if BGFX_SHADER_LANGUAGE_GLSL
+        uint pseudoRandomIndex = uint( (imageSize(s_target).y-1.0-SVPosRounded.y) * 2 + SVPosRounded.x ) % 5;
+#else
+        uint pseudoRandomIndex = uint( SVPosRounded.y * 2 + SVPosRounded.x ) % 5;
+#endif
+        vec4 rs = u_patternRotScaleMatrices( pseudoRandomIndex );
+        rotScale = mat2( rs.x * pixLookupRadiusMod, rs.y * pixLookupRadiusMod, rs.z * pixLookupRadiusMod, rs.w * pixLookupRadiusMod );
+    }
+
+    // the main obscurance & sample weight storage
+    float obscuranceSum = 0.0;
+    float weightSum = 0.0;
+
+    // edge mask for between this and left/right/top/bottom neighbour pixels - not used in quality level 0 so initialize to "no edge" (1 is no edge, 0 is edge)
+    vec4 edgesLRTB = vec4( 1.0, 1.0, 1.0, 1.0 );
+
+    // Move center pixel slightly towards camera to avoid imprecision artifacts due to using of 16bit depth buffer; a lot smaller offsets needed when using 32bit floats
+    pixCenterPos *= u_depthPrecisionOffsetMod;
+
+    if( !adaptiveBase && (qualityLevel >= SSAO_DEPTH_BASED_EDGES_ENABLE_AT_QUALITY_PRESET) )
+    {
+        edgesLRTB = CalculateEdges( pixZ, pixLZ, pixRZ, pixTZ, pixBZ );
+    }
+
+    // adds a more high definition sharp effect, which gets blurred out (reuses left/right/top/bottom samples that we used for edge detection)
+    if( !adaptiveBase && (qualityLevel >= SSAO_DETAIL_AO_ENABLE_AT_QUALITY_PRESET) )
+    {
+        // disable in case of quality level 4 (reference)
+        if( qualityLevel != 4 )
+        {
+            //approximate neighbouring pixels positions (actually just deltas or "positions - pixCenterPos" )
+            vec3 viewspaceDirZNormalized = vec3( pixCenterPos.xy / pixCenterPos.zz, 1.0 );
+            vec3 pixLDelta  = vec3( -pixelDirRBViewspaceSizeAtCenterZ.x, 0.0, 0.0 ) + viewspaceDirZNormalized * (pixLZ - pixCenterPos.z); // very close approximation of: vec3 pixLPos  = NDCToViewspace( normalizedScreenPos + vec2( -u_halfViewportPixelSize.x, 0.0 ), pixLZ ).xyz - pixCenterPos.xyz;
+            vec3 pixRDelta  = vec3( +pixelDirRBViewspaceSizeAtCenterZ.x, 0.0, 0.0 ) + viewspaceDirZNormalized * (pixRZ - pixCenterPos.z); // very close approximation of: vec3 pixRPos  = NDCToViewspace( normalizedScreenPos + vec2( +u_halfViewportPixelSize.x, 0.0 ), pixRZ ).xyz - pixCenterPos.xyz;
+#if BGFX_SHADER_LANGUAGE_GLSL
+            vec3 pixTDelta  = vec3( 0.0, +pixelDirRBViewspaceSizeAtCenterZ.y, 0.0 ) + viewspaceDirZNormalized * (pixTZ - pixCenterPos.z); // very close approximation of: vec3 pixTPos  = NDCToViewspace( normalizedScreenPos + vec2( 0.0, -u_halfViewportPixelSize.y ), pixTZ ).xyz - pixCenterPos.xyz;
+            vec3 pixBDelta  = vec3( 0.0, -pixelDirRBViewspaceSizeAtCenterZ.y, 0.0 ) + viewspaceDirZNormalized * (pixBZ - pixCenterPos.z); // very close approximation of: vec3 pixBPos  = NDCToViewspace( normalizedScreenPos + vec2( 0.0, +u_halfViewportPixelSize.y ), pixBZ ).xyz - pixCenterPos.xyz;
+#else
+            vec3 pixTDelta  = vec3( 0.0, -pixelDirRBViewspaceSizeAtCenterZ.y, 0.0 ) + viewspaceDirZNormalized * (pixTZ - pixCenterPos.z); // very close approximation of: vec3 pixTPos  = NDCToViewspace( normalizedScreenPos + vec2( 0.0, -u_halfViewportPixelSize.y ), pixTZ ).xyz - pixCenterPos.xyz;
+            vec3 pixBDelta  = vec3( 0.0, +pixelDirRBViewspaceSizeAtCenterZ.y, 0.0 ) + viewspaceDirZNormalized * (pixBZ - pixCenterPos.z); // very close approximation of: vec3 pixBPos  = NDCToViewspace( normalizedScreenPos + vec2( 0.0, +u_halfViewportPixelSize.y ), pixBZ ).xyz - pixCenterPos.xyz;
+#endif
+
+            const float rangeReductionConst         = 4.0f;                         // this is to avoid various artifacts
+            const float modifiedFalloffCalcMulSq    = rangeReductionConst * falloffCalcMulSq;
+
+            vec4 additionalObscurance;
+            additionalObscurance.x = CalculatePixelObscurance( pixelNormal, pixLDelta, modifiedFalloffCalcMulSq );
+            additionalObscurance.y = CalculatePixelObscurance( pixelNormal, pixRDelta, modifiedFalloffCalcMulSq );
+            additionalObscurance.z = CalculatePixelObscurance( pixelNormal, pixTDelta, modifiedFalloffCalcMulSq );
+            additionalObscurance.w = CalculatePixelObscurance( pixelNormal, pixBDelta, modifiedFalloffCalcMulSq );
+
+            obscuranceSum += u_detailAOStrength * dot( additionalObscurance, edgesLRTB );
+        }
+    }
+
+    // Sharp normals also create edges - but this adds to the cost as well
+    if( !adaptiveBase && (qualityLevel >= SSAO_NORMAL_BASED_EDGES_ENABLE_AT_QUALITY_PRESET ) )
+    {
+        vec3 neighbourNormalL  = LoadNormal( ivec2(fullResCoord), ivec2( -2,  0 ) );
+        vec3 neighbourNormalR  = LoadNormal( ivec2(fullResCoord), ivec2(  2,  0 ) );
+#if BGFX_SHADER_LANGUAGE_GLSL
+        vec3 neighbourNormalT  = LoadNormal( ivec2(fullResCoord), ivec2(  0,  2 ) );
+        vec3 neighbourNormalB  = LoadNormal( ivec2(fullResCoord), ivec2(  0, -2 ) );
+#else
+        vec3 neighbourNormalT  = LoadNormal( ivec2(fullResCoord), ivec2(  0, -2 ) );
+        vec3 neighbourNormalB  = LoadNormal( ivec2(fullResCoord), ivec2(  0,  2 ) );
+#endif
+
+        const float dotThreshold = SSAO_NORMAL_BASED_EDGES_DOT_THRESHOLD;
+
+        vec4 normalEdgesLRTB;
+        normalEdgesLRTB.x = saturate( (dot( pixelNormal, neighbourNormalL ) + dotThreshold ) );
+        normalEdgesLRTB.y = saturate( (dot( pixelNormal, neighbourNormalR ) + dotThreshold ) );
+        normalEdgesLRTB.z = saturate( (dot( pixelNormal, neighbourNormalT ) + dotThreshold ) );
+        normalEdgesLRTB.w = saturate( (dot( pixelNormal, neighbourNormalB ) + dotThreshold ) );
+
+//#define SSAO_SMOOTHEN_NORMALS // fixes some aliasing artifacts but kills a lot of high detail and adds to the cost - not worth it probably but feel free to play with it
+#ifdef SSAO_SMOOTHEN_NORMALS
+        //neighbourNormalL  = LoadNormal( fullResCoord, ivec2( -1,  0 ) );
+        //neighbourNormalR  = LoadNormal( fullResCoord, ivec2(  1,  0 ) );
+        //neighbourNormalT  = LoadNormal( fullResCoord, ivec2(  0, -1 ) );
+        //neighbourNormalB  = LoadNormal( fullResCoord, ivec2(  0,  1 ) );
+        pixelNormal += neighbourNormalL * edgesLRTB.x + neighbourNormalR * edgesLRTB.y + neighbourNormalT * edgesLRTB.z + neighbourNormalB * edgesLRTB.w;
+        pixelNormal = normalize( pixelNormal );
+#endif
+
+        edgesLRTB *= normalEdgesLRTB;
+    }
+
+    const float globalMipOffset     = SSAO_DEPTH_MIPS_GLOBAL_OFFSET;
+    float mipOffset = ( qualityLevel < SSAO_DEPTH_MIPS_ENABLE_AT_QUALITY_PRESET ) ? ( 0 ) : ( log2( pixLookupRadiusMod ) + globalMipOffset );
+
+    // Used to tilt the second set of samples so that the disk is effectively rotated by the normal
+    // effective at removing one set of artifacts, but too expensive for lower quality settings
+    vec2 normXY = vec2( pixelNormal.x, pixelNormal.y );
+    float normXYLength = length( normXY );
+    normXY /= vec2( normXYLength, -normXYLength );
+    normXYLength *= SSAO_TILT_SAMPLES_AMOUNT;
+
+    const vec3 negViewspaceDir = -normalize( pixCenterPos );
+
+    // standard, non-adaptive approach
+    if( (qualityLevel != 3) || adaptiveBase )
+    {
+        // [unroll] // <- doesn't seem to help on any platform, although the compilers seem to unroll anyway if const number of tap used!
+        for( uint i = 0; i < numberOfTaps; i++ )
+        {
+            SSAOTap( qualityLevel, obscuranceSum, weightSum, int(i), rotScale, pixCenterPos, negViewspaceDir, pixelNormal, normalizedScreenPos, mipOffset, falloffCalcMulSq, 1.0, normXY, normXYLength);
+        }
+    }
+    else // if( qualityLevel == 3 ) adaptive approach
+    {
+        // add new ones if needed
+        vec2 fullResUV = normalizedScreenPos + u_perPassFullResUVOffset.xy;
+		float importance = texture2DLod(s_importanceMap, fullResUV, 0.0 ).x;
+
+        // this is to normalize SSAO_DETAIL_AO_AMOUNT across all pixel regardless of importance
+        obscuranceSum *= (SSAO_ADAPTIVE_TAP_BASE_COUNT / float(SSAO_MAX_TAPS)) + (importance * SSAO_ADAPTIVE_TAP_FLEXIBLE_COUNT / float(SSAO_MAX_TAPS));
+
+        // load existing base values
+        vec2 baseValues = imageLoad(s_baseSSAO, ivec3( SVPosui, u_passIndex ) ).xy;
+        weightSum += baseValues.y * (float(SSAO_ADAPTIVE_TAP_BASE_COUNT) * 4.0);
+        obscuranceSum += (baseValues.x) * weightSum;
+
+        // increase importance around edges
+        float edgeCount = dot( 1.0-edgesLRTB, vec4( 1.0, 1.0, 1.0, 1.0 ) );
+        //importance += edgeCount / (float)SSAO_ADAPTIVE_TAP_FLEXIBLE_COUNT;
+
+        float avgTotalImportance = float(imageLoad(s_loadCounter,ivec2(0,0)).x) * u_loadCounterAvgDiv;
+
+        float importanceLimiter = saturate( u_adaptiveSampleCountLimit / avgTotalImportance );
+        importance *= importanceLimiter;
+
+        float additionalSampleCountFlt = SSAO_ADAPTIVE_TAP_FLEXIBLE_COUNT * importance;
+
+        const float blendRange = 3.0; // use 1 to just blend the last one; use larger number to blend over more for a more smooth transition
+        const float blendRangeInv = 1.0 / blendRange;
+
+        additionalSampleCountFlt += 0.5;
+        uint additionalSamples   = uint( additionalSampleCountFlt );
+        uint additionalSamplesTo = min( SSAO_MAX_TAPS, additionalSamples + SSAO_ADAPTIVE_TAP_BASE_COUNT );
+
+        // additional manual unroll doesn't help unfortunately
+        LOOP
+        for( uint i = SSAO_ADAPTIVE_TAP_BASE_COUNT; i < additionalSamplesTo; i++ )
+        {
+            additionalSampleCountFlt -= 1.0f;
+            float weightMod = saturate(additionalSampleCountFlt * blendRangeInv); // slowly blend in the last few samples
+            SSAOTap( qualityLevel, obscuranceSum, weightSum, int(i), rotScale, pixCenterPos, negViewspaceDir, pixelNormal, normalizedScreenPos, mipOffset, falloffCalcMulSq, weightMod, normXY, normXYLength);
+        }
+    }
+
+    // early out for adaptive base - just output weight (used for the next pass)
+    if( adaptiveBase )
+    {
+        float obscurance = obscuranceSum / weightSum;
+
+        outShadowTerm   = obscurance;
+        outEdges        = vec4(0,0,0,0);
+        outWeight       = weightSum;
+        return;
+    }
+
+    // calculate weighted average
+    float obscurance = obscuranceSum / weightSum;
+
+    // calculate fadeout (1 close, gradient, 0 far)
+    float fadeOut = saturate( pixCenterPos.z * u_effectFadeOutMul + u_effectFadeOutAdd );
+  
+    // Reduce the SSAO shadowing if we're on the edge to remove artifacts on edges (we don't care for the lower quality one)
+    if( !adaptiveBase && (qualityLevel >= SSAO_DEPTH_BASED_EDGES_ENABLE_AT_QUALITY_PRESET) )
+    {
+        // float edgeCount = dot( 1.0-edgesLRTB, vec4( 1.0, 1.0, 1.0, 1.0 ) );
+
+        // when there's more than 2 opposite edges, start fading out the occlusion to reduce aliasing artifacts
+        float edgeFadeoutFactor = saturate( (1.0 - edgesLRTB.x - edgesLRTB.y) * 0.35) + saturate( (1.0 - edgesLRTB.z - edgesLRTB.w) * 0.35 );
+
+        // (experimental) if you want to reduce the effect next to any edge
+        // edgeFadeoutFactor += 0.1 * saturate( dot( 1 - edgesLRTB, vec4( 1, 1, 1, 1 ) ) );
+
+        fadeOut *= saturate( 1.0 - edgeFadeoutFactor );
+    }
+    
+    // same as a bove, but a lot more conservative version
+    // fadeOut *= saturate( dot( edgesLRTB, vec4( 0.9, 0.9, 0.9, 0.9 ) ) - 2.6 );
+
+    // strength
+    obscurance = u_effectShadowStrength * obscurance;
+    
+    // clamp
+    obscurance = min( obscurance, u_effectShadowClamp );
+    
+    // fadeout
+    obscurance *= fadeOut;
+
+    // conceptually switch to occlusion with the meaning being visibility (grows with visibility, occlusion == 1 implies full visibility), 
+    // to be in line with what is more commonly used.
+    float occlusion = 1.0 - obscurance;
+
+    // modify the gradient
+    // note: this cannot be moved to a later pass because of loss of precision after storing in the render target
+    occlusion = pow( saturate( occlusion ), u_effectShadowPow );
+
+    // outputs!
+    outShadowTerm   = occlusion;    // Our final 'occlusion' term (0 means fully occluded, 1 means fully lit)
+    outEdges        = edgesLRTB;    // These are used to prevent blurring across edges, 1 means no edge, 0 means edge, 0.5 means half way there, etc.
+    outWeight       = weightSum;
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{ 
+		float   outShadowTerm;
+		float   outWeight;
+		vec4  outEdges;
+		GenerateSSAOShadowsInternal( outShadowTerm, outEdges, outWeight, vec2(dtID.xy), ASSAO_QUALITY, ASSAO_ADAPTIVE_BASE);
+		vec2 out0;
+		out0.x = outShadowTerm;
+
+		if ( ASSAO_ADAPTIVE_BASE )
+		{
+			out0.y = outWeight / (float(SSAO_ADAPTIVE_TAP_BASE_COUNT) * 4.0); //0.0; //frac(outWeight / 6.0);// / (float)(SSAO_MAX_TAPS * 4.0);
+		}
+		else
+		{
+			if (ASSAO_QUALITY == 0)
+				out0.y = PackEdges( vec4( 1, 1, 1, 1 ) ); // no edges in low quality
+			else
+				out0.y = PackEdges( outEdges );
+		}
+		imageStore(s_target, ivec3(dtID.xy, u_layer), out0.xyyy);
+	}
+}
--- a/examples/39-assao/cs_assao_generate_q0.sc
+++ b/examples/39-assao/cs_assao_generate_q0.sc
@ -0,0 +1,9 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#define ASSAO_QUALITY 0
+#define ASSAO_ADAPTIVE_BASE false
+
+#include "cs_assao_generate_q.sh" 
--- a/examples/39-assao/cs_assao_generate_q1.sc
+++ b/examples/39-assao/cs_assao_generate_q1.sc
@ -0,0 +1,9 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#define ASSAO_QUALITY 1
+#define ASSAO_ADAPTIVE_BASE false
+
+#include "cs_assao_generate_q.sh" 
--- a/examples/39-assao/cs_assao_generate_q2.sc
+++ b/examples/39-assao/cs_assao_generate_q2.sc
@ -0,0 +1,9 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#define ASSAO_QUALITY 2
+#define ASSAO_ADAPTIVE_BASE false
+
+#include "cs_assao_generate_q.sh" 
--- a/examples/39-assao/cs_assao_generate_q3.sc
+++ b/examples/39-assao/cs_assao_generate_q3.sc
@ -0,0 +1,9 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#define ASSAO_QUALITY 3
+#define ASSAO_ADAPTIVE_BASE false
+
+#include "cs_assao_generate_q.sh" 
--- a/examples/39-assao/cs_assao_generate_q3base.sc
+++ b/examples/39-assao/cs_assao_generate_q3base.sc
@ -0,0 +1,9 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#define ASSAO_QUALITY 3
+#define ASSAO_ADAPTIVE_BASE true
+
+#include "cs_assao_generate_q.sh" 
--- a/examples/39-assao/cs_assao_load_counter_clear.sc
+++ b/examples/39-assao/cs_assao_load_counter_clear.sc
@ -0,0 +1,15 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+UIMAGE2D_WR(s_loadCounterOutputUAV, r32ui, 0);
+
+NUM_THREADS(1, 1, 1)
+void main() 
+{
+	imageStore(s_loadCounterOutputUAV, ivec2(0, 0), uvec4(0,0,0,0));
+}
--- a/examples/39-assao/cs_assao_non_smart_apply.sc
+++ b/examples/39-assao/cs_assao_non_smart_apply.sc
@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2DARRAY(s_finalSSAO,	1);
+
+// edge-ignorant blur & apply (for the lowest quality level 0)
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_viewportPixelSize;
+		float a = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 0 ), 0.0 ).x;
+		float b = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 1 ), 0.0 ).x;
+		float c = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 2 ), 0.0 ).x;
+		float d = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 3 ), 0.0 ).x;
+		float avg = (a+b+c+d) * 0.25;
+		avg = pow(avg,1.0/2.2);
+		imageStore(s_target, ivec2(dtID.xy), avg.xxxx);
+	}
+}
+
--- a/examples/39-assao/cs_assao_non_smart_blur.sc
+++ b/examples/39-assao/cs_assao_non_smart_blur.sc
@ -0,0 +1,37 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_ARRAY_WR(s_target, rg8, 0);
+SAMPLER2DARRAY(s_blurInput,  1); 
+
+// edge-ignorant blur in x and y directions, 9 pixels touched (for the lowest quality level 0)
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_halfViewportPixelSize;
+		vec2 halfPixel = u_halfViewportPixelSize * 0.5f;
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+		halfPixel.y = -halfPixel.y;
+#endif
+
+		vec2 centre = texture2DArrayLod(s_blurInput, vec3(inUV, 0.0), 0.0 ).xy;
+
+		vec4 vals;
+		vals.x = texture2DArrayLod(s_blurInput, vec3(inUV + vec2( -halfPixel.x * 3, -halfPixel.y ),0.0) , 0.0 ).x;
+		vals.y = texture2DArrayLod(s_blurInput, vec3(inUV + vec2( +halfPixel.x, -halfPixel.y * 3 ),0.0) , 0.0 ).x;
+		vals.z = texture2DArrayLod(s_blurInput, vec3(inUV + vec2( -halfPixel.x, +halfPixel.y * 3 ),0.0) , 0.0 ).x;
+		vals.w = texture2DArrayLod(s_blurInput, vec3(inUV + vec2( +halfPixel.x * 3, +halfPixel.y ),0.0) , 0.0 ).x;
+
+		imageStore(s_target, ivec3(dtID.xy,u_layer), vec4(dot( vals, 0.2.xxxx ) + centre.x * 0.2, centre.y, 0.0, 0.0));
+	}
+}
+
--- a/examples/39-assao/cs_assao_non_smart_half_apply.sc
+++ b/examples/39-assao/cs_assao_non_smart_half_apply.sc
@ -0,0 +1,26 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2DARRAY(s_finalSSAO, 1);
+
+// edge-ignorant blur & apply, skipping half pixels in checkerboard pattern (for the Lowest quality level 0 and Settings::SkipHalfPixelsOnLowQualityLevel == true )
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_viewportPixelSize;
+		float a = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 0 ), 0.0 ).x;
+		float d = texture2DArrayLod(s_finalSSAO, vec3( inUV.xy, 3 ), 0.0 ).x;
+		float avg = (a+d) * 0.5;
+		avg = pow(avg,1.0/2.2);
+		imageStore(s_target, ivec2(dtID.xy), avg.xxxx);
+	}
+}
--- a/examples/39-assao/cs_assao_postprocess_importance_map_a.sc
+++ b/examples/39-assao/cs_assao_postprocess_importance_map_a.sc
@ -0,0 +1,47 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2D(s_importanceMap, 1);
+
+// Shaders below only needed for adaptive quality level
+
+CONST(float cSmoothenImportance) = 1.0;
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		uvec2 pos = uvec2(dtID.xy);
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_quarterResPixelSize;
+
+		float centre = texture2DLod(s_importanceMap, inUV, 0.0 ).x;
+		//return centre;
+
+		vec2 halfPixel = u_quarterResPixelSize * 0.5f;
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+		halfPixel.y = -halfPixel.y;
+#endif 
+		vec4 vals;
+		vals.x = texture2DLod(s_importanceMap, inUV + vec2( -halfPixel.x * 3, -halfPixel.y ), 0.0 ).x;
+		vals.y = texture2DLod(s_importanceMap, inUV + vec2( +halfPixel.x, -halfPixel.y * 3 ), 0.0 ).x;
+		vals.z = texture2DLod(s_importanceMap, inUV + vec2( +halfPixel.x * 3, +halfPixel.y ), 0.0 ).x;
+		vals.w = texture2DLod(s_importanceMap, inUV + vec2( -halfPixel.x, +halfPixel.y * 3 ), 0.0 ).x;
+
+		float avgVal = dot( vals, vec4( 0.25, 0.25, 0.25, 0.25 ) );
+		vals.xy = max( vals.xy, vals.zw );
+		float maxVal = max( centre, max( vals.x, vals.y ) );
+
+		imageStore(s_target, ivec2(dtID.xy), mix( maxVal, avgVal, cSmoothenImportance ).xxxx);
+	}
+}
--- a/examples/39-assao/cs_assao_postprocess_importance_map_b.sc
+++ b/examples/39-assao/cs_assao_postprocess_importance_map_b.sc
@ -0,0 +1,55 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_WR(s_target, r8, 0);
+SAMPLER2D(s_importanceMap, 1);
+UIMAGE2D_RW(s_loadCounterOutputUAV, r32ui, 2);
+
+CONST(float cSmoothenImportance) = 1.0;
+
+// Shaders below only needed for adaptive quality level
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_quarterResPixelSize;
+
+		float centre = texture2DLod(s_importanceMap, inUV, 0.0 ).x;
+		//return centre;
+
+		vec2 halfPixel = u_quarterResPixelSize * 0.5f;
+
+		vec4 vals;
+		vals.x = texture2DLod(s_importanceMap, inUV + vec2( -halfPixel.x, -halfPixel.y * 3 ), 0.0 ).x;
+		vals.y = texture2DLod(s_importanceMap, inUV + vec2( +halfPixel.x * 3, -halfPixel.y ), 0.0 ).x;
+		vals.z = texture2DLod(s_importanceMap, inUV + vec2( +halfPixel.x, +halfPixel.y * 3 ), 0.0 ).x;
+		vals.w = texture2DLod(s_importanceMap, inUV + vec2( -halfPixel.x * 3, +halfPixel.y ), 0.0 ).x;
+
+		float avgVal = dot( vals, vec4( 0.25, 0.25, 0.25, 0.25 ) );
+		vals.xy = max( vals.xy, vals.zw );
+		float maxVal = max( centre, max( vals.x, vals.y ) );
+
+		float retVal = mix( maxVal, avgVal, cSmoothenImportance );
+
+		// sum the average; to avoid overflowing we assume max AO resolution is not bigger than 16384x16384; so quarter res (used here) will be 4096x4096, which leaves us with 8 bits per pixel 
+		uint sum = uint(saturate(retVal) * 255.0 + 0.5);
+    
+		// save every 9th to avoid InterlockedAdd congestion - since we're blurring, this is good enough; compensated by multiplying LoadCounterAvgDiv by 9
+#if BGFX_SHADER_LANGUAGE_GLSL 
+		if( ((dtID.x % 3) + ((dim.y-1-dtID.y) % 3)) == 0  )
+#else
+		if( ((dtID.x % 3) + (dtID.y % 3)) == 0  )
+#endif
+			imageAtomicAdd(s_loadCounterOutputUAV, ivec2(0, 0), sum );
+		imageStore(s_target, ivec2(dtID.xy), retVal.xxxx);
+	}
+}
--- a/examples/39-assao/cs_assao_prepare_depth_mip.sc
+++ b/examples/39-assao/cs_assao_prepare_depth_mip.sc
@ -0,0 +1,103 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_RO(s_viewspaceDepthSource0, r16f, 0); 
+IMAGE2D_RO(s_viewspaceDepthSource1, r16f, 1);
+IMAGE2D_RO(s_viewspaceDepthSource2, r16f, 2);
+IMAGE2D_RO(s_viewspaceDepthSource3, r16f, 3);
+
+IMAGE2D_WR(s_target0, r16f, 4);
+IMAGE2D_WR(s_target1, r16f, 5);
+IMAGE2D_WR(s_target2, r16f, 6);
+IMAGE2D_WR(s_target3, r16f, 7);
+
+// calculate effect radius and fit our screen sampling pattern inside it
+void CalculateRadiusParameters( const float pixCenterLength, const vec2 pixelDirRBViewspaceSizeAtCenterZ, out float pixLookupRadiusMod, out float effectRadius, out float falloffCalcMulSq )
+{
+    effectRadius = u_effectRadius;
+
+    // leaving this out for performance reasons: use something similar if radius needs to scale based on distance
+    //effectRadius *= pow( pixCenterLength, u_radiusDistanceScalingFunctionPow);
+
+    // when too close, on-screen sampling disk will grow beyond screen size; limit this to avoid closeup temporal artifacts
+    const float tooCloseLimitMod = saturate( pixCenterLength * u_effectSamplingRadiusNearLimitRec ) * 0.8 + 0.2;
+    
+    effectRadius *= tooCloseLimitMod;
+
+    // 0.85 is to reduce the radius to allow for more samples on a slope to still stay within influence
+    pixLookupRadiusMod = (0.85 * effectRadius) / pixelDirRBViewspaceSizeAtCenterZ.x;
+
+    // used to calculate falloff (both for AO samples and per-sample weights)
+    falloffCalcMulSq= -1.0f / (effectRadius*effectRadius);
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = uvec2(u_rect.zw);
+	if (all(lessThan(dtID.xy, dim) ) )
+	{ 
+		ivec2 baseCoords = ivec2(dtID.xy) * 2;
+
+		vec4 depthsArr[4];
+		float depthsOutArr[4];
+
+		// how to Gather a specific mip level?
+		depthsArr[0].x = imageLoad(s_viewspaceDepthSource0, baseCoords + ivec2( 0, 0 )).x ;
+		depthsArr[0].y = imageLoad(s_viewspaceDepthSource0, baseCoords + ivec2( 1, 0 )).x ;
+		depthsArr[0].z = imageLoad(s_viewspaceDepthSource0, baseCoords + ivec2( 0, 1 )).x ;
+		depthsArr[0].w = imageLoad(s_viewspaceDepthSource0, baseCoords + ivec2( 1, 1 )).x ;
+		depthsArr[1].x = imageLoad(s_viewspaceDepthSource1, baseCoords + ivec2( 0, 0 )).x;
+		depthsArr[1].y = imageLoad(s_viewspaceDepthSource1, baseCoords + ivec2( 1, 0 )).x;
+		depthsArr[1].z = imageLoad(s_viewspaceDepthSource1, baseCoords + ivec2( 0, 1 )).x;
+		depthsArr[1].w = imageLoad(s_viewspaceDepthSource1, baseCoords + ivec2( 1, 1 )).x;
+		depthsArr[2].x = imageLoad(s_viewspaceDepthSource2, baseCoords + ivec2( 0, 0 )).x;
+		depthsArr[2].y = imageLoad(s_viewspaceDepthSource2, baseCoords + ivec2( 1, 0 )).x;
+		depthsArr[2].z = imageLoad(s_viewspaceDepthSource2, baseCoords + ivec2( 0, 1 )).x;
+		depthsArr[2].w = imageLoad(s_viewspaceDepthSource2, baseCoords + ivec2( 1, 1 )).x;
+		depthsArr[3].x = imageLoad(s_viewspaceDepthSource3, baseCoords + ivec2( 0, 0 )).x;
+		depthsArr[3].y = imageLoad(s_viewspaceDepthSource3, baseCoords + ivec2( 1, 0 )).x;
+		depthsArr[3].z = imageLoad(s_viewspaceDepthSource3, baseCoords + ivec2( 0, 1 )).x;
+		depthsArr[3].w = imageLoad(s_viewspaceDepthSource3, baseCoords + ivec2( 1, 1 )).x;
+		
+	    const uvec2 SVPosui         = uvec2( dtID.xy );
+		const uint pseudoRandomA    = (SVPosui.x ) + 2 * (SVPosui.y );
+
+		float dummyUnused1;
+		float dummyUnused2;
+		float falloffCalcMulSq, falloffCalcAdd;
+ 
+		UNROLL
+		for( int i = 0; i < 4; i++ )
+		{
+			vec4 depths = depthsArr[i];
+			float closest = min( min( depths.x, depths.y ), min( depths.z, depths.w ) );
+
+			CalculateRadiusParameters( abs( closest ), vec2(1.0,1.0), dummyUnused1, dummyUnused2, falloffCalcMulSq );
+
+			vec4 dists = depths - closest.xxxx;
+
+			vec4 weights = saturate( dists * dists * falloffCalcMulSq + 1.0 );
+
+			float smartAvg = dot( weights, depths ) / dot( weights, vec4( 1.0, 1.0, 1.0, 1.0 ) );
+
+			const uint pseudoRandomIndex = ( pseudoRandomA + i ) % 4;
+
+			//depthsOutArr[i] = closest;
+			//depthsOutArr[i] = depths[ pseudoRandomIndex ];
+			depthsOutArr[i] = smartAvg;
+		}
+
+		imageStore(s_target0, ivec2(dtID.xy), depthsOutArr[0].xxxx);
+		imageStore(s_target1, ivec2(dtID.xy), depthsOutArr[1].xxxx);
+		imageStore(s_target2, ivec2(dtID.xy), depthsOutArr[2].xxxx);
+		imageStore(s_target3, ivec2(dtID.xy), depthsOutArr[3].xxxx);
+	}
+}
--- a/examples/39-assao/cs_assao_prepare_depths.sc
+++ b/examples/39-assao/cs_assao_prepare_depths.sc
@ -0,0 +1,58 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+SAMPLER2D(s_depthSource, 0);
+
+IMAGE2D_WR(s_target0, r16f, 1); 
+IMAGE2D_WR(s_target1, r16f, 2);
+IMAGE2D_WR(s_target2, r16f, 3);
+IMAGE2D_WR(s_target3, r16f, 4);
+
+float ScreenSpaceToViewSpaceDepth( float screenDepth )
+{
+    float depthLinearizeMul = u_depthUnpackConsts.x;
+    float depthLinearizeAdd = u_depthUnpackConsts.y;
+
+    // Optimised version of "-cameraClipNear / (cameraClipFar - projDepth * (cameraClipFar - cameraClipNear)) * cameraClipFar"
+
+    // Set your depthLinearizeMul and depthLinearizeAdd to:
+    // depthLinearizeMul = ( cameraClipFar * cameraClipNear) / ( cameraClipFar - cameraClipNear );
+    // depthLinearizeAdd = cameraClipFar / ( cameraClipFar - cameraClipNear );
+
+    return depthLinearizeMul / ( depthLinearizeAdd - screenDepth );
+}
+
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target0).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{ 
+		ivec2 baseCoord = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL 
+		float a = texelFetch(s_depthSource, baseCoord + ivec2( 0, 1 ), 0).x;
+		float b = texelFetch(s_depthSource, baseCoord + ivec2( 1, 1 ), 0).x;
+		float c = texelFetch(s_depthSource, baseCoord + ivec2( 0, 0 ), 0).x;
+		float d = texelFetch(s_depthSource, baseCoord + ivec2( 1, 0 ), 0).x;
+#else
+		float a = texelFetch(s_depthSource, baseCoord + ivec2( 0, 0 ), 0).x;
+		float b = texelFetch(s_depthSource, baseCoord + ivec2( 1, 0 ), 0).x;
+		float c = texelFetch(s_depthSource, baseCoord + ivec2( 0, 1 ), 0).x;
+		float d = texelFetch(s_depthSource, baseCoord + ivec2( 1, 1 ), 0).x;
+#endif
+
+		imageStore(s_target0, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( a ).xxxx);
+		imageStore(s_target1, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( b ).xxxx);
+		imageStore(s_target2, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( c ).xxxx);
+		imageStore(s_target3, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( d ).xxxx);
+	}
+}
+
--- a/examples/39-assao/cs_assao_prepare_depths_and_normals.sc
+++ b/examples/39-assao/cs_assao_prepare_depths_and_normals.sc
@ -0,0 +1,192 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+SAMPLER2D(s_depthSource, 0);
+
+IMAGE2D_WR(s_target0, r16f, 1);
+IMAGE2D_WR(s_target1, r16f, 2);
+IMAGE2D_WR(s_target2, r16f, 3);
+IMAGE2D_WR(s_target3, r16f, 4);
+IMAGE2D_WR(s_normalsOutputUAV, rgba8, 5);
+
+float ScreenSpaceToViewSpaceDepth( float screenDepth )
+{
+    float depthLinearizeMul = u_depthUnpackConsts.x;
+    float depthLinearizeAdd = u_depthUnpackConsts.y;
+
+    // Optimised version of "-cameraClipNear / (cameraClipFar - projDepth * (cameraClipFar - cameraClipNear)) * cameraClipFar"
+
+    // Set your depthLinearizeMul and depthLinearizeAdd to:
+    // depthLinearizeMul = ( cameraClipFar * cameraClipNear) / ( cameraClipFar - cameraClipNear );
+    // depthLinearizeAdd = cameraClipFar / ( cameraClipFar - cameraClipNear );
+
+    return depthLinearizeMul / ( depthLinearizeAdd - screenDepth );
+}
+
+vec3 NDCToViewspace( vec2 pos, float viewspaceDepth )
+{
+    vec3 ret;
+
+    ret.xy = (u_ndcToViewMul * pos.xy + u_ndcToViewAdd) * viewspaceDepth;
+
+    ret.z = viewspaceDepth;
+
+    return ret;
+}
+
+vec4 CalculateEdges( const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ )
+{
+    // slope-sensitive depth-based edge detection
+    vec4 edgesLRTB = vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ;
+    vec4 edgesLRTBSlopeAdjusted = edgesLRTB + edgesLRTB.yxwz;
+    edgesLRTB = min( abs( edgesLRTB ), abs( edgesLRTBSlopeAdjusted ) );
+    return saturate( ( 1.3 - edgesLRTB / (centerZ * 0.040) ) );
+
+    // cheaper version but has artifacts
+    // edgesLRTB = abs( vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ; );
+    // return saturate( ( 1.3 - edgesLRTB / (pixZ * 0.06 + 0.1) ) );
+}
+
+vec3 CalculateNormal( const vec4 edgesLRTB, vec3 pixCenterPos, vec3 pixLPos, vec3 pixRPos, vec3 pixTPos, vec3 pixBPos )
+{
+    // Get this pixel's viewspace normal
+    vec4 acceptedNormals  = vec4( edgesLRTB.x*edgesLRTB.z, edgesLRTB.z*edgesLRTB.y, edgesLRTB.y*edgesLRTB.w, edgesLRTB.w*edgesLRTB.x );
+
+    pixLPos = normalize(pixLPos - pixCenterPos);
+    pixRPos = normalize(pixRPos - pixCenterPos);
+    pixTPos = normalize(pixTPos - pixCenterPos);
+    pixBPos = normalize(pixBPos - pixCenterPos);
+
+    vec3 pixelNormal = vec3( 0, 0, -0.0005 );
+    pixelNormal += ( acceptedNormals.x ) * cross( pixLPos, pixTPos );
+    pixelNormal += ( acceptedNormals.y ) * cross( pixTPos, pixRPos );
+    pixelNormal += ( acceptedNormals.z ) * cross( pixRPos, pixBPos );
+    pixelNormal += ( acceptedNormals.w ) * cross( pixBPos, pixLPos );
+    pixelNormal = normalize( pixelNormal );
+    
+    return pixelNormal;
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target0).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		ivec2 baseCoords = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		vec2 upperLeftUV = (vec2(dtID.xy) + vec2(0.25,0.75)) * u_viewport2xPixelSize;
+#else
+		vec2 upperLeftUV = (vec2(dtID.xy) + vec2(0.25,0.25)) * u_viewport2xPixelSize;
+#endif
+
+		ivec2 baseCoord = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		float z0 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 1 ) ).x );
+		float z1 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 1 ) ).x );
+		float z2 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 0 ) ).x );
+		float z3 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 0 ) ).x );
+#else
+		float z0 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 0 ) ).x );
+		float z1 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 0 ) ).x );
+		float z2 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 1 ) ).x );
+		float z3 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 1 ) ).x );
+#endif
+		imageStore(s_target0, ivec2(dtID.xy), z0.xxxx );
+		imageStore(s_target1, ivec2(dtID.xy), z1.xxxx );
+		imageStore(s_target2, ivec2(dtID.xy), z2.xxxx );
+		imageStore(s_target3, ivec2(dtID.xy), z3.xxxx );
+
+		float pixZs[4][4];
+
+		// middle 4
+		pixZs[1][1] = z0;
+		pixZs[2][1] = z1;
+		pixZs[1][2] = z2;
+		pixZs[2][2] = z3;
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+		// left 2
+		pixZs[0][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 0 ) ).x ); 
+		pixZs[0][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, -1 ) ).x ); 
+
+		// right 2
+		pixZs[3][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 0 ) ).x ); 
+		pixZs[3][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, -1 ) ).x ); 
+
+		// top 2
+		pixZs[1][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0, 1 ) ).x );
+		pixZs[2][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1, 1 ) ).x );
+
+		// bottom 2
+		pixZs[1][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0,  -2 ) ).x );
+		pixZs[2][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1,  -2 ) ).x );
+#else
+		// left 2
+		pixZs[0][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 0 ) ).x ); 
+		pixZs[0][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 1 ) ).x ); 
+		// right 2
+		pixZs[3][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 0 ) ).x ); 
+		pixZs[3][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 1 ) ).x ); 
+		// top 2
+		pixZs[1][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0, -1 ) ).x );
+		pixZs[2][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1, -1 ) ).x );
+		// bottom 2
+		pixZs[1][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0,  2 ) ).x );
+		pixZs[2][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1,  2 ) ).x );
+#endif
+
+		vec4 edges0 = CalculateEdges( pixZs[1][1], pixZs[0][1], pixZs[2][1], pixZs[1][0], pixZs[1][2] );
+		vec4 edges1 = CalculateEdges( pixZs[2][1], pixZs[1][1], pixZs[3][1], pixZs[2][0], pixZs[2][2] );
+		vec4 edges2 = CalculateEdges( pixZs[1][2], pixZs[0][2], pixZs[2][2], pixZs[1][1], pixZs[1][3] );
+		vec4 edges3 = CalculateEdges( pixZs[2][2], pixZs[1][2], pixZs[3][2], pixZs[2][1], pixZs[2][3] );
+
+		vec2 viewportPixelSize = u_viewportPixelSize;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		viewportPixelSize.y = -viewportPixelSize.y;
+#endif
+
+			vec3 pixPos[4][4];
+		// middle 4
+		pixPos[1][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  0.0 ), pixZs[1][1] );
+		pixPos[2][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  0.0 ), pixZs[2][1] );
+		pixPos[1][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  1.0 ), pixZs[1][2] );
+		pixPos[2][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  1.0 ), pixZs[2][2] );
+		// left 2
+		pixPos[0][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( -1.0,  0.0), pixZs[0][1] );
+		pixPos[0][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( -1.0,  1.0), pixZs[0][2] );
+		// right 2                                                                                     
+		pixPos[3][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2(  2.0,  0.0), pixZs[3][1] );
+		pixPos[3][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2(  2.0,  1.0), pixZs[3][2] );
+		// top 2                                                                                       
+		pixPos[1][0] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0, -1.0 ), pixZs[1][0] );
+		pixPos[2][0] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0, -1.0 ), pixZs[2][0] );
+		// bottom 2                                                                                   
+		pixPos[1][3] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  2.0 ), pixZs[1][3] );
+		pixPos[2][3] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  2.0 ), pixZs[2][3] );
+
+		vec3 norm0 = CalculateNormal( edges0, pixPos[1][1], pixPos[0][1], pixPos[2][1], pixPos[1][0], pixPos[1][2] );
+		vec3 norm1 = CalculateNormal( edges1, pixPos[2][1], pixPos[1][1], pixPos[3][1], pixPos[2][0], pixPos[2][2] );
+		vec3 norm2 = CalculateNormal( edges2, pixPos[1][2], pixPos[0][2], pixPos[2][2], pixPos[1][1], pixPos[1][3] );
+		vec3 norm3 = CalculateNormal( edges3, pixPos[2][2], pixPos[1][2], pixPos[3][2], pixPos[2][1], pixPos[2][3] );
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 1 ), vec4( norm0 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 1 ), vec4( norm1 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 0 ), vec4( norm2 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 0 ), vec4( norm3 * 0.5 + 0.5, 0.0 ));
+#else
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 0 ), vec4( norm0 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 0 ), vec4( norm1 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 1 ), vec4( norm2 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 1 ), vec4( norm3 * 0.5 + 0.5, 0.0 ));
+#endif
+	}
+}
--- a/examples/39-assao/cs_assao_prepare_depths_and_normals_half.sc
+++ b/examples/39-assao/cs_assao_prepare_depths_and_normals_half.sc
@ -0,0 +1,188 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+SAMPLER2D(s_depthSource, 0);
+
+IMAGE2D_WR(s_target0, r16f, 1);
+IMAGE2D_WR(s_target1, r16f, 2);
+IMAGE2D_WR(s_normalsOutputUAV, rgba8, 5);
+
+float ScreenSpaceToViewSpaceDepth( float screenDepth )
+{
+    float depthLinearizeMul = u_depthUnpackConsts.x;
+    float depthLinearizeAdd = u_depthUnpackConsts.y;
+
+    // Optimised version of "-cameraClipNear / (cameraClipFar - projDepth * (cameraClipFar - cameraClipNear)) * cameraClipFar"
+
+    // Set your depthLinearizeMul and depthLinearizeAdd to:
+    // depthLinearizeMul = ( cameraClipFar * cameraClipNear) / ( cameraClipFar - cameraClipNear );
+    // depthLinearizeAdd = cameraClipFar / ( cameraClipFar - cameraClipNear );
+
+    return depthLinearizeMul / ( depthLinearizeAdd - screenDepth );
+}
+
+vec3 NDCToViewspace( vec2 pos, float viewspaceDepth )
+{
+    vec3 ret;
+
+    ret.xy = (u_ndcToViewMul * pos.xy + u_ndcToViewAdd) * viewspaceDepth;
+    ret.z = viewspaceDepth;
+
+    return ret;
+}
+
+vec4 CalculateEdges( const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ )
+{
+    // slope-sensitive depth-based edge detection
+    vec4 edgesLRTB = vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ;
+    vec4 edgesLRTBSlopeAdjusted = edgesLRTB + edgesLRTB.yxwz;
+    edgesLRTB = min( abs( edgesLRTB ), abs( edgesLRTBSlopeAdjusted ) );
+    return saturate( ( 1.3 - edgesLRTB / (centerZ * 0.040) ) );
+
+    // cheaper version but has artifacts
+    // edgesLRTB = abs( vec4( leftZ, rightZ, topZ, bottomZ ) - centerZ; );
+    // return saturate( ( 1.3 - edgesLRTB / (pixZ * 0.06 + 0.1) ) );
+}
+
+
+vec3 CalculateNormal( const vec4 edgesLRTB, vec3 pixCenterPos, vec3 pixLPos, vec3 pixRPos, vec3 pixTPos, vec3 pixBPos )
+{
+    // Get this pixel's viewspace normal
+    vec4 acceptedNormals  = vec4( edgesLRTB.x*edgesLRTB.z, edgesLRTB.z*edgesLRTB.y, edgesLRTB.y*edgesLRTB.w, edgesLRTB.w*edgesLRTB.x );
+
+    pixLPos = normalize(pixLPos - pixCenterPos);
+    pixRPos = normalize(pixRPos - pixCenterPos);
+    pixTPos = normalize(pixTPos - pixCenterPos);
+    pixBPos = normalize(pixBPos - pixCenterPos);
+
+    vec3 pixelNormal = vec3( 0, 0, -0.0005 );
+    pixelNormal += ( acceptedNormals.x ) * cross( pixLPos, pixTPos );
+    pixelNormal += ( acceptedNormals.y ) * cross( pixTPos, pixRPos );
+    pixelNormal += ( acceptedNormals.z ) * cross( pixRPos, pixBPos );
+    pixelNormal += ( acceptedNormals.w ) * cross( pixBPos, pixLPos );
+    pixelNormal = normalize( pixelNormal );
+    
+    return pixelNormal;
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target0).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		ivec2 baseCoords = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		vec2 upperLeftUV = (vec2(dtID.xy) + vec2(0.25,0.75)) * u_viewport2xPixelSize;
+#else
+		vec2 upperLeftUV = (vec2(dtID.xy) + vec2(0.25,0.25)) * u_viewport2xPixelSize;
+#endif
+
+		ivec2 baseCoord = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		float z0 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 1 ) ).x );
+		float z1 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 1 ) ).x );
+		float z2 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 0 ) ).x );
+		float z3 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 0 ) ).x );
+#else
+		float z0 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 0 ) ).x );
+		float z1 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 0 ) ).x );
+		float z2 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 0, 1 ) ).x );
+		float z3 = ScreenSpaceToViewSpaceDepth( texelFetchOffset(s_depthSource, baseCoord, 0, ivec2( 1, 1 ) ).x );
+#endif
+
+		imageStore(s_target0, ivec2(dtID.xy), z0.xxxx );
+		imageStore(s_target1, ivec2(dtID.xy), z3.xxxx );
+
+		float pixZs[4][4];
+
+		// middle 4
+		pixZs[1][1] = z0;
+		pixZs[2][1] = z1;
+		pixZs[1][2] = z2;
+		pixZs[2][2] = z3;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		// left 2
+		pixZs[0][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 0 ) ).x ); 
+		pixZs[0][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, -1 ) ).x ); 
+
+		// right 2
+		pixZs[3][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 0 ) ).x ); 
+		pixZs[3][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, -1 ) ).x ); 
+
+		// top 2
+		pixZs[1][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0, 1 ) ).x );
+		pixZs[2][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1, 1 ) ).x );
+
+		// bottom 2
+		pixZs[1][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0,  -2 ) ).x );
+		pixZs[2][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1,  -2 ) ).x );
+#else
+		// left 2
+		pixZs[0][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 0 ) ).x ); 
+		pixZs[0][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2( -1, 1 ) ).x ); 
+
+		// right 2
+		pixZs[3][1] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 0 ) ).x ); 
+		pixZs[3][2] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  2, 1 ) ).x ); 
+
+		// top 2
+		pixZs[1][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0, -1 ) ).x );
+		pixZs[2][0] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1, -1 ) ).x );
+
+		// bottom 2
+		pixZs[1][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  0,  2 ) ).x );
+		pixZs[2][3] = ScreenSpaceToViewSpaceDepth(  texture2DLodOffset(s_depthSource, upperLeftUV, 0.0, ivec2(  1,  2 ) ).x );
+#endif
+
+		vec4 edges0 = CalculateEdges( pixZs[1][1], pixZs[0][1], pixZs[2][1], pixZs[1][0], pixZs[1][2] );
+		vec4 edges1 = CalculateEdges( pixZs[2][1], pixZs[1][1], pixZs[3][1], pixZs[2][0], pixZs[2][2] );
+		vec4 edges2 = CalculateEdges( pixZs[1][2], pixZs[0][2], pixZs[2][2], pixZs[1][1], pixZs[1][3] );
+		vec4 edges3 = CalculateEdges( pixZs[2][2], pixZs[1][2], pixZs[3][2], pixZs[2][1], pixZs[2][3] );
+
+		vec2 viewportPixelSize = u_viewportPixelSize;
+#if BGFX_SHADER_LANGUAGE_GLSL
+		viewportPixelSize.y = -viewportPixelSize.y;
+#endif
+
+		vec3 pixPos[4][4];
+		// there is probably a way to optimize the math below; however no approximation will work, has to be precise.
+
+		// middle 4
+		pixPos[1][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  0.0 ), pixZs[1][1] );
+		pixPos[2][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  0.0 ), pixZs[2][1] );
+		pixPos[1][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  1.0 ), pixZs[1][2] );
+		pixPos[2][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  1.0 ), pixZs[2][2] );
+
+		// left 2
+		pixPos[0][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( -1.0,  0.0), pixZs[0][1] );
+		//pixPos[0][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( -1.0,  1.0), pixZs[0][2] );
+		// right 2                                                                                     
+		//pixPos[3][1] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2(  2.0,  0.0), pixZs[3][1] );
+		pixPos[3][2] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2(  2.0,  1.0), pixZs[3][2] );
+		// top 2                                                                                       
+		pixPos[1][0] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0, -1.0 ), pixZs[1][0] );
+		//pixPos[2][0] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0, -1.0 ), pixZs[2][0] );
+		// bottom 2                                                                                    
+		//pixPos[1][3] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 0.0,  2.0 ), pixZs[1][3] );
+		pixPos[2][3] = NDCToViewspace( upperLeftUV + viewportPixelSize * vec2( 1.0,  2.0 ), pixZs[2][3] );
+
+		vec3 norm0 = CalculateNormal( edges0, pixPos[1][1], pixPos[0][1], pixPos[2][1], pixPos[1][0], pixPos[1][2] );
+		vec3 norm3 = CalculateNormal( edges3, pixPos[2][2], pixPos[1][2], pixPos[3][2], pixPos[2][1], pixPos[2][3] );
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 1 ), vec4( norm0 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 0 ), vec4( norm3 * 0.5 + 0.5, 0.0 ));
+#else
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 0, 0 ), vec4( norm0 * 0.5 + 0.5, 0.0 ));
+		imageStore(s_normalsOutputUAV, baseCoords + ivec2( 1, 1 ), vec4( norm3 * 0.5 + 0.5, 0.0 ));
+#endif
+	}
+}
--- a/examples/39-assao/cs_assao_prepare_depths_half.sc
+++ b/examples/39-assao/cs_assao_prepare_depths_half.sc
@ -0,0 +1,48 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+SAMPLER2D(s_depthSource, 0);
+IMAGE2D_WR(s_target0, r16f, 1);
+IMAGE2D_WR(s_target1, r16f, 2);
+
+float ScreenSpaceToViewSpaceDepth( float screenDepth )
+{
+    float depthLinearizeMul = u_depthUnpackConsts.x;
+    float depthLinearizeAdd = u_depthUnpackConsts.y;
+
+    // Optimised version of "-cameraClipNear / (cameraClipFar - projDepth * (cameraClipFar - cameraClipNear)) * cameraClipFar"
+
+    // Set your depthLinearizeMul and depthLinearizeAdd to:
+    // depthLinearizeMul = ( cameraClipFar * cameraClipNear) / ( cameraClipFar - cameraClipNear );
+    // depthLinearizeAdd = cameraClipFar / ( cameraClipFar - cameraClipNear );
+
+    return depthLinearizeMul / ( depthLinearizeAdd - screenDepth );
+}
+
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy);
+
+	uvec2 dim = imageSize(s_target0).xy;
+	if (all(lessThan(dtID.xy, dim) ) )
+	{
+		ivec2 baseCoord = ivec2(dtID.xy) * 2;
+#if BGFX_SHADER_LANGUAGE_GLSL 
+		float a = texelFetch(s_depthSource, baseCoord + ivec2( 0, 1 ), 0).x;
+		float d = texelFetch(s_depthSource, baseCoord + ivec2( 1, 0 ), 0).x;
+#else
+		float a = texelFetch(s_depthSource, baseCoord + ivec2( 0, 0 ), 0).x;
+		float d = texelFetch(s_depthSource, baseCoord + ivec2( 1, 1 ), 0).x;
+#endif
+
+		imageStore(s_target0, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( a ).xxxx);
+		imageStore(s_target1, ivec2(dtID.xy), ScreenSpaceToViewSpaceDepth( d ).xxxx);
+	}
+}
+
--- a/examples/39-assao/cs_assao_smart_blur.sc
+++ b/examples/39-assao/cs_assao_smart_blur.sc
@ -0,0 +1,82 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_ARRAY_WR(s_target, rg8, 0);
+SAMPLER2DARRAY(s_blurInput, 1);
+
+// unpacking for edges; 2 bits per edge mean 4 gradient values (0, 0.33, 0.66, 1) for smoother transitions!
+vec4 UnpackEdges( float _packedVal )
+{
+    uint packedVal = uint(_packedVal * 255.5);
+    vec4 edgesLRTB;
+    edgesLRTB.x = float((packedVal >> 6) & 0x03) / 3.0;          // there's really no need for mask (as it's an 8 bit input) but I'll leave it in so it doesn't cause any trouble in the future
+    edgesLRTB.y = float((packedVal >> 4) & 0x03) / 3.0;
+    edgesLRTB.z = float((packedVal >> 2) & 0x03) / 3.0;
+    edgesLRTB.w = float((packedVal >> 0) & 0x03) / 3.0;
+
+    return saturate( edgesLRTB + u_invSharpness );
+}
+
+// ********************************************************************************************************
+// Pixel shader that does smart blurring (to avoid bleeding)
+
+void AddSample( float ssaoValue, float edgeValue, inout float sum, inout float sumWeight )
+{
+    float weight = edgeValue;    
+
+    sum += (weight * ssaoValue);
+    sumWeight += weight;
+}
+
+vec2 SampleBlurred( ivec2 inPos, vec2 coord )
+{
+    float packedEdges   = texelFetch(s_blurInput, ivec3(inPos.xy,0.0), 0 ).y;
+    vec4 edgesLRTB    = UnpackEdges( packedEdges );
+
+#if BGFX_SHADER_LANGUAGE_GLSL
+    vec4 valuesUL     = textureGather(s_blurInput, vec3(coord - u_halfViewportPixelSize * 0.5 + vec2(0.0,u_halfViewportPixelSize.y), 0.0)).wzyx;
+    vec4 valuesBR     = textureGather(s_blurInput, vec3(coord + u_halfViewportPixelSize * 0.5 + vec2(0.0,-u_halfViewportPixelSize.y), 0.0)).wzyx;
+#else
+    vec4 valuesUL     = textureGather(s_blurInput, vec3(coord - u_halfViewportPixelSize * 0.5, 0.0));
+    vec4 valuesBR     = textureGather(s_blurInput, vec3(coord + u_halfViewportPixelSize * 0.5, 0.0));
+#endif
+
+    float ssaoValue     = valuesUL.y;
+    float ssaoValueL    = valuesUL.x;
+    float ssaoValueT    = valuesUL.z;
+    float ssaoValueR    = valuesBR.z;
+    float ssaoValueB    = valuesBR.x;
+
+    float sumWeight = 0.5f;
+    float sum = ssaoValue * sumWeight;
+
+    AddSample( ssaoValueL, edgesLRTB.x, sum, sumWeight );
+    AddSample( ssaoValueR, edgesLRTB.y, sum, sumWeight );
+
+    AddSample( ssaoValueT, edgesLRTB.z, sum, sumWeight );
+    AddSample( ssaoValueB, edgesLRTB.w, sum, sumWeight );
+
+    float ssaoAvg = sum / sumWeight;
+
+    ssaoValue = ssaoAvg; //min( ssaoValue, ssaoAvg ) * 0.2 + ssaoAvg * 0.8;
+
+    return vec2( ssaoValue, packedEdges );
+}
+
+// edge-sensitive blur
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_halfViewportPixelSize;
+	    imageStore(s_target, ivec3(dtID.xy, u_layer), SampleBlurred( ivec2(dtID.xy), inUV ).xyyy);
+	}
+}
+
--- a/examples/39-assao/cs_assao_smart_blur_wide.sc
+++ b/examples/39-assao/cs_assao_smart_blur_wide.sc
@ -0,0 +1,83 @@
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh" 
+#include "uniforms.sh"
+
+IMAGE2D_ARRAY_WR(s_target, rg8, 0);
+SAMPLER2DARRAY(s_blurInput, 1);
+
+// unpacking for edges; 2 bits per edge mean 4 gradient values (0, 0.33, 0.66, 1) for smoother transitions!
+vec4 UnpackEdges( float _packedVal )
+{
+    uint packedVal = uint(_packedVal * 255.5);
+    vec4 edgesLRTB;
+    edgesLRTB.x = float((packedVal >> 6) & 0x03) / 3.0;          // there's really no need for mask (as it's an 8 bit input) but I'll leave it in so it doesn't cause any trouble in the future
+    edgesLRTB.y = float((packedVal >> 4) & 0x03) / 3.0;
+    edgesLRTB.z = float((packedVal >> 2) & 0x03) / 3.0;
+    edgesLRTB.w = float((packedVal >> 0) & 0x03) / 3.0;
+
+    return saturate( edgesLRTB + u_invSharpness );
+}
+
+// ********************************************************************************************************
+// Pixel shader that does smart blurring (to avoid bleeding)
+
+void AddSample( float ssaoValue, float edgeValue, inout float sum, inout float sumWeight )
+{
+    float weight = edgeValue;    
+
+    sum += (weight * ssaoValue);
+    sumWeight += weight;
+}
+
+vec2 SampleBlurredWide(vec3 coord)
+{
+	vec2 vC = texture2DArrayLodOffset(s_blurInput, coord, 0.0, ivec2(0, 0)).xy;
+	vec2 vL = texture2DArrayLodOffset(s_blurInput, coord, 0.0, ivec2(-2, 0)).xy;
+	vec2 vT = texture2DArrayLodOffset(s_blurInput, coord, 0.0, ivec2(0, -2)).xy;
+	vec2 vR = texture2DArrayLodOffset(s_blurInput, coord, 0.0, ivec2(2, 0)).xy;
+	vec2 vB = texture2DArrayLodOffset(s_blurInput, coord, 0.0, ivec2(0, 2)).xy;
+
+	float packedEdges = vC.y;
+	vec4 edgesLRTB = UnpackEdges(packedEdges);
+	edgesLRTB.x *= UnpackEdges(vL.y).y;
+	edgesLRTB.z *= UnpackEdges(vT.y).w;
+	edgesLRTB.y *= UnpackEdges(vR.y).x;
+	edgesLRTB.w *= UnpackEdges(vB.y).z;
+
+	float ssaoValue = vC.x;
+	float ssaoValueL = vL.x;
+	float ssaoValueT = vT.x;
+	float ssaoValueR = vR.x;
+	float ssaoValueB = vB.x;
+
+	float sumWeight = 0.8f;
+	float sum = ssaoValue * sumWeight;
+
+	AddSample(ssaoValueL, edgesLRTB.x, sum, sumWeight);
+	AddSample(ssaoValueR, edgesLRTB.y, sum, sumWeight);
+	AddSample(ssaoValueT, edgesLRTB.z, sum, sumWeight);
+	AddSample(ssaoValueB, edgesLRTB.w, sum, sumWeight);
+
+	float ssaoAvg = sum / sumWeight;
+
+	ssaoValue = ssaoAvg; //min( ssaoValue, ssaoAvg ) * 0.2 + ssaoAvg * 0.8;
+
+	return vec2(ssaoValue, packedEdges);
+}
+
+// edge-sensitive blur (wider kernel)
+NUM_THREADS(8, 8, 1)
+void main() 
+{
+	uvec2 dtID = uvec2(gl_GlobalInvocationID.xy) + uvec2(u_rect.xy);
+	if (all(lessThan(dtID.xy, u_rect.zw) ) )
+	{
+		vec2 inUV = (dtID.xy+vec2(0.5,0.5)) * u_halfViewportPixelSize;
+		imageStore(s_target, ivec3(dtID.xy,u_layer), SampleBlurredWide(vec3(inUV,0.0)).xyyy);
+	}
+}
+
--- a/examples/39-assao/fs_assao_deferred_combine.sc
+++ b/examples/39-assao/fs_assao_deferred_combine.sc
@ -0,0 +1,43 @@
+$input v_texcoord0
+
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+SAMPLER2D(s_color, 0);
+SAMPLER2D(s_normal, 1);
+SAMPLER2D(s_ao, 2);
+
+uniform vec4 u_combineParams[2];
+
+void main()
+{
+	vec2 tc0 = v_texcoord0 * u_combineParams[1].xy + u_combineParams[1].zw;
+	vec3 albedoColor = vec3(1.0,1.0,1.0);
+	if (u_combineParams[0].x > 0.0)
+	{
+		albedoColor = texture2D(s_color, tc0).rgb;
+	}
+
+	float light = 1.0;
+	if (u_combineParams[0].x > 0.0)
+	{
+		vec3 n  = texture2D(s_normal, tc0).xyz;
+		// Expand out normal
+		n = n*2.0-1.0;
+		vec3 l = normalize(vec3(-0.8,0.75,-1.0));
+		light = max(0.0,dot(n,l)) * 1.2+ 0.3; 
+	}
+
+	float ao = 1.0;
+	if ( u_combineParams[0].y > 0.0)
+	{
+		ao = texture2D(s_ao, tc0).x;
+	}
+
+	gl_FragColor = vec4(albedoColor * light * ao, 1.0f);
+} 
+ 
--- a/examples/39-assao/fs_assao_gbuffer.sc
+++ b/examples/39-assao/fs_assao_gbuffer.sc
@ -0,0 +1,22 @@
+$input v_normal,  v_texcoord0
+
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+SAMPLER2D(s_albedo, 0);
+
+void main()
+{
+	vec3 normalWorldSpace = v_normal;
+
+	// Write normal
+	gl_FragData[0].xyz = normalWorldSpace.xyz; // Normal is already compressed to [0,1] so can fit in gbuffer
+	gl_FragData[0].w = 0.0;
+
+	// Write color
+	gl_FragData[1] = texture2D(s_albedo,  v_texcoord0);
+}
--- a/examples/39-assao/makefile
+++ b/examples/39-assao/makefile
@ -0,0 +1,10 @@
+#
+# Copyright 2011-2018 Branimir Karadzic. All rights reserved.
+# License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+#
+
+BGFX_DIR=../..
+RUNTIME_DIR=$(BGFX_DIR)/examples/runtime
+BUILD_DIR=../../.build
+
+include $(BGFX_DIR)/scripts/shader.mk
--- a/examples/39-assao/uniforms.sh
+++ b/examples/39-assao/uniforms.sh
@ -0,0 +1,42 @@
+uniform vec4 u_params[19];
+uniform vec4 u_rect;
+
+#define u_viewportPixelSize					u_params[0].xy 
+#define u_halfViewportPixelSize				u_params[0].zw 
+#define u_depthUnpackConsts					u_params[1].xy
+#define u_ndcToViewMul						u_params[2].xy
+#define u_ndcToViewAdd						u_params[2].zw
+#define u_perPassFullResCoordOffset			u_params[3].xy
+#define u_perPassFullResUVOffset			u_params[3].zw
+#define u_viewport2xPixelSize				u_params[4].xy
+#define u_viewport2xPixelSize_x_025			u_params[4].zw
+#define u_effectRadius						u_params[5].x
+#define u_effectShadowStrength				u_params[5].y
+#define u_effectShadowPow					u_params[5].z
+#define u_effectShadowClamp					u_params[5].w
+#define u_effectFadeOutMul					u_params[6].x
+#define u_effectFadeOutAdd					u_params[6].y
+#define u_effectHorizonAngleThreshold		u_params[6].z
+#define u_effectSamplingRadiusNearLimitRec	u_params[6].w
+#define u_depthPrecisionOffsetMod			u_params[7].x
+#define u_negRecEffectRadius				u_params[7].y
+#define u_loadCounterAvgDiv					u_params[7].z
+#define u_adaptiveSampleCountLimit			u_params[7].w
+#define u_invSharpness						u_params[8].x
+#define u_passIndex							u_params[8].y
+#define u_quarterResPixelSize				u_params[8].zw
+#define u_patternRotScaleMatrices(i)		u_params[9+(i)]
+#define u_normalsUnpackMul					u_params[14].x
+#define u_normalsUnpackAdd					u_params[14].y
+#define u_detailAOStrength					u_params[14].z
+#define u_layer								u_params[14].w
+#define u_normalsWorldToViewspaceMatrix0	u_params[15]
+#define u_normalsWorldToViewspaceMatrix1	u_params[16]
+#define u_normalsWorldToViewspaceMatrix2	u_params[17]
+#define u_normalsWorldToViewspaceMatrix3	u_params[18]
+
+#define SSAO_MAX_TAPS                               32
+#define SSAO_ADAPTIVE_TAP_BASE_COUNT                5
+#define SSAO_ADAPTIVE_TAP_FLEXIBLE_COUNT            (SSAO_MAX_TAPS-SSAO_ADAPTIVE_TAP_BASE_COUNT)
+#define SSAO_DEPTH_MIP_LEVELS                       4
+#define SSAO_ENABLE_NORMAL_WORLD_TO_VIEW_CONVERSION 1
--- a/examples/39-assao/varying.def.sc
+++ b/examples/39-assao/varying.def.sc
@ -0,0 +1,7 @@
+vec4 a_position  : POSITION;
+vec2 a_texcoord0 : TEXCOORD0;
+vec3 a_normal    : NORMAL;
+
+vec2  v_texcoord0 : TEXCOORD0;
+vec3 v_normal    : NORMAL    = vec3(0.0, 0.0, 1.0); 
+
--- a/examples/39-assao/vs_assao.sc
+++ b/examples/39-assao/vs_assao.sc
@ -0,0 +1,16 @@
+$input a_position, a_texcoord0
+$output v_texcoord0
+
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+void main()
+{
+	gl_Position = mul(u_modelViewProj, vec4(a_position.xyz, 1.0) );
+	v_texcoord0 = a_texcoord0; 
+}
+
--- a/examples/39-assao/vs_assao_gbuffer.sc
+++ b/examples/39-assao/vs_assao_gbuffer.sc
@ -0,0 +1,27 @@
+$input a_position, a_normal, a_texcoord0
+$output v_normal, v_texcoord0
+
+/*
+ * Copyright 2018 Attila Kocsis. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+
+#include "../common/common.sh"
+
+void main()
+{
+	// Calculate vertex position
+	vec3 pos = a_position.xyz;
+	gl_Position = mul(u_modelViewProj, vec4(pos, 1.0) );
+
+	// Calculate normal.  Note that compressed normal is stored in the vertices
+	vec3 normalObjectSpace = a_normal.xyz*2.0+-1.0; // Normal is stored in [0,1], remap to [-1,1].
+
+	// Transform normal into world space.  
+	vec3 normalWorldSpace = mul(u_model[0], vec4(normalObjectSpace, 0.0) ).xyz;
+	// Normalize to remove (uniform...) scaling, however, recompress
+	v_normal.xyz = normalize(normalWorldSpace)*0.5+0.5;
+
+	v_texcoord0 = a_texcoord0 * 16.0;
+}