coffee.pygments/tests/examplefiles/hlsl/example.hlsl

// A few random snippets of HLSL shader code I gathered...

// Macro inside a single-line comment: #define COMMENT_MACRO 1

/* Macro inside a block comment: #define COMMENT_MACRO 2 */

	# define INDENTED_MACRO 5.0

#define SINGLELINE_MACRO 10.0

#define MULTILINE_MACRO(a, b) float2( \
	a, \
	b \
)

[numthreads(256, 1, 1)]
void cs_main(uint3 threadId : SV_DispatchThreadID)
{
	// Seed the PRNG using the thread ID
	rng_state = threadId.x;

	// Generate a few numbers...
	uint r0 = rand_xorshift();
	uint r1 = rand_xorshift();
	// Do some stuff with them...

	// Generate a random float in [0, 1)...
	float f0 = float(rand_xorshift()) * (1.0 / 4294967296.0);

	// ...etc.
}

// Constant buffer of parameters
cbuffer IntegratorParams : register(b0)
{
	float2 specPow;		// Spec powers in XY directions (equal for isotropic BRDFs)
	float3 L;			// Unit vector toward light
	int2 cThread;		// Total threads launched in XY dimensions
	int2 xyOutput;		// Where in the output buffer to store the result
}

static const float pi = 3.141592654;

float AshikhminShirleyNDF(float3 H)
{
	float normFactor = sqrt((specPow.x + 2.0f) * (specPow.y + 2.0)) * (0.5f / pi);
	float NdotH = H.z;
	float2 Hxy = normalize(H.xy);
	return normFactor * pow(NdotH, dot(specPow, Hxy * Hxy));
}

float BeckmannNDF(float3 H)
{
	float glossFactor = specPow.x * 0.5f + 1.0f;	// This is 1/m^2 in the usual Beckmann formula
	float normFactor = glossFactor * (1.0f / pi);
	float NdotHSq = H.z * H.z;
	return normFactor / (NdotHSq * NdotHSq) * exp(glossFactor * (1.0f - 1.0f / NdotHSq));
}

// Output buffer for compute shader (actually float, but must be declared as uint
// for atomic operations to work)
globallycoherent RWTexture2D<uint> o_data : register(u0);

// Sum up the outputs of all threads and store to the output location
static const uint threadGroupSize2D = 16;
static const uint threadGroupSize1D = threadGroupSize2D * threadGroupSize2D;
groupshared float g_partialSums[threadGroupSize1D];
void SumAcrossThreadsAndStore(float value, uint iThreadInGroup)
{
	// First reduce within the threadgroup: partial sums of 2, 4, 8... elements
	// are calculated by 1/2, 1/4, 1/8... of the threads, always keeping the
	// active threads at the front of the group to minimize divergence.

	// NOTE: there are faster ways of doing this...but this is simple to code
	// and good enough.

	g_partialSums[iThreadInGroup] = value;
	GroupMemoryBarrierWithGroupSync();

	[unroll] for (uint i = threadGroupSize1D / 2; i > 0; i /= 2)
	{
		if (iThreadInGroup < i)
		{
			g_partialSums[iThreadInGroup] += g_partialSums[iThreadInGroup + i];
		}
		GroupMemoryBarrierWithGroupSync();
	}

	// Then reduce across threadgroups: one thread from each group adds the group
	// total to the final output location, using a software transactional memory
	// style since D3D11 doesn't support atomic add on floats.
	// (Assumes the output value has been cleared to zero beforehand.)

	if (iThreadInGroup == 0)
	{
		float threadGroupSum = g_partialSums[0];
		uint outputValueRead = o_data[xyOutput];
		while (true)
		{
			uint newOutputValue = asuint(asfloat(outputValueRead) + threadGroupSum);
			uint previousOutputValue;
			InterlockedCompareExchange(
				o_data[xyOutput], outputValueRead, newOutputValue, previousOutputValue);
			if (previousOutputValue == outputValueRead)
				break;
			outputValueRead = previousOutputValue;
		}
	}
}

void main(
	in Vertex i_vtx,
	out Vertex o_vtx,
	out float3 o_vecCamera : CAMERA,
	out float4 o_uvzwShadow : UVZW_SHADOW,
	out float4 o_posClip : SV_Position)
{
	o_vtx = i_vtx;
	o_vecCamera = g_posCamera - i_vtx.m_pos;
	o_uvzwShadow = mul(float4(i_vtx.m_pos, 1.0), g_matWorldToUvzwShadow);
	o_posClip = mul(float4(i_vtx.m_pos, 1.0), g_matWorldToClip);
}

#pragma pack_matrix(row_major)

struct Vertex
{
	float3		m_pos		: POSITION;
	float3		m_normal	: NORMAL;
	float2		m_uv		: UV;
};

cbuffer CBFrame : CB_FRAME					// matches struct CBFrame in test.cpp
{
	float4x4	g_matWorldToClip;
	float4x4	g_matWorldToUvzwShadow;
	float3x3	g_matWorldToUvzShadowNormal;
	float3		g_posCamera;

	float3		g_vecDirectionalLight;
	float3		g_rgbDirectionalLight;

	float2		g_dimsShadowMap;
	float		g_normalOffsetShadow;
	float		g_shadowSharpening;

	float		g_exposure;					// Exposure multiplier
}

Texture2D<float3> g_texDiffuse : register(t0);
SamplerState g_ss : register(s0);

void main(
	in Vertex i_vtx,
	in float3 i_vecCamera : CAMERA,
	in float4 i_uvzwShadow : UVZW_SHADOW,
	out float3 o_rgb : SV_Target)
{
	float3 normal = normalize(i_vtx.m_normal);

	// Sample shadow map
	float shadow = EvaluateShadow(i_uvzwShadow, normal);

	// Evaluate diffuse lighting
	float3 diffuseColor = g_texDiffuse.Sample(g_ss, i_vtx.m_uv);
	float3 diffuseLight = g_rgbDirectionalLight * (shadow * saturate(dot(normal, g_vecDirectionalLight)));
	diffuseLight += SimpleAmbient(normal);

	o_rgb = diffuseColor * diffuseLight;
}

[domain("quad")]
void ds(
	in float edgeFactors[4] : SV_TessFactor,
	in float insideFactors[2] : SV_InsideTessFactor,
	in OutputPatch<VData, 4> inp,
	in float2 uv : SV_DomainLocation,
	out float4 o_pos : SV_Position)
{
	o_pos = lerp(lerp(inp[0].pos, inp[1].pos, uv.x), lerp(inp[2].pos, inp[3].pos, uv.x), uv.y);
}