Skip to content

Commit

Permalink
Update llpc from commit 879e8809
Browse files Browse the repository at this point in the history
[CONTINUATIONS] Add metadata required by RGP
[CONTINUATIONS] Run some optimization pass for gpurt module
Add NoContraction decoration to have a test for fmul_legacy mapping
Add readfirstlane on the result of subgroupClusterReduction on gfx11+
Add RobustGsEmits to GFX10
Add support for GS patch primitive type
amdllpc: Add more helpful info with PipelineLib* dumps
Avoid upgrade to seqcst ordering
Debug Printf refactor
Downgrade SequentiallyConsistent to AcquireRelease
Expose getResourceMappingNodeTypeName() to the driver
lgc: Improve TANH expansion to avoid overflow
lgc: Refactor getShaderStageAbbreviation
lgc: Use agent scope in more places
llpcSpirvLowerGlobal: Fix originUpperLeft handling
llpcSpirvLowerGlobal: Refactor input/output lowering
llvmraytracing: Separate out header file for pointee type metadata
Postpone descriptor load to ImageBuilder
Promote llvm-dialects submodule
Set last-use for load from the continuation stack
Support for LogRayTracingPipelineSummary
Update shader tests after LLVM update
Fix a typo on paClVsOutCntl
Fix primitive type for barycentric
Fix the cooperativematrix issues(convert+muladd) on gfx1010
Fixes for lit tests on standalone amdllpc build
  • Loading branch information
qiaojbao committed Jun 26, 2024
1 parent 6c770c7 commit f64d106
Show file tree
Hide file tree
Showing 182 changed files with 2,520 additions and 1,911 deletions.
3 changes: 3 additions & 0 deletions compilerutils/include/compilerutils/CompilerUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ llvm::Function *cloneFunctionHeader(llvm::Function &f, llvm::FunctionType *newTy
// Add an unreachable at the current position and remove the rest of the basic block.
void createUnreachable(llvm::IRBuilder<> &b);

// Specifies a memory that is loaded is the last use.
void setIsLastUseLoad(llvm::LoadInst &Load);

struct CrossModuleInlinerResult {
llvm::Value *returnValue;
llvm::iterator_range<llvm::Function::iterator> newBBs;
Expand Down
8 changes: 8 additions & 0 deletions compilerutils/lib/CompilerUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@

using namespace llvm;

// Whether this is a load instruction that should translate to a last_use
// load.
static constexpr const char *MDIsLastUseName = "amdgpu.last.use";

// =====================================================================================================================
// Create an LLVM function call to the named function. The callee is built
// automatically based on return type and its parameters.
Expand Down Expand Up @@ -150,6 +154,10 @@ void CompilerUtils::createUnreachable(llvm::IRBuilder<> &b) {
DeleteDeadBlock(oldCode);
}

void CompilerUtils::setIsLastUseLoad(llvm::LoadInst &Load) {
Load.setMetadata(MDIsLastUseName, MDTuple::get(Load.getContext(), {}));
}

namespace {

// Get the name of a global that is copied to a different module for inlining.
Expand Down
12 changes: 6 additions & 6 deletions gfxruntime/src/shaders/AdvancedBlend.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
float4 AmdExtFragCoord() DUMMY_FLOAT4_FUNC
int AmdExtSampleId() DUMMY_INT_FUNC

float4 AmdAdvancedBlendTexelLoad(int4 imageLow, int4 imageHigh, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
float4 AmdAdvancedBlendTexelLoadFmask(int4 imageMsLow, int4 imageMsHigh, int4 fmaskLow, int4 fmaskHigh, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
float4 AmdAdvancedBlendTexelLoad(int64_t imageDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
float4 AmdAdvancedBlendTexelLoadFmask(int64_t imageDesc, int64_t fmaskDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC

float4 AmdAdvancedBlendCoherentTexelLoad(float4 color, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC
void AmdAdvancedBlendCoherentTexelStore(float4 color, int2 iCoord, int sampleId) DUMMY_VOID_FUNC
Expand Down Expand Up @@ -224,8 +224,8 @@ float AmdAdvancedBlendDivide(float dividend, float divisor) {
}
}

export float4 AmdAdvancedBlendInternal(float4 inColor, int4 imageMsLow, int4 imageMsHigh, int4 imageLow, int4 imageHigh,
int4 fmaskLow, int4 fmaskHigh, int mode, bool isMsaa) {
export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int64_t imageDesc, int64_t fmaskDesc,
int mode, bool isMsaa) {
float4 srcColor = inColor;
if (mode == 0) {
return srcColor;
Expand All @@ -234,9 +234,9 @@ export float4 AmdAdvancedBlendInternal(float4 inColor, int4 imageMsLow, int4 ima
int2 iCoord = int2(fragCoord.x, fragCoord.y);
float4 dstColor;
if (isMsaa) {
dstColor = AmdAdvancedBlendTexelLoadFmask(imageMsLow, imageMsHigh, fmaskLow, fmaskHigh, iCoord, 0);
dstColor = AmdAdvancedBlendTexelLoadFmask(imageDescMs, fmaskDesc, iCoord, 0);
} else {
dstColor = AmdAdvancedBlendTexelLoad(imageLow, imageHigh, iCoord, 0);
dstColor = AmdAdvancedBlendTexelLoad(imageDesc, iCoord, 0);
}
// TODO: Uncomment them once ROV is support in LLPC
// int sampleId = AmdExtSampleId();
Expand Down
99 changes: 62 additions & 37 deletions include/vkgcDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ struct PipelineOptions {
bool internalRtShaders; ///< Whether this pipeline has internal raytracing shaders
unsigned forceNonUniformResourceIndexStageMask; ///< Mask of the stage to force using non-uniform resource index.
bool reserved16;
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 73
bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate
bool disableSampleMask; ///< For OGL only, disabled if framebuffer doesn't attach multisample texture
bool buildResourcesDataForShaderModule; ///< For OGL only, build resources usage data while building shader module
Expand All @@ -482,6 +483,25 @@ struct PipelineOptions {
bool enableFragColor; ///< For OGL only, need to do frag color broadcast if it is enabled.
bool disableBaseVertex; ///< For OGL only, force the BaseVertex builtin to 0 instead of
/// loading it from userdata
bool bindlessTextureMode; ///< For OGL only, true if bindless textures are used
bool bindlessImageMode; ///< For OGL only, true if bindless images are used
const auto &getGlState() const { return *this; }
#else
struct GLState {
bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate
bool disableSampleMask; ///< For OGL only, disabled if framebuffer doesn't attach multisample texture
bool buildResourcesDataForShaderModule; ///< For OGL only, build resources usage data while building shader module
bool disableTruncCoordForGather; ///< If set, trunc_coord of sampler srd is disabled for gather4
bool enableCombinedTexture; ///< For OGL only, use the 'set' for DescriptorCombinedTexture
///< for sampled images and samplers
bool vertex64BitsAttribSingleLoc; ///< For OGL only, dvec3/dvec4 vertex attrib only consumes 1 location.
bool enableFragColor; ///< For OGL only, need to do frag color broadcast if it is enabled.
bool disableBaseVertex; ///< For OGL only, force the BaseVertex builtin to 0 instead of
bool bindlessTextureMode; ///< For OGL only, true if bindless textures are used
bool bindlessImageMode; ///< For OGL only, true if bindless images are used
} glState;
const auto &getGlState() const { return glState; }
#endif
unsigned reserved20;
bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled
bool disablePerCompFetch; ///< Disable per component fetch in uber fetch shader.
Expand Down Expand Up @@ -512,6 +532,7 @@ struct ResourceNodeData {
unsigned isTexelFetchUsed; ///< TRUE if texelFetch is used
unsigned isDefaultUniformSampler; ///< TRUE if it's sampler image in default uniform struct
unsigned columnCount; ///< Column count if this is a matrix variable.
unsigned componentCount; ///< Component count if this is a vector, row count if it is a matrix.
BasicType basicType; ///< Type of the variable or element
};

Expand Down Expand Up @@ -545,6 +566,43 @@ struct ResourcesNodes {
unsigned defaultUniformInfoCount;
};

// raytracing system value usage flags
union RayTracingSystemValueUsage {
struct {
union {
struct {
uint16_t flags : 1; // Shader calls gl_IncomingRayFlagsEXT
uint16_t worldRayOrigin : 1; // Shader calls gl_WorldRayOriginEXT
uint16_t tMin : 1; // Shader calls gl_RayTminEXT
uint16_t worldRayDirection : 1; // Shader calls gl_WorldRayDirectionEXT
uint16_t tCurrent : 1; // Shader calls gl_HitTEXT
uint16_t launchId : 1; // Shader calls gl_LaunchIDEXT
uint16_t launchSize : 1; // Shader calls gl_LaunchSizeEXT
uint16_t reserved : 9; // Reserved
};
uint16_t u16All;
} ray;

union {
struct {
uint16_t hitKind : 1; // Shader calls gl_HitKindEXT
uint16_t instanceIndex : 1; // Shader calls gl_InstanceCustomIndexEXT
uint16_t instanceID : 1; // Shader calls gl_InstanceID
uint16_t primitiveIndex : 1; // Shader calls gl_PrimitiveID
uint16_t geometryIndex : 1; // Shader calls gl_GeometryIndexEXT
uint16_t objectToWorld : 1; // Shader calls gl_ObjectToWorldEXT
uint16_t objectRayOrigin : 1; // Shader calls gl_ObjectRayOriginEXT
uint16_t objectRayDirection : 1; // Shader calls gl_ObjectRayDirectionEXT
uint16_t worldToObject : 1; // Shader calls gl_WorldToObjectEXT
uint16_t hitTrianglePosition : 1; // Shader calls gl_HitTriangleVertexPositionsEXT
uint16_t reserved : 6; // Reserved
};
uint16_t u16All;
} primitive;
};
uint32_t u32All;
};

/// Represents usage info of a shader module
struct ShaderModuleUsage {
bool enableVarPtrStorageBuf; ///< Whether to enable "VariablePointerStorageBuffer" capability
Expand Down Expand Up @@ -573,12 +631,14 @@ struct ShaderModuleUsage {
bool pixelCenterInteger; ///< Whether pixel coord is Integer
bool useGenericBuiltIn; ///< Whether to use builtIn inputs that include gl_PointCoord, gl_PrimitiveId,
/// gl_Layer, gl_ClipDistance or gl_CullDistance.
bool useBarycentric; ///< Whether to use gl_BarycentricXX or pervertexEXT decoration
bool enableXfb; ///< Whether transform feedback is enabled
unsigned localSizeX; ///< Compute shader work-group size in the X dimension
unsigned localSizeY; ///< Compute shader work-group size in the Y dimension
unsigned localSizeZ; ///< Compute shader work-group size in the Z dimension
bool disableDualSource; ///< Whether disable dualSource blend
uint32_t clipDistanceArraySize; ///< Count of output clip distance
RayTracingSystemValueUsage rtSystemValueUsage; ///< Usage flags for ray tracing builtins
};

/// Represents common part of shader module data
Expand Down Expand Up @@ -1001,43 +1061,6 @@ enum RayTracingRayFlag : unsigned {
};

// =====================================================================================================================
// raytracing system value usage flags
union RayTracingSystemValueUsage {
struct {
union {
struct {
uint16_t flags : 1; // Shader calls gl_IncomingRayFlagsEXT
uint16_t worldRayOrigin : 1; // Shader calls gl_WorldRayOriginEXT
uint16_t tMin : 1; // Shader calls gl_RayTminEXT
uint16_t worldRayDirection : 1; // Shader calls gl_WorldRayDirectionEXT
uint16_t tCurrent : 1; // Shader calls gl_HitTEXT
uint16_t launchId : 1; // Shader calls gl_LaunchIDEXT
uint16_t launchSize : 1; // Shader calls gl_LaunchSizeEXT
uint16_t reserved : 9; // Reserved
};
uint16_t u16All;
} ray;

union {
struct {
uint16_t hitKind : 1; // Shader calls gl_HitKindEXT
uint16_t instanceIndex : 1; // Shader calls gl_InstanceCustomIndexEXT
uint16_t instanceID : 1; // Shader calls gl_InstanceID
uint16_t primitiveIndex : 1; // Shader calls gl_PrimitiveID
uint16_t geometryIndex : 1; // Shader calls gl_GeometryIndexEXT
uint16_t objectToWorld : 1; // Shader calls gl_ObjectToWorldEXT
uint16_t objectRayOrigin : 1; // Shader calls gl_ObjectRayOriginEXT
uint16_t objectRayDirection : 1; // Shader calls gl_ObjectRayDirectionEXT
uint16_t worldToObject : 1; // Shader calls gl_WorldToObjectEXT
uint16_t hitTrianglePosition : 1; // Shader calls gl_HitTriangleVertexPositionsEXT
uint16_t reserved : 6; // Reserved
};
uint16_t u16All;
} primitive;
};
uint32_t u32All;
};

/// Represents ray-tracing shader export configuration
struct RayTracingShaderExportConfig {
unsigned indirectCallingConvention; ///< Indirect calling convention
Expand Down Expand Up @@ -1299,6 +1322,7 @@ struct GraphicsPipelineBuildInfo {
float pixelTransferBias[4]; ///< Bias apply to render color target
bool enableColorClampVs; ///< Enable clamp vertex output color
bool enableColorClampFs; ///< Enable clamp fragment output color
bool enableFlatShade; ///< Whether enable flat shade.
} glState;
const auto &getGlState() const { return glState; }
#endif
Expand Down Expand Up @@ -1597,6 +1621,7 @@ class IUtil {
///
/// @param [in] spvBin SPIR-V binary
static const char *VKAPI_CALL GetEntryPointNameFromSpirvBinary(const BinaryData *spvBin);
static const char *VKAPI_CALL GetResourceMappingNodeTypeName(ResourceMappingNodeType type);
};

/// 128-bit hash compatible structure
Expand Down
1 change: 1 addition & 0 deletions lgc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ target_sources(LLVMlgc PRIVATE
state/ShaderModes.cpp
state/ShaderStage.cpp
state/TargetInfo.cpp
state/RuntimeContext.cpp
)

# lgc/util
Expand Down
35 changes: 13 additions & 22 deletions lgc/builder/ArithBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,29 +507,20 @@ Value *BuilderImpl::CreateCosh(Value *x, const Twine &instName) {
// @param x : Input value X
// @param instName : Name to give instruction(s)
Value *BuilderImpl::CreateTanh(Value *x, const Twine &instName) {
// sinh(x) / cosh(x)
// (e^x - e^(-x))/(e^x + e^(-x))
// tanh(x) = copysign(1-2/(e^-|2x|+1),x)
// 1/log(2) = 1.442695
// e^x = 2^(x*(1/log(2))) = 2^(x*1.442695))
Value *divLog2 = CreateFMul(x, getRecipLog2(x->getType()));
Value *negDivLog2 = CreateFSub(ConstantFP::get(x->getType(), 0.0), divLog2);
Value *exp = CreateUnaryIntrinsic(Intrinsic::exp2, divLog2);
Value *expNeg = CreateUnaryIntrinsic(Intrinsic::exp2, negDivLog2);
Value *doubleSinh = CreateFSub(exp, expNeg);
Value *doubleCosh = CreateFAdd(exp, expNeg);
Value *result = fDivFast(doubleSinh, doubleCosh);

if (!getFastMathFlags().noInfs()) {
// NOTE: If the fast math flags might have INFs, we should check the special case when the input is +INF or -INF.
// According to the limit of tanh(x), we have following definitions:
// / 1.0, when x -> +INF
// lim(tanh(x)) =
// \ -1.0, when x -> -INF
Value *one = ConstantFP::get(x->getType(), 1.0);
Value *isInf = CreateIsInf(x);
result = CreateSelect(isInf, CreateCopySign(one, x), result);
}

// e = 2^(1/log(2))
// e^-|2x| = 2^(-|2x|*(1/log(2)))
auto vTy = x->getType();
Value *result = CreateIntrinsic(Intrinsic::fabs, vTy, x);
result = CreateFNeg(result);
result = CreateFMul(ConstantFP::get(vTy, 2.0), result);
result = CreateFMul(getRecipLog2(vTy), result);
result = CreateUnaryIntrinsic(Intrinsic::exp2, result);
result = CreateFAdd(ConstantFP::get(vTy, 1.0), result);
result = fDivFast(ConstantFP::get(vTy, 2.0), result);
result = CreateFSub(ConstantFP::get(vTy, 1.0), result);
result = CreateCopySign(result, x);
result->setName(instName);
return result;
}
Expand Down
12 changes: 5 additions & 7 deletions lgc/builder/BuilderImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,22 +249,20 @@ Value *BuilderImpl::CreateIntegerDotProduct(Value *vector1, Value *vector2, Valu

// =====================================================================================================================
// Get whether the context we are building in supports ds_bpermute or v_bpermute across all lanes in the wave
bool BuilderImpl::supportWaveWideBPermute() const {
//
// @param shaderStage : shader stage enum.
bool BuilderImpl::supportWaveWideBPermute(ShaderStageEnum shaderStage) const {
auto gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
auto supportBPermute = gfxIp == 8 || gfxIp == 9;
auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage.value());
auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage);
supportBPermute = supportBPermute || waveSize == 32;
return supportBPermute;
}

// =====================================================================================================================
// Get whether the context we are building in supports permute lane 64 DPP operations.
bool BuilderImpl::supportPermLane64Dpp() const {
auto gfxip = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage.value());
return gfxip >= 11 && waveSize == 64;
return getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11;
}

// =====================================================================================================================
Expand Down
76 changes: 39 additions & 37 deletions lgc/builder/DescBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,45 +394,47 @@ Value *BuilderImpl::buildBufferCompactDesc(Value *desc, unsigned stride) {
Value *descElem1 = CreateExtractElement(desc, 1);

// Build normal buffer descriptor
// Dword 0
Value *bufDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), 4));
bufDesc = CreateInsertElement(bufDesc, descElem0, uint64_t(0));

// Dword 1
SqBufRsrcWord1 sqBufRsrcWord1 = {};
sqBufRsrcWord1.bits.baseAddressHi = UINT16_MAX;
descElem1 = CreateAnd(descElem1, getInt32(sqBufRsrcWord1.u32All));
if (stride) {
SqBufRsrcWord1 sqBufRsrcWord1Stride = {};
sqBufRsrcWord1Stride.bits.stride = stride;
descElem1 = CreateOr(descElem1, getInt32(sqBufRsrcWord1Stride.u32All));
}
bufDesc = CreateInsertElement(bufDesc, descElem1, 1);

// Dword 2
SqBufRsrcWord2 sqBufRsrcWord2 = {};
sqBufRsrcWord2.bits.numRecords = UINT32_MAX;
bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord2.u32All), 2);

// Dword 3
SqBufRsrcWord3 sqBufRsrcWord3 = {};
sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
if (gfxIp.major == 10) {
sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
sqBufRsrcWord3.gfx10.resourceLevel = 1;
sqBufRsrcWord3.gfx10.oobSelect = stride ? 3 : 2;
assert(sqBufRsrcWord3.u32All == 0x21014FAC || sqBufRsrcWord3.u32All == 0x31014FAC);
} else if (gfxIp.major >= 11) {
sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
} else {
llvm_unreachable("Not implemented!");
{
// Dword 0
bufDesc = CreateInsertElement(bufDesc, descElem0, uint64_t(0));

// Dword 1
SqBufRsrcWord1 sqBufRsrcWord1 = {};
sqBufRsrcWord1.bits.baseAddressHi = UINT16_MAX;
descElem1 = CreateAnd(descElem1, getInt32(sqBufRsrcWord1.u32All));
if (stride) {
SqBufRsrcWord1 sqBufRsrcWord1Stride = {};
sqBufRsrcWord1Stride.bits.stride = stride;
descElem1 = CreateOr(descElem1, getInt32(sqBufRsrcWord1Stride.u32All));
}
bufDesc = CreateInsertElement(bufDesc, descElem1, 1);

// Dword 2
SqBufRsrcWord2 sqBufRsrcWord2 = {};
sqBufRsrcWord2.bits.numRecords = UINT32_MAX;
bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord2.u32All), 2);

// Dword 3
SqBufRsrcWord3 sqBufRsrcWord3 = {};
sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
if (gfxIp.major == 10) {
sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
sqBufRsrcWord3.gfx10.resourceLevel = 1;
sqBufRsrcWord3.gfx10.oobSelect = stride ? 3 : 2;
assert(sqBufRsrcWord3.u32All == 0x21014FAC || sqBufRsrcWord3.u32All == 0x31014FAC);
} else if (gfxIp.major >= 11) {
sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
} else {
llvm_unreachable("Not implemented!");
}
bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord3.u32All), 3);
}
bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord3.u32All), 3);

return bufDesc;
}
Loading

0 comments on commit f64d106

Please sign in to comment.