Skip to content

Commit

Permalink
Merge pull request oneapi-src#1099 from jandres742/largeallocations
Browse files Browse the repository at this point in the history
[UR][L0] Unify use of large allocation in L0 adapter
  • Loading branch information
kbenzie committed Dec 15, 2023
1 parent 1584019 commit 0b95702
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 6 deletions.
24 changes: 23 additions & 1 deletion source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return ReturnValue(uint32_t{64});
}
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
// if not optimized for 32-bit access, return total memory size.
// otherwise, return only maximum allocatable size.
if (Device->useOptimized32bitAccess() == 0) {
return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
} else {
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
}
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
// Support to read physicalSize depends on kernel,
// so fallback into reading totalSize if physicalSize
Expand Down Expand Up @@ -911,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() {
}
}

int32_t ur_device_handle_t_::useOptimized32bitAccess() {
static const int32_t Optimize32bitAccessMode = [this] {
// If device is Intel(R) Data Center GPU Max,
// use default provided by L0 driver.
// TODO: Use IP versioning to select based on range of devices
if (this->isPVC())
return -1;
const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
if (!UrRet)
return 0;
return std::atoi(UrRet);
}();

return Optimize32bitAccessMode;
}

ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
int SubSubDeviceIndex) {
// Maintain various device properties cache.
Expand Down
16 changes: 16 additions & 0 deletions source/adapters/level_zero/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object {
// Returns whether immediate command lists are used on this device.
ImmCmdlistMode ImmCommandListUsed{};

// Returns whether large allocations are being used
// or not to have a consistent behavior throughout
// the adapter between the creation of large allocations
// and the compilation of kernels into stateful and
// stateless modes.
// With stateful mode, kernels are compiled with
// pointer-arithmetic optimizations for optimized
// access of allocations smaller than 4GB.
// In stateless mode, such optimizations are not
// applied.
// Even if a GPU supports both modes, L0 driver may
// provide support for only one, like for Intel(R)
// Data Center GPU Max, for which L0 driver only
// supports stateless.
int32_t useOptimized32bitAccess();

bool isSubDevice() { return RootDevice != nullptr; }

// Is this a Data Center GPU Max series (aka PVC)?
Expand Down
28 changes: 26 additions & 2 deletions source/adapters/level_zero/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
? ZE_MODULE_FORMAT_IL_SPIRV
: ZE_MODULE_FORMAT_NATIVE;

ZeModuleDesc.inputSize = hProgram->CodeLength;
ZeModuleDesc.pInputModule = hProgram->Code.get();
ZeModuleDesc.pBuildFlags = pOptions;

// if large allocations are selected, then pass
// ze-opt-greater-than-4GB-buffer-required to disable
// stateful optimizations and be able to use larger than
// 4GB allocations on these kernels.
std::string ZeBuildOptions{};
if (pOptions) {
ZeBuildOptions += pOptions;
}

if (phDevices[0]->useOptimized32bitAccess() == 0) {
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
}

ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
ZeModuleDesc.pConstants = Shim.ze();

ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
Expand Down Expand Up @@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
// This produces better code because the driver can do cross-module
// optimizations. Therefore, we just remember the compilation flags, so we
// can use them later.
if (Options)
if (Options) {
Program->BuildFlags = Options;

// if large allocations are selected, then pass
// ze-opt-greater-than-4GB-buffer-required to disable
// stateful optimizations and be able to use larger than
// 4GB allocations on these kernels.
if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
}
}
Program->State = ur_program_handle_t_::Object;

return UR_RESULT_SUCCESS;
Expand Down
8 changes: 5 additions & 3 deletions source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
ZeDesc.flags = 0;
ZeDesc.ordinal = 0;

ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
// Tell Level-Zero to accept Size > maxMemAllocSize
if (Device->useOptimized32bitAccess() == 0 &&
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
// Tell Level-Zero to accept Size > maxMemAllocSize if
// large allocations are used.
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
ZeDesc.pNext = &RelaxedDesc;
}
Expand Down

0 comments on commit 0b95702

Please sign in to comment.