Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UR][L0] Unify use of large allocation in L0 adapter #1099

Merged
merged 2 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 46 additions & 13 deletions source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
return UR_RESULT_SUCCESS;
}

uint64_t calculateGlobalMemSize(ur_device_handle_t Device) {
// Cache GlobalMemSize
Device->ZeGlobalMemSize.Compute =
[Device](struct ze_global_memsize &GlobalMemSize) {
for (const auto &ZeDeviceMemoryExtProperty :
Device->ZeDeviceMemoryProperties->second) {
GlobalMemSize.value += ZeDeviceMemoryExtProperty.physicalSize;
}
if (GlobalMemSize.value == 0) {
for (const auto &ZeDeviceMemoryProperty :
Device->ZeDeviceMemoryProperties->first) {
GlobalMemSize.value += ZeDeviceMemoryProperty.totalSize;
}
}
};
return Device->ZeGlobalMemSize.operator->()->value;
}

UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
ur_device_handle_t Device, ///< [in] handle of the device instance
ur_device_info_t ParamName, ///< [in] type of the info to retrieve
Expand Down Expand Up @@ -249,22 +267,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return ReturnValue(uint32_t{64});
}
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
// if not optimized for 32-bit access, return total memory size.
// otherwise, return only maximum allocatable size.
if (Device->useOptimized32bitAccess() == 0) {
return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
} else {
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
}
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
uint64_t GlobalMemSize = 0;
// Support to read physicalSize depends on kernel,
// so fallback into reading totalSize if physicalSize
// is not available.
for (const auto &ZeDeviceMemoryExtProperty :
Device->ZeDeviceMemoryProperties->second) {
GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize;
}
if (GlobalMemSize == 0) {
for (const auto &ZeDeviceMemoryProperty :
Device->ZeDeviceMemoryProperties->first) {
GlobalMemSize += ZeDeviceMemoryProperty.totalSize;
}
}
uint64_t GlobalMemSize = calculateGlobalMemSize(Device);
return ReturnValue(uint64_t{GlobalMemSize});
}
case UR_DEVICE_INFO_LOCAL_MEM_SIZE:
Expand Down Expand Up @@ -637,6 +651,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
static_cast<int32_t>(ZE_RESULT_ERROR_UNINITIALIZED));
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
// Calculate the global memory size as the max limit that can be reported as
// "free" memory for the user to allocate.
uint64_t GlobalMemSize = calculateGlobalMemSize(Device);
// Only report device memory which zeMemAllocDevice can allocate from.
// Currently this is only the one enumerated with ordinal 0.
uint64_t FreeMemory = 0;
Expand All @@ -661,7 +678,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
}
}
}
return ReturnValue(FreeMemory);
return ReturnValue(std::min(GlobalMemSize, FreeMemory));
}
case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
// If there are not any memory modules then return 0.
Expand Down Expand Up @@ -900,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() {
}
}

int32_t ur_device_handle_t_::useOptimized32bitAccess() {
static const int32_t Optimize32bitAccessMode = [this] {
// If device is Intel(R) Data Center GPU Max,
// use default provided by L0 driver.
// TODO: Use IP versioning to select based on range of devices
if (this->isPVC())
return -1;
const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
if (!UrRet)
return 0;
return std::atoi(UrRet);
}();

return Optimize32bitAccessMode;
}

ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
int SubSubDeviceIndex) {
// Maintain various device properties cache.
Expand Down
21 changes: 21 additions & 0 deletions source/adapters/level_zero/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ enum EventsScope {
LastCommandInBatchHostVisible
};

struct ze_global_memsize {
uint64_t value;
};

struct ur_device_handle_t_ : _ur_object {
ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt,
ur_device_handle_t ParentDevice = nullptr)
Expand Down Expand Up @@ -141,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object {
// Returns whether immediate command lists are used on this device.
ImmCmdlistMode ImmCommandListUsed{};

// Returns whether large allocations are being used
// or not to have a consistent behavior throughout
// the adapter between the creation of large allocations
// and the compilation of kernels into stateful and
// stateless modes.
// With stateful mode, kernels are compiled with
// pointer-arithmetic optimizations for optimized
// access of allocations smaller than 4GB.
// In stateless mode, such optimizations are not
// applied.
// Even if a GPU supports both modes, L0 driver may
// provide support for only one, like for Intel(R)
// Data Center GPU Max, for which L0 driver only
// supports stateless.
int32_t useOptimized32bitAccess();

bool isSubDevice() { return RootDevice != nullptr; }

// Is this a Data Center GPU Max series (aka PVC)?
Expand Down Expand Up @@ -170,4 +190,5 @@ struct ur_device_handle_t_ : _ur_object {
ZeDeviceMemoryAccessProperties;
ZeCache<ZeStruct<ze_device_cache_properties_t>> ZeDeviceCacheProperties;
ZeCache<ZeStruct<ze_device_ip_version_ext_t>> ZeDeviceIpVersionExt;
ZeCache<struct ze_global_memsize> ZeGlobalMemSize;
};
28 changes: 26 additions & 2 deletions source/adapters/level_zero/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
? ZE_MODULE_FORMAT_IL_SPIRV
: ZE_MODULE_FORMAT_NATIVE;

ZeModuleDesc.inputSize = hProgram->CodeLength;
ZeModuleDesc.pInputModule = hProgram->Code.get();
ZeModuleDesc.pBuildFlags = pOptions;

// if large allocations are selected, then pass
// ze-opt-greater-than-4GB-buffer-required to disable
// stateful optimizations and be able to use larger than
// 4GB allocations on these kernels.
std::string ZeBuildOptions{};
if (pOptions) {
ZeBuildOptions += pOptions;
}

if (phDevices[0]->useOptimized32bitAccess() == 0) {
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
}

ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
ZeModuleDesc.pConstants = Shim.ze();

ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
Expand Down Expand Up @@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
// This produces better code because the driver can do cross-module
// optimizations. Therefore, we just remember the compilation flags, so we
// can use them later.
if (Options)
if (Options) {
Program->BuildFlags = Options;

// if large allocations are selected, then pass
// ze-opt-greater-than-4GB-buffer-required to disable
// stateful optimizations and be able to use larger than
// 4GB allocations on these kernels.
if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be in the Exp function and have the Exp function have the Compile code now? That way you check the specific device being passed in and not just the Context->Devices[0] since you might be on a non-uniform system.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @nrspruit . Good idea. However, urProgramCompileExp at this moment is unimplemented, so adding implementation for urProgramCompileExp on top of these changes would make this PR too big. I think it is better we merge this patch, then we add the support for urProgramCompileExp, including using the functionality from this patch. what do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, that would be fine, a follow-up patch would be good improvement on this.

Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
}
}
Program->State = ur_program_handle_t_::Object;

return UR_RESULT_SUCCESS;
Expand Down
8 changes: 5 additions & 3 deletions source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
ZeDesc.flags = 0;
ZeDesc.ordinal = 0;

ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
// Tell Level-Zero to accept Size > maxMemAllocSize
if (Device->useOptimized32bitAccess() == 0 &&
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
// Tell Level-Zero to accept Size > maxMemAllocSize if
// large allocations are used.
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
ZeDesc.pNext = &RelaxedDesc;
}
Expand Down