diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index f5b00d80cc..acc7c755f4 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -88,6 +88,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( return UR_RESULT_SUCCESS; } +uint64_t calculateGlobalMemSize(ur_device_handle_t Device) { + // Cache GlobalMemSize + Device->ZeGlobalMemSize.Compute = + [Device](struct ze_global_memsize &GlobalMemSize) { + for (const auto &ZeDeviceMemoryExtProperty : + Device->ZeDeviceMemoryProperties->second) { + GlobalMemSize.value += ZeDeviceMemoryExtProperty.physicalSize; + } + if (GlobalMemSize.value == 0) { + for (const auto &ZeDeviceMemoryProperty : + Device->ZeDeviceMemoryProperties->first) { + GlobalMemSize.value += ZeDeviceMemoryProperty.totalSize; + } + } + }; + return Device->ZeGlobalMemSize.operator->()->value; +} + UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( ur_device_handle_t Device, ///< [in] handle of the device instance ur_device_info_t ParamName, ///< [in] type of the info to retrieve @@ -249,22 +267,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{64}); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); + // if not optimized for 32-bit access, return total memory size. + // otherwise, return only maximum allocatable size. + if (Device->useOptimized32bitAccess() == 0) { + return ReturnValue(uint64_t{calculateGlobalMemSize(Device)}); + } else { + return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); + } case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - uint64_t GlobalMemSize = 0; // Support to read physicalSize depends on kernel, // so fallback into reading totalSize if physicalSize // is not available. - for (const auto &ZeDeviceMemoryExtProperty : - Device->ZeDeviceMemoryProperties->second) { - GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize; - } - if (GlobalMemSize == 0) { - for (const auto &ZeDeviceMemoryProperty : - Device->ZeDeviceMemoryProperties->first) { - GlobalMemSize += ZeDeviceMemoryProperty.totalSize; - } - } + uint64_t GlobalMemSize = calculateGlobalMemSize(Device); return ReturnValue(uint64_t{GlobalMemSize}); } case UR_DEVICE_INFO_LOCAL_MEM_SIZE: @@ -637,6 +651,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( static_cast(ZE_RESULT_ERROR_UNINITIALIZED)); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } + // Calculate the global memory size as the max limit that can be reported as + // "free" memory for the user to allocate. + uint64_t GlobalMemSize = calculateGlobalMemSize(Device); // Only report device memory which zeMemAllocDevice can allocate from. // Currently this is only the one enumerated with ordinal 0. uint64_t FreeMemory = 0; @@ -661,7 +678,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } } } - return ReturnValue(FreeMemory); + return ReturnValue(std::min(GlobalMemSize, FreeMemory)); } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { // If there are not any memory modules then return 0. @@ -900,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() { } } +int32_t ur_device_handle_t_::useOptimized32bitAccess() { + static const int32_t Optimize32bitAccessMode = [this] { + // If device is Intel(R) Data Center GPU Max, + // use default provided by L0 driver. + // TODO: Use IP versioning to select based on range of devices + if (this->isPVC()) + return -1; + const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS"); + if (!UrRet) + return 0; + return std::atoi(UrRet); + }(); + + return Optimize32bitAccessMode; +} + ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { // Maintain various device properties cache. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 35404c6525..5f34efab44 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -39,6 +39,10 @@ enum EventsScope { LastCommandInBatchHostVisible }; +struct ze_global_memsize { + uint64_t value; +}; + struct ur_device_handle_t_ : _ur_object { ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt, ur_device_handle_t ParentDevice = nullptr) @@ -141,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object { // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; + // Returns whether large allocations are being used + // or not to have a consistent behavior throughout + // the adapter between the creation of large allocations + // and the compilation of kernels into stateful and + // stateless modes. + // With stateful mode, kernels are compiled with + // pointer-arithmetic optimizations for optimized + // access of allocations smaller than 4GB. + // In stateless mode, such optimizations are not + // applied. + // Even if a GPU supports both modes, L0 driver may + // provide support for only one, like for Intel(R) + // Data Center GPU Max, for which L0 driver only + // supports stateless. + int32_t useOptimized32bitAccess(); + bool isSubDevice() { return RootDevice != nullptr; } // Is this a Data Center GPU Max series (aka PVC)? @@ -170,4 +190,5 @@ struct ur_device_handle_t_ : _ur_object { ZeDeviceMemoryAccessProperties; ZeCache> ZeDeviceCacheProperties; ZeCache> ZeDeviceIpVersionExt; + ZeCache ZeGlobalMemSize; }; diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index 92a3c87aea..f118a5b9dd 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL) ? ZE_MODULE_FORMAT_IL_SPIRV : ZE_MODULE_FORMAT_NATIVE; + ZeModuleDesc.inputSize = hProgram->CodeLength; ZeModuleDesc.pInputModule = hProgram->Code.get(); - ZeModuleDesc.pBuildFlags = pOptions; + + // if large allocations are selected, then pass + // ze-opt-greater-than-4GB-buffer-required to disable + // stateful optimizations and be able to use larger than + // 4GB allocations on these kernels. + std::string ZeBuildOptions{}; + if (pOptions) { + ZeBuildOptions += pOptions; + } + + if (phDevices[0]->useOptimized32bitAccess() == 0) { + ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required"; + } + + ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str(); ZeModuleDesc.pConstants = Shim.ze(); ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice; @@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // This produces better code because the driver can do cross-module // optimizations. Therefore, we just remember the compilation flags, so we // can use them later. - if (Options) + if (Options) { Program->BuildFlags = Options; + + // if large allocations are selected, then pass + // ze-opt-greater-than-4GB-buffer-required to disable + // stateful optimizations and be able to use larger than + // 4GB allocations on these kernels. + if (Context->Devices[0]->useOptimized32bitAccess() == 0) { + Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; + } + } Program->State = ur_program_handle_t_::Object; return UR_RESULT_SUCCESS; diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index daec0408fb..c6d98855e7 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ZeDesc.flags = 0; ZeDesc.ordinal = 0; - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize + if (Device->useOptimized32bitAccess() == 0 && + (Size > Device->ZeDeviceProperties->maxMemAllocSize)) { + // Tell Level-Zero to accept Size > maxMemAllocSize if + // large allocations are used. + ZeStruct RelaxedDesc; RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; ZeDesc.pNext = &RelaxedDesc; }