Skip to content

Commit

Permalink
Merge pull request #76 from qingshui/paddlebox
Browse files Browse the repository at this point in the history
support h800
  • Loading branch information
qingshui committed Jan 18, 2024
2 parents 61fa982 + 2809f5f commit 61a9d60
Show file tree
Hide file tree
Showing 47 changed files with 2,841 additions and 465 deletions.
56 changes: 39 additions & 17 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,34 @@ if(WITH_NV_JETSON)
add_definitions(-DWITH_NV_JETSON)
set(paddle_known_gpu_archs "53 62 72")
set(paddle_known_gpu_archs10 "53 62 72")
set(paddle_known_gpu_archs11 "53 62 72 87")
set(paddle_known_gpu_archs12 "53 62 72 87 90")
elseif(NEW_RELEASE_ALL)
message("Using New Release Strategy - All Arches Packge")
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Packge")
add_definitions(-DNEW_RELEASE_PYPI)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "")
set(paddle_known_gpu_archs11 "60 61 70 75 80")
set(paddle_known_gpu_archs11 "61 70 75 80")
set(paddle_known_gpu_archs12 "61 70 75 80 90")
elseif(NEW_RELEASE_JIT)
message("Using New Release Strategy - JIT Packge")
add_definitions(-DNEW_RELEASE_JIT)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 60 70 75")
set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 60 70 75")
set(paddle_known_gpu_archs11 "50 60 70 75 80")
set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
else()
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs "70 80")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
set(paddle_known_gpu_archs12 "70 80")
endif()

######################################################################################
Expand Down Expand Up @@ -98,12 +104,12 @@ endfunction()
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names
"Kepler"
"Maxwell"
"Pascal"
"Volta"
"Turing"
"Ampere"
"Hopper"
"All"
"Manual")
set(archs_name_default "Auto")
Expand Down Expand Up @@ -142,9 +148,7 @@ function(select_nvcc_arch_flags out_variable)
unset(CUDA_ARCH_PTX CACHE)
endif()

if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(WITH_NV_JETSON)
set(cuda_arch_bin "53")
else()
Expand All @@ -165,11 +169,17 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
set(cuda_arch_bin "80")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
set(cuda_arch_bin "80 86")
if(WITH_NV_JETSON)
set(cuda_arch_bin "87")
else()
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
set(cuda_arch_bin "80")
else()
set(cuda_arch_bin "80 86")
endif()
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
set(cuda_arch_bin "90")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
Expand All @@ -186,6 +196,13 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()

# cuda11.4
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.6)
set(cuda_arch_bin "70 80")
else()
set(cuda_arch_bin "70 80 90")
endif()

if(NEW_RELEASE_JIT)
set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
set(cuda_arch_bin "")
Expand Down Expand Up @@ -249,6 +266,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 90")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()

if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
Expand Down
9 changes: 7 additions & 2 deletions cmake/external/gloo.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,13 @@ set(GLOO_LIBRARY_DIR
"${GLOO_INSTALL_DIR}/lib"
CACHE PATH "gloo library directory." FORCE)
# As we add extra features for gloo, we use the non-official repo
set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
set(GLOO_TAG v0.0.2)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
set(GLOO_TAG v0.0.2)
else()
set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git)
set(GLOO_TAG v0.0.3)
endif()
set(GLOO_LIBRARIES
"${GLOO_INSTALL_DIR}/lib/libgloo.a"
CACHE FILEPATH "gloo library." FORCE)
Expand Down
6 changes: 5 additions & 1 deletion cmake/external/warpctc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# in case of low internet speed
#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
else()
set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
endif()

set(WARPCTC_INCLUDE_DIR
"${WARPCTC_INSTALL_DIR}/include"
Expand Down
24 changes: 24 additions & 0 deletions cmake/version.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,27 @@ math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000
add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER})
message(STATUS "Paddle version is ${PADDLE_VERSION}")

#add git version
set(COMMIT_HASH "")
set(BRANCH_NAME "")
find_package(Git QUIET)
if(GIT_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%H
OUTPUT_VARIABLE COMMIT_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
execute_process(
COMMAND ${GIT_EXECUTABLE} symbolic-ref --short -q HEAD
OUTPUT_VARIABLE BRANCH_NAME
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
message(STATUS "Git version is ${BRANCH_NAME}:${COMMIT_HASH}")
add_definitions(-DPADDLE_BRANCH_NAME="${BRANCH_NAME}")
add_definitions(-DPADDLE_COMMIT_HASH="${COMMIT_HASH}")
10 changes: 6 additions & 4 deletions paddle/fluid/framework/boxps_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -968,10 +968,6 @@ void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) {
auto dim = root_tensor->dims();
param_sync_.share(gpu_tensor, len).Resize(dim);
skip_vars_.push_back(name);
// add copy back to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
}
}
}
// data norm copy and learning rate
Expand All @@ -985,6 +981,11 @@ void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) {
place_,
static_cast<Tensor*>(gpu_tensor));
++copy_persist_num;
// add copy back to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
skip_vars_.push_back(name);
}
}
} else {
auto* ptr = thread_scope_->Var(name);
Expand Down Expand Up @@ -1104,6 +1105,7 @@ void BoxPSWorker::CreateThreadScopeForSharding(const ProgramDesc& program) {
// device 0 need sync datanorm and learning rate to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
skip_vars_.push_back(name);
}
}
} else {
Expand Down
20 changes: 15 additions & 5 deletions paddle/fluid/framework/fleet/box_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ DECLARE_int32(padbox_dataset_shuffle_thread_num);

namespace paddle {
namespace framework {
extern int make_day_id(const int &y, const int &m, const int &d);
extern int make_day_id(const int& y, const int& m, const int& d);
#ifdef PADDLE_WITH_BOX_PS
#define MAX_GPU_NUM 16

Expand Down Expand Up @@ -322,6 +322,11 @@ class MetricMsg {
platform::errors::NotFound("Error: var %s is not found in scope.",
varname.c_str()));
auto& gpu_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
gpu_tensor.IsInitialized(),
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
*data = gpu_tensor.data<T>();
*len = gpu_tensor.numel();
}
Expand All @@ -335,6 +340,11 @@ class MetricMsg {
platform::errors::NotFound("Error: var %s is not found in scope.",
varname.c_str()));
auto& gpu_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
gpu_tensor.IsInitialized(),
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
auto* gpu_data = gpu_tensor.data<T>();
auto len = gpu_tensor.numel();
data->resize(len);
Expand Down Expand Up @@ -424,7 +434,7 @@ class BoxWrapper {
}
int GetMpiSize() { return boxps::MPICluster::Ins().size(); }
int GetMpiRank() { return boxps::MPICluster::Ins().rank(); }
int GetNCCLRankId(const int &device_id) {
int GetNCCLRankId(const int& device_id) {
return (GetMpiRank() * gpu_num_ + device_id);
}
int GetGpuNum() { return gpu_num_; }
Expand Down Expand Up @@ -832,7 +842,7 @@ class BoxWrapper {
for (auto& name : var_names) {
auto it = std::find(skip_gc_vars_.begin(), skip_gc_vars_.end(), name);
if (it != skip_gc_vars_.end()) {
return;
continue;
}
skip_gc_vars_.push_back(name);
}
Expand Down Expand Up @@ -1026,8 +1036,8 @@ class BoxHelper {

void SetDate(int year, int month, int day) {
day_id_ = make_day_id(year, month, day);
VLOG(0) << "BoxHelpler set year=" << year << ", month="
<< month << ", day=" << day << ", day id=" << day_id_;
VLOG(0) << "BoxHelpler set year=" << year << ", month=" << month
<< ", day=" << day << ", day id=" << day_id_;
}
void BeginPass() {
#ifdef PADDLE_WITH_BOX_PS
Expand Down
25 changes: 16 additions & 9 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ DECLARE_bool(check_nan_inf);
DECLARE_bool(enable_unused_var_check);
DECLARE_bool(run_kp_kernel);
DECLARE_bool(enable_host_event_recorder_hook);
PADDLE_DEFINE_EXPORTED_bool(enable_check_input_var,
false,
"enable check input var");

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -1773,7 +1776,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
os << "\n";
printf("%s", os.str().c_str());
}
PADDLE_ENFORCE(false, "ERROR: check INF and NAN: %s",
PADDLE_ENFORCE(false,
"ERROR: check INF and NAN: %s",
DebugStringEx(&exec_scope).c_str());
}
#else
Expand Down Expand Up @@ -1938,7 +1942,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
} else if (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
} else if (!paddle::platform::is_xpu_support_op(type_,
expected_kernel_key)) {
VLOG(3) << "fluid XPU not support kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down Expand Up @@ -2419,13 +2424,15 @@ void OperatorWithKernel::ParseInputDataType(
}
}
if (t != nullptr) {
// PADDLE_ENFORCE_EQ(
// t->IsInitialized(),
// true,
// platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
// "contains uninitialized Tensor.",
// Type(),
// name));
if (FLAGS_enable_check_input_var) {
PADDLE_ENFORCE_EQ(
t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.",
Type(),
name));
}
*data_type = paddle::framework::TransToProtoVarType(t->dtype());
}
}
Expand Down
Loading

0 comments on commit 61a9d60

Please sign in to comment.