alibaba · lcsama · Feb 14, 2022 · Mar 16, 2022 · weimingzha0 · Feb 14, 2022
diff --git a/include/halo/halo.h b/include/halo/halo.h
@@ -54,6 +54,7 @@ struct AnalyzerOpts {
  bool print_details = false;
  int batch_size = 1;
  int qps = 0; // image per second
+ int model_type = 0;
 };
 
 struct CXXCodeGenOpts {
@@ -150,7 +151,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
  const char* const input_shapes[], unsigned num_inputs,
  const char* const inputs[], unsigned num_outputs,
  const char* const outputs[], const HaloCodeGenOpts* cg_opts,
- const char* main_output_file, HaloModelInfo* model_info);
+ const char* main_output_file, HaloModelInfo* model_info,
+ const int model_type);
 }
 
 #endif // HALO_HALO_H_
diff --git a/lib/interface/interface.cc b/lib/interface/interface.cc
@@ -53,7 +53,8 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
  ModelFormat model_format,
  const CXXCodeGenOpts& cg_opts,
  const std::string& main_output_file_name,
- ModelInfo* model_info, bool is_compile_model = true) {
+ ModelInfo* model_info, bool is_compile_model = true,
+ const int model_type = 0) {
  auto& ctx = m->GetGlobalContext();
  ctx.SetVerbosity(1);
  ctx.SetBasePath(GetBaseDir());
@@ -107,6 +108,7 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
  alz_opts.batch_size = model_info->adaptive_bsz;
  alz_opts.print_details = false;
  alz_opts.qps = model_info->input_qps;
+ alz_opts.model_type = model_type;
  Analyzer* analyzer =
  static_cast<Analyzer*>(pm.AddAnalyzerPass(&std::cout, alz_opts));
  pm.Run(m);
@@ -149,6 +151,29 @@ int Compile(ModelFormat format, const std::vector<const void*>& model_defs,
  format, cg_opts, main_output_file_name, model_info);
 }
 
+HL_API_EXPORT
+int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
+ const std::vector<size_t>& model_sizes, const std::string& target,
+ int batch, const std::vector<std::string>& input_shapes,
+ const std::vector<std::string>& inputs,
+ const std::vector<std::string>& outputs,
+ const CXXCodeGenOpts& cg_opts,
+ const std::string& main_output_file_name, ModelInfo* model_info,
+ bool is_compile_model, const int model_type) {
+ GlobalContext ctx;
+ Function* func;
+ std::unique_ptr<Module> m;
+ std::tie(m, func) = CreateModule(&ctx, target);
+ if (auto status = Parser::Parse(func, models, model_sizes, format);
+ status != Status::SUCCESS) {
+ return 1;
+ }
+
+ return InvokeCompiler(m.get(), target, batch, input_shapes, inputs, outputs,
+ format, cg_opts, main_output_file_name, model_info,
+ is_compile_model, model_type);
+}
+
 HL_API_EXPORT
 int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
  const std::vector<size_t>& model_sizes, const std::string& target,
@@ -250,7 +275,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
  const char* const input_shapes[], unsigned num_inputs,
  const char* const inputs[], unsigned num_outputs,
  const char* const outputs[], const HaloCodeGenOpts* cg_opts,
- const char* main_output_file, HaloModelInfo* model_info) {
+ const char* main_output_file, HaloModelInfo* model_info,
+ const int model_type) {
  const halo::CXXCodeGenOpts& opts =
  *reinterpret_cast<const halo::CXXCodeGenOpts*>(cg_opts);
  std::vector<const char*> models_data(num_models);
@@ -263,5 +289,5 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
  model_format, models_data, models_sizes, std::string(target), batch,
  ToStrings(num_input_shapes, input_shapes), ToStrings(num_inputs, inputs),
  ToStrings(num_outputs, outputs), opts, std::string(main_output_file),
- model_info, false);
+ model_info, false, model_type);
 }
diff --git a/lib/transforms/analyzer.cc b/lib/transforms/analyzer.cc
@@ -565,6 +565,32 @@ bool Analyzer::RunOnModule(Module* m) {
 // return l;
 // }
 
+static float NewtonSolver(const std::array<double, 4> func, int iteration,
+ float error) {
+ const std::array<double, 3> func_de{func[1], func[2] * 2, func[3] * 3};
+ const float init = 50;
+ const float max_per = 100;
+ const float min_per = 0;
+ float per = init;
+
+ for (int i = 0; i < iteration; i++) {
+ if (fabs(func[0] + func[1] * per + func[2] * per * per +
+ func[3] * per * per * per) < error) {
+ break;
+ }
+ per = per - (func[0] + func[1] * per + func[2] * per * per +
+ func[3] * per * per * per) /
+ (func_de[0] + func_de[1] * per + func_de[2] * per * per);
+ }
+ if (per > max_per) {
+ per = max_per;
+ } else if (per < min_per) {
+ per = min_per;
+ }
+
+ return per;
+}
+
 void Analyzer::GenerateRscInfo(std::ostream& os) {
  static constexpr float mflops = 1000000.0F;
  static constexpr float gflops = 1000 * mflops;
@@ -688,8 +714,8 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
  hw_paras_step["GPU_t4"].other_time *
  (total_flops - conv_flops - conv_act_flops - matmul_flops);
  // knl_latency_temp *= static_cast<float>(adaptive_bsz_);
- cur_qps = opts_.batch_size * ms2s * 4 /
- (init_latency + knl_latency_temp * opts_.batch_size);
+ cur_qps = float(opts_.batch_size) * ms2s * 4 /
+ (init_latency + knl_latency_temp * float(opts_.batch_size));
  // os << "step:" << step << " ,cur_qps:" << cur_qps << "\n";
  // os << "hw_paras_step[GPU_t4].conv_time" <<
  // hw_paras_step["GPU_t4"].conv_time
@@ -742,11 +768,114 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
 
  float est_latency = init_latency + knl_latency;
  const float t4 = t4_flops / 100;
+ const double u_sec = 1e+6;
+ const int iteration = 10;
+ const float error_rate = 0.001;
+ const float max_percent = 100;
+ if (opts_.model_type == 1) {
+ // const std::array<double, 10> model{64073.283167584894, -88.91731411,
+ // 12.78189374, 26.05789414,
+ // 8533.30914793, -2900.88985761};
+ // const std::array<double, 4> func{
+ // model[0] +
+ // model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
+ // model[4] * float(opts_.batch_size) -
+ // u_sec * float(opts_.batch_size) / opts_.qps,
+ // model[1] * float(opts_.batch_size) + model[5], model[3]};
+ const int resnet_max_batch = 64;
+ if (opts_.batch_size > resnet_max_batch) {
+ opts_.batch_size = resnet_max_batch;
+ }
+ const std::array<double, 10> model{
+ 49902.8906207358, -3.30238451e+02, -2.17410190e+01, 9.51439925e+01,
+ 1.34280387e+04, -4.18767285e+03, 2.18543166e+00, -4.47421309e-03,
+ 7.32400224e-01, -5.81271182e-01};
+ const std::array<double, 4> func{
+ model[0] + model[4] * float(opts_.batch_size) +
+ model[8] * float(opts_.batch_size) * float(opts_.batch_size) *
+ float(opts_.batch_size) +
+ model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
+ u_sec * float(opts_.batch_size) / opts_.qps,
+ model[1] * float(opts_.batch_size) + model[5] +
+ model[7] * float(opts_.batch_size) * float(opts_.batch_size),
+ model[3] + model[6] * float(opts_.batch_size), model[9]};
+ float per =
+ NewtonSolver(func, iteration,
+ error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+ floatsrate = per * t4_flops / max_percent;
+ os << "Model: resnet50"
+ << "\n";
+ os << "est latency: "
+ << func[0] + func[1] * per + func[2] * per * per +
+ u_sec * float(opts_.batch_size) / opts_.qps +
+ func[3] * per * per * per
+ << "\n";
+ } else if (opts_.model_type == 2) {
+ const std::array<double, 4> func{
+ 88324.13992776436 - u_sec * float(opts_.batch_size) / opts_.qps,
+ -2.75316291e+03, 3.90359192e+01, -1.81786268e-01};
+ float per =
+ NewtonSolver(func, iteration,
+ error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+ floatsrate = per * t4_flops / max_percent;
+ os << "Model: dbnet"
+ << "\n";
+ os << "est latency: "
+ << func[0] + func[1] * per + func[2] * per * per +
+ func[3] * per * per * per +
+ u_sec * float(opts_.batch_size) / opts_.qps
+ << "\n";
+ } else if (opts_.model_type == 3) {
+ const std::array<double, 4> func{
+ 31525.584310580438 - u_sec * float(opts_.batch_size) / opts_.qps,
+ -475.78524037, 2.58107976, 0.0};
+ float per =
+ NewtonSolver(func, iteration,
+ error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+ floatsrate = per * t4_flops / max_percent;
+ os << "Model: crnn"
+ << "\n";
+ os << "est latency: "
+ << func[0] + func[1] * per + func[2] * per * per +
+ func[3] * per * per * per +
+ u_sec * float(opts_.batch_size) / opts_.qps
+ << "\n";
+ } else if (opts_.model_type == 4) {
+ const int bert_max_batch = 128;
+ if (opts_.batch_size > bert_max_batch) {
+ opts_.batch_size = bert_max_batch;
+ }
+ const std::array<double, 6> model{438429.4914344477, -5.17070386e+02,
+ 1.96615464e+01, 2.23010546e+02,
+ 5.54527169e+04, -2.45070285e+04};
+ const std::array<double, 4> func{
+ model[0] +
+ model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
+ model[4] * float(opts_.batch_size) -
+ u_sec * float(opts_.batch_size) / opts_.qps,
+ model[1] * float(opts_.batch_size) + model[5], model[3]};
+ float per =
+ NewtonSolver(func, iteration,
+ error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+ floatsrate = per * t4_flops / max_percent;
+ os << "Model: bert"
+ << "\n";
+ os << "est latency: "
+ << func[0] + func[1] * per + func[2] * per * per +
+ func[3] * per * per * per +
+ u_sec * float(opts_.batch_size) / opts_.qps
+ << "\n";
+ } else {
+ os << "Model: other"
+ << "\n";
+ }
+
  os << "Device: GPU T4"
  << "\n";
  os << "batch size: " << adaptive_bsz_ << "\n";
  os << "est FLOPs: " << floatsrate << " gFlops\n";
  os << "est split: " << floatsrate / t4 << "% T4\n";
+ os << "model FLOPs: " << floatsrate / t4 << "% T4\n";
  os << "est latency: " << est_latency << " ms\n";
  os << "est mem: " << trt_mem << " MB\n";
  /*-----Generated T4 parameters-----------------*/
@@ -768,7 +897,7 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
  rsc_req_.append("\"size\":1,");
  rsc_req_.append("\"flops\":\"");
  // std::string s = std::to_string(total_flops * gflops);
- std::string s = std::to_string(ceil(floatsrate));
+ std::string s = std::to_string(ceil(int(floatsrate)));
  rsc_req_.append(s.substr(0, s.find('.')));
  rsc_req_.append("\",");
  rsc_req_.append("\"precision\":\"Fp32\",");

diff --git a/python/halo/halo.py b/python/halo/halo.py
@@ -118,6 +118,7 @@ class ModelInfo(Structure):
  c_void_p, # cg_opts
  c_char_p, # filename
  c_void_p, # model_info
+ c_int, # model_type
 ]
 
 
@@ -228,7 +229,16 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
 
  target = "cxx".encode("utf-8")
  output_filename = output_file.encode("utf-8")
-
+ if("resnet50" in model_file):
+ model_type = 1
+ elif("dbnet" in model_file):
+ model_type = 2
+ elif("crnn" in model_file):
+ model_type = 3
+ elif("bert" in model_file):
+ model_type = 4
+ else:
+ model_type = 0
  Analyze(
  format_val,
  model_num,
@@ -245,6 +255,7 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
  pointer(opts),
  output_filename,
  pointer(model_info),
+ model_type,
  )
 
 def CompileODLAModel(files, device, debug=False):