Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve costmodel on 4 models #826

Open
wants to merge 2 commits into
base: cmb_vodla_demo_0.2.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/halo/halo.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct AnalyzerOpts {
bool print_details = false;
int batch_size = 1;
int qps = 0; // image per second
int model_type = 0;
};

struct CXXCodeGenOpts {
Expand Down Expand Up @@ -150,7 +151,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
const char* const input_shapes[], unsigned num_inputs,
const char* const inputs[], unsigned num_outputs,
const char* const outputs[], const HaloCodeGenOpts* cg_opts,
const char* main_output_file, HaloModelInfo* model_info);
const char* main_output_file, HaloModelInfo* model_info,
const int model_type);
}

#endif // HALO_HALO_H_
32 changes: 29 additions & 3 deletions lib/interface/interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
ModelFormat model_format,
const CXXCodeGenOpts& cg_opts,
const std::string& main_output_file_name,
ModelInfo* model_info, bool is_compile_model = true) {
ModelInfo* model_info, bool is_compile_model = true,
const int model_type = 0) {
auto& ctx = m->GetGlobalContext();
ctx.SetVerbosity(1);
ctx.SetBasePath(GetBaseDir());
Expand Down Expand Up @@ -107,6 +108,7 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
alz_opts.batch_size = model_info->adaptive_bsz;
alz_opts.print_details = false;
alz_opts.qps = model_info->input_qps;
alz_opts.model_type = model_type;
Analyzer* analyzer =
static_cast<Analyzer*>(pm.AddAnalyzerPass(&std::cout, alz_opts));
pm.Run(m);
Expand Down Expand Up @@ -149,6 +151,29 @@ int Compile(ModelFormat format, const std::vector<const void*>& model_defs,
format, cg_opts, main_output_file_name, model_info);
}

HL_API_EXPORT
int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
const std::vector<size_t>& model_sizes, const std::string& target,
int batch, const std::vector<std::string>& input_shapes,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const CXXCodeGenOpts& cg_opts,
const std::string& main_output_file_name, ModelInfo* model_info,
bool is_compile_model, const int model_type) {
GlobalContext ctx;
Function* func;
std::unique_ptr<Module> m;
std::tie(m, func) = CreateModule(&ctx, target);
if (auto status = Parser::Parse(func, models, model_sizes, format);
status != Status::SUCCESS) {
return 1;
}

return InvokeCompiler(m.get(), target, batch, input_shapes, inputs, outputs,
format, cg_opts, main_output_file_name, model_info,
is_compile_model, model_type);
}

HL_API_EXPORT
int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
const std::vector<size_t>& model_sizes, const std::string& target,
Expand Down Expand Up @@ -250,7 +275,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
const char* const input_shapes[], unsigned num_inputs,
const char* const inputs[], unsigned num_outputs,
const char* const outputs[], const HaloCodeGenOpts* cg_opts,
const char* main_output_file, HaloModelInfo* model_info) {
const char* main_output_file, HaloModelInfo* model_info,
const int model_type) {
const halo::CXXCodeGenOpts& opts =
*reinterpret_cast<const halo::CXXCodeGenOpts*>(cg_opts);
std::vector<const char*> models_data(num_models);
Expand All @@ -263,5 +289,5 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
model_format, models_data, models_sizes, std::string(target), batch,
ToStrings(num_input_shapes, input_shapes), ToStrings(num_inputs, inputs),
ToStrings(num_outputs, outputs), opts, std::string(main_output_file),
model_info, false);
model_info, false, model_type);
}
135 changes: 132 additions & 3 deletions lib/transforms/analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,32 @@ bool Analyzer::RunOnModule(Module* m) {
// return l;
// }

static float NewtonSolver(const std::array<double, 4> func, int iteration,
float error) {
const std::array<double, 3> func_de{func[1], func[2] * 2, func[3] * 3};
const float init = 50;
const float max_per = 100;
const float min_per = 0;
float per = init;

for (int i = 0; i < iteration; i++) {
if (fabs(func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per) < error) {
break;
}
per = per - (func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per) /
(func_de[0] + func_de[1] * per + func_de[2] * per * per);
}
if (per > max_per) {
per = max_per;
} else if (per < min_per) {
per = min_per;
}

return per;
}

void Analyzer::GenerateRscInfo(std::ostream& os) {
static constexpr float mflops = 1000000.0F;
static constexpr float gflops = 1000 * mflops;
Expand Down Expand Up @@ -688,8 +714,8 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
hw_paras_step["GPU_t4"].other_time *
(total_flops - conv_flops - conv_act_flops - matmul_flops);
// knl_latency_temp *= static_cast<float>(adaptive_bsz_);
cur_qps = opts_.batch_size * ms2s * 4 /
(init_latency + knl_latency_temp * opts_.batch_size);
cur_qps = float(opts_.batch_size) * ms2s * 4 /
(init_latency + knl_latency_temp * float(opts_.batch_size));
// os << "step:" << step << " ,cur_qps:" << cur_qps << "\n";
// os << "hw_paras_step[GPU_t4].conv_time" <<
// hw_paras_step["GPU_t4"].conv_time
Expand Down Expand Up @@ -742,11 +768,114 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {

float est_latency = init_latency + knl_latency;
const float t4 = t4_flops / 100;
const double u_sec = 1e+6;
const int iteration = 10;
const float error_rate = 0.001;
const float max_percent = 100;
if (opts_.model_type == 1) {
// const std::array<double, 10> model{64073.283167584894, -88.91731411,
// 12.78189374, 26.05789414,
// 8533.30914793, -2900.88985761};
// const std::array<double, 4> func{
// model[0] +
// model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
// model[4] * float(opts_.batch_size) -
// u_sec * float(opts_.batch_size) / opts_.qps,
// model[1] * float(opts_.batch_size) + model[5], model[3]};
const int resnet_max_batch = 64;
if (opts_.batch_size > resnet_max_batch) {
opts_.batch_size = resnet_max_batch;
}
const std::array<double, 10> model{
49902.8906207358, -3.30238451e+02, -2.17410190e+01, 9.51439925e+01,
1.34280387e+04, -4.18767285e+03, 2.18543166e+00, -4.47421309e-03,
7.32400224e-01, -5.81271182e-01};
const std::array<double, 4> func{
model[0] + model[4] * float(opts_.batch_size) +
model[8] * float(opts_.batch_size) * float(opts_.batch_size) *
float(opts_.batch_size) +
model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
u_sec * float(opts_.batch_size) / opts_.qps,
model[1] * float(opts_.batch_size) + model[5] +
model[7] * float(opts_.batch_size) * float(opts_.batch_size),
model[3] + model[6] * float(opts_.batch_size), model[9]};
float per =
NewtonSolver(func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: resnet50"
<< "\n";
os << "est latency: "
<< func[0] + func[1] * per + func[2] * per * per +
u_sec * float(opts_.batch_size) / opts_.qps +
func[3] * per * per * per
<< "\n";
} else if (opts_.model_type == 2) {
const std::array<double, 4> func{
88324.13992776436 - u_sec * float(opts_.batch_size) / opts_.qps,
-2.75316291e+03, 3.90359192e+01, -1.81786268e-01};
float per =
NewtonSolver(func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: dbnet"
<< "\n";
os << "est latency: "
<< func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else if (opts_.model_type == 3) {
const std::array<double, 4> func{
31525.584310580438 - u_sec * float(opts_.batch_size) / opts_.qps,
-475.78524037, 2.58107976, 0.0};
float per =
NewtonSolver(func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: crnn"
<< "\n";
os << "est latency: "
<< func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else if (opts_.model_type == 4) {
const int bert_max_batch = 128;
if (opts_.batch_size > bert_max_batch) {
opts_.batch_size = bert_max_batch;
}
const std::array<double, 6> model{438429.4914344477, -5.17070386e+02,
1.96615464e+01, 2.23010546e+02,
5.54527169e+04, -2.45070285e+04};
const std::array<double, 4> func{
model[0] +
model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
model[4] * float(opts_.batch_size) -
u_sec * float(opts_.batch_size) / opts_.qps,
model[1] * float(opts_.batch_size) + model[5], model[3]};
float per =
NewtonSolver(func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: bert"
<< "\n";
os << "est latency: "
<< func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else {
os << "Model: other"
<< "\n";
}

os << "Device: GPU T4"
<< "\n";
os << "batch size: " << adaptive_bsz_ << "\n";
os << "est FLOPs: " << floatsrate << " gFlops\n";
os << "est split: " << floatsrate / t4 << "% T4\n";
os << "model FLOPs: " << floatsrate / t4 << "% T4\n";
os << "est latency: " << est_latency << " ms\n";
os << "est mem: " << trt_mem << " MB\n";
/*-----Generated T4 parameters-----------------*/
Expand All @@ -768,7 +897,7 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
rsc_req_.append("\"size\":1,");
rsc_req_.append("\"flops\":\"");
// std::string s = std::to_string(total_flops * gflops);
std::string s = std::to_string(ceil(floatsrate));
std::string s = std::to_string(ceil(int(floatsrate)));
rsc_req_.append(s.substr(0, s.find('.')));
rsc_req_.append("\",");
rsc_req_.append("\"precision\":\"Fp32\",");
Expand Down
13 changes: 12 additions & 1 deletion python/halo/halo.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class ModelInfo(Structure):
c_void_p, # cg_opts
c_char_p, # filename
c_void_p, # model_info
c_int, # model_type
]


Expand Down Expand Up @@ -228,7 +229,16 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):

target = "cxx".encode("utf-8")
output_filename = output_file.encode("utf-8")

if("resnet50" in model_file):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we pass the model_type from user's command line flags?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed the way to get model_type parameters. Now we can pass -t resnet50/bert/crnn/dbnet to choose the specific model.

model_type = 1
elif("dbnet" in model_file):
model_type = 2
elif("crnn" in model_file):
model_type = 3
elif("bert" in model_file):
model_type = 4
else:
model_type = 0
Analyze(
format_val,
model_num,
Expand All @@ -245,6 +255,7 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
pointer(opts),
output_filename,
pointer(model_info),
model_type,
)

def CompileODLAModel(files, device, debug=False):
Expand Down