Skip to content

Commit

Permalink
[adams2019] Add caching to autoscheduler (#5697)
Browse files Browse the repository at this point in the history
* add feature caching and block caching to adams2019 autoscheduler

* added caching verification for feautures

* add caching docstrings
  • Loading branch information
rootjalex committed Apr 20, 2021
1 parent ac23987 commit c1de142
Show file tree
Hide file tree
Showing 11 changed files with 1,387 additions and 377 deletions.
52 changes: 38 additions & 14 deletions src/autoschedulers/adams2019/AutoSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
The most interesting classes to look at are:
LoopNest Represents one node in our tree representation of loop nests.
State A state in the beam search. Holds a root loop nest.
LoopNest Represents one node in our tree representation of loop nests. (Now in LoopNest.(h | cpp)).
State A state in the beam search. Holds a root loop nest. (Now in State.(h | cpp)).
Interesting functions below are:
Expand Down Expand Up @@ -61,6 +61,14 @@
HL_AUTOSCHEDULE_MEMORY_LIMIT
If set, only consider schedules that allocate at most this much memory (measured in bytes).
HL_DISABLE_MEMOIZED_FEATURES
If set, features of possible schedules are always recalculated, and are not cached across passes.
(see Cache.h for more information)
HL_DISABLE_MEMOIZED_BLOCKS
If set, then tiling sizes are not cached across passes.
(see Cache.h for more information)
TODO: expose these settings by adding some means to pass args to
generator plugins instead of environment vars.
*/
Expand All @@ -79,6 +87,7 @@

#include "ASLog.h"
#include "AutoSchedule.h"
#include "Cache.h"
#include "CostModel.h"
#include "DefaultCostModel.h"
#include "Errors.h"
Expand Down Expand Up @@ -261,7 +270,8 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
int pass_idx,
int num_passes,
ProgressBar &tick,
std::unordered_set<uint64_t> &permitted_hashes) {
std::unordered_set<uint64_t> &permitted_hashes,
Cache *cache) {

if (cost_model) {
configure_pipeline_features(dag, params, cost_model);
Expand Down Expand Up @@ -320,7 +330,8 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
pass_idx,
num_passes,
tick,
permitted_hashes);
permitted_hashes,
cache);
} else {
internal_error << "Ran out of legal states with beam size " << beam_size << "\n";
}
Expand Down Expand Up @@ -408,7 +419,7 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
return best;
}

state->generate_children(dag, params, cost_model, memory_limit, enqueue_new_children);
state->generate_children(dag, params, cost_model, memory_limit, enqueue_new_children, cache);
expanded++;
}

Expand All @@ -431,7 +442,7 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
auto state = q[choice_label];
aslog(0) << "\n[" << choice_label << "]:\n";
state->dump();
state->calculate_cost(dag, params, cost_model, memory_limit, true);
state->calculate_cost(dag, params, cost_model, cache->options, memory_limit, true);
}
cost_model->evaluate_costs();

Expand All @@ -457,12 +468,16 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
CostModel *cost_model,
std::mt19937 &rng,
int beam_size,
int64_t memory_limit) {
int64_t memory_limit,
const CachingOptions &options) {

IntrusivePtr<State> best;

std::unordered_set<uint64_t> permitted_hashes;

// Set up cache with options and size.
Cache cache(options, dag.nodes.size());

// If the beam size is one, it's pointless doing multiple passes.
int num_passes = (beam_size == 1) ? 1 : 5;

Expand All @@ -486,7 +501,7 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,

auto pass = optimal_schedule_pass(dag, outputs, params, cost_model,
rng, beam_size, memory_limit,
i, num_passes, tick, permitted_hashes);
i, num_passes, tick, permitted_hashes, &cache);

std::chrono::duration<double> total_time = timer.elapsed();
auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count();
Expand All @@ -509,6 +524,11 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,

aslog(0) << "Best cost: " << best->cost << "\n";

if (options.cache_blocks) {
aslog(0) << "Cache (block) hits: " << cache.cache_hits << "\n";
aslog(0) << "Cache (block) misses: " << cache.cache_misses << "\n";
}

return best;
}

Expand Down Expand Up @@ -568,8 +588,11 @@ void generate_schedule(const std::vector<Function> &outputs,

IntrusivePtr<State> optimal;

// Options generated from environment variables, decide whether or not to cache features and/or tilings.
CachingOptions cache_options = CachingOptions::MakeOptionsFromEnviron();

// Run beam search
optimal = optimal_schedule(dag, outputs, params, cost_model.get(), rng, beam_size, memory_limit);
optimal = optimal_schedule(dag, outputs, params, cost_model.get(), rng, beam_size, memory_limit, cache_options);

HALIDE_TOC;

Expand All @@ -579,7 +602,7 @@ void generate_schedule(const std::vector<Function> &outputs,
aslog(1) << "** Optimal schedule:\n";

// Just to get the debugging prints to fire
optimal->calculate_cost(dag, params, cost_model.get(), memory_limit, aslog::aslog_level() > 0);
optimal->calculate_cost(dag, params, cost_model.get(), cache_options, memory_limit, aslog::aslog_level() > 0);

// Apply the schedules to the pipeline
optimal->apply_schedule(dag, params);
Expand Down Expand Up @@ -607,7 +630,7 @@ void generate_schedule(const std::vector<Function> &outputs,
if (!feature_file.empty()) {
user_warning << "HL_FEATURE_FILE is deprecated; use the featurization output from Generator instead\n";
std::ofstream binfile(feature_file, std::ios::binary | std::ios_base::trunc);
optimal->save_featurization(dag, params, binfile);
optimal->save_featurization(dag, params, cache_options, binfile);
binfile.close();
internal_assert(!binfile.fail()) << "Failed to write " << feature_file;
}
Expand All @@ -617,7 +640,7 @@ void generate_schedule(const std::vector<Function> &outputs,
auto_scheduler_results->schedule_source = optimal->schedule_source;
{
std::ostringstream out;
optimal->save_featurization(dag, params, out);
optimal->save_featurization(dag, params, cache_options, out);
auto_scheduler_results->featurization.resize(out.str().size());
memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size());
}
Expand Down Expand Up @@ -646,13 +669,14 @@ void find_and_apply_schedule(FunctionDAG &dag,
StageMap<ScheduleFeatures> *schedule_features) {

std::mt19937 rng(12345);
IntrusivePtr<State> optimal = optimal_schedule(dag, outputs, params, cost_model, rng, beam_size, memory_limit);
CachingOptions cache_options = CachingOptions::MakeOptionsFromEnviron();
IntrusivePtr<State> optimal = optimal_schedule(dag, outputs, params, cost_model, rng, beam_size, memory_limit, cache_options);

// Apply the schedules
optimal->apply_schedule(dag, params);

if (schedule_features) {
optimal->compute_featurization(dag, params, schedule_features);
optimal->compute_featurization(dag, params, schedule_features, cache_options);
}
}

Expand Down
1 change: 1 addition & 0 deletions src/autoschedulers/adams2019/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ add_autoscheduler(NAME Adams2019
SOURCES
ASLog.cpp
AutoSchedule.cpp
Cache.cpp
DefaultCostModel.cpp
FunctionDAG.cpp
LoopNest.cpp
Expand Down
117 changes: 117 additions & 0 deletions src/autoschedulers/adams2019/Cache.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include "Cache.h"
#include "LoopNest.h"
#include "State.h"

namespace Halide {
namespace Internal {
namespace Autoscheduler {

bool use_memoized_features() {
return get_env_variable("HL_DISABLE_MEMOIZED_FEATURES") != "1";
}

bool is_memoize_blocks_enabled() {
return get_env_variable("HL_DISABLE_MEMOIZED_BLOCKS") != "1";
}

bool Cache::add_memoized_blocks(const State *state,
std::function<void(IntrusivePtr<State> &&)> &accept_child,
const FunctionDAG::Node *node, int &num_children,
const FunctionDAG &dag,
const MachineParams &params,
CostModel *cost_model,
int64_t memory_limit) const {
if (!options.cache_blocks || !memoized_compute_root_blocks.contains(node)) {
// either memoization is turned off, or we haven't cached this node yet.
return false;
}

// get correct vector dimension.
int vector_dims = -1;
for (const auto &child : state->root->children) {
if (child->node == node && child->stage->index == 0) {
vector_dims = child->vector_dim;
break;
}
}

const auto &vector_dim_map = memoized_compute_root_blocks.get(node);

if (vector_dim_map.count(vector_dims) == 0) {
// Never cached this vector dimension before.
return false;
}

auto blocks = vector_dim_map.at(vector_dims);

size_t num_stages = node->stages.size();

for (size_t i = 0; i < blocks.size(); i += num_stages) {
// Construct child from memoization.
IntrusivePtr<State> child = state->make_child();
LoopNest *new_root = new LoopNest;
new_root->copy_from(*(state->root));
child->root = new_root;
child->num_decisions_made++;

int block_index = 0;
for (const auto &new_child : new_root->children) {
if (new_child->node == node) {
break;
}
block_index++;
}

// Copy all stages into new_root.
for (size_t j = 0; j < num_stages; j++) {
LoopNest *new_block = new LoopNest;
new_block->copy_from_including_features(*blocks[i + j]);
new_root->children[block_index++] = new_block;
}

if (child->calculate_cost(dag, params, cost_model, this->options, memory_limit)) {
num_children++;
accept_child(std::move(child));
cache_hits++;
}
}

// succesfully added cached items!
return true;
}

void Cache::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) {
if (!options.cache_blocks) {
return;
}

int vector_dim = -1;
bool loop_nest_found = false;

for (auto &child : new_root->children) {
if (child->node == node && child->stage->index == 0) {
vector_dim = child->vector_dim;
loop_nest_found = true;
break;
}
}

internal_assert(loop_nest_found) << "memoize_blocks did not find loop nest!\n";

auto &blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim];

for (auto &child : new_root->children) {
if (child->node == node) {
LoopNest *new_block = new LoopNest;
// Need const reference for copy.
const LoopNest *child_ptr = child.get();
new_block->copy_from_including_features(*child_ptr);
blocks.emplace_back(new_block);
cache_misses++;
}
}
}

} // namespace Autoscheduler
} // namespace Internal
} // namespace Halide
Loading

0 comments on commit c1de142

Please sign in to comment.