[adams2019] Add caching to autoscheduler (#5697)

* add feature caching and block caching to adams2019 autoscheduler * added caching verification for feautures * add caching docstrings
halide · Apr 20, 2021 · c1de142 · c1de142
1 parent ac23987
commit c1de142
Show file tree

Hide file tree

Showing 11 changed files with 1,387 additions and 377 deletions.
diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp
@@ -6,8 +6,8 @@
 
   The most interesting classes to look at are:
 
-  LoopNest               Represents one node in our tree representation of loop nests.
-  State                  A state in the beam search. Holds a root loop nest.
+  LoopNest               Represents one node in our tree representation of loop nests. (Now in LoopNest.(h | cpp)).
+  State                  A state in the beam search. Holds a root loop nest. (Now in State.(h | cpp)).
 
   Interesting functions below are:
 
@@ -61,6 +61,14 @@
   HL_AUTOSCHEDULE_MEMORY_LIMIT
   If set, only consider schedules that allocate at most this much memory (measured in bytes).
 
+  HL_DISABLE_MEMOIZED_FEATURES
+  If set, features of possible schedules are always recalculated, and are not cached across passes.
+  (see Cache.h for more information)
+
+  HL_DISABLE_MEMOIZED_BLOCKS
+  If set, then tiling sizes are not cached across passes.
+  (see Cache.h for more information)
+
   TODO: expose these settings by adding some means to pass args to
   generator plugins instead of environment vars.
 */
@@ -79,6 +87,7 @@
 
 #include "ASLog.h"
 #include "AutoSchedule.h"
+#include "Cache.h"
 #include "CostModel.h"
 #include "DefaultCostModel.h"
 #include "Errors.h"
@@ -261,7 +270,8 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
                                           int pass_idx,
                                           int num_passes,
                                           ProgressBar &tick,
-                                          std::unordered_set<uint64_t> &permitted_hashes) {
+                                          std::unordered_set<uint64_t> &permitted_hashes,
+                                          Cache *cache) {
 
     if (cost_model) {
         configure_pipeline_features(dag, params, cost_model);
@@ -320,7 +330,8 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
                                              pass_idx,
                                              num_passes,
                                              tick,
-                                             permitted_hashes);
+                                             permitted_hashes,
+                                             cache);
             } else {
                 internal_error << "Ran out of legal states with beam size " << beam_size << "\n";
             }
@@ -408,7 +419,7 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
                 return best;
             }
 
-            state->generate_children(dag, params, cost_model, memory_limit, enqueue_new_children);
+            state->generate_children(dag, params, cost_model, memory_limit, enqueue_new_children, cache);
             expanded++;
         }
 
@@ -431,7 +442,7 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
                 auto state = q[choice_label];
                 aslog(0) << "\n[" << choice_label << "]:\n";
                 state->dump();
-                state->calculate_cost(dag, params, cost_model, memory_limit, true);
+                state->calculate_cost(dag, params, cost_model, cache->options, memory_limit, true);
             }
             cost_model->evaluate_costs();
 
@@ -457,12 +468,16 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
                                      CostModel *cost_model,
                                      std::mt19937 &rng,
                                      int beam_size,
-                                     int64_t memory_limit) {
+                                     int64_t memory_limit,
+                                     const CachingOptions &options) {
 
     IntrusivePtr<State> best;
 
     std::unordered_set<uint64_t> permitted_hashes;
 
+    // Set up cache with options and size.
+    Cache cache(options, dag.nodes.size());
+
     // If the beam size is one, it's pointless doing multiple passes.
     int num_passes = (beam_size == 1) ? 1 : 5;
 
@@ -486,7 +501,7 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
 
         auto pass = optimal_schedule_pass(dag, outputs, params, cost_model,
                                           rng, beam_size, memory_limit,
-                                          i, num_passes, tick, permitted_hashes);
+                                          i, num_passes, tick, permitted_hashes, &cache);
 
         std::chrono::duration<double> total_time = timer.elapsed();
         auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count();
@@ -509,6 +524,11 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
 
     aslog(0) << "Best cost: " << best->cost << "\n";
 
+    if (options.cache_blocks) {
+        aslog(0) << "Cache (block) hits: " << cache.cache_hits << "\n";
+        aslog(0) << "Cache (block) misses: " << cache.cache_misses << "\n";
+    }
+
     return best;
 }
 
@@ -568,8 +588,11 @@ void generate_schedule(const std::vector<Function> &outputs,
 
     IntrusivePtr<State> optimal;
 
+    // Options generated from environment variables, decide whether or not to cache features and/or tilings.
+    CachingOptions cache_options = CachingOptions::MakeOptionsFromEnviron();
+
     // Run beam search
-    optimal = optimal_schedule(dag, outputs, params, cost_model.get(), rng, beam_size, memory_limit);
+    optimal = optimal_schedule(dag, outputs, params, cost_model.get(), rng, beam_size, memory_limit, cache_options);
 
     HALIDE_TOC;
 
@@ -579,7 +602,7 @@ void generate_schedule(const std::vector<Function> &outputs,
     aslog(1) << "** Optimal schedule:\n";
 
     // Just to get the debugging prints to fire
-    optimal->calculate_cost(dag, params, cost_model.get(), memory_limit, aslog::aslog_level() > 0);
+    optimal->calculate_cost(dag, params, cost_model.get(), cache_options, memory_limit, aslog::aslog_level() > 0);
 
     // Apply the schedules to the pipeline
     optimal->apply_schedule(dag, params);
@@ -607,7 +630,7 @@ void generate_schedule(const std::vector<Function> &outputs,
     if (!feature_file.empty()) {
         user_warning << "HL_FEATURE_FILE is deprecated; use the featurization output from Generator instead\n";
         std::ofstream binfile(feature_file, std::ios::binary | std::ios_base::trunc);
-        optimal->save_featurization(dag, params, binfile);
+        optimal->save_featurization(dag, params, cache_options, binfile);
         binfile.close();
         internal_assert(!binfile.fail()) << "Failed to write " << feature_file;
     }
@@ -617,7 +640,7 @@ void generate_schedule(const std::vector<Function> &outputs,
         auto_scheduler_results->schedule_source = optimal->schedule_source;
         {
             std::ostringstream out;
-            optimal->save_featurization(dag, params, out);
+            optimal->save_featurization(dag, params, cache_options, out);
             auto_scheduler_results->featurization.resize(out.str().size());
             memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size());
         }
@@ -646,13 +669,14 @@ void find_and_apply_schedule(FunctionDAG &dag,
                              StageMap<ScheduleFeatures> *schedule_features) {
 
     std::mt19937 rng(12345);
-    IntrusivePtr<State> optimal = optimal_schedule(dag, outputs, params, cost_model, rng, beam_size, memory_limit);
+    CachingOptions cache_options = CachingOptions::MakeOptionsFromEnviron();
+    IntrusivePtr<State> optimal = optimal_schedule(dag, outputs, params, cost_model, rng, beam_size, memory_limit, cache_options);
 
     // Apply the schedules
     optimal->apply_schedule(dag, params);
 
     if (schedule_features) {
-        optimal->compute_featurization(dag, params, schedule_features);
+        optimal->compute_featurization(dag, params, schedule_features, cache_options);
     }
 }
 

diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt
@@ -37,6 +37,7 @@ add_autoscheduler(NAME Adams2019
                   SOURCES
                   ASLog.cpp
                   AutoSchedule.cpp
+                  Cache.cpp
                   DefaultCostModel.cpp
                   FunctionDAG.cpp
                   LoopNest.cpp

diff --git a/src/autoschedulers/adams2019/Cache.cpp b/src/autoschedulers/adams2019/Cache.cpp
@@ -0,0 +1,117 @@
+#include "Cache.h"
+#include "LoopNest.h"
+#include "State.h"
+
+namespace Halide {
+namespace Internal {
+namespace Autoscheduler {
+
+bool use_memoized_features() {
+    return get_env_variable("HL_DISABLE_MEMOIZED_FEATURES") != "1";
+}
+
+bool is_memoize_blocks_enabled() {
+    return get_env_variable("HL_DISABLE_MEMOIZED_BLOCKS") != "1";
+}
+
+bool Cache::add_memoized_blocks(const State *state,
+                                std::function<void(IntrusivePtr<State> &&)> &accept_child,
+                                const FunctionDAG::Node *node, int &num_children,
+                                const FunctionDAG &dag,
+                                const MachineParams &params,
+                                CostModel *cost_model,
+                                int64_t memory_limit) const {
+    if (!options.cache_blocks || !memoized_compute_root_blocks.contains(node)) {
+        // either memoization is turned off, or we haven't cached this node yet.
+        return false;
+    }
+
+    // get correct vector dimension.
+    int vector_dims = -1;
+    for (const auto &child : state->root->children) {
+        if (child->node == node && child->stage->index == 0) {
+            vector_dims = child->vector_dim;
+            break;
+        }
+    }
+
+    const auto &vector_dim_map = memoized_compute_root_blocks.get(node);
+
+    if (vector_dim_map.count(vector_dims) == 0) {
+        // Never cached this vector dimension before.
+        return false;
+    }
+
+    auto blocks = vector_dim_map.at(vector_dims);
+
+    size_t num_stages = node->stages.size();
+
+    for (size_t i = 0; i < blocks.size(); i += num_stages) {
+        // Construct child from memoization.
+        IntrusivePtr<State> child = state->make_child();
+        LoopNest *new_root = new LoopNest;
+        new_root->copy_from(*(state->root));
+        child->root = new_root;
+        child->num_decisions_made++;
+
+        int block_index = 0;
+        for (const auto &new_child : new_root->children) {
+            if (new_child->node == node) {
+                break;
+            }
+            block_index++;
+        }
+
+        // Copy all stages into new_root.
+        for (size_t j = 0; j < num_stages; j++) {
+            LoopNest *new_block = new LoopNest;
+            new_block->copy_from_including_features(*blocks[i + j]);
+            new_root->children[block_index++] = new_block;
+        }
+
+        if (child->calculate_cost(dag, params, cost_model, this->options, memory_limit)) {
+            num_children++;
+            accept_child(std::move(child));
+            cache_hits++;
+        }
+    }
+
+    // succesfully added cached items!
+    return true;
+}
+
+void Cache::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) {
+    if (!options.cache_blocks) {
+        return;
+    }
+
+    int vector_dim = -1;
+    bool loop_nest_found = false;
+
+    for (auto &child : new_root->children) {
+        if (child->node == node && child->stage->index == 0) {
+            vector_dim = child->vector_dim;
+            loop_nest_found = true;
+            break;
+        }
+    }
+
+    internal_assert(loop_nest_found) << "memoize_blocks did not find loop nest!\n";
+
+    auto &blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim];
+
+    for (auto &child : new_root->children) {
+        if (child->node == node) {
+            LoopNest *new_block = new LoopNest;
+            // Need const reference for copy.
+            const LoopNest *child_ptr = child.get();
+            new_block->copy_from_including_features(*child_ptr);
+            blocks.emplace_back(new_block);
+            cache_misses++;
+        }
+    }
+}
+
+}  // namespace Autoscheduler
+}  // namespace Internal
+}  // namespace Halide