Merge pull request #4 from bigno78/autotuning-dia

Merge the final changes
bigno78 · May 15, 2023 · fe66995 · fe66995
2 parents 6847393 + b65e416
commit fe66995
Show file tree

Hide file tree

Showing 27 changed files with 2,680 additions and 587 deletions.
diff --git a/.gitignore b/.gitignore
@@ -122,3 +122,4 @@ performance/spmm/spmm
 *.user
 
 .vscode
+a.out
diff --git a/build.sh b/build.sh
@@ -1,8 +1,8 @@
 nvcc -DCUSP_PATH=$(realpath .) \
      -I . -I ../KTT/Source \
-     -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \
-     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \
-     -std=c++17 -g -O3 -DKTT_LINE_INFO -lineinfo main.cu
+     -l cuda -l ktt -L ../KTT/Build/x86_64_Release/ \
+     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Release/) \
+     -std=c++17 -g -O3 -lineinfo main.cu
 
 [ "$?" -eq "0" ] || exit 1
 

diff --git a/build/build-env.py b/build/build-env.py
@@ -277,6 +277,10 @@ def addKTT(env):
     env.Append(RPATH = [ ktt_lib_path ])
     env.Append(LIBS = [ "ktt" ])
 
+    if env['mode'] == 'debug':
+        env.Append(CFLAGS   = ['-DKTT_LINE_INFO'])
+        env.Append(CXXFLAGS = ['-DKTT_LINE_INFO'])
+
 
 def Environment(buildDir):
     # allow the user discretion to choose the MSVC version
@@ -465,7 +469,7 @@ def Environment(buildDir):
             env['ENV']['DYLD_LIBRARY_PATH'] = os.environ['DYLD_LIBRARY_PATH']
         elif 'LD_LIBRARY_PATH' in os.environ:
             env['ENV']['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH']
-    
+
     addKTT(env)
 
     # generate help text

diff --git a/cusp/detail/temporary_array.h b/cusp/detail/temporary_array.h
@@ -28,7 +28,7 @@
 #include <thrust/detail/temporary_array.h>
 
 #if THRUST_VERSION >= 100800
-#define TEMP_HOST_DEVICE_DECORATORS __host__ __device__
+#define TEMP_HOST_DEVICE_DECORATORS __host__
 #else
 #define TEMP_HOST_DEVICE_DECORATORS
 #endif

diff --git a/cusp/ktt/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl
@@ -0,0 +1,137 @@
+#include <cusp/array2d.h>
+#include <cusp/convert.h>
+#include <cusp/detail/utils.h>
+
+#include <thrust/transform.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/device_ptr.h> // raw_pointer_cast
+
+namespace cusp
+{
+
+namespace ktt
+{
+
+
+template<typename IndexType>
+struct ell_row_length
+{
+    const IndexType* column_indices;
+    size_t num_cols_per_row;
+    size_t pitch;
+
+    __host__ __device__ IndexType operator()(IndexType row_idx)
+    {
+        IndexType len = 0;
+
+        while (len < num_cols_per_row
+                && column_indices[row_idx + len*pitch] >= 0)
+        {
+            len++;
+        }
+
+        return len;
+    }
+};
+
+template<typename IndexType, typename ValueType, typename MemorySpace>
+void
+compute_row_lengths(cusp::ktt::ellr_matrix<IndexType, ValueType, MemorySpace>& A)
+{
+    thrust::counting_iterator<IndexType> row_idx_it(0);
+
+    thrust::transform(
+        row_idx_it,
+        row_idx_it + A.num_rows,
+        A.row_lengths.begin(),
+        ell_row_length<IndexType>{
+            thrust::raw_pointer_cast(&A.column_indices(0, 0)),
+            A.column_indices.num_cols,
+            A.column_indices.pitch }
+    );
+}
+
+
+//////////////////
+// Constructors //
+//////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+             const size_t num_entries_per_row, const size_t alignment)
+    : Parent(num_rows, num_cols, num_entries, num_entries_per_row, alignment)
+{
+    row_lengths.resize(num_rows);
+}
+
+// construct from a different matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+}
+
+//////////////////////
+// Member Functions //
+//////////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::swap(ellr_matrix& matrix)
+{
+    Parent::swap(matrix);
+    row_lengths.swap(matrix.row_lengths);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row)
+{
+    Parent::resize(num_rows, num_cols, num_entries, num_entries_per_row);
+    row_lengths.resize(num_rows);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row, const size_t alignment)
+{
+    Parent::resize(num_rows, num_cols, num_entries,
+                   num_entries_per_row, alignment);
+    row_lengths.resize(num_rows);
+}
+
+// assignment from another matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>&
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::operator=(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+
+    return *this;
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void ellr_matrix<IndexType, ValueType, MemorySpace>::update_row_lengths()
+{
+    if (row_lengths.size() != this->num_rows)
+        row_lengths.resize(this->num_rows);
+
+    compute_row_lengths(*this);
+}
+
+
+} // namespace ktt
+
+} // namespace cusp
diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
@@ -15,6 +15,7 @@ namespace ktt {
 
 namespace detail {
 
+
 inline std::unique_ptr<::ktt::Tuner> tuner;
 inline bool is_enabled = true;
 
@@ -41,17 +42,18 @@ inline void lazy_init()
             CUstream stream;
             cuStreamCreate(&stream, CU_STREAM_DEFAULT);
 
-            ::ktt::ComputeApiInitializer initializer(context, std::vector<::ktt::ComputeQueue>{ stream });
+            std::vector<::ktt::ComputeQueue> queues = { stream };
+            ::ktt::ComputeApiInitializer initializer(context, queues);
 
-            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, initializer);
+            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA,
+                                                   initializer);
         }
 
         std::string compiler_flags = "-std=c++17 ";
 #ifdef KTT_LINE_INFO
         compiler_flags += "-lineinfo ";
 #endif
         tuner->SetCompilerOptions(compiler_flags);
-
         tuner->SetValidationMode(::ktt::ValidationMode::OfflineTuning);
 
         std::atexit(cleanup);
@@ -77,70 +79,62 @@ inline ::ktt::Tuner& get_tuner()
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-              const cusp::array1d<ValueType1, cusp::device_memory>& x,
-              cusp::array1d<ValueType2, cusp::device_memory>& y)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y);
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                             const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                             cusp::array1d<ValueType2, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& configuration,
-                             bool run_with_profiling)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y,
+     const ::ktt::KernelConfiguration& configuration,
+     bool run_with_profiling)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y, configuration, run_with_profiling);
 }
 
 
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::KernelResult>
-tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
      std::optional<::ktt::ReferenceComputation> reference_computation)
 {
     return cusp::system::cuda::ktt::tune(get_tuner(), A, x, y, reference_computation);
 }
 
-
-template<typename IndexType,
-         typename ValueType1,
-         typename ValueType2,
-         typename ValueType3,
-         typename Format>
-void reset_tuning()
-{
-    using namespace cusp::system::cuda::ktt;
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(get_tuner(), Format{});
-    detail::tuner->ClearData(kernel.kernel_id);
-}
-
-
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
 void reset_tuning(const MatrixType& A,
-                  const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                  cusp::array1d<ValueType2, cusp::device_memory>& y)
+                  const cusp::array1d<ValueType1, MemorySpace1>& x,
+                  cusp::array1d<ValueType2, MemorySpace2>& y)
 {
-    using IndexType = typename MatrixType::index_type;
-    using ValueType = typename MatrixType::value_type;
-    using Format = typename MatrixType::format;
-
-    return reset_tuning<IndexType, ValueType, ValueType2, ValueType2, Format>();
+    auto& tuner = get_tuner();
+    const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    tuner.ClearData(kernel.kernel_id);
 }