diff --git a/cpp/demo/poisson_matrix_free/main.cpp b/cpp/demo/poisson_matrix_free/main.cpp
index 4ecbf9b3122..0c2512f429a 100644
--- a/cpp/demo/poisson_matrix_free/main.cpp
+++ b/cpp/demo/poisson_matrix_free/main.cpp
@@ -41,9 +41,7 @@ namespace linalg
 /// @param[in] alpha
 /// @param[in] x
 /// @param[in] y
-template <typename U>
-void axpy(la::Vector<U>& r, U alpha, const la::Vector<U>& x,
-          const la::Vector<U>& y)
+void axpy(auto& r, auto alpha, const auto& x, const auto& y)
 {
   std::transform(x.array().begin(), x.array().end(), y.array().begin(),
                  r.mutable_array().begin(),
@@ -61,19 +59,21 @@ void axpy(la::Vector<U>& r, U alpha, const la::Vector<U>& x,
 /// @return The number if iterations
 /// @pre It is required that the ghost values of `x` and `b` have been
 /// updated before this function is called
-template <typename U, typename ApplyFunction>
-int cg(la::Vector<U>& x, const la::Vector<U>& b, ApplyFunction&& action,
-       int kmax = 50, double rtol = 1e-8)
+template <typename ApplyFunction>
+int cg(auto& x, auto& b, ApplyFunction&& action, int kmax = 50,
+       double rtol = 1e-8)
 {
+  using T = typename std::decay_t<decltype(x)>::value_type;
+
   // Create working vectors
-  la::Vector<U> r(b), y(b);
+  la::Vector r(b), y(b);
 
   // Compute initial residual r0 = b - Ax0
   action(x, y);
-  axpy(r, U(-1), y, b);
+  axpy(r, T(-1), y, b);
 
   // Create p work vector
-  la::Vector<U> p(r);
+  la::Vector p(r);
 
   // Iterations of CG
   auto rnorm0 = la::squared_norm(r);
@@ -88,7 +88,7 @@ int cg(la::Vector<U>& x, const la::Vector<U>& b, ApplyFunction&& action,
     action(p, y);
 
     // Compute alpha = r.r/p.y
-    const U alpha = rnorm / la::inner_product(p, y);
+    const T alpha = rnorm / la::inner_product(p, y);
 
     // Update x (x <- x + alpha*p)
     axpy(x, alpha, p, x);
@@ -98,7 +98,7 @@ int cg(la::Vector<U>& x, const la::Vector<U>& b, ApplyFunction&& action,
 
     // Update residual norm
     const auto rnorm_new = la::squared_norm(r);
-    const U beta = rnorm_new / rnorm;
+    const T beta = rnorm_new / rnorm;
     rnorm = rnorm_new;
 
     if (rnorm / rnorm0 < rtol2)
@@ -182,8 +182,7 @@ int main(int argc, char* argv[])
     const std::vector<T> constants = fem::pack_constants(*M);
 
     // Create function for computing the action of A on x (y = Ax)
-    std::function<void(la::Vector<T>&, la::Vector<T>&)> action
-        = [&M, &ui, &bc, &coeff, &constants](la::Vector<T>& x, la::Vector<T>& y)
+    auto action = [&M, &ui, &bc, &coeff, &constants](auto& x, auto& y)
     {
       // Zero y
       y.set(0.0);
diff --git a/cpp/dolfinx/fem/utils.h b/cpp/dolfinx/fem/utils.h
index 1739166dc5e..1f4476190b7 100644
--- a/cpp/dolfinx/fem/utils.h
+++ b/cpp/dolfinx/fem/utils.h
@@ -814,12 +814,13 @@ void pack(std::span<T> coeffs, std::int32_t cell, int bs, std::span<const T> v,
 /// @private
 /// @brief  Concepts for function that returns cell index
 template <typename F>
-concept FetchCells = requires(F&& f, std::span<const std::int32_t> v) {
-  requires std::invocable<F, std::span<const std::int32_t>>;
-  {
-    f(v)
-  } -> std::convertible_to<std::int32_t>;
-};
+concept FetchCells
+    = requires(F&& f, std::span<const std::int32_t> v) {
+        requires std::invocable<F, std::span<const std::int32_t>>;
+        {
+          f(v)
+          } -> std::convertible_to<std::int32_t>;
+      };
 
 /// @brief Pack a single coefficient for a set of active entities.
 ///
diff --git a/cpp/dolfinx/graph/AdjacencyList.h b/cpp/dolfinx/graph/AdjacencyList.h
index a79055c5801..30a47d78fdf 100644
--- a/cpp/dolfinx/graph/AdjacencyList.h
+++ b/cpp/dolfinx/graph/AdjacencyList.h
@@ -172,10 +172,10 @@ AdjacencyList(T, U) -> AdjacencyList<typename T::value_type>;
 /// @return An adjacency list
 template <typename U>
   requires requires {
-    typename std::decay_t<U>::value_type;
-    requires std::convertible_to<
-        U, std::vector<typename std::decay_t<U>::value_type>>;
-  }
+             typename std::decay_t<U>::value_type;
+             requires std::convertible_to<
+                 U, std::vector<typename std::decay_t<U>::value_type>>;
+           }
 AdjacencyList<typename std::decay_t<U>::value_type>
 regular_adjacency_list(U&& data, int degree)
 {
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index a324e02099c..d88268837a0 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -18,10 +18,9 @@
 
 namespace dolfinx::la
 {
-
 namespace impl
 {
-/// @brief Set data in a CSR matrix
+/// @brief Set data in a CSR matrix.
 ///
 /// @param[out] data The CSR matrix data
 /// @param[in] cols The CSR column indices
@@ -33,38 +32,9 @@ namespace impl
 /// @param[in] local_size The maximum row index that can be set. Used
 /// when debugging is own to check that rows beyond a permitted range
 /// are not being set.
-template <typename U, typename V, typename W, typename X>
-void set_csr(U&& data, const V& cols, const V& row_ptr, const W& x,
-             const X& xrows, const X& xcols,
-             [[maybe_unused]] typename X::value_type local_size)
-{
-  assert(x.size() == xrows.size() * xcols.size());
-  for (std::size_t r = 0; r < xrows.size(); ++r)
-  {
-    // Row index and current data row
-    auto row = xrows[r];
-    using T = typename W::value_type;
-    const T* xr = x.data() + r * xcols.size();
-
-#ifndef NDEBUG
-    if (row >= local_size)
-      throw std::runtime_error("Local row out of range");
-#endif
-
-    // Columns indices for row
-    auto cit0 = std::next(cols.begin(), row_ptr[row]);
-    auto cit1 = std::next(cols.begin(), row_ptr[row + 1]);
-    for (std::size_t c = 0; c < xcols.size(); ++c)
-    {
-      // Find position of column index
-      auto it = std::lower_bound(cit0, cit1, xcols[c]);
-      assert(it != cit1);
-      std::size_t d = std::distance(cols.begin(), it);
-      assert(d < data.size());
-      data[d] = xr[c];
-    }
-  }
-}
+template <typename U, typename V, typename W, typename X, typename Y>
+void set_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+             const Y& xrows, const Y& xcols, typename Y::value_type local_size);
 
 /// @brief Add data to a CSR matrix
 ///
@@ -75,37 +45,9 @@ void set_csr(U&& data, const V& cols, const V& row_ptr, const W& x,
 /// to the matrix
 /// @param[in] xrows The row indices of `x`
 /// @param[in] xcols The column indices of `x`
-template <typename U, typename V, typename W, typename X>
-void add_csr(U&& data, const V& cols, const V& row_ptr, const W& x,
-             const X& xrows, const X& xcols)
-{
-  assert(x.size() == xrows.size() * xcols.size());
-  for (std::size_t r = 0; r < xrows.size(); ++r)
-  {
-    // Row index and current data row
-    auto row = xrows[r];
-    using T = typename W::value_type;
-    const T* xr = x.data() + r * xcols.size();
-
-#ifndef NDEBUG
-    if (row >= (int)row_ptr.size())
-      throw std::runtime_error("Local row out of range");
-#endif
-
-    // Columns indices for row
-    auto cit0 = std::next(cols.begin(), row_ptr[row]);
-    auto cit1 = std::next(cols.begin(), row_ptr[row + 1]);
-    for (std::size_t c = 0; c < xcols.size(); ++c)
-    {
-      // Find position of column index
-      auto it = std::lower_bound(cit0, cit1, xcols[c]);
-      assert(it != cit1);
-      std::size_t d = std::distance(cols.begin(), it);
-      assert(d < data.size());
-      data[d] += xr[c];
-    }
-  }
-}
+template <typename U, typename V, typename W, typename X, typename Y>
+void add_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+             const Y& xrows, const Y& xcols);
 } // namespace impl
 
 /// Distributed sparse matrix
@@ -113,232 +55,83 @@ void add_csr(U&& data, const V& cols, const V& row_ptr, const W& x,
 /// The matrix storage format is compressed sparse row. The matrix is
 /// partitioned row-wise across MPI rank.
 ///
-/// @tparam T The data type for the matrix
-/// @tparam Allocator The memory allocator type for the data storage
-///
-/// @note Highly "experimental" storage of a matrix in CSR format which
-/// can be assembled into using the usual dolfinx assembly routines
+/// @warning Highly experimental storage of a matrix in CSR format which
+/// can be assembled into using the usual DOLFINx assembly routines
 /// Matrix internal data can be accessed for interfacing with other
 /// code.
-///
 /// @todo Handle block sizes
-template <typename T, class Allocator = std::allocator<T>>
+///
+/// @tparam Scalar Scalar type of matrix entries
+/// @tparam Container Sequence container type to store matrix entries
+/// @tparam ColContainer Column index container type
+/// @tparam RowPtrContainer Row pointer container type
+template <class Scalar, class Container = std::vector<Scalar>,
+          class ColContainer = std::vector<std::int32_t>,
+          class RowPtrContainer = std::vector<std::int64_t>>
 class MatrixCSR
 {
+  static_assert(std::is_same_v<typename Container::value_type, Scalar>);
+
 public:
-  /// The value type
-  using value_type = T;
+  /// Scalar type
+  using value_type = Scalar;
 
-  /// The allocator type
-  using allocator_type = Allocator;
+  /// Matrix entries container type
+  using container_type = Container;
 
-  /// Insertion functor for setting values in matrix. It is typically
-  /// used in finite element assembly functions.
+  /// Column index container type
+  using column_container_type = ColContainer;
+
+  /// Row pointer container type
+  using rowptr_container_type = RowPtrContainer;
+
+  static_assert(std::is_same_v<value_type, typename container_type::value_type>,
+                "Scalar type and container value type must be the same.");
+
+  /// @brief Insertion functor for setting values in matrix. It is
+  /// typically used in finite element assembly functions.
   /// @return Function for inserting values into `A`
   /// @todo clarify setting on non-owned entries
   auto mat_set_values()
   {
-    return [&](const std::span<const std::int32_t>& rows,
-               const std::span<const std::int32_t>& cols,
-               const std::span<const T>& data) -> int
+    return [&](std::span<const std::int32_t> rows,
+               std::span<const std::int32_t> cols,
+               std::span<const value_type> data) -> int
     {
       this->set(data, rows, cols);
       return 0;
     };
   }
 
-  /// Insertion functor for accumulating values in matrix. It is
+  /// @brief Insertion functor for accumulating values in matrix. It is
   /// typically used in finite element assembly functions.
   /// @return Function for inserting values into `A`
   auto mat_add_values()
   {
-    return [&](const std::span<const std::int32_t>& rows,
-               const std::span<const std::int32_t>& cols,
-               const std::span<const T>& data) -> int
+    return [&](std::span<const std::int32_t> rows,
+               std::span<const std::int32_t> cols,
+               std::span<const value_type> data) -> int
     {
       this->add(data, rows, cols);
       return 0;
     };
   }
 
-  /// Create a distributed matrix
-  /// @param[in] p The sparsty pattern the describes the parallel
-  /// distribution and the non-zero structure
-  /// @param[in] alloc The memory allocator for the data storafe
-  MatrixCSR(const SparsityPattern& p, const Allocator& alloc = Allocator())
-      : _index_maps({p.index_map(0),
-                     std::make_shared<common::IndexMap>(p.column_index_map())}),
-        _bs({p.block_size(0), p.block_size(1)}),
-        _data(p.num_nonzeros(), 0, alloc),
-        _cols(p.graph().array().begin(), p.graph().array().end()),
-        _row_ptr(p.graph().offsets().begin(), p.graph().offsets().end()),
-        _comm(MPI_COMM_NULL)
-  {
-    // TODO: handle block sizes
-    if (_bs[0] > 1 or _bs[1] > 1)
-      throw std::runtime_error("Block size not yet supported");
-
-    // Compute off-diagonal offset for each row
-    std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offset();
-    _off_diagonal_offset.reserve(num_diag_nnz.size());
-    std::transform(num_diag_nnz.begin(), num_diag_nnz.end(), _row_ptr.begin(),
-                   std::back_inserter(_off_diagonal_offset), std::plus{});
-
-    // Some short-hand
-    const std::array local_size
-        = {_index_maps[0]->size_local(), _index_maps[1]->size_local()};
-    const std::array local_range
-        = {_index_maps[0]->local_range(), _index_maps[1]->local_range()};
-    const std::vector<std::int64_t>& ghosts1 = _index_maps[1]->ghosts();
-
-    const std::vector<std::int64_t>& ghosts0 = _index_maps[0]->ghosts();
-    const std::vector<int>& src_ranks = _index_maps[0]->src();
-    const std::vector<int>& dest_ranks = _index_maps[0]->dest();
-
-    // Create neighbourhood communicator (owner <- ghost)
-    MPI_Comm comm;
-    MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(),
-                                   dest_ranks.data(), MPI_UNWEIGHTED,
-                                   src_ranks.size(), src_ranks.data(),
-                                   MPI_UNWEIGHTED, MPI_INFO_NULL, false, &comm);
-    _comm = dolfinx::MPI::Comm(comm, false);
-
-    // Build map from ghost row index position to owning (neighborhood)
-    // rank
-    _ghost_row_to_rank.reserve(_index_maps[0]->owners().size());
-    for (int r : _index_maps[0]->owners())
-    {
-      auto it = std::lower_bound(src_ranks.begin(), src_ranks.end(), r);
-      assert(it != src_ranks.end() and *it == r);
-      int pos = std::distance(src_ranks.begin(), it);
-      _ghost_row_to_rank.push_back(pos);
-    }
-
-    // Compute size of data to send to each neighbor
-    std::vector<std::int32_t> data_per_proc(src_ranks.size(), 0);
-    for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
-    {
-      assert(_ghost_row_to_rank[i] < data_per_proc.size());
-      std::size_t pos = local_size[0] + i;
-      data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos];
-    }
-
-    // Compute send displacements
-    _val_send_disp.resize(src_ranks.size() + 1, 0);
-    std::partial_sum(data_per_proc.begin(), data_per_proc.end(),
-                     std::next(_val_send_disp.begin()));
-
-    // For each ghost row, pack and send indices to neighborhood
-    std::vector<std::int64_t> ghost_index_data(2 * _val_send_disp.back());
-    {
-      std::vector<int> insert_pos = _val_send_disp;
-      for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
-      {
-        const int rank = _ghost_row_to_rank[i];
-        int row_id = local_size[0] + i;
-        for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
-        {
-          // Get position in send buffer
-          const std::int32_t idx_pos = 2 * insert_pos[rank];
-
-          // Pack send data (row, col) as global indices
-          ghost_index_data[idx_pos] = ghosts0[i];
-          if (std::int32_t col_local = _cols[j]; col_local < local_size[1])
-            ghost_index_data[idx_pos + 1] = col_local + local_range[1][0];
-          else
-            ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]];
-
-          insert_pos[rank] += 1;
-        }
-      }
-    }
-
-    // Communicate data with neighborhood
-    std::vector<std::int64_t> ghost_index_array;
-    std::vector<int> recv_disp;
-    {
-      std::vector<int> send_sizes;
-      std::transform(data_per_proc.begin(), data_per_proc.end(),
-                     std::back_inserter(send_sizes),
-                     [](auto x) { return 2 * x; });
-
-      std::vector<int> recv_sizes(dest_ranks.size());
-      send_sizes.reserve(1);
-      recv_sizes.reserve(1);
-      MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1,
-                            MPI_INT, _comm.comm());
-
-      // Build send/recv displacement
-      std::vector<int> send_disp = {0};
-      std::partial_sum(send_sizes.begin(), send_sizes.end(),
-                       std::back_inserter(send_disp));
-      recv_disp = {0};
-      std::partial_sum(recv_sizes.begin(), recv_sizes.end(),
-                       std::back_inserter(recv_disp));
-
-      ghost_index_array.resize(recv_disp.back());
-      MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(),
-                             send_disp.data(), MPI_INT64_T,
-                             ghost_index_array.data(), recv_sizes.data(),
-                             recv_disp.data(), MPI_INT64_T, _comm.comm());
-    }
-
-    // Store receive displacements for future use, when transferring
-    // data values
-    _val_recv_disp.resize(recv_disp.size());
-    std::transform(recv_disp.begin(), recv_disp.end(), _val_recv_disp.begin(),
-                   [](int d) { return d / 2; });
-
-    // Global-to-local map for ghost columns
-    std::vector<std::pair<std::int64_t, std::int32_t>> global_to_local;
-    global_to_local.reserve(ghosts1.size());
-    std::int32_t local_i = local_size[1];
-    for (std::int64_t idx : ghosts1)
-      global_to_local.push_back({idx, global_to_local.size() + local_size[1]});
-    std::sort(global_to_local.begin(), global_to_local.end());
-
-    // Compute location in which data for each index should be stored
-    // when received
-    for (std::size_t i = 0; i < ghost_index_array.size(); i += 2)
-    {
-      // Row must be on this process
-      const std::int32_t local_row = ghost_index_array[i] - local_range[0][0];
-      assert(local_row >= 0 and local_row < local_size[0]);
-
-      // Column may be owned or unowned
-      std::int32_t local_col = ghost_index_array[i + 1] - local_range[1][0];
-      if (local_col < 0 or local_col >= local_size[1])
-      {
-        auto it = std::lower_bound(
-            global_to_local.begin(), global_to_local.end(),
-            std::pair(ghost_index_array[i + 1], -1),
-            [](auto& a, auto& b) { return a.first < b.first; });
-        assert(it != global_to_local.end()
-               and it->first == ghost_index_array[i + 1]);
-        local_col = it->second;
-      }
-      auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]);
-      auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]);
-
-      // Find position of column index and insert data
-      auto cit = std::lower_bound(cit0, cit1, local_col);
-      assert(cit != cit1);
-      assert(*cit == local_col);
-      std::size_t d = std::distance(_cols.begin(), cit);
-      _unpack_pos.push_back(d);
-    }
-  }
+  /// @brief Create a distributed matrix.
+  /// @param[in] p The sparsity pattern the describes the parallel
+  /// distribution and the non-zero structure.
+  MatrixCSR(const SparsityPattern& p);
 
   /// Move constructor
   /// @todo Check handling of MPI_Request
   MatrixCSR(MatrixCSR&& A) = default;
 
-  /// Set all non-zero local entries to a value
-  /// including entries in ghost rows
+  /// @brief Set all non-zero local entries to a value including entries
+  /// in ghost rows.
   /// @param[in] x The value to set non-zero matrix entries to
-  void set(T x) { std::fill(_data.begin(), _data.end(), x); }
+  void set(value_type x) { std::fill(_data.begin(), _data.end(), x); }
 
-  /// Set values in the matrix
+  /// @brief Set values in the matrix.
   /// @note Only entries included in the sparsity pattern used to
   /// initialize the matrix can be set
   /// @note All indices are local to the calling MPI rank and entries
@@ -350,15 +143,14 @@ class MatrixCSR
   /// set in the matrix
   /// @param[in] rows The row indices of `x`
   /// @param[in] cols The column indices of `x`
-  void set(const std::span<const T>& x,
-           const std::span<const std::int32_t>& rows,
-           const std::span<const std::int32_t>& cols)
+  void set(std::span<const value_type> x, std::span<const std::int32_t> rows,
+           std::span<const std::int32_t> cols)
   {
     impl::set_csr(_data, _cols, _row_ptr, x, rows, cols,
                   _index_maps[0]->size_local());
   }
 
-  /// Accumulate values in the matrix
+  /// @brief Accumulate values in the matrix
   /// @note Only entries included in the sparsity pattern used to
   /// initialize the matrix can be accumulated in to
   /// @note All indices are local to the calling MPI rank and entries
@@ -370,9 +162,8 @@ class MatrixCSR
   /// add to the matrix
   /// @param[in] rows The row indices of `x`
   /// @param[in] cols The column indices of `x`
-  void add(const std::span<const T>& x,
-           const std::span<const std::int32_t>& rows,
-           const std::span<const std::int32_t>& cols)
+  void add(std::span<const value_type> x, std::span<const std::int32_t> rows,
+           std::span<const std::int32_t> cols)
   {
     impl::add_csr(_data, _cols, _row_ptr, x, rows, cols);
   }
@@ -390,115 +181,41 @@ class MatrixCSR
   /// manually by using num_owned_rows() if required.
   /// @return Dense copy of the part of the matrix on the calling rank.
   /// Storage is row-major.
-  std::vector<T> to_dense() const
-  {
-    const std::size_t nrows = num_all_rows();
-    const std::size_t ncols
-        = _index_maps[1]->size_local() + _index_maps[1]->num_ghosts();
-    std::vector<T> A(nrows * ncols);
-    for (std::size_t r = 0; r < nrows; ++r)
-      for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
-        A[r * ncols + _cols[j]] = _data[j];
-
-    return A;
-  }
+  std::vector<value_type> to_dense() const;
 
-  /// Transfer ghost row data to the owning ranks
-  /// accumulating received values on the owned rows, and zeroing any existing
-  /// data in ghost rows.
+  /// @brief Transfer ghost row data to the owning ranks accumulating
+  /// received values on the owned rows, and zeroing any existing data
+  /// in ghost rows.
   void finalize()
   {
     finalize_begin();
     finalize_end();
   }
 
-  /// Begin transfer of ghost row data to owning ranks, where it will be
-  /// accumulated into existing owned rows.
+  /// @brief Begin transfer of ghost row data to owning ranks, where it
+  /// will be accumulated into existing owned rows.
   /// @note Calls to this function must be followed by
   /// MatrixCSR::finalize_end(). Between the two calls matrix values
   /// must not be changed.
   /// @note This function does not change the matrix data. Data update only
   /// occurs with `finalize_end()`.
-  void finalize_begin()
-  {
-    const std::int32_t local_size0 = _index_maps[0]->size_local();
-    const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
-
-    // For each ghost row, pack and send values to send to neighborhood
-    std::vector<int> insert_pos = _val_send_disp;
-    std::vector<T> ghost_value_data(_val_send_disp.back());
-    for (int i = 0; i < num_ghosts0; ++i)
-    {
-      const int rank = _ghost_row_to_rank[i];
-
-      // Get position in send buffer to place data to send to this
-      // neighbour
-      const std::int32_t val_pos = insert_pos[rank];
-      std::copy(std::next(_data.data(), _row_ptr[local_size0 + i]),
-                std::next(_data.data(), _row_ptr[local_size0 + i + 1]),
-                std::next(ghost_value_data.begin(), val_pos));
-      insert_pos[rank]
-          += _row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i];
-    }
-
-    _ghost_value_data_in.resize(_val_recv_disp.back());
+  void finalize_begin();
 
-    // Compute data sizes for send and receive from displacements
-    std::vector<int> val_send_count(_val_send_disp.size() - 1);
-    std::adjacent_difference(std::next(_val_send_disp.begin()),
-                             _val_send_disp.end(), val_send_count.begin());
-
-    std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
-    std::adjacent_difference(std::next(_val_recv_disp.begin()),
-                             _val_recv_disp.end(), val_recv_count.begin());
-
-    int status = MPI_Ineighbor_alltoallv(
-        ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
-        dolfinx::MPI::mpi_type<T>(), _ghost_value_data_in.data(),
-        val_recv_count.data(), _val_recv_disp.data(),
-        dolfinx::MPI::mpi_type<T>(), _comm.comm(), &_request);
-    assert(status == MPI_SUCCESS);
-  }
-
-  /// End transfer of ghost row data to owning ranks
+  /// @brief End transfer of ghost row data to owning ranks.
   /// @note Must be preceded by MatrixCSR::finalize_begin()
   /// @note Matrix data received from other processes will be
   /// accumulated into locally owned rows, and ghost rows will be
   /// zeroed.
-  void finalize_end()
-  {
-    int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
-    assert(status == MPI_SUCCESS);
-
-    // Add to local rows
-    assert(_ghost_value_data_in.size() == _unpack_pos.size());
-    for (std::size_t i = 0; i < _ghost_value_data_in.size(); ++i)
-      _data[_unpack_pos[i]] += _ghost_value_data_in[i];
-
-    // Set ghost row data to zero
-    const std::int32_t local_size0 = _index_maps[0]->size_local();
-    std::fill(std::next(_data.begin(), _row_ptr[local_size0]), _data.end(), 0);
-  }
+  void finalize_end();
 
   /// Compute the Frobenius norm squared
-  double norm_squared() const
-  {
-    const std::size_t num_owned_rows = _index_maps[0]->size_local();
-    assert(num_owned_rows < _row_ptr.size());
-
-    const double norm_sq_local = std::accumulate(
-        _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows]),
-        double(0), [](auto norm, T y) { return norm + std::norm(y); });
-    double norm_sq;
-    MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM,
-                  _comm.comm());
-    return norm_sq;
-  }
+  double norm_squared() const;
 
-  /// Index maps for the row and column space. The row IndexMap contains
-  /// ghost entries for rows which may be inserted into and the column
-  /// IndexMap contains all local and ghost columns that may exist in
-  /// the owned rows.
+  /// @brief Index maps for the row and column space.
+  ///
+  /// The row IndexMap contains ghost entries for rows which may be
+  /// inserted into and the column IndexMap contains all local and ghost
+  /// columns that may exist in the owned rows.
   ///
   /// @return Row (0) and column (1) index maps
   const std::array<std::shared_ptr<const common::IndexMap>, 2>&
@@ -509,19 +226,19 @@ class MatrixCSR
 
   /// Get local data values
   /// @note Includes ghost values
-  std::vector<T>& values() { return _data; }
+  container_type& values() { return _data; }
 
   /// Get local values (const version)
   /// @note Includes ghost values
-  const std::vector<T>& values() const { return _data; }
+  const container_type& values() const { return _data; }
 
   /// Get local row pointers
   /// @note Includes pointers to ghost rows
-  const std::vector<std::int32_t>& row_ptr() const { return _row_ptr; }
+  const rowptr_container_type& row_ptr() const { return _row_ptr; }
 
   /// Get local column indices
   /// @note Includes columns in ghost rows
-  const std::vector<std::int32_t>& cols() const { return _cols; }
+  const column_container_type& cols() const { return _cols; }
 
   /// Get the start of off-diagonal (unowned columns) on each row,
   /// allowing the matrix to be split (virtually) into two parts.
@@ -530,7 +247,7 @@ class MatrixCSR
   /// from operations on the unowned parts.
   /// @note Includes ghost rows, which should be truncated manually if
   /// not required.
-  const std::vector<std::int32_t>& off_diag_offset() const
+  const rowptr_container_type& off_diag_offset() const
   {
     return _off_diagonal_offset;
   }
@@ -543,11 +260,12 @@ class MatrixCSR
   std::array<int, 2> _bs;
 
   // Matrix data
-  std::vector<T, Allocator> _data;
-  std::vector<std::int32_t> _cols, _row_ptr;
+  container_type _data;
+  column_container_type _cols;
+  rowptr_container_type _row_ptr;
 
   // Start of off-diagonal (unowned columns) on each row
-  std::vector<std::int32_t> _off_diagonal_offset;
+  rowptr_container_type _off_diagonal_offset;
 
   // Neighborhood communicator (ghost->owner communicator for rows)
   dolfinx::MPI::Comm _comm;
@@ -569,7 +287,324 @@ class MatrixCSR
   std::vector<int> _ghost_row_to_rank;
 
   // Temporary store for finalize data during non-blocking communication
-  std::vector<T> _ghost_value_data_in;
+  container_type _ghost_value_data_in;
 };
 
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X, typename Y>
+void impl::set_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                   const Y& xrows, const Y& xcols,
+                   [[maybe_unused]] typename Y::value_type local_size)
+{
+  assert(x.size() == xrows.size() * xcols.size());
+  for (std::size_t r = 0; r < xrows.size(); ++r)
+  {
+    // Row index and current data row
+    auto row = xrows[r];
+    using T = typename X::value_type;
+    const T* xr = x.data() + r * xcols.size();
+
+#ifndef NDEBUG
+    if (row >= local_size)
+      throw std::runtime_error("Local row out of range");
+#endif
+
+    // Columns indices for row
+    auto cit0 = std::next(cols.begin(), row_ptr[row]);
+    auto cit1 = std::next(cols.begin(), row_ptr[row + 1]);
+    for (std::size_t c = 0; c < xcols.size(); ++c)
+    {
+      // Find position of column index
+      auto it = std::lower_bound(cit0, cit1, xcols[c]);
+      assert(it != cit1);
+      std::size_t d = std::distance(cols.begin(), it);
+      assert(d < data.size());
+      data[d] = xr[c];
+    }
+  }
+}
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X, typename Y>
+void impl::add_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                   const Y& xrows, const Y& xcols)
+{
+  assert(x.size() == xrows.size() * xcols.size());
+  for (std::size_t r = 0; r < xrows.size(); ++r)
+  {
+    // Row index and current data row
+    auto row = xrows[r];
+    using T = typename X::value_type;
+    const T* xr = x.data() + r * xcols.size();
+
+#ifndef NDEBUG
+    if (row >= (int)row_ptr.size())
+      throw std::runtime_error("Local row out of range");
+#endif
+
+    // Columns indices for row
+    auto cit0 = std::next(cols.begin(), row_ptr[row]);
+    auto cit1 = std::next(cols.begin(), row_ptr[row + 1]);
+    for (std::size_t c = 0; c < xcols.size(); ++c)
+    {
+      // Find position of column index
+      auto it = std::lower_bound(cit0, cit1, xcols[c]);
+      assert(it != cit1);
+      std::size_t d = std::distance(cols.begin(), it);
+      assert(d < data.size());
+      data[d] += xr[c];
+    }
+  }
+}
+//-----------------------------------------------------------------------------
+template <class U, class V, class W, class X>
+MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p)
+    : _index_maps({p.index_map(0),
+                   std::make_shared<common::IndexMap>(p.column_index_map())}),
+      _bs({p.block_size(0), p.block_size(1)}), _data(p.num_nonzeros(), 0),
+      _cols(p.graph().array().begin(), p.graph().array().end()),
+      _row_ptr(p.graph().offsets().begin(), p.graph().offsets().end()),
+      _comm(MPI_COMM_NULL)
+{
+  // TODO: handle block sizes
+  if (_bs[0] > 1 or _bs[1] > 1)
+    throw std::runtime_error("Block size not yet supported");
+
+  // Compute off-diagonal offset for each row
+  std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offset();
+  _off_diagonal_offset.reserve(num_diag_nnz.size());
+  std::transform(num_diag_nnz.begin(), num_diag_nnz.end(), _row_ptr.begin(),
+                 std::back_inserter(_off_diagonal_offset), std::plus{});
+
+  // Some short-hand
+  const std::array local_size
+      = {_index_maps[0]->size_local(), _index_maps[1]->size_local()};
+  const std::array local_range
+      = {_index_maps[0]->local_range(), _index_maps[1]->local_range()};
+  const std::vector<std::int64_t>& ghosts1 = _index_maps[1]->ghosts();
+
+  const std::vector<std::int64_t>& ghosts0 = _index_maps[0]->ghosts();
+  const std::vector<int>& src_ranks = _index_maps[0]->src();
+  const std::vector<int>& dest_ranks = _index_maps[0]->dest();
+
+  // Create neighbourhood communicator (owner <- ghost)
+  MPI_Comm comm;
+  MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(),
+                                 dest_ranks.data(), MPI_UNWEIGHTED,
+                                 src_ranks.size(), src_ranks.data(),
+                                 MPI_UNWEIGHTED, MPI_INFO_NULL, false, &comm);
+  _comm = dolfinx::MPI::Comm(comm, false);
+
+  // Build map from ghost row index position to owning (neighborhood)
+  // rank
+  _ghost_row_to_rank.reserve(_index_maps[0]->owners().size());
+  for (int r : _index_maps[0]->owners())
+  {
+    auto it = std::lower_bound(src_ranks.begin(), src_ranks.end(), r);
+    assert(it != src_ranks.end() and *it == r);
+    int pos = std::distance(src_ranks.begin(), it);
+    _ghost_row_to_rank.push_back(pos);
+  }
+
+  // Compute size of data to send to each neighbor
+  std::vector<std::int32_t> data_per_proc(src_ranks.size(), 0);
+  for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
+  {
+    assert(_ghost_row_to_rank[i] < data_per_proc.size());
+    std::size_t pos = local_size[0] + i;
+    data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos];
+  }
+
+  // Compute send displacements
+  _val_send_disp.resize(src_ranks.size() + 1, 0);
+  std::partial_sum(data_per_proc.begin(), data_per_proc.end(),
+                   std::next(_val_send_disp.begin()));
+
+  // For each ghost row, pack and send indices to neighborhood
+  std::vector<std::int64_t> ghost_index_data(2 * _val_send_disp.back());
+  {
+    std::vector<int> insert_pos = _val_send_disp;
+    for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
+    {
+      const int rank = _ghost_row_to_rank[i];
+      int row_id = local_size[0] + i;
+      for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
+      {
+        // Get position in send buffer
+        const std::int32_t idx_pos = 2 * insert_pos[rank];
+
+        // Pack send data (row, col) as global indices
+        ghost_index_data[idx_pos] = ghosts0[i];
+        if (std::int32_t col_local = _cols[j]; col_local < local_size[1])
+          ghost_index_data[idx_pos + 1] = col_local + local_range[1][0];
+        else
+          ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]];
+
+        insert_pos[rank] += 1;
+      }
+    }
+  }
+
+  // Communicate data with neighborhood
+  std::vector<std::int64_t> ghost_index_array;
+  std::vector<int> recv_disp;
+  {
+    std::vector<int> send_sizes;
+    std::transform(data_per_proc.begin(), data_per_proc.end(),
+                   std::back_inserter(send_sizes),
+                   [](auto x) { return 2 * x; });
+
+    std::vector<int> recv_sizes(dest_ranks.size());
+    send_sizes.reserve(1);
+    recv_sizes.reserve(1);
+    MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1,
+                          MPI_INT, _comm.comm());
+
+    // Build send/recv displacement
+    std::vector<int> send_disp = {0};
+    std::partial_sum(send_sizes.begin(), send_sizes.end(),
+                     std::back_inserter(send_disp));
+    recv_disp = {0};
+    std::partial_sum(recv_sizes.begin(), recv_sizes.end(),
+                     std::back_inserter(recv_disp));
+
+    ghost_index_array.resize(recv_disp.back());
+    MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(),
+                           send_disp.data(), MPI_INT64_T,
+                           ghost_index_array.data(), recv_sizes.data(),
+                           recv_disp.data(), MPI_INT64_T, _comm.comm());
+  }
+
+  // Store receive displacements for future use, when transferring
+  // data values
+  _val_recv_disp.resize(recv_disp.size());
+  std::transform(recv_disp.begin(), recv_disp.end(), _val_recv_disp.begin(),
+                 [](int d) { return d / 2; });
+
+  // Global-to-local map for ghost columns
+  std::vector<std::pair<std::int64_t, std::int32_t>> global_to_local;
+  global_to_local.reserve(ghosts1.size());
+  std::int32_t local_i = local_size[1];
+  for (std::int64_t idx : ghosts1)
+    global_to_local.push_back({idx, global_to_local.size() + local_size[1]});
+  std::sort(global_to_local.begin(), global_to_local.end());
+
+  // Compute location in which data for each index should be stored
+  // when received
+  for (std::size_t i = 0; i < ghost_index_array.size(); i += 2)
+  {
+    // Row must be on this process
+    const std::int32_t local_row = ghost_index_array[i] - local_range[0][0];
+    assert(local_row >= 0 and local_row < local_size[0]);
+
+    // Column may be owned or unowned
+    std::int32_t local_col = ghost_index_array[i + 1] - local_range[1][0];
+    if (local_col < 0 or local_col >= local_size[1])
+    {
+      auto it = std::lower_bound(global_to_local.begin(), global_to_local.end(),
+                                 std::pair(ghost_index_array[i + 1], -1),
+                                 [](auto& a, auto& b)
+                                 { return a.first < b.first; });
+      assert(it != global_to_local.end()
+             and it->first == ghost_index_array[i + 1]);
+      local_col = it->second;
+    }
+    auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]);
+    auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]);
+
+    // Find position of column index and insert data
+    auto cit = std::lower_bound(cit0, cit1, local_col);
+    assert(cit != cit1);
+    assert(*cit == local_col);
+    std::size_t d = std::distance(_cols.begin(), cit);
+    _unpack_pos.push_back(d);
+  }
+}
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X>
+std::vector<typename MatrixCSR<U, V, W, X>::value_type>
+MatrixCSR<U, V, W, X>::to_dense() const
+{
+  const std::size_t nrows = num_all_rows();
+  const std::size_t ncols
+      = _index_maps[1]->size_local() + _index_maps[1]->num_ghosts();
+  std::vector<value_type> A(nrows * ncols);
+  for (std::size_t r = 0; r < nrows; ++r)
+    for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
+      A[r * ncols + _cols[j]] = _data[j];
+
+  return A;
+}
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X>
+void MatrixCSR<U, V, W, X>::finalize_begin()
+{
+  const std::int32_t local_size0 = _index_maps[0]->size_local();
+  const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
+
+  // For each ghost row, pack and send values to send to neighborhood
+  std::vector<int> insert_pos = _val_send_disp;
+  std::vector<value_type> ghost_value_data(_val_send_disp.back());
+  for (int i = 0; i < num_ghosts0; ++i)
+  {
+    const int rank = _ghost_row_to_rank[i];
+
+    // Get position in send buffer to place data to send to this
+    // neighbour
+    const std::int32_t val_pos = insert_pos[rank];
+    std::copy(std::next(_data.data(), _row_ptr[local_size0 + i]),
+              std::next(_data.data(), _row_ptr[local_size0 + i + 1]),
+              std::next(ghost_value_data.begin(), val_pos));
+    insert_pos[rank]
+        += _row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i];
+  }
+
+  _ghost_value_data_in.resize(_val_recv_disp.back());
+
+  // Compute data sizes for send and receive from displacements
+  std::vector<int> val_send_count(_val_send_disp.size() - 1);
+  std::adjacent_difference(std::next(_val_send_disp.begin()),
+                           _val_send_disp.end(), val_send_count.begin());
+
+  std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
+  std::adjacent_difference(std::next(_val_recv_disp.begin()),
+                           _val_recv_disp.end(), val_recv_count.begin());
+
+  int status = MPI_Ineighbor_alltoallv(
+      ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
+      dolfinx::MPI::mpi_type<value_type>(), _ghost_value_data_in.data(),
+      val_recv_count.data(), _val_recv_disp.data(),
+      dolfinx::MPI::mpi_type<value_type>(), _comm.comm(), &_request);
+  assert(status == MPI_SUCCESS);
+}
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X>
+void MatrixCSR<U, V, W, X>::finalize_end()
+{
+  int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
+  assert(status == MPI_SUCCESS);
+
+  // Add to local rows
+  assert(_ghost_value_data_in.size() == _unpack_pos.size());
+  for (std::size_t i = 0; i < _ghost_value_data_in.size(); ++i)
+    _data[_unpack_pos[i]] += _ghost_value_data_in[i];
+
+  // Set ghost row data to zero
+  const std::int32_t local_size0 = _index_maps[0]->size_local();
+  std::fill(std::next(_data.begin(), _row_ptr[local_size0]), _data.end(), 0);
+}
+//-----------------------------------------------------------------------------
+template <typename U, typename V, typename W, typename X>
+double MatrixCSR<U, V, W, X>::norm_squared() const
+{
+  const std::size_t num_owned_rows = _index_maps[0]->size_local();
+  assert(num_owned_rows < _row_ptr.size());
+  double norm_sq_local = std::accumulate(
+      _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows]),
+      double(0), [](auto norm, value_type y) { return norm + std::norm(y); });
+  double norm_sq;
+  MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, _comm.comm());
+  return norm_sq;
+}
+//-----------------------------------------------------------------------------
+
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h
index 20df13a2b0a..db1dd6a1b7d 100644
--- a/cpp/dolfinx/la/Vector.h
+++ b/cpp/dolfinx/la/Vector.h
@@ -14,30 +14,39 @@
 #include <memory>
 #include <numeric>
 #include <span>
+#include <type_traits>
 #include <vector>
 
 namespace dolfinx::la
 {
 
 /// Distributed vector
-
-template <typename T, class Allocator = std::allocator<T>>
+///
+/// @tparam V data container type
+///
+template <typename Scalar, typename Container = std::vector<Scalar>>
 class Vector
 {
+  static_assert(std::is_same_v<typename Container::value_type, Scalar>);
+
 public:
-  /// The value type
-  using value_type = T;
+  /// Scalar type
+  using value_type = Scalar;
+
+  /// Container type
+  using container_type = Container;
 
-  /// The allocator type
-  using allocator_type = Allocator;
+  static_assert(std::is_same_v<value_type, typename container_type::value_type>,
+                "Scalar type and container value type must be the same.");
 
   /// Create a distributed vector
-  Vector(std::shared_ptr<const common::IndexMap> map, int bs,
-         const Allocator& alloc = Allocator())
+  /// @param map IndexMap for parallel distribution of the data
+  /// @param bs Block size
+  Vector(std::shared_ptr<const common::IndexMap> map, int bs)
       : _map(map), _scatterer(std::make_shared<common::Scatterer<>>(*_map, bs)),
-        _bs(bs), _buffer_local(_scatterer->local_buffer_size(), alloc),
-        _buffer_remote(_scatterer->remote_buffer_size(), alloc),
-        _x(bs * (map->size_local() + map->num_ghosts()), alloc)
+        _bs(bs), _buffer_local(_scatterer->local_buffer_size()),
+        _buffer_remote(_scatterer->remote_buffer_size()),
+        _x(bs * (map->size_local() + map->num_ghosts()))
   {
   }
 
@@ -67,14 +76,14 @@ class Vector
 
   /// Set all entries (including ghosts)
   /// @param[in] v The value to set all entries to (on calling rank)
-  void set(T v) { std::fill(_x.begin(), _x.end(), v); }
+  void set(value_type v) { std::fill(_x.begin(), _x.end(), v); }
 
   /// Begin scatter of local data from owner to ghosts on other ranks
   /// @note Collective MPI operation
   void scatter_fwd_begin()
   {
     const std::int32_t local_size = _bs * _map->size_local();
-    std::span<const T> x_local(_x.data(), local_size);
+    std::span<const value_type> x_local(_x.data(), local_size);
 
     auto pack = [](const auto& in, const auto& idx, auto& out)
     {
@@ -83,8 +92,8 @@ class Vector
     };
     pack(x_local, _scatterer->local_indices(), _buffer_local);
 
-    _scatterer->scatter_fwd_begin(std::span<const T>(_buffer_local),
-                                  std::span<T>(_buffer_remote),
+    _scatterer->scatter_fwd_begin(std::span<const value_type>(_buffer_local),
+                                  std::span<value_type>(_buffer_remote),
                                   std::span<MPI_Request>(_request));
   }
 
@@ -94,7 +103,7 @@ class Vector
   {
     const std::int32_t local_size = _bs * _map->size_local();
     const std::int32_t num_ghosts = _bs * _map->num_ghosts();
-    std::span<T> x_remote(_x.data() + local_size, num_ghosts);
+    std::span<value_type> x_remote(_x.data() + local_size, num_ghosts);
     _scatterer->scatter_fwd_end(std::span<MPI_Request>(_request));
 
     auto unpack = [](const auto& in, const auto& idx, auto& out, auto op)
@@ -121,7 +130,7 @@ class Vector
   {
     const std::int32_t local_size = _bs * _map->size_local();
     const std::int32_t num_ghosts = _bs * _map->num_ghosts();
-    std::span<T> x_remote(_x.data() + local_size, num_ghosts);
+    std::span<value_type> x_remote(_x.data() + local_size, num_ghosts);
 
     auto pack = [](const auto& in, const auto& idx, auto& out)
     {
@@ -130,8 +139,9 @@ class Vector
     };
     pack(x_remote, _scatterer->remote_indices(), _buffer_remote);
 
-    _scatterer->scatter_rev_begin(std::span<const T>(_buffer_remote),
-                                  std::span<T>(_buffer_local), _request);
+    _scatterer->scatter_rev_begin(std::span<const value_type>(_buffer_remote),
+                                  std::span<value_type>(_buffer_local),
+                                  _request);
   }
 
   /// End scatter of ghost data to owner. This process may receive data
@@ -144,7 +154,7 @@ class Vector
   void scatter_rev_end(BinaryOperation op)
   {
     const std::int32_t local_size = _bs * _map->size_local();
-    std::span<T> x_local(_x.data(), local_size);
+    std::span<value_type> x_local(_x.data(), local_size);
     _scatterer->scatter_rev_end(_request);
 
     auto unpack = [](const auto& in, const auto& idx, auto& out, auto op)
@@ -174,13 +184,13 @@ class Vector
   constexpr int bs() const { return _bs; }
 
   /// Get local part of the vector (const version)
-  std::span<const T> array() const { return std::span<const T>(_x); }
+  std::span<const value_type> array() const
+  {
+    return std::span<const value_type>(_x);
+  }
 
   /// Get local part of the vector
-  std::span<T> mutable_array() { return std::span(_x); }
-
-  /// Get the allocator associated with the container
-  constexpr allocator_type allocator() const { return _x.get_allocator(); }
+  std::span<value_type> mutable_array() { return std::span(_x); }
 
 private:
   // Map describing the data layout
@@ -196,10 +206,10 @@ class Vector
   std::vector<MPI_Request> _request = {MPI_REQUEST_NULL};
 
   // Buffers for ghost scatters
-  std::vector<T, Allocator> _buffer_local, _buffer_remote;
+  container_type _buffer_local, _buffer_remote;
 
   // Vector data
-  std::vector<T, Allocator> _x;
+  container_type _x;
 };
 
 /// Compute the inner product of two vectors. The two vectors must have
@@ -208,9 +218,10 @@ class Vector
 /// @param a A vector
 /// @param b A vector
 /// @return Returns `a^{H} b` (`a^{T} b` if `a` and `b` are real)
-template <typename T, class Allocator>
-T inner_product(const Vector<T, Allocator>& a, const Vector<T, Allocator>& b)
+template <class V>
+auto inner_product(const V& a, const V& b)
 {
+  using T = typename V::value_type;
   const std::int32_t local_size = a.bs() * a.map()->size_local();
   if (local_size != b.bs() * b.map()->size_local())
     throw std::runtime_error("Incompatible vector sizes");
@@ -238,9 +249,10 @@ T inner_product(const Vector<T, Allocator>& a, const Vector<T, Allocator>& b)
 
 /// Compute the squared L2 norm of vector
 /// @note Collective MPI operation
-template <typename T, class Allocator>
-auto squared_norm(const Vector<T, Allocator>& a)
+template <class V>
+auto squared_norm(const V& a)
 {
+  using T = typename V::value_type;
   T result = inner_product(a, a);
   return std::real(result);
 }
@@ -249,9 +261,10 @@ auto squared_norm(const Vector<T, Allocator>& a)
 /// @note Collective MPI operation
 /// @param a A vector
 /// @param type Norm type (supported types are \f$L^2\f$ and \f$L^\infty\f$)
-template <typename T, class Allocator>
-auto norm(const Vector<T, Allocator>& a, Norm type = Norm::l2)
+template <class V>
+auto norm(const V& a, Norm type = Norm::l2)
 {
+  using T = typename V::value_type;
   switch (type)
   {
   case Norm::l2:
@@ -279,9 +292,10 @@ auto norm(const Vector<T, Allocator>& a, Norm type = Norm::l2)
 /// vectors must have identical parallel layouts. The vectors are
 /// modified in-place.
 /// @param[in] tol The tolerance used to detect a linear dependency
-template <typename T, class Allocator>
-void orthonormalize(std::span<Vector<T, Allocator>> basis, double tol = 1.0e-10)
+template <class V>
+void orthonormalize(std::span<V> basis, double tol = 1.0e-10)
 {
+  using T = typename V::value_type;
   // Loop over each vector in basis
   for (std::size_t i = 0; i < basis.size(); ++i)
   {
@@ -313,10 +327,10 @@ void orthonormalize(std::span<Vector<T, Allocator>> basis, double tol = 1.0e-10)
 /// @param[in] basis The set of vectors to check
 /// @param[in] tol The tolerance used to test for orthonormality
 /// @return True is basis is orthonormal, otherwise false
-template <typename T, class Allocator>
-bool is_orthonormal(std::span<const Vector<T, Allocator>> basis,
-                    double tol = 1.0e-10)
+template <class V>
+bool is_orthonormal(std::span<const V> basis, double tol = 1.0e-10)
 {
+  using T = typename V::value_type;
   for (std::size_t i = 0; i < basis.size(); i++)
   {
     for (std::size_t j = i; j < basis.size(); j++)
diff --git a/cpp/dolfinx/la/petsc.h b/cpp/dolfinx/la/petsc.h
index 7d8dd78f389..ab42a02c3a2 100644
--- a/cpp/dolfinx/la/petsc.h
+++ b/cpp/dolfinx/la/petsc.h
@@ -78,8 +78,8 @@ Vec create_vector_wrap(const common::IndexMap& map, int bs,
 /// Create a PETSc Vec that wraps the data in an array
 /// @param[in] x The vector to be wrapped
 /// @return A PETSc Vec object that shares the data in @p x
-template <typename Allocator>
-Vec create_vector_wrap(const la::Vector<PetscScalar, Allocator>& x)
+template <class V>
+Vec create_vector_wrap(const la::Vector<V>& x)
 {
   assert(x.map());
   return create_vector_wrap(*x.map(), x.bs(), x.array());
diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp
index a234cb0fe5b..97a884bb8b4 100644
--- a/cpp/test/matrix.cpp
+++ b/cpp/test/matrix.cpp
@@ -29,8 +29,8 @@ namespace
 /// @param[in, out] x Output vector
 template <typename T>
 void spmv_impl(std::span<const T> values,
-               std::span<const std::int32_t> row_begin,
-               std::span<const std::int32_t> row_end,
+               std::span<const std::int64_t> row_begin,
+               std::span<const std::int64_t> row_end,
                std::span<const std::int32_t> indices, std::span<const T> x,
                std::span<T> y)
 {
@@ -72,23 +72,22 @@ void spmv_impl(std::span<const T> values,
 /// @param[in, out] y Output vector
 template <typename T>
 void spmv(la::MatrixCSR<T>& A, la::Vector<T>& x, la::Vector<T>& y)
-
 {
   // start communication (update ghosts)
   x.scatter_fwd_begin();
 
   const std::int32_t nrowslocal = A.num_owned_rows();
-  std::span<const std::int32_t> row_ptr(A.row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int64_t> row_ptr(A.row_ptr().data(), nrowslocal + 1);
   std::span<const std::int32_t> cols(A.cols().data(), row_ptr[nrowslocal]);
-  std::span<const std::int32_t> off_diag_offset(A.off_diag_offset().data(),
+  std::span<const std::int64_t> off_diag_offset(A.off_diag_offset().data(),
                                                 nrowslocal);
   std::span<const T> values(A.values().data(), row_ptr[nrowslocal]);
 
   std::span<const T> _x = x.array();
   std::span<T> _y = y.mutable_array();
 
-  std::span<const std::int32_t> row_begin(row_ptr.data(), nrowslocal);
-  std::span<const std::int32_t> row_end(row_ptr.data() + 1, nrowslocal);
+  std::span<const std::int64_t> row_begin(row_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> row_end(row_ptr.data() + 1, nrowslocal);
 
   // First stage:  spmv - diagonal
   // yi[0] += Ai[0] * xi[0]
@@ -131,8 +130,8 @@ la::MatrixCSR<double> create_operator(MPI_Comm comm)
 
 [[maybe_unused]] void test_matrix_norm()
 {
-  la::MatrixCSR<double> A0 = create_operator(MPI_COMM_SELF);
-  la::MatrixCSR<double> A1 = create_operator(MPI_COMM_WORLD);
+  la::MatrixCSR A0 = create_operator(MPI_COMM_SELF);
+  la::MatrixCSR A1 = create_operator(MPI_COMM_WORLD);
   CHECK(A1.norm_squared() == Approx(A0.norm_squared()).epsilon(1e-8));
 }
 
diff --git a/python/dolfinx/wrappers/la.cpp b/python/dolfinx/wrappers/la.cpp
index 408e00aaf71..a44ba365895 100644
--- a/python/dolfinx/wrappers/la.cpp
+++ b/python/dolfinx/wrappers/la.cpp
@@ -120,18 +120,16 @@ void declare_objects(py::module& m, const std::string& type)
       .def_property_readonly("indices",
                              [](dolfinx::la::MatrixCSR<T>& self)
                              {
-                               std::span<const std::int32_t> array
-                                   = self.cols();
-                               return py::array_t<const std::int32_t>(
-                                   array.size(), array.data(), py::cast(self));
+                               auto& array = self.cols();
+                               return py::array_t(array.size(), array.data(),
+                                                  py::cast(self));
                              })
       .def_property_readonly("indptr",
                              [](dolfinx::la::MatrixCSR<T>& self)
                              {
-                               std::span<const std::int32_t> array
-                                   = self.row_ptr();
-                               return py::array_t<const std::int32_t>(
-                                   array.size(), array.data(), py::cast(self));
+                               auto& array = self.row_ptr();
+                               return py::array_t(array.size(), array.data(),
+                                                  py::cast(self));
                              })
       .def("finalize_begin", &dolfinx::la::MatrixCSR<T>::finalize_begin)
       .def("finalize_end", &dolfinx::la::MatrixCSR<T>::finalize_end);
@@ -146,7 +144,7 @@ void petsc_module(py::module& m)
         py::arg("bs"), "Create a ghosted PETSc Vec for index map.");
   m.def(
       "create_vector_wrap",
-      [](dolfinx::la::Vector<PetscScalar, std::allocator<PetscScalar>>& x)
+      [](dolfinx::la::Vector<PetscScalar>& x)
       { return dolfinx::la::petsc::create_vector_wrap(x); },
       py::return_value_policy::take_ownership, py::arg("x"),
       "Create a ghosted PETSc Vec that wraps a DOLFINx Vector");