diff --git a/cpp/demo/poisson_matrix_free/main.cpp b/cpp/demo/poisson_matrix_free/main.cpp index 4ecbf9b3122..0c2512f429a 100644 --- a/cpp/demo/poisson_matrix_free/main.cpp +++ b/cpp/demo/poisson_matrix_free/main.cpp @@ -41,9 +41,7 @@ namespace linalg /// @param[in] alpha /// @param[in] x /// @param[in] y -template -void axpy(la::Vector& r, U alpha, const la::Vector& x, - const la::Vector& y) +void axpy(auto& r, auto alpha, const auto& x, const auto& y) { std::transform(x.array().begin(), x.array().end(), y.array().begin(), r.mutable_array().begin(), @@ -61,19 +59,21 @@ void axpy(la::Vector& r, U alpha, const la::Vector& x, /// @return The number if iterations /// @pre It is required that the ghost values of `x` and `b` have been /// updated before this function is called -template -int cg(la::Vector& x, const la::Vector& b, ApplyFunction&& action, - int kmax = 50, double rtol = 1e-8) +template +int cg(auto& x, auto& b, ApplyFunction&& action, int kmax = 50, + double rtol = 1e-8) { + using T = typename std::decay_t::value_type; + // Create working vectors - la::Vector r(b), y(b); + la::Vector r(b), y(b); // Compute initial residual r0 = b - Ax0 action(x, y); - axpy(r, U(-1), y, b); + axpy(r, T(-1), y, b); // Create p work vector - la::Vector p(r); + la::Vector p(r); // Iterations of CG auto rnorm0 = la::squared_norm(r); @@ -88,7 +88,7 @@ int cg(la::Vector& x, const la::Vector& b, ApplyFunction&& action, action(p, y); // Compute alpha = r.r/p.y - const U alpha = rnorm / la::inner_product(p, y); + const T alpha = rnorm / la::inner_product(p, y); // Update x (x <- x + alpha*p) axpy(x, alpha, p, x); @@ -98,7 +98,7 @@ int cg(la::Vector& x, const la::Vector& b, ApplyFunction&& action, // Update residual norm const auto rnorm_new = la::squared_norm(r); - const U beta = rnorm_new / rnorm; + const T beta = rnorm_new / rnorm; rnorm = rnorm_new; if (rnorm / rnorm0 < rtol2) @@ -182,8 +182,7 @@ int main(int argc, char* argv[]) const std::vector constants = fem::pack_constants(*M); // Create function for computing the action of A on x (y = Ax) - std::function&, la::Vector&)> action - = [&M, &ui, &bc, &coeff, &constants](la::Vector& x, la::Vector& y) + auto action = [&M, &ui, &bc, &coeff, &constants](auto& x, auto& y) { // Zero y y.set(0.0); diff --git a/cpp/dolfinx/fem/utils.h b/cpp/dolfinx/fem/utils.h index 1739166dc5e..1f4476190b7 100644 --- a/cpp/dolfinx/fem/utils.h +++ b/cpp/dolfinx/fem/utils.h @@ -814,12 +814,13 @@ void pack(std::span coeffs, std::int32_t cell, int bs, std::span v, /// @private /// @brief Concepts for function that returns cell index template -concept FetchCells = requires(F&& f, std::span v) { - requires std::invocable>; - { - f(v) - } -> std::convertible_to; -}; +concept FetchCells + = requires(F&& f, std::span v) { + requires std::invocable>; + { + f(v) + } -> std::convertible_to; + }; /// @brief Pack a single coefficient for a set of active entities. /// diff --git a/cpp/dolfinx/graph/AdjacencyList.h b/cpp/dolfinx/graph/AdjacencyList.h index a79055c5801..30a47d78fdf 100644 --- a/cpp/dolfinx/graph/AdjacencyList.h +++ b/cpp/dolfinx/graph/AdjacencyList.h @@ -172,10 +172,10 @@ AdjacencyList(T, U) -> AdjacencyList; /// @return An adjacency list template requires requires { - typename std::decay_t::value_type; - requires std::convertible_to< - U, std::vector::value_type>>; - } + typename std::decay_t::value_type; + requires std::convertible_to< + U, std::vector::value_type>>; + } AdjacencyList::value_type> regular_adjacency_list(U&& data, int degree) { diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index a324e02099c..d88268837a0 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -18,10 +18,9 @@ namespace dolfinx::la { - namespace impl { -/// @brief Set data in a CSR matrix +/// @brief Set data in a CSR matrix. /// /// @param[out] data The CSR matrix data /// @param[in] cols The CSR column indices @@ -33,38 +32,9 @@ namespace impl /// @param[in] local_size The maximum row index that can be set. Used /// when debugging is own to check that rows beyond a permitted range /// are not being set. -template -void set_csr(U&& data, const V& cols, const V& row_ptr, const W& x, - const X& xrows, const X& xcols, - [[maybe_unused]] typename X::value_type local_size) -{ - assert(x.size() == xrows.size() * xcols.size()); - for (std::size_t r = 0; r < xrows.size(); ++r) - { - // Row index and current data row - auto row = xrows[r]; - using T = typename W::value_type; - const T* xr = x.data() + r * xcols.size(); - -#ifndef NDEBUG - if (row >= local_size) - throw std::runtime_error("Local row out of range"); -#endif - - // Columns indices for row - auto cit0 = std::next(cols.begin(), row_ptr[row]); - auto cit1 = std::next(cols.begin(), row_ptr[row + 1]); - for (std::size_t c = 0; c < xcols.size(); ++c) - { - // Find position of column index - auto it = std::lower_bound(cit0, cit1, xcols[c]); - assert(it != cit1); - std::size_t d = std::distance(cols.begin(), it); - assert(d < data.size()); - data[d] = xr[c]; - } - } -} +template +void set_csr(U&& data, const V& cols, const W& row_ptr, const X& x, + const Y& xrows, const Y& xcols, typename Y::value_type local_size); /// @brief Add data to a CSR matrix /// @@ -75,37 +45,9 @@ void set_csr(U&& data, const V& cols, const V& row_ptr, const W& x, /// to the matrix /// @param[in] xrows The row indices of `x` /// @param[in] xcols The column indices of `x` -template -void add_csr(U&& data, const V& cols, const V& row_ptr, const W& x, - const X& xrows, const X& xcols) -{ - assert(x.size() == xrows.size() * xcols.size()); - for (std::size_t r = 0; r < xrows.size(); ++r) - { - // Row index and current data row - auto row = xrows[r]; - using T = typename W::value_type; - const T* xr = x.data() + r * xcols.size(); - -#ifndef NDEBUG - if (row >= (int)row_ptr.size()) - throw std::runtime_error("Local row out of range"); -#endif - - // Columns indices for row - auto cit0 = std::next(cols.begin(), row_ptr[row]); - auto cit1 = std::next(cols.begin(), row_ptr[row + 1]); - for (std::size_t c = 0; c < xcols.size(); ++c) - { - // Find position of column index - auto it = std::lower_bound(cit0, cit1, xcols[c]); - assert(it != cit1); - std::size_t d = std::distance(cols.begin(), it); - assert(d < data.size()); - data[d] += xr[c]; - } - } -} +template +void add_csr(U&& data, const V& cols, const W& row_ptr, const X& x, + const Y& xrows, const Y& xcols); } // namespace impl /// Distributed sparse matrix @@ -113,232 +55,83 @@ void add_csr(U&& data, const V& cols, const V& row_ptr, const W& x, /// The matrix storage format is compressed sparse row. The matrix is /// partitioned row-wise across MPI rank. /// -/// @tparam T The data type for the matrix -/// @tparam Allocator The memory allocator type for the data storage -/// -/// @note Highly "experimental" storage of a matrix in CSR format which -/// can be assembled into using the usual dolfinx assembly routines +/// @warning Highly experimental storage of a matrix in CSR format which +/// can be assembled into using the usual DOLFINx assembly routines /// Matrix internal data can be accessed for interfacing with other /// code. -/// /// @todo Handle block sizes -template > +/// +/// @tparam Scalar Scalar type of matrix entries +/// @tparam Container Sequence container type to store matrix entries +/// @tparam ColContainer Column index container type +/// @tparam RowPtrContainer Row pointer container type +template , + class ColContainer = std::vector, + class RowPtrContainer = std::vector> class MatrixCSR { + static_assert(std::is_same_v); + public: - /// The value type - using value_type = T; + /// Scalar type + using value_type = Scalar; - /// The allocator type - using allocator_type = Allocator; + /// Matrix entries container type + using container_type = Container; - /// Insertion functor for setting values in matrix. It is typically - /// used in finite element assembly functions. + /// Column index container type + using column_container_type = ColContainer; + + /// Row pointer container type + using rowptr_container_type = RowPtrContainer; + + static_assert(std::is_same_v, + "Scalar type and container value type must be the same."); + + /// @brief Insertion functor for setting values in matrix. It is + /// typically used in finite element assembly functions. /// @return Function for inserting values into `A` /// @todo clarify setting on non-owned entries auto mat_set_values() { - return [&](const std::span& rows, - const std::span& cols, - const std::span& data) -> int + return [&](std::span rows, + std::span cols, + std::span data) -> int { this->set(data, rows, cols); return 0; }; } - /// Insertion functor for accumulating values in matrix. It is + /// @brief Insertion functor for accumulating values in matrix. It is /// typically used in finite element assembly functions. /// @return Function for inserting values into `A` auto mat_add_values() { - return [&](const std::span& rows, - const std::span& cols, - const std::span& data) -> int + return [&](std::span rows, + std::span cols, + std::span data) -> int { this->add(data, rows, cols); return 0; }; } - /// Create a distributed matrix - /// @param[in] p The sparsty pattern the describes the parallel - /// distribution and the non-zero structure - /// @param[in] alloc The memory allocator for the data storafe - MatrixCSR(const SparsityPattern& p, const Allocator& alloc = Allocator()) - : _index_maps({p.index_map(0), - std::make_shared(p.column_index_map())}), - _bs({p.block_size(0), p.block_size(1)}), - _data(p.num_nonzeros(), 0, alloc), - _cols(p.graph().array().begin(), p.graph().array().end()), - _row_ptr(p.graph().offsets().begin(), p.graph().offsets().end()), - _comm(MPI_COMM_NULL) - { - // TODO: handle block sizes - if (_bs[0] > 1 or _bs[1] > 1) - throw std::runtime_error("Block size not yet supported"); - - // Compute off-diagonal offset for each row - std::span num_diag_nnz = p.off_diagonal_offset(); - _off_diagonal_offset.reserve(num_diag_nnz.size()); - std::transform(num_diag_nnz.begin(), num_diag_nnz.end(), _row_ptr.begin(), - std::back_inserter(_off_diagonal_offset), std::plus{}); - - // Some short-hand - const std::array local_size - = {_index_maps[0]->size_local(), _index_maps[1]->size_local()}; - const std::array local_range - = {_index_maps[0]->local_range(), _index_maps[1]->local_range()}; - const std::vector& ghosts1 = _index_maps[1]->ghosts(); - - const std::vector& ghosts0 = _index_maps[0]->ghosts(); - const std::vector& src_ranks = _index_maps[0]->src(); - const std::vector& dest_ranks = _index_maps[0]->dest(); - - // Create neighbourhood communicator (owner <- ghost) - MPI_Comm comm; - MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(), - dest_ranks.data(), MPI_UNWEIGHTED, - src_ranks.size(), src_ranks.data(), - MPI_UNWEIGHTED, MPI_INFO_NULL, false, &comm); - _comm = dolfinx::MPI::Comm(comm, false); - - // Build map from ghost row index position to owning (neighborhood) - // rank - _ghost_row_to_rank.reserve(_index_maps[0]->owners().size()); - for (int r : _index_maps[0]->owners()) - { - auto it = std::lower_bound(src_ranks.begin(), src_ranks.end(), r); - assert(it != src_ranks.end() and *it == r); - int pos = std::distance(src_ranks.begin(), it); - _ghost_row_to_rank.push_back(pos); - } - - // Compute size of data to send to each neighbor - std::vector data_per_proc(src_ranks.size(), 0); - for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i) - { - assert(_ghost_row_to_rank[i] < data_per_proc.size()); - std::size_t pos = local_size[0] + i; - data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos]; - } - - // Compute send displacements - _val_send_disp.resize(src_ranks.size() + 1, 0); - std::partial_sum(data_per_proc.begin(), data_per_proc.end(), - std::next(_val_send_disp.begin())); - - // For each ghost row, pack and send indices to neighborhood - std::vector ghost_index_data(2 * _val_send_disp.back()); - { - std::vector insert_pos = _val_send_disp; - for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i) - { - const int rank = _ghost_row_to_rank[i]; - int row_id = local_size[0] + i; - for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j) - { - // Get position in send buffer - const std::int32_t idx_pos = 2 * insert_pos[rank]; - - // Pack send data (row, col) as global indices - ghost_index_data[idx_pos] = ghosts0[i]; - if (std::int32_t col_local = _cols[j]; col_local < local_size[1]) - ghost_index_data[idx_pos + 1] = col_local + local_range[1][0]; - else - ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]]; - - insert_pos[rank] += 1; - } - } - } - - // Communicate data with neighborhood - std::vector ghost_index_array; - std::vector recv_disp; - { - std::vector send_sizes; - std::transform(data_per_proc.begin(), data_per_proc.end(), - std::back_inserter(send_sizes), - [](auto x) { return 2 * x; }); - - std::vector recv_sizes(dest_ranks.size()); - send_sizes.reserve(1); - recv_sizes.reserve(1); - MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1, - MPI_INT, _comm.comm()); - - // Build send/recv displacement - std::vector send_disp = {0}; - std::partial_sum(send_sizes.begin(), send_sizes.end(), - std::back_inserter(send_disp)); - recv_disp = {0}; - std::partial_sum(recv_sizes.begin(), recv_sizes.end(), - std::back_inserter(recv_disp)); - - ghost_index_array.resize(recv_disp.back()); - MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(), - send_disp.data(), MPI_INT64_T, - ghost_index_array.data(), recv_sizes.data(), - recv_disp.data(), MPI_INT64_T, _comm.comm()); - } - - // Store receive displacements for future use, when transferring - // data values - _val_recv_disp.resize(recv_disp.size()); - std::transform(recv_disp.begin(), recv_disp.end(), _val_recv_disp.begin(), - [](int d) { return d / 2; }); - - // Global-to-local map for ghost columns - std::vector> global_to_local; - global_to_local.reserve(ghosts1.size()); - std::int32_t local_i = local_size[1]; - for (std::int64_t idx : ghosts1) - global_to_local.push_back({idx, global_to_local.size() + local_size[1]}); - std::sort(global_to_local.begin(), global_to_local.end()); - - // Compute location in which data for each index should be stored - // when received - for (std::size_t i = 0; i < ghost_index_array.size(); i += 2) - { - // Row must be on this process - const std::int32_t local_row = ghost_index_array[i] - local_range[0][0]; - assert(local_row >= 0 and local_row < local_size[0]); - - // Column may be owned or unowned - std::int32_t local_col = ghost_index_array[i + 1] - local_range[1][0]; - if (local_col < 0 or local_col >= local_size[1]) - { - auto it = std::lower_bound( - global_to_local.begin(), global_to_local.end(), - std::pair(ghost_index_array[i + 1], -1), - [](auto& a, auto& b) { return a.first < b.first; }); - assert(it != global_to_local.end() - and it->first == ghost_index_array[i + 1]); - local_col = it->second; - } - auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]); - auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]); - - // Find position of column index and insert data - auto cit = std::lower_bound(cit0, cit1, local_col); - assert(cit != cit1); - assert(*cit == local_col); - std::size_t d = std::distance(_cols.begin(), cit); - _unpack_pos.push_back(d); - } - } + /// @brief Create a distributed matrix. + /// @param[in] p The sparsity pattern the describes the parallel + /// distribution and the non-zero structure. + MatrixCSR(const SparsityPattern& p); /// Move constructor /// @todo Check handling of MPI_Request MatrixCSR(MatrixCSR&& A) = default; - /// Set all non-zero local entries to a value - /// including entries in ghost rows + /// @brief Set all non-zero local entries to a value including entries + /// in ghost rows. /// @param[in] x The value to set non-zero matrix entries to - void set(T x) { std::fill(_data.begin(), _data.end(), x); } + void set(value_type x) { std::fill(_data.begin(), _data.end(), x); } - /// Set values in the matrix + /// @brief Set values in the matrix. /// @note Only entries included in the sparsity pattern used to /// initialize the matrix can be set /// @note All indices are local to the calling MPI rank and entries @@ -350,15 +143,14 @@ class MatrixCSR /// set in the matrix /// @param[in] rows The row indices of `x` /// @param[in] cols The column indices of `x` - void set(const std::span& x, - const std::span& rows, - const std::span& cols) + void set(std::span x, std::span rows, + std::span cols) { impl::set_csr(_data, _cols, _row_ptr, x, rows, cols, _index_maps[0]->size_local()); } - /// Accumulate values in the matrix + /// @brief Accumulate values in the matrix /// @note Only entries included in the sparsity pattern used to /// initialize the matrix can be accumulated in to /// @note All indices are local to the calling MPI rank and entries @@ -370,9 +162,8 @@ class MatrixCSR /// add to the matrix /// @param[in] rows The row indices of `x` /// @param[in] cols The column indices of `x` - void add(const std::span& x, - const std::span& rows, - const std::span& cols) + void add(std::span x, std::span rows, + std::span cols) { impl::add_csr(_data, _cols, _row_ptr, x, rows, cols); } @@ -390,115 +181,41 @@ class MatrixCSR /// manually by using num_owned_rows() if required. /// @return Dense copy of the part of the matrix on the calling rank. /// Storage is row-major. - std::vector to_dense() const - { - const std::size_t nrows = num_all_rows(); - const std::size_t ncols - = _index_maps[1]->size_local() + _index_maps[1]->num_ghosts(); - std::vector A(nrows * ncols); - for (std::size_t r = 0; r < nrows; ++r) - for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j) - A[r * ncols + _cols[j]] = _data[j]; - - return A; - } + std::vector to_dense() const; - /// Transfer ghost row data to the owning ranks - /// accumulating received values on the owned rows, and zeroing any existing - /// data in ghost rows. + /// @brief Transfer ghost row data to the owning ranks accumulating + /// received values on the owned rows, and zeroing any existing data + /// in ghost rows. void finalize() { finalize_begin(); finalize_end(); } - /// Begin transfer of ghost row data to owning ranks, where it will be - /// accumulated into existing owned rows. + /// @brief Begin transfer of ghost row data to owning ranks, where it + /// will be accumulated into existing owned rows. /// @note Calls to this function must be followed by /// MatrixCSR::finalize_end(). Between the two calls matrix values /// must not be changed. /// @note This function does not change the matrix data. Data update only /// occurs with `finalize_end()`. - void finalize_begin() - { - const std::int32_t local_size0 = _index_maps[0]->size_local(); - const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts(); - - // For each ghost row, pack and send values to send to neighborhood - std::vector insert_pos = _val_send_disp; - std::vector ghost_value_data(_val_send_disp.back()); - for (int i = 0; i < num_ghosts0; ++i) - { - const int rank = _ghost_row_to_rank[i]; - - // Get position in send buffer to place data to send to this - // neighbour - const std::int32_t val_pos = insert_pos[rank]; - std::copy(std::next(_data.data(), _row_ptr[local_size0 + i]), - std::next(_data.data(), _row_ptr[local_size0 + i + 1]), - std::next(ghost_value_data.begin(), val_pos)); - insert_pos[rank] - += _row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]; - } - - _ghost_value_data_in.resize(_val_recv_disp.back()); + void finalize_begin(); - // Compute data sizes for send and receive from displacements - std::vector val_send_count(_val_send_disp.size() - 1); - std::adjacent_difference(std::next(_val_send_disp.begin()), - _val_send_disp.end(), val_send_count.begin()); - - std::vector val_recv_count(_val_recv_disp.size() - 1); - std::adjacent_difference(std::next(_val_recv_disp.begin()), - _val_recv_disp.end(), val_recv_count.begin()); - - int status = MPI_Ineighbor_alltoallv( - ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(), - dolfinx::MPI::mpi_type(), _ghost_value_data_in.data(), - val_recv_count.data(), _val_recv_disp.data(), - dolfinx::MPI::mpi_type(), _comm.comm(), &_request); - assert(status == MPI_SUCCESS); - } - - /// End transfer of ghost row data to owning ranks + /// @brief End transfer of ghost row data to owning ranks. /// @note Must be preceded by MatrixCSR::finalize_begin() /// @note Matrix data received from other processes will be /// accumulated into locally owned rows, and ghost rows will be /// zeroed. - void finalize_end() - { - int status = MPI_Wait(&_request, MPI_STATUS_IGNORE); - assert(status == MPI_SUCCESS); - - // Add to local rows - assert(_ghost_value_data_in.size() == _unpack_pos.size()); - for (std::size_t i = 0; i < _ghost_value_data_in.size(); ++i) - _data[_unpack_pos[i]] += _ghost_value_data_in[i]; - - // Set ghost row data to zero - const std::int32_t local_size0 = _index_maps[0]->size_local(); - std::fill(std::next(_data.begin(), _row_ptr[local_size0]), _data.end(), 0); - } + void finalize_end(); /// Compute the Frobenius norm squared - double norm_squared() const - { - const std::size_t num_owned_rows = _index_maps[0]->size_local(); - assert(num_owned_rows < _row_ptr.size()); - - const double norm_sq_local = std::accumulate( - _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows]), - double(0), [](auto norm, T y) { return norm + std::norm(y); }); - double norm_sq; - MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, - _comm.comm()); - return norm_sq; - } + double norm_squared() const; - /// Index maps for the row and column space. The row IndexMap contains - /// ghost entries for rows which may be inserted into and the column - /// IndexMap contains all local and ghost columns that may exist in - /// the owned rows. + /// @brief Index maps for the row and column space. + /// + /// The row IndexMap contains ghost entries for rows which may be + /// inserted into and the column IndexMap contains all local and ghost + /// columns that may exist in the owned rows. /// /// @return Row (0) and column (1) index maps const std::array, 2>& @@ -509,19 +226,19 @@ class MatrixCSR /// Get local data values /// @note Includes ghost values - std::vector& values() { return _data; } + container_type& values() { return _data; } /// Get local values (const version) /// @note Includes ghost values - const std::vector& values() const { return _data; } + const container_type& values() const { return _data; } /// Get local row pointers /// @note Includes pointers to ghost rows - const std::vector& row_ptr() const { return _row_ptr; } + const rowptr_container_type& row_ptr() const { return _row_ptr; } /// Get local column indices /// @note Includes columns in ghost rows - const std::vector& cols() const { return _cols; } + const column_container_type& cols() const { return _cols; } /// Get the start of off-diagonal (unowned columns) on each row, /// allowing the matrix to be split (virtually) into two parts. @@ -530,7 +247,7 @@ class MatrixCSR /// from operations on the unowned parts. /// @note Includes ghost rows, which should be truncated manually if /// not required. - const std::vector& off_diag_offset() const + const rowptr_container_type& off_diag_offset() const { return _off_diagonal_offset; } @@ -543,11 +260,12 @@ class MatrixCSR std::array _bs; // Matrix data - std::vector _data; - std::vector _cols, _row_ptr; + container_type _data; + column_container_type _cols; + rowptr_container_type _row_ptr; // Start of off-diagonal (unowned columns) on each row - std::vector _off_diagonal_offset; + rowptr_container_type _off_diagonal_offset; // Neighborhood communicator (ghost->owner communicator for rows) dolfinx::MPI::Comm _comm; @@ -569,7 +287,324 @@ class MatrixCSR std::vector _ghost_row_to_rank; // Temporary store for finalize data during non-blocking communication - std::vector _ghost_value_data_in; + container_type _ghost_value_data_in; }; +//----------------------------------------------------------------------------- +template +void impl::set_csr(U&& data, const V& cols, const W& row_ptr, const X& x, + const Y& xrows, const Y& xcols, + [[maybe_unused]] typename Y::value_type local_size) +{ + assert(x.size() == xrows.size() * xcols.size()); + for (std::size_t r = 0; r < xrows.size(); ++r) + { + // Row index and current data row + auto row = xrows[r]; + using T = typename X::value_type; + const T* xr = x.data() + r * xcols.size(); + +#ifndef NDEBUG + if (row >= local_size) + throw std::runtime_error("Local row out of range"); +#endif + + // Columns indices for row + auto cit0 = std::next(cols.begin(), row_ptr[row]); + auto cit1 = std::next(cols.begin(), row_ptr[row + 1]); + for (std::size_t c = 0; c < xcols.size(); ++c) + { + // Find position of column index + auto it = std::lower_bound(cit0, cit1, xcols[c]); + assert(it != cit1); + std::size_t d = std::distance(cols.begin(), it); + assert(d < data.size()); + data[d] = xr[c]; + } + } +} +//----------------------------------------------------------------------------- +template +void impl::add_csr(U&& data, const V& cols, const W& row_ptr, const X& x, + const Y& xrows, const Y& xcols) +{ + assert(x.size() == xrows.size() * xcols.size()); + for (std::size_t r = 0; r < xrows.size(); ++r) + { + // Row index and current data row + auto row = xrows[r]; + using T = typename X::value_type; + const T* xr = x.data() + r * xcols.size(); + +#ifndef NDEBUG + if (row >= (int)row_ptr.size()) + throw std::runtime_error("Local row out of range"); +#endif + + // Columns indices for row + auto cit0 = std::next(cols.begin(), row_ptr[row]); + auto cit1 = std::next(cols.begin(), row_ptr[row + 1]); + for (std::size_t c = 0; c < xcols.size(); ++c) + { + // Find position of column index + auto it = std::lower_bound(cit0, cit1, xcols[c]); + assert(it != cit1); + std::size_t d = std::distance(cols.begin(), it); + assert(d < data.size()); + data[d] += xr[c]; + } + } +} +//----------------------------------------------------------------------------- +template +MatrixCSR::MatrixCSR(const SparsityPattern& p) + : _index_maps({p.index_map(0), + std::make_shared(p.column_index_map())}), + _bs({p.block_size(0), p.block_size(1)}), _data(p.num_nonzeros(), 0), + _cols(p.graph().array().begin(), p.graph().array().end()), + _row_ptr(p.graph().offsets().begin(), p.graph().offsets().end()), + _comm(MPI_COMM_NULL) +{ + // TODO: handle block sizes + if (_bs[0] > 1 or _bs[1] > 1) + throw std::runtime_error("Block size not yet supported"); + + // Compute off-diagonal offset for each row + std::span num_diag_nnz = p.off_diagonal_offset(); + _off_diagonal_offset.reserve(num_diag_nnz.size()); + std::transform(num_diag_nnz.begin(), num_diag_nnz.end(), _row_ptr.begin(), + std::back_inserter(_off_diagonal_offset), std::plus{}); + + // Some short-hand + const std::array local_size + = {_index_maps[0]->size_local(), _index_maps[1]->size_local()}; + const std::array local_range + = {_index_maps[0]->local_range(), _index_maps[1]->local_range()}; + const std::vector& ghosts1 = _index_maps[1]->ghosts(); + + const std::vector& ghosts0 = _index_maps[0]->ghosts(); + const std::vector& src_ranks = _index_maps[0]->src(); + const std::vector& dest_ranks = _index_maps[0]->dest(); + + // Create neighbourhood communicator (owner <- ghost) + MPI_Comm comm; + MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(), + dest_ranks.data(), MPI_UNWEIGHTED, + src_ranks.size(), src_ranks.data(), + MPI_UNWEIGHTED, MPI_INFO_NULL, false, &comm); + _comm = dolfinx::MPI::Comm(comm, false); + + // Build map from ghost row index position to owning (neighborhood) + // rank + _ghost_row_to_rank.reserve(_index_maps[0]->owners().size()); + for (int r : _index_maps[0]->owners()) + { + auto it = std::lower_bound(src_ranks.begin(), src_ranks.end(), r); + assert(it != src_ranks.end() and *it == r); + int pos = std::distance(src_ranks.begin(), it); + _ghost_row_to_rank.push_back(pos); + } + + // Compute size of data to send to each neighbor + std::vector data_per_proc(src_ranks.size(), 0); + for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i) + { + assert(_ghost_row_to_rank[i] < data_per_proc.size()); + std::size_t pos = local_size[0] + i; + data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos]; + } + + // Compute send displacements + _val_send_disp.resize(src_ranks.size() + 1, 0); + std::partial_sum(data_per_proc.begin(), data_per_proc.end(), + std::next(_val_send_disp.begin())); + + // For each ghost row, pack and send indices to neighborhood + std::vector ghost_index_data(2 * _val_send_disp.back()); + { + std::vector insert_pos = _val_send_disp; + for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i) + { + const int rank = _ghost_row_to_rank[i]; + int row_id = local_size[0] + i; + for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j) + { + // Get position in send buffer + const std::int32_t idx_pos = 2 * insert_pos[rank]; + + // Pack send data (row, col) as global indices + ghost_index_data[idx_pos] = ghosts0[i]; + if (std::int32_t col_local = _cols[j]; col_local < local_size[1]) + ghost_index_data[idx_pos + 1] = col_local + local_range[1][0]; + else + ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]]; + + insert_pos[rank] += 1; + } + } + } + + // Communicate data with neighborhood + std::vector ghost_index_array; + std::vector recv_disp; + { + std::vector send_sizes; + std::transform(data_per_proc.begin(), data_per_proc.end(), + std::back_inserter(send_sizes), + [](auto x) { return 2 * x; }); + + std::vector recv_sizes(dest_ranks.size()); + send_sizes.reserve(1); + recv_sizes.reserve(1); + MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1, + MPI_INT, _comm.comm()); + + // Build send/recv displacement + std::vector send_disp = {0}; + std::partial_sum(send_sizes.begin(), send_sizes.end(), + std::back_inserter(send_disp)); + recv_disp = {0}; + std::partial_sum(recv_sizes.begin(), recv_sizes.end(), + std::back_inserter(recv_disp)); + + ghost_index_array.resize(recv_disp.back()); + MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(), + send_disp.data(), MPI_INT64_T, + ghost_index_array.data(), recv_sizes.data(), + recv_disp.data(), MPI_INT64_T, _comm.comm()); + } + + // Store receive displacements for future use, when transferring + // data values + _val_recv_disp.resize(recv_disp.size()); + std::transform(recv_disp.begin(), recv_disp.end(), _val_recv_disp.begin(), + [](int d) { return d / 2; }); + + // Global-to-local map for ghost columns + std::vector> global_to_local; + global_to_local.reserve(ghosts1.size()); + std::int32_t local_i = local_size[1]; + for (std::int64_t idx : ghosts1) + global_to_local.push_back({idx, global_to_local.size() + local_size[1]}); + std::sort(global_to_local.begin(), global_to_local.end()); + + // Compute location in which data for each index should be stored + // when received + for (std::size_t i = 0; i < ghost_index_array.size(); i += 2) + { + // Row must be on this process + const std::int32_t local_row = ghost_index_array[i] - local_range[0][0]; + assert(local_row >= 0 and local_row < local_size[0]); + + // Column may be owned or unowned + std::int32_t local_col = ghost_index_array[i + 1] - local_range[1][0]; + if (local_col < 0 or local_col >= local_size[1]) + { + auto it = std::lower_bound(global_to_local.begin(), global_to_local.end(), + std::pair(ghost_index_array[i + 1], -1), + [](auto& a, auto& b) + { return a.first < b.first; }); + assert(it != global_to_local.end() + and it->first == ghost_index_array[i + 1]); + local_col = it->second; + } + auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]); + auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]); + + // Find position of column index and insert data + auto cit = std::lower_bound(cit0, cit1, local_col); + assert(cit != cit1); + assert(*cit == local_col); + std::size_t d = std::distance(_cols.begin(), cit); + _unpack_pos.push_back(d); + } +} +//----------------------------------------------------------------------------- +template +std::vector::value_type> +MatrixCSR::to_dense() const +{ + const std::size_t nrows = num_all_rows(); + const std::size_t ncols + = _index_maps[1]->size_local() + _index_maps[1]->num_ghosts(); + std::vector A(nrows * ncols); + for (std::size_t r = 0; r < nrows; ++r) + for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j) + A[r * ncols + _cols[j]] = _data[j]; + + return A; +} +//----------------------------------------------------------------------------- +template +void MatrixCSR::finalize_begin() +{ + const std::int32_t local_size0 = _index_maps[0]->size_local(); + const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts(); + + // For each ghost row, pack and send values to send to neighborhood + std::vector insert_pos = _val_send_disp; + std::vector ghost_value_data(_val_send_disp.back()); + for (int i = 0; i < num_ghosts0; ++i) + { + const int rank = _ghost_row_to_rank[i]; + + // Get position in send buffer to place data to send to this + // neighbour + const std::int32_t val_pos = insert_pos[rank]; + std::copy(std::next(_data.data(), _row_ptr[local_size0 + i]), + std::next(_data.data(), _row_ptr[local_size0 + i + 1]), + std::next(ghost_value_data.begin(), val_pos)); + insert_pos[rank] + += _row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]; + } + + _ghost_value_data_in.resize(_val_recv_disp.back()); + + // Compute data sizes for send and receive from displacements + std::vector val_send_count(_val_send_disp.size() - 1); + std::adjacent_difference(std::next(_val_send_disp.begin()), + _val_send_disp.end(), val_send_count.begin()); + + std::vector val_recv_count(_val_recv_disp.size() - 1); + std::adjacent_difference(std::next(_val_recv_disp.begin()), + _val_recv_disp.end(), val_recv_count.begin()); + + int status = MPI_Ineighbor_alltoallv( + ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(), + dolfinx::MPI::mpi_type(), _ghost_value_data_in.data(), + val_recv_count.data(), _val_recv_disp.data(), + dolfinx::MPI::mpi_type(), _comm.comm(), &_request); + assert(status == MPI_SUCCESS); +} +//----------------------------------------------------------------------------- +template +void MatrixCSR::finalize_end() +{ + int status = MPI_Wait(&_request, MPI_STATUS_IGNORE); + assert(status == MPI_SUCCESS); + + // Add to local rows + assert(_ghost_value_data_in.size() == _unpack_pos.size()); + for (std::size_t i = 0; i < _ghost_value_data_in.size(); ++i) + _data[_unpack_pos[i]] += _ghost_value_data_in[i]; + + // Set ghost row data to zero + const std::int32_t local_size0 = _index_maps[0]->size_local(); + std::fill(std::next(_data.begin(), _row_ptr[local_size0]), _data.end(), 0); +} +//----------------------------------------------------------------------------- +template +double MatrixCSR::norm_squared() const +{ + const std::size_t num_owned_rows = _index_maps[0]->size_local(); + assert(num_owned_rows < _row_ptr.size()); + double norm_sq_local = std::accumulate( + _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows]), + double(0), [](auto norm, value_type y) { return norm + std::norm(y); }); + double norm_sq; + MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, _comm.comm()); + return norm_sq; +} +//----------------------------------------------------------------------------- + } // namespace dolfinx::la diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h index 20df13a2b0a..db1dd6a1b7d 100644 --- a/cpp/dolfinx/la/Vector.h +++ b/cpp/dolfinx/la/Vector.h @@ -14,30 +14,39 @@ #include #include #include +#include #include namespace dolfinx::la { /// Distributed vector - -template > +/// +/// @tparam V data container type +/// +template > class Vector { + static_assert(std::is_same_v); + public: - /// The value type - using value_type = T; + /// Scalar type + using value_type = Scalar; + + /// Container type + using container_type = Container; - /// The allocator type - using allocator_type = Allocator; + static_assert(std::is_same_v, + "Scalar type and container value type must be the same."); /// Create a distributed vector - Vector(std::shared_ptr map, int bs, - const Allocator& alloc = Allocator()) + /// @param map IndexMap for parallel distribution of the data + /// @param bs Block size + Vector(std::shared_ptr map, int bs) : _map(map), _scatterer(std::make_shared>(*_map, bs)), - _bs(bs), _buffer_local(_scatterer->local_buffer_size(), alloc), - _buffer_remote(_scatterer->remote_buffer_size(), alloc), - _x(bs * (map->size_local() + map->num_ghosts()), alloc) + _bs(bs), _buffer_local(_scatterer->local_buffer_size()), + _buffer_remote(_scatterer->remote_buffer_size()), + _x(bs * (map->size_local() + map->num_ghosts())) { } @@ -67,14 +76,14 @@ class Vector /// Set all entries (including ghosts) /// @param[in] v The value to set all entries to (on calling rank) - void set(T v) { std::fill(_x.begin(), _x.end(), v); } + void set(value_type v) { std::fill(_x.begin(), _x.end(), v); } /// Begin scatter of local data from owner to ghosts on other ranks /// @note Collective MPI operation void scatter_fwd_begin() { const std::int32_t local_size = _bs * _map->size_local(); - std::span x_local(_x.data(), local_size); + std::span x_local(_x.data(), local_size); auto pack = [](const auto& in, const auto& idx, auto& out) { @@ -83,8 +92,8 @@ class Vector }; pack(x_local, _scatterer->local_indices(), _buffer_local); - _scatterer->scatter_fwd_begin(std::span(_buffer_local), - std::span(_buffer_remote), + _scatterer->scatter_fwd_begin(std::span(_buffer_local), + std::span(_buffer_remote), std::span(_request)); } @@ -94,7 +103,7 @@ class Vector { const std::int32_t local_size = _bs * _map->size_local(); const std::int32_t num_ghosts = _bs * _map->num_ghosts(); - std::span x_remote(_x.data() + local_size, num_ghosts); + std::span x_remote(_x.data() + local_size, num_ghosts); _scatterer->scatter_fwd_end(std::span(_request)); auto unpack = [](const auto& in, const auto& idx, auto& out, auto op) @@ -121,7 +130,7 @@ class Vector { const std::int32_t local_size = _bs * _map->size_local(); const std::int32_t num_ghosts = _bs * _map->num_ghosts(); - std::span x_remote(_x.data() + local_size, num_ghosts); + std::span x_remote(_x.data() + local_size, num_ghosts); auto pack = [](const auto& in, const auto& idx, auto& out) { @@ -130,8 +139,9 @@ class Vector }; pack(x_remote, _scatterer->remote_indices(), _buffer_remote); - _scatterer->scatter_rev_begin(std::span(_buffer_remote), - std::span(_buffer_local), _request); + _scatterer->scatter_rev_begin(std::span(_buffer_remote), + std::span(_buffer_local), + _request); } /// End scatter of ghost data to owner. This process may receive data @@ -144,7 +154,7 @@ class Vector void scatter_rev_end(BinaryOperation op) { const std::int32_t local_size = _bs * _map->size_local(); - std::span x_local(_x.data(), local_size); + std::span x_local(_x.data(), local_size); _scatterer->scatter_rev_end(_request); auto unpack = [](const auto& in, const auto& idx, auto& out, auto op) @@ -174,13 +184,13 @@ class Vector constexpr int bs() const { return _bs; } /// Get local part of the vector (const version) - std::span array() const { return std::span(_x); } + std::span array() const + { + return std::span(_x); + } /// Get local part of the vector - std::span mutable_array() { return std::span(_x); } - - /// Get the allocator associated with the container - constexpr allocator_type allocator() const { return _x.get_allocator(); } + std::span mutable_array() { return std::span(_x); } private: // Map describing the data layout @@ -196,10 +206,10 @@ class Vector std::vector _request = {MPI_REQUEST_NULL}; // Buffers for ghost scatters - std::vector _buffer_local, _buffer_remote; + container_type _buffer_local, _buffer_remote; // Vector data - std::vector _x; + container_type _x; }; /// Compute the inner product of two vectors. The two vectors must have @@ -208,9 +218,10 @@ class Vector /// @param a A vector /// @param b A vector /// @return Returns `a^{H} b` (`a^{T} b` if `a` and `b` are real) -template -T inner_product(const Vector& a, const Vector& b) +template +auto inner_product(const V& a, const V& b) { + using T = typename V::value_type; const std::int32_t local_size = a.bs() * a.map()->size_local(); if (local_size != b.bs() * b.map()->size_local()) throw std::runtime_error("Incompatible vector sizes"); @@ -238,9 +249,10 @@ T inner_product(const Vector& a, const Vector& b) /// Compute the squared L2 norm of vector /// @note Collective MPI operation -template -auto squared_norm(const Vector& a) +template +auto squared_norm(const V& a) { + using T = typename V::value_type; T result = inner_product(a, a); return std::real(result); } @@ -249,9 +261,10 @@ auto squared_norm(const Vector& a) /// @note Collective MPI operation /// @param a A vector /// @param type Norm type (supported types are \f$L^2\f$ and \f$L^\infty\f$) -template -auto norm(const Vector& a, Norm type = Norm::l2) +template +auto norm(const V& a, Norm type = Norm::l2) { + using T = typename V::value_type; switch (type) { case Norm::l2: @@ -279,9 +292,10 @@ auto norm(const Vector& a, Norm type = Norm::l2) /// vectors must have identical parallel layouts. The vectors are /// modified in-place. /// @param[in] tol The tolerance used to detect a linear dependency -template -void orthonormalize(std::span> basis, double tol = 1.0e-10) +template +void orthonormalize(std::span basis, double tol = 1.0e-10) { + using T = typename V::value_type; // Loop over each vector in basis for (std::size_t i = 0; i < basis.size(); ++i) { @@ -313,10 +327,10 @@ void orthonormalize(std::span> basis, double tol = 1.0e-10) /// @param[in] basis The set of vectors to check /// @param[in] tol The tolerance used to test for orthonormality /// @return True is basis is orthonormal, otherwise false -template -bool is_orthonormal(std::span> basis, - double tol = 1.0e-10) +template +bool is_orthonormal(std::span basis, double tol = 1.0e-10) { + using T = typename V::value_type; for (std::size_t i = 0; i < basis.size(); i++) { for (std::size_t j = i; j < basis.size(); j++) diff --git a/cpp/dolfinx/la/petsc.h b/cpp/dolfinx/la/petsc.h index 7d8dd78f389..ab42a02c3a2 100644 --- a/cpp/dolfinx/la/petsc.h +++ b/cpp/dolfinx/la/petsc.h @@ -78,8 +78,8 @@ Vec create_vector_wrap(const common::IndexMap& map, int bs, /// Create a PETSc Vec that wraps the data in an array /// @param[in] x The vector to be wrapped /// @return A PETSc Vec object that shares the data in @p x -template -Vec create_vector_wrap(const la::Vector& x) +template +Vec create_vector_wrap(const la::Vector& x) { assert(x.map()); return create_vector_wrap(*x.map(), x.bs(), x.array()); diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp index a234cb0fe5b..97a884bb8b4 100644 --- a/cpp/test/matrix.cpp +++ b/cpp/test/matrix.cpp @@ -29,8 +29,8 @@ namespace /// @param[in, out] x Output vector template void spmv_impl(std::span values, - std::span row_begin, - std::span row_end, + std::span row_begin, + std::span row_end, std::span indices, std::span x, std::span y) { @@ -72,23 +72,22 @@ void spmv_impl(std::span values, /// @param[in, out] y Output vector template void spmv(la::MatrixCSR& A, la::Vector& x, la::Vector& y) - { // start communication (update ghosts) x.scatter_fwd_begin(); const std::int32_t nrowslocal = A.num_owned_rows(); - std::span row_ptr(A.row_ptr().data(), nrowslocal + 1); + std::span row_ptr(A.row_ptr().data(), nrowslocal + 1); std::span cols(A.cols().data(), row_ptr[nrowslocal]); - std::span off_diag_offset(A.off_diag_offset().data(), + std::span off_diag_offset(A.off_diag_offset().data(), nrowslocal); std::span values(A.values().data(), row_ptr[nrowslocal]); std::span _x = x.array(); std::span _y = y.mutable_array(); - std::span row_begin(row_ptr.data(), nrowslocal); - std::span row_end(row_ptr.data() + 1, nrowslocal); + std::span row_begin(row_ptr.data(), nrowslocal); + std::span row_end(row_ptr.data() + 1, nrowslocal); // First stage: spmv - diagonal // yi[0] += Ai[0] * xi[0] @@ -131,8 +130,8 @@ la::MatrixCSR create_operator(MPI_Comm comm) [[maybe_unused]] void test_matrix_norm() { - la::MatrixCSR A0 = create_operator(MPI_COMM_SELF); - la::MatrixCSR A1 = create_operator(MPI_COMM_WORLD); + la::MatrixCSR A0 = create_operator(MPI_COMM_SELF); + la::MatrixCSR A1 = create_operator(MPI_COMM_WORLD); CHECK(A1.norm_squared() == Approx(A0.norm_squared()).epsilon(1e-8)); } diff --git a/python/dolfinx/wrappers/la.cpp b/python/dolfinx/wrappers/la.cpp index 408e00aaf71..a44ba365895 100644 --- a/python/dolfinx/wrappers/la.cpp +++ b/python/dolfinx/wrappers/la.cpp @@ -120,18 +120,16 @@ void declare_objects(py::module& m, const std::string& type) .def_property_readonly("indices", [](dolfinx::la::MatrixCSR& self) { - std::span array - = self.cols(); - return py::array_t( - array.size(), array.data(), py::cast(self)); + auto& array = self.cols(); + return py::array_t(array.size(), array.data(), + py::cast(self)); }) .def_property_readonly("indptr", [](dolfinx::la::MatrixCSR& self) { - std::span array - = self.row_ptr(); - return py::array_t( - array.size(), array.data(), py::cast(self)); + auto& array = self.row_ptr(); + return py::array_t(array.size(), array.data(), + py::cast(self)); }) .def("finalize_begin", &dolfinx::la::MatrixCSR::finalize_begin) .def("finalize_end", &dolfinx::la::MatrixCSR::finalize_end); @@ -146,7 +144,7 @@ void petsc_module(py::module& m) py::arg("bs"), "Create a ghosted PETSc Vec for index map."); m.def( "create_vector_wrap", - [](dolfinx::la::Vector>& x) + [](dolfinx::la::Vector& x) { return dolfinx::la::petsc::create_vector_wrap(x); }, py::return_value_policy::take_ownership, py::arg("x"), "Create a ghosted PETSc Vec that wraps a DOLFINx Vector");