diff --git a/src/container/srv_target.c b/src/container/srv_target.c index dd6fe8008ad..c0c6c0cfdd6 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1621,6 +1621,8 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, uint64_t flags, uint64_t sec_capas, uint32_t status_pm_ver) { + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; struct cont_tgt_open_arg arg = { 0 }; struct dss_coll_ops coll_ops = { 0 }; struct dss_coll_args coll_args = { 0 }; @@ -1657,18 +1659,22 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, coll_args.ca_func_args = &arg; /* setting aggregator args */ - rc = ds_pool_get_failed_tgt_idx(pool_uuid, &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); - return rc; + goto out; } - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; + } - if (rc != 0) { + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + if (rc != 0) /* Once it exclude the target from the pool, since the target * might still in the cart group, so IV cont open might still * come to this target, especially if cont open/close will be @@ -1678,9 +1684,10 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n", DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid), DP_RC(rc)); - return rc; - } +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } diff --git a/src/dtx/SConscript b/src/dtx/SConscript index 5a6849671de..4d0f1f2dcb3 100644 --- a/src/dtx/SConscript +++ b/src/dtx/SConscript @@ -18,7 +18,8 @@ def scons(): # dtx denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) dtx = denv.d_library('dtx', - ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c'], + ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c', + 'dtx_coll.c'], install_off="../..") denv.Install('$PREFIX/lib64/daos_srv', dtx) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c new file mode 100644 index 00000000000..a8975c0b3f4 --- /dev/null +++ b/src/dtx/dtx_coll.c @@ -0,0 +1,350 @@ +/** + * (C) Copyright 2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * dtx: DTX collective RPC logic + */ +#define D_LOGFAC DD_FAC(dtx) + +#include +#include +#include +#include +#include +#include +#include +#include "dtx_internal.h" + +/* + * For collective DTX, when commit/abort/check the DTX on system XS (on non-leader), we cannot + * directly locate the DTX entry since no VOS target is attached to system XS. Under such case, + * we have two options: + * + * 1. The DTX leader (on IO XS) knows on which VOS target the non-leader can find out the DTX, + * so DTX leader can send related information (IO XS index) to the non-leader. + * + * 2. The non-leader can start ULT on every local XS collectively to find the DTX by force in + * spite of whether related DTX entry really exists on the VOS target or not. + * + * Usually, the 2nd option may cause more overhead, should be avoid. Then the 1st is relative + * better choice. On the other hand, if there are a lot of VOS targets in the system, then it + * maybe inefficient to send all VOS targets information to all related non-leaders via bcast. + * Instead, we will only send one VOS target information for each non-leader, then non-leader + * can load mbs (dtx_memberships) from the DTX entry and then calculate the other VOS targets + * information by itself. + */ + +struct dtx_coll_local_args { + uuid_t dcla_po_uuid; + uuid_t dcla_co_uuid; + struct dtx_id dcla_xid; + daos_epoch_t dcla_epoch; + uint32_t dcla_opc; + int *dcla_results; +}; + +void +dtx_coll_load_mbs_ult(void *arg) +{ + struct dtx_coll_load_mbs_args *dclma = arg; + struct dtx_coll_in *dci = dclma->dclma_params; + struct ds_cont_child *cont = NULL; + int rc = 0; + + rc = ds_cont_child_lookup(dci->dci_po_uuid, dci->dci_co_uuid, &cont); + if (rc != 0) { + D_ERROR("Failed to locate pool="DF_UUID" cont="DF_UUID" for DTX " + DF_DTI" with opc %u: "DF_RC"\n", + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), + DP_DTI(&dci->dci_xid), dclma->dclma_opc, DP_RC(rc)); + /* + * Convert the case of container non-exist as -DER_IO to distinguish + * the case of DTX entry does not exist. The latter one is normal. + */ + if (rc == -DER_NONEXIST) + rc = -DER_IO; + dclma->dclma_result = rc; + } else { + rc = vos_dtx_load_mbs(cont->sc_hdl, &dci->dci_xid, &dclma->dclma_oid, + &dclma->dclma_mbs); + dclma->dclma_result = rc; + if (rc == -DER_INPROGRESS && !dtx_cont_opened(cont) && + dclma->dclma_opc == DTX_COLL_CHECK) { + rc = start_dtx_reindex_ult(cont); + if (rc != 0) + D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", + DP_UUID(cont->sc_uuid), DP_RC(rc)); + } + ds_cont_child_put(cont); + } + + rc = ABT_future_set(dclma->dclma_future, NULL); + D_ASSERT(rc == ABT_SUCCESS); +} + +static int +dtx_coll_dtg_cmp(const void *m1, const void *m2) +{ + const struct dtx_target_group *dtg1 = m1; + const struct dtx_target_group *dtg2 = m2; + + if (dtg1->dtg_rank > dtg2->dtg_rank) + return 1; + + if (dtg1->dtg_rank < dtg2->dtg_rank) + return -1; + + return 0; +} + +int +dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_memberships *mbs, d_rank_t my_rank, + uint32_t my_tgtid, uint32_t version, uint8_t **p_hints, uint32_t *hint_sz, + uint8_t **p_bitmap, uint32_t *bitmap_sz, d_rank_list_t **p_ranks) +{ + struct pl_map *map = NULL; + struct pool_target *target; + struct dtx_daos_target *ddt; + struct dtx_target_group *base; + struct dtx_target_group *dtg = NULL; + struct dtx_target_group key = { 0 }; + uint8_t *hints = NULL; + uint8_t *bitmap = NULL; + size_t size = ((dss_tgt_nr - 1) >> 3) + 1; + uint32_t node_nr; + d_rank_t max_rank; + int count; + int rc = 0; + int i; + int j; + int k; + + D_ASSERT(mbs->dm_flags & DMF_CONTAIN_TARGET_GRP); + + *p_bitmap = NULL; + *bitmap_sz = 0; + + ddt = &mbs->dm_tgts[0]; + base = (struct dtx_target_group *)(ddt + mbs->dm_tgt_cnt); + count = (mbs->dm_data_size - sizeof(*ddt) * mbs->dm_tgt_cnt) / sizeof(*dtg); + + /* + * The first dtg is for the original leader group. The others groups are sorted against + * ranks ID. + */ + + if (base->dtg_rank == my_rank) { + dtg = base; + } else { + key.dtg_rank = my_rank; + dtg = bsearch(&key, base + 1, count - 1, sizeof(*dtg), dtx_coll_dtg_cmp); + if (dtg == NULL) { + D_ERROR("Cannot locate rank %u in the mbs\n", my_rank); + D_GOTO(out, rc = -DER_IO); + } + } + + D_ALLOC_ARRAY(bitmap, size); + if (bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + map = pl_map_find(po_uuid, oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map for "DF_OID"\n", DP_OID(oid.id_pub)); + D_GOTO(out, rc = -DER_INVAL); + } + + for (i = dtg->dtg_start_idx; i < dtg->dtg_start_idx + dtg->dtg_tgt_nr; i++) { + rc = pool_map_find_target(map->pl_poolmap, ddt[i].ddt_id, &target); + D_ASSERT(rc == 1); + + /* Skip the targets that reside on other engines. */ + if (unlikely(my_rank != target->ta_comp.co_rank)) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > version) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_NEW && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + /* Skip current (new) leader target. */ + if (my_tgtid != target->ta_comp.co_index) + setbit(bitmap, target->ta_comp.co_index); + } + + if (p_hints == NULL) + D_GOTO(out, rc = 0); + + D_ASSERT(hint_sz != NULL); + D_ASSERT(p_ranks != NULL); + + if (unlikely(count == 1)) { + *p_ranks = NULL; + *p_hints = NULL; + *hint_sz = 0; + goto out; + } + + *p_ranks = d_rank_list_alloc(count - 1); + if (*p_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + node_nr = pool_map_node_nr(map->pl_poolmap); + D_ALLOC_ARRAY(hints, node_nr); + if (hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0, j = 0, max_rank = 0, dtg = base; i < count; i++, dtg++) { + /* Skip current leader rank. */ + if (my_rank == dtg->dtg_rank) + continue; + + for (k = dtg->dtg_start_idx; k < dtg->dtg_start_idx + dtg->dtg_tgt_nr; k++) { + rc = pool_map_find_target(map->pl_poolmap, ddt[k].ddt_id, &target); + D_ASSERT(rc == 1); + + if ((target->ta_comp.co_ver <= version) && + (target->ta_comp.co_status == PO_COMP_ST_UP || + target->ta_comp.co_status == PO_COMP_ST_UPIN || + target->ta_comp.co_status == PO_COMP_ST_NEW || + target->ta_comp.co_status == PO_COMP_ST_DRAIN)) { + if (max_rank < dtg->dtg_rank) + max_rank = dtg->dtg_rank; + + (*p_ranks)->rl_ranks[j++] = dtg->dtg_rank; + hints[dtg->dtg_rank] = target->ta_comp.co_index; + break; + } + } + } + + /* + * It is no matter that the real size of rl_ranks array is larger than rl_nr. + * Then reduce rl_nr to skip those non-defined ranks at the tail in rl_ranks. + */ + (*p_ranks)->rl_nr = j; + + *p_hints = hints; + *hint_sz = max_rank + 1; + +out: + if (map != NULL) + pl_map_decref(map); + + if (rc != 0) { + D_FREE(bitmap); + if (p_ranks != NULL) { + d_rank_list_free(*p_ranks); + *p_ranks = NULL; + } + D_FREE(hints); + if (p_hints != NULL) { + *p_hints = NULL; + *hint_sz = 0; + } + } else { + *p_bitmap = bitmap; + *bitmap_sz = size; + } + + return rc; +} + +static int +dtx_coll_local_one(void *args) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct dtx_coll_local_args *dcla = args; + struct ds_cont_child *cont = NULL; + uint32_t opc = dcla->dcla_opc; + int rc; + int rc1; + + rc = ds_cont_child_lookup(dcla->dcla_po_uuid, dcla->dcla_co_uuid, &cont); + if (rc != 0) { + D_ERROR("Failed to locate "DF_UUID"/"DF_UUID" for collective DTX " + DF_DTI" rpc %u: "DF_RC"\n", DP_UUID(dcla->dcla_po_uuid), + DP_UUID(dcla->dcla_co_uuid), DP_DTI(&dcla->dcla_xid), opc, DP_RC(rc)); + goto out; + } + + switch (opc) { + case DTX_COLL_COMMIT: + rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, NULL); + break; + case DTX_COLL_ABORT: + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + break; + case DTX_COLL_CHECK: + rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, NULL, false); + if (rc == DTX_ST_INITED) { + /* + * For DTX_CHECK, non-ready one is equal to non-exist. Do not directly + * return 'DTX_ST_INITED' to avoid interoperability trouble if related + * request is from old server. + */ + rc = -DER_NONEXIST; + } else if (rc == -DER_INPROGRESS && !dtx_cont_opened(cont)) { + /* Trigger DTX re-index for subsequent (retry) DTX_CHECK. */ + rc1 = start_dtx_reindex_ult(cont); + if (rc1 != 0) + D_ERROR("Failed to trigger DTX reindex for "DF_UUID"/"DF_UUID + " on target %u/%u: "DF_RC"\n", + DP_UUID(dcla->dcla_po_uuid), DP_UUID(dcla->dcla_co_uuid), + dss_self_rank(), dmi->dmi_tgt_id, DP_RC(rc1)); + } + break; + default: + D_ASSERTF(0, "Unknown collective DTX opc %u\n", opc); + D_GOTO(out, rc = -DER_NOTSUPPORTED); + } + +out: + dcla->dcla_results[dmi->dmi_tgt_id] = rc; + if (cont != NULL) + ds_cont_child_put(cont); + + return 0; +} + +int +dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) +{ + struct dtx_coll_local_args dcla = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int rc; + + D_ALLOC_ARRAY(dcla.dcla_results, dss_tgt_nr); + if (dcla.dcla_results == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + uuid_copy(dcla.dcla_po_uuid, po_uuid); + uuid_copy(dcla.dcla_co_uuid, co_uuid); + dcla.dcla_xid = *xid; + dcla.dcla_epoch = epoch; + dcla.dcla_opc = opc; + + coll_ops.co_func = dtx_coll_local_one; + coll_args.ca_func_args = &dcla; + coll_args.ca_tgt_bitmap_sz = bitmap_sz; + coll_args.ca_tgt_bitmap = bitmap; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Locally exec collective DTX PRC %u for "DF_DTI": "DF_RC"\n", + opc, DP_DTI(xid), DP_RC(rc)); + +out: + *p_results = dcla.dcla_results; + return rc < 0 ? rc : dss_tgt_nr; +} diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 77aec06240e..dde62aaaec9 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -23,6 +23,11 @@ uint32_t dtx_agg_thd_cnt_lo; uint32_t dtx_agg_thd_age_up; uint32_t dtx_agg_thd_age_lo; uint32_t dtx_batched_ult_max; +/* + * Smaller bcast RPC tree width for collective transaction makes related RPC load to be distributed + * among more engines, but it may increase single transaction latency from the client perspective. + */ +uint32_t dtx_coll_tree_width; struct dtx_batched_pool_args { @@ -303,12 +308,14 @@ dtx_dpci_free(struct dtx_partial_cmt_item *dpci) static void dtx_cleanup(void *arg) { + struct dss_module_info *dmi = dss_get_module_info(); struct dtx_batched_cont_args *dbca = arg; struct ds_cont_child *cont = dbca->dbca_cont; struct dtx_share_peer *dsp; struct dtx_partial_cmt_item *dpci; struct dtx_entry *dte; struct dtx_cleanup_cb_args dcca; + daos_unit_oid_t oid; d_list_t cmt_list; d_list_t abt_list; d_list_t act_list; @@ -366,9 +373,28 @@ dtx_cleanup(void *arg) dte = &dpci->dpci_dte; if (dte->dte_mbs == NULL) - rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &dte->dte_mbs); - if (dte->dte_mbs != NULL) - rc = dtx_commit(cont, &dte, NULL, 1); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &oid, &dte->dte_mbs); + if (dte->dte_mbs != NULL) { + if (dte->dte_mbs->dm_flags & DMF_CONTAIN_TARGET_GRP) { + d_rank_list_t *ranks = NULL; + uint8_t *hints = NULL; + uint8_t *bitmap = NULL; + uint32_t hint_sz = 0; + uint32_t bitmap_sz = 0; + + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, dte->dte_mbs, + dss_self_rank(), dmi->dmi_tgt_id, dte->dte_ver, + &hints, &hint_sz, &bitmap, &bitmap_sz, &ranks); + if (rc == 0) + rc = dtx_coll_commit(cont, &dte->dte_xid, ranks, hints, + hint_sz, bitmap, bitmap_sz, dte->dte_ver); + d_rank_list_free(ranks); + D_FREE(hints); + D_FREE(bitmap); + } else { + rc = dtx_commit(cont, &dte, NULL, 1); + } + } D_DEBUG(DB_IO, "Cleanup partial committed DTX "DF_DTI", left %d: %d\n", DP_DTI(&dte->dte_xid), dcca.dcca_pc_count, rc); @@ -846,11 +872,9 @@ dtx_handle_reinit(struct dtx_handle *dth) */ static int dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, - uint16_t sub_modification_cnt, uint32_t pm_ver, - daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, - int dti_cos_cnt, struct dtx_memberships *mbs, bool leader, - bool solo, bool sync, bool dist, bool migration, bool ignore_uncommitted, - bool resent, bool prepared, bool drop_cmt, struct dtx_handle *dth) + bool leader, uint16_t sub_modification_cnt, uint32_t pm_ver, + daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, int dti_cos_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_handle *dth) { if (sub_modification_cnt > DTX_SUB_MOD_MAX) { D_ERROR("Too many modifications in a single transaction:" @@ -871,17 +895,16 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_pinned = 0; dth->dth_cos_done = 0; - dth->dth_resent = resent ? 1 : 0; - dth->dth_solo = solo ? 1 : 0; - dth->dth_drop_cmt = drop_cmt ? 1 : 0; dth->dth_modify_shared = 0; dth->dth_active = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; - dth->dth_dist = dist ? 1 : 0; - dth->dth_for_migration = migration ? 1 : 0; - dth->dth_ignore_uncommitted = ignore_uncommitted ? 1 : 0; - dth->dth_prepared = prepared ? 1 : 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; dth->dth_aborted = 0; dth->dth_already = 0; dth->dth_need_validation = 0; @@ -891,7 +914,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_ent = NULL; dth->dth_flags = leader ? DTE_LEADER : 0; - if (sync) { + if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; dth->dth_sync = 1; } else { @@ -1097,58 +1120,84 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) * \param leader_oid [IN] The object ID is used to elect the DTX leader. * \param dti_cos [IN] The DTX array to be committed because of shared. * \param dti_cos_cnt [IN] The @dti_cos array size. + * \param hints [IN] VOS targets hint for collective modification. + * \param hint_sz [IN] The size of hints array. + * \param bitmap [IN] Bitmap for collective modification on local VOS targets. + * \param bitmap_sz [IN] The size of bitmap for local VOS targets. * \param tgts [IN] targets for distribute transaction. * \param tgt_cnt [IN] number of targets (not count the leader itself). * \param flags [IN] See dtx_flags. + * \param ranks [IN] Ranks list for collective modification. * \param mbs [IN] DTX participants information. * \param p_dlh [OUT] Pointer to the DTX handle. * * \return Zero on success, negative value if error. */ int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh) +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, uint8_t *hints, uint32_t hint_sz, + uint8_t *bitmap, uint32_t bitmap_sz, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, d_rank_list_t *ranks, struct dtx_memberships *mbs, + struct dtx_leader_handle **p_dlh) { struct dtx_leader_handle *dlh; struct dtx_tls *tls = dtx_tls_get(); struct dtx_handle *dth; + int cnt; int rc; int i; - D_ALLOC(dlh, sizeof(*dlh) + sizeof(struct dtx_sub_status) * tgt_cnt); + if (flags & DTX_COLL) + /* For collective RPC, the leader just need at most one bcast request. */ + cnt = (ranks != NULL) ? 1 : 0; + else + cnt = tgt_cnt; + + D_ALLOC(dlh, sizeof(*dlh) + sizeof(struct dtx_sub_status) * cnt); if (dlh == NULL) return -DER_NOMEM; - if (tgt_cnt > 0) { - dlh->dlh_future = ABT_FUTURE_NULL; + dlh->dlh_future = ABT_FUTURE_NULL; + + if (cnt > 0) { dlh->dlh_subs = (struct dtx_sub_status *)(dlh + 1); - for (i = 0; i < tgt_cnt; i++) { - dlh->dlh_subs[i].dss_tgt = tgts[i]; - if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) - dlh->dlh_delay_sub_cnt++; + + if (ranks != NULL) { + /* NOTE: do not support DTF_DELAY_FORWARD for collective DTX. */ + dlh->dlh_delay_sub_cnt = 0; + dlh->dlh_normal_sub_cnt = cnt; + } else { + for (i = 0; i < cnt; i++) { + dlh->dlh_subs[i].dss_tgt = tgts[i]; + if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) + dlh->dlh_delay_sub_cnt++; + } + + dlh->dlh_normal_sub_cnt = cnt - dlh->dlh_delay_sub_cnt; } - dlh->dlh_normal_sub_cnt = tgt_cnt - dlh->dlh_delay_sub_cnt; } + if (flags & DTX_COLL) { + dlh->dlh_coll = 1; + dlh->dlh_coll_tree_width = dtx_coll_tree_width; + } + + dlh->dlh_coll_ranks = ranks; + dlh->dlh_coll_hints = hints; + dlh->dlh_coll_hint_sz = hint_sz; + dlh->dlh_coll_bitmap = bitmap; + dlh->dlh_coll_bitmap_sz = bitmap_sz; + dth = &dlh->dlh_handle; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, pm_ver, - leader_oid, dti_cos, dti_cos_cnt, mbs, true, - (flags & DTX_SOLO) ? true : false, - (flags & DTX_SYNC) ? true : false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, false, - (flags & DTX_RESEND) ? true : false, - (flags & DTX_PREPARED) ? true : false, - (flags & DTX_DROP_CMT) ? true : false, dth); + rc = dtx_handle_init(dti, coh, epoch, true, sub_modification_cnt, pm_ver, + leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, (flags & DTX_PREPARED) ? true : false); - D_DEBUG(DB_IO, "Start DTX "DF_DTI" sub modification %d, ver %u, leader " + D_DEBUG(DB_IO, "Start (%s) DTX "DF_DTI" sub modification %d, ver %u, leader " DF_UOID", dti_cos_cnt %d, tgt_cnt %d, flags %x: "DF_RC"\n", + (flags & DTX_COLL) ? "collective" : "regular", DP_DTI(dti), sub_modification_cnt, dth->dth_ver, DP_UOID(*leader_oid), dti_cos_cnt, tgt_cnt, flags, DP_RC(rc)); @@ -1301,6 +1350,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT)) dth->dth_sync = 1; + /* Currently, we synchronously commit collective DTX. */ + if (dlh->dlh_coll) + dth->dth_sync = 1; + /* For synchronous DTX, do not add it into CoS cache, otherwise, * we may have no way to remove it from the cache. */ @@ -1361,11 +1414,21 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * batched commit. */ vos_dtx_mark_committable(dth); - dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + + if (dlh->dlh_coll) { + rc = dtx_coll_commit(cont, &dth->dth_xid, dlh->dlh_coll_ranks, + dlh->dlh_coll_hints, dlh->dlh_coll_hint_sz, + dlh->dlh_coll_bitmap, dlh->dlh_coll_bitmap_sz, + dth->dth_ver); + } else { + dte = &dth->dth_dte; + rc = dtx_commit(cont, &dte, NULL, 1); + } + if (rc != 0) - D_WARN(DF_UUID": Fail to sync commit DTX "DF_DTI": "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1390,7 +1453,13 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * 2. Remove the pinned DTX entry. */ vos_dtx_cleanup(dth, true); - dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); + if (dlh->dlh_coll) + dtx_coll_abort(cont, &dth->dth_xid, dlh->dlh_coll_ranks, + dlh->dlh_coll_hints, dlh->dlh_coll_hint_sz, + dlh->dlh_coll_bitmap, dlh->dlh_coll_bitmap_sz, + dth->dth_ver, dth->dth_epoch); + else + dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); aborted = true; } @@ -1472,13 +1541,8 @@ dtx_begin(daos_handle_t coh, struct dtx_id *dti, if (dth == NULL) return -DER_NOMEM; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, - pm_ver, leader_oid, dti_cos, dti_cos_cnt, mbs, - false, false, false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, - (flags & DTX_IGNORE_UNCOMMITTED) ? true : false, - (flags & DTX_RESEND) ? true : false, false, false, dth); + rc = dtx_handle_init(dti, coh, epoch, false, sub_modification_cnt, pm_ver, + leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, false); @@ -1938,8 +2002,12 @@ dtx_comp_cb(void **arg) sub->dss_result == dlh->dlh_allow_failure) continue; - /* Ignore DER_INPROGRESS if there is other failure. */ - if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS) + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + + /* Ignore DER_INPROGRESS and DER_AGAIN if there is other failure. */ + if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS || + dlh->dlh_result == -DER_AGAIN) dlh->dlh_result = sub->dss_result; } } @@ -2229,3 +2297,44 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, return rc; } + +void +dtx_merge_check_result(int *tgt, int src) +{ + /* As long as one target has committed, then the DTX is committable on all targets. */ + if (*tgt != DTX_ST_COMMITTED && *tgt != DTX_ST_COMMITTABLE) { + switch (src) { + case DTX_ST_COMMITTED: + case DTX_ST_COMMITTABLE: + *tgt = src; + break; + case -DER_EXCLUDED: + /* + * If non-leader is excluded, handle it as 'prepared'. If other + * non-leaders are also 'prepared' then related DTX maybe still + * committable or 'corrupted'. The subsequent DTX resync logic + * will handle related things, see dtx_verify_groups(). + * + * Fall through. + */ + case DTX_ST_PREPARED: + if (*tgt == 0 || *tgt == DTX_ST_CORRUPTED) + *tgt = src; + break; + case DTX_ST_CORRUPTED: + if (*tgt == 0) + *tgt = src; + break; + default: + if (src >= 0) { + if (*tgt != -DER_NONEXIST) + *tgt = -DER_IO; + } else { + if (src == -DER_NONEXIST || *tgt >= 0 || + (*tgt != -DER_IO && *tgt != -DER_NONEXIST)) + *tgt = src; + } + break; + } + } +} diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a38c747a61d..43233462fd0 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -22,16 +22,26 @@ * These are for daos_rpc::dr_opc and DAOS_RPC_OPCODE(opc, ...) rather than * crt_req_create(..., opc, ...). See src/include/daos/rpc.h. */ -#define DAOS_DTX_VERSION 3 +#define DAOS_DTX_VERSION 4 /* LIST of internal RPCS in form of: * OPCODE, flags, FMT, handler, corpc_hdlr, */ -#define DTX_PROTO_SRV_RPC_LIST \ - X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_commit") \ - X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_abort") \ - X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, NULL, "dtx_check") \ - X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, NULL, "dtx_refresh") +#define DTX_PROTO_SRV_RPC_LIST \ + X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_commit") \ + X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_abort") \ + X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_check") \ + X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_refresh") \ + X(DTX_COLL_COMMIT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_commit_co_ops, "dtx_coll_commit") \ + X(DTX_COLL_ABORT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_abort_co_ops, "dtx_coll_abort") \ + X(DTX_COLL_CHECK, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_check_co_ops, "dtx_coll_check") #define X(a, b, c, d, e, f) a, enum dtx_operation { @@ -56,6 +66,27 @@ enum dtx_operation { CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +/* + * DTX collective RPC input fields + * dci_hints is sparse array, one per engine, sorted against the rank ID. + * It can hold more than 19K engines inline RPC body. + */ +#define DAOS_ISEQ_COLL_DTX \ + ((uuid_t) (dci_po_uuid) CRT_VAR) \ + ((uuid_t) (dci_co_uuid) CRT_VAR) \ + ((struct dtx_id) (dci_xid) CRT_VAR) \ + ((uint32_t) (dci_version) CRT_VAR) \ + ((uint32_t) (dci_padding) CRT_VAR) \ + ((uint64_t) (dci_epoch) CRT_VAR) \ + ((uint8_t) (dci_hints) CRT_ARRAY) + +/* DTX collective RPC output fields */ +#define DAOS_OSEQ_COLL_DTX \ + ((int32_t) (dco_status) CRT_VAR) \ + ((uint32_t) (dco_misc) CRT_VAR) + +CRT_RPC_DECLARE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); + #define DTX_YIELD_CYCLE (DTX_THRESHOLD_COUNT >> 3) /* The time threshold for triggering DTX cleanup of stale entries. @@ -131,6 +162,13 @@ extern uint32_t dtx_agg_thd_age_lo; /* The default count of DTX batched commit ULTs. */ #define DTX_BATCHED_ULT_DEF 32 +/* The bcast RPC tree width for collective transaction. */ +#define DTX_COLL_TREE_WIDTH_MAX 64 +#define DTX_COLL_TREE_WIDTH_DEF 16 +#define DTX_COLL_TREE_WIDTH_MIN 4 + +extern uint32_t dtx_coll_tree_width; + /* * Ideally, dedicated DXT batched commit ULT for each opened container is the most simple model. * But it may be burden for the engine if opened containers become more and more on the target. @@ -149,6 +187,19 @@ extern uint32_t dtx_batched_ult_max; */ #define DTX_INLINE_MBS_SIZE 512 +extern struct crt_corpc_ops dtx_coll_commit_co_ops; +extern struct crt_corpc_ops dtx_coll_abort_co_ops; +extern struct crt_corpc_ops dtx_coll_check_co_ops; + +struct dtx_coll_load_mbs_args { + struct dtx_coll_in *dclma_params; + struct dtx_memberships *dclma_mbs; + daos_unit_oid_t dclma_oid; + ABT_future dclma_future; + uint32_t dclma_opc; + int dclma_result; +}; + struct dtx_pool_metrics { struct d_tm_node_t *dpm_batched_degree; struct d_tm_node_t *dpm_batched_total; @@ -196,6 +247,7 @@ void dtx_batched_commit(void *arg); void dtx_aggregation_main(void *arg); int start_dtx_reindex_ult(struct ds_cont_child *cont); void stop_dtx_reindex_ult(struct ds_cont_child *cont); +void dtx_merge_check_result(int *tgt, int src); /* dtx_cos.c */ int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, @@ -209,19 +261,27 @@ int dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, uint64_t dtx_cos_oldest(struct ds_cont_child *cont); /* dtx_rpc.c */ -int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); - +int dtx_coll_check(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version, daos_epoch_t epoch); int dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *check_list, d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, bool for_io); -int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, +int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, daos_epoch_t epoch, int *tgt_array, int *err); - int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, struct pool_target **p_tgt); +/* dtx_coll.c */ +void dtx_coll_load_mbs_ult(void *arg); +int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_memberships *mbs, + d_rank_t my_rank, uint32_t my_tgtid, uint32_t version, + uint8_t **p_hints, uint32_t *hint_sz, uint8_t **p_bitmap, uint32_t *bitmap_sz, + d_rank_list_t **p_ranks); +int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); + enum dtx_status_handle_result { DSHR_NEED_COMMIT = 1, DSHR_NEED_RETRY = 2, diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 02f94319c6a..418b23f7205 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -261,28 +261,46 @@ dtx_verify_groups(struct ds_pool *pool, struct dtx_memberships *mbs, } int -dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, +dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, daos_epoch_t epoch, int *tgt_array, int *err) { - int rc = 0; + struct dtx_memberships *mbs = dte->dte_mbs; + d_rank_list_t *ranks = NULL; + uint8_t *hints = NULL; + uint8_t *bitmap = NULL; + uint32_t hint_sz = 0; + uint32_t bitmap_sz = 0; + int rc = 0; + + if (mbs->dm_flags & DMF_CONTAIN_TARGET_GRP) { + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, mbs, dss_self_rank(), + dss_get_module_info()->dmi_tgt_id, dte->dte_ver, + &hints, &hint_sz, &bitmap, &bitmap_sz, &ranks); + if (rc != 0) { + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); + goto out; + } - rc = dtx_check(cont, dte, epoch); + rc = dtx_coll_check(cont, &dte->dte_xid, ranks, hints, hint_sz, bitmap, bitmap_sz, + dte->dte_ver, epoch); + } else { + rc = dtx_check(cont, dte, epoch); + } switch (rc) { case DTX_ST_COMMITTED: case DTX_ST_COMMITTABLE: /* The DTX has been committed on some remote replica(s), * let's commit the DTX globally. */ - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); case -DER_INPROGRESS: case -DER_TIMEDOUT: D_WARN("Other participants not sure about whether the " "DTX "DF_DTI" is committed or not, need retry.\n", DP_DTI(&dte->dte_xid)); - return DSHR_NEED_RETRY; + D_GOTO(out, rc = DSHR_NEED_RETRY); case DTX_ST_PREPARED: { - struct dtx_memberships *mbs = dte->dte_mbs; - /* If the transaction across multiple redundancy groups, * need to check whether there are enough alive targets. */ @@ -293,7 +311,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, goto out; if (rc > 0) - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); /* XXX: For the distributed transaction that lose too * many particiants (the whole redundancy group), @@ -304,14 +322,18 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * Then we mark the TX as corrupted via special * dtx_abort() with 0 @epoch. */ - rc = dtx_abort(cont, dte, 0); + if (mbs->dm_flags & DMF_CONTAIN_TARGET_GRP) + rc = dtx_coll_abort(cont, &dte->dte_xid, ranks, hints, hint_sz, + bitmap, bitmap_sz, dte->dte_ver, 0); + else + rc = dtx_abort(cont, dte, 0); if (rc < 0 && err != NULL) *err = rc; - return DSHR_CORRUPT; + D_GOTO(out, rc = DSHR_CORRUPT); } - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); } case -DER_NONEXIST: /* Someone (the DTX owner or batched commit ULT) may have @@ -345,7 +367,11 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * some other DTX(s). To avoid complex rollback logic, let's * abort the DTXs one by one, not batched. */ - rc = dtx_abort(cont, dte, epoch); + if (mbs->dm_flags & DMF_CONTAIN_TARGET_GRP) + rc = dtx_coll_abort(cont, &dte->dte_xid, ranks, hints, hint_sz, bitmap, + bitmap_sz, dte->dte_ver, epoch); + else + rc = dtx_abort(cont, dte, epoch); D_DEBUG(DB_TRACE, "As new leader for DTX "DF_DTI", abort it (2): "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); @@ -354,10 +380,10 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, if (err != NULL) *err = rc; - return DSHR_ABORT_FAILED; + D_GOTO(out, rc = DSHR_ABORT_FAILED); } - return DSHR_IGNORE; + D_GOTO(out, rc = DSHR_IGNORE); default: D_WARN("Not sure about whether the DTX "DF_DTI " can be committed or not: %d, skip it.\n", @@ -368,6 +394,13 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, } out: + if (rc == DSHR_NEED_COMMIT && mbs->dm_flags & DMF_CONTAIN_TARGET_GRP) + rc = dtx_coll_commit(cont, &dte->dte_xid, ranks, hints, hint_sz, bitmap, bitmap_sz, + dte->dte_ver); + + d_rank_list_free(ranks); + D_FREE(hints); + D_FREE(bitmap); return rc; } @@ -412,9 +445,10 @@ dtx_status_handle(struct dtx_resync_args *dra) } if (dre->dre_dte.dte_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, &dre->dre_dte.dte_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, NULL, + &dre->dre_dte.dte_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST) + if (rc < 0 && rc != -DER_NONEXIST) D_WARN("Failed to load mbs, do not know the leader for DTX " DF_DTI" (ver = %u/%u/%u): rc = %d, skip it.\n", DP_DTI(&dre->dre_xid), dra->resync_version, @@ -446,7 +480,7 @@ dtx_status_handle(struct dtx_resync_args *dra) continue; } - rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_epoch, + rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_oid, dre->dre_epoch, tgt_array, &err); switch (rc) { case DSHR_NEED_COMMIT: diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5c4c44c9035..efc4b75f0e3 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -20,6 +20,7 @@ #include "dtx_internal.h" CRT_RPC_DEFINE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +CRT_RPC_DEFINE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); #define X(a, b, c, d, e, f) \ { \ @@ -206,18 +207,16 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: + D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + drr->drr_comp = 1; drr->drr_result = rc; rc = ABT_future_set(dra->dra_future, drr); D_ASSERTF(rc == ABT_SUCCESS, "ABT_future_set failed for opc %x to %d/%d: rc = %d.\n", dra->dra_opc, drr->drr_rank, drr->drr_tag, rc); - - D_DEBUG(DB_TRACE, - "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", rc %d.\n", dra->dra_opc, req, - dra->dra_future, drr->drr_rank, drr->drr_tag, - din != NULL ? din->di_epoch : 0, drr->drr_result); } static int @@ -291,41 +290,7 @@ dtx_req_list_cb(void **args) if (dra->dra_opc == DTX_CHECK) { for (i = 0; i < dra->dra_length; i++) { drr = args[i]; - switch (drr->drr_result) { - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTABLE: - dra->dra_result = DTX_ST_COMMITTED; - /* As long as one target has committed the DTX, - * then the DTX is committable on all targets. - */ - D_DEBUG(DB_TRACE, - "The DTX "DF_DTI" has been committed on %d/%d.\n", - DP_DTI(&drr->drr_dti[0]), drr->drr_rank, drr->drr_tag); - return; - case -DER_EXCLUDED: - /* - * If non-leader is excluded, handle it as 'prepared'. If other - * non-leaders are also 'prepared' then related DTX maybe still - * committable or 'corrupted'. The subsequent DTX resync logic - * will handle related things, see dtx_verify_groups(). - * - * Fall through. - */ - case DTX_ST_PREPARED: - if (dra->dra_result == 0 || - dra->dra_result == DTX_ST_CORRUPTED) - dra->dra_result = DTX_ST_PREPARED; - break; - case DTX_ST_CORRUPTED: - if (dra->dra_result == 0) - dra->dra_result = drr->drr_result; - break; - default: - dra->dra_result = drr->drr_result >= 0 ? - -DER_IO : drr->drr_result; - break; - } - + dtx_merge_check_result(&dra->dra_result, drr->drr_result); D_DEBUG(DB_TRACE, "The DTX "DF_DTI" RPC req result %d, status is %d.\n", DP_DTI(&drr->drr_dti[0]), drr->drr_result, dra->dra_result); } @@ -608,7 +573,7 @@ dtx_rpc_internal(struct dtx_common_args *dca) int rc; int i; - if (dca->dca_dra.dra_opc != DTX_REFRESH) { + if (dca->dca_dtes != NULL) { D_ASSERT(dca->dca_dtis != NULL); if (dca->dca_count > 1) { @@ -778,7 +743,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, * Some RPC may has been sent, so need to wait even if dtx_rpc_prep hit failure. */ rc = dtx_rpc_post(&dca, rc, false); - if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED) + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) rc = 0; if (rc != 0) { @@ -833,7 +798,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, DP_DTI(&dtes[0]->dte_xid), count, dra->dra_committed > 0 ? "partial" : "nothing", rc, rc1); else - D_DEBUG(DB_IO, "Commit DTXs " DF_DTI", count %d\n", + D_DEBUG(DB_TRACE, "Commit DTXs " DF_DTI", count %d\n", DP_DTI(&dtes[0]->dte_xid), count); return rc != 0 ? rc : rc1; @@ -870,7 +835,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) if (rc1 > 0 || rc1 == -DER_NONEXIST) rc1 = 0; - D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_IO, "Abort DTX "DF_DTI": rc %d %d %d\n", + D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, "Abort DTX "DF_DTI": rc %d %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1, rc2); return rc1 != 0 ? rc1 : rc2; @@ -893,8 +858,8 @@ dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) rc1 = dtx_rpc_post(&dca, rc, false); - D_CDEBUG(rc1 < 0, DLOG_ERR, DB_IO, "Check DTX "DF_DTI": rc %d %d\n", - DP_DTI(&dte->dte_xid), rc, rc1); + D_CDEBUG(rc1 < 0 && rc1 != -DER_NONEXIST, DLOG_ERR, DB_TRACE, + "Check DTX "DF_DTI": rc %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1); return rc1; } @@ -929,9 +894,9 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che drop = false; if (dsp->dsp_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, &dsp->dsp_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) goto out; drop = true; @@ -1166,8 +1131,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che dte.dte_refs = 1; dte.dte_mbs = dsp->dsp_mbs; - rc = dtx_status_handle_one(cont, &dte, dsp->dsp_epoch, - NULL, NULL); + rc = dtx_status_handle_one(cont, &dte, dsp->dsp_oid, dsp->dsp_epoch, NULL, NULL); switch (rc) { case DSHR_NEED_COMMIT: { struct dtx_entry *pdte = &dte; @@ -1187,6 +1151,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (for_io) D_GOTO(out, rc = -DER_INPROGRESS); continue; + case 0: case DSHR_IGNORE: dtx_dsp_free(dsp); continue; @@ -1297,3 +1262,363 @@ dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont) return rc; } + +static int +dtx_coll_commit_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + out_target->dco_misc += out_source->dco_misc; + if (out_target->dco_status == 0) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_abort_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + if (out_source->dco_status != 0 && + (out_target->dco_status == 0 || out_target->dco_status == -DER_NONEXIST)) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_check_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + dtx_merge_check_result(&out_target->dco_status, out_source->dco_status); + + return 0; +} + +struct crt_corpc_ops dtx_coll_commit_co_ops = { + .co_aggregate = dtx_coll_commit_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_abort_co_ops = { + .co_aggregate = dtx_coll_abort_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_check_co_ops = { + .co_aggregate = dtx_coll_check_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct dtx_coll_rpc_args { + struct ds_cont_child *dcra_cont; + struct dtx_id dcra_xid; + uint32_t dcra_opc; + uint32_t dcra_ver; + daos_epoch_t dcra_epoch; + d_rank_list_t *dcra_ranks; + uint8_t *dcra_hints; + uint32_t dcra_hint_sz; + uint32_t dcra_committed; + uint32_t dcra_completed:1; + int dcra_result; + ABT_thread dcra_helper; + ABT_future dcra_future; +}; + +static void +dtx_coll_rpc_cb(const struct crt_cb_info *cb_info) +{ + struct dtx_coll_rpc_args *dcra = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct dtx_coll_out *dco; + int rc = cb_info->cci_rc; + + if (rc != 0) { + dcra->dcra_result = rc; + } else { + dco = crt_reply_get(req); + dcra->dcra_result = dco->dco_status; + dcra->dcra_committed = dco->dco_misc; + } + + dcra->dcra_completed = 1; + rc = ABT_future_set(dcra->dcra_future, NULL); + D_ASSERTF(rc == ABT_SUCCESS, + "ABT_future_set failed for opc %u: rc = %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc(struct dtx_coll_rpc_args *dcra) +{ + crt_rpc_t *req = NULL; + struct dtx_coll_in *dci; + int rc; + + rc = ABT_future_create(1, NULL, &dcra->dcra_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed for coll DTX ("DF_DTI") RPC %u: rc = %d\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = crt_corpc_req_create(dss_get_module_info()->dmi_ctx, NULL, dcra->dcra_ranks, + DAOS_RPC_OPCODE(dcra->dcra_opc, DAOS_DTX_MODULE, + DAOS_DTX_VERSION), + NULL, NULL, CRT_RPC_FLAG_FILTER_INVERT, + crt_tree_topo(CRT_TREE_KNOMIAL, dtx_coll_tree_width), &req); + if (rc != 0) { + D_ERROR("crt_corpc_req_create failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + D_GOTO(out, rc); + } + + dci = crt_req_get(req); + + uuid_copy(dci->dci_po_uuid, dcra->dcra_cont->sc_pool->spc_pool->sp_uuid); + uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid); + dci->dci_xid = dcra->dcra_xid; + dci->dci_version = dcra->dcra_ver; + dci->dci_epoch = dcra->dcra_epoch; + dci->dci_hints.ca_count = dcra->dcra_hint_sz; + dci->dci_hints.ca_arrays = dcra->dcra_hints; + + rc = crt_req_send(req, dtx_coll_rpc_cb, dcra); + if (rc != 0) + D_ERROR("crt_req_send failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + +out: + if (rc != 0 && !dcra->dcra_completed) { + dcra->dcra_result = rc; + dcra->dcra_completed = 1; + if (dcra->dcra_future != ABT_FUTURE_NULL) + ABT_future_set(dcra->dcra_future, NULL); + } + + return rc; +} + +static void +dtx_coll_rpc_helper(void *arg) +{ + struct dtx_coll_rpc_args *dcra = arg; + int rc; + + rc = dtx_coll_rpc(dcra); + + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Collective DTX helper ULT for %u exit: %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_id *xid, uint32_t opc, uint32_t version, + daos_epoch_t epoch, uint8_t *hints, uint32_t hint_sz, + d_rank_list_t *ranks, struct dtx_coll_rpc_args *dcra) +{ + int rc; + + dcra->dcra_cont = cont; + dcra->dcra_xid = *xid; + dcra->dcra_opc = opc; + dcra->dcra_ver = version; + dcra->dcra_epoch = epoch; + dcra->dcra_ranks = ranks; + dcra->dcra_hints = hints; + dcra->dcra_hint_sz = hint_sz; + dcra->dcra_future = ABT_FUTURE_NULL; + dcra->dcra_helper = ABT_THREAD_NULL; + + if (dss_has_enough_helper()) + rc = dss_ult_create(dtx_coll_rpc_helper, dcra, DSS_XS_IOFW, + dss_get_module_info()->dmi_tgt_id, 0, &dcra->dcra_helper); + else + rc = dtx_coll_rpc(dcra); + + return rc; +} + +static int +dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) +{ + int rc; + + if (dcra->dcra_helper != ABT_THREAD_NULL) + ABT_thread_free(&dcra->dcra_helper); + + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } + + return ret != 0 ? ret : dcra->dcra_result; +} + +int +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + uint32_t committed = 0; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (ranks != NULL) + rc = dtx_coll_rpc_prep(cont, xid, DTX_COLL_COMMIT, version, 0, hints, hint_sz, + ranks, &dcra); + + if (bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, xid, 0, + DTX_COLL_COMMIT, bitmap_sz, bitmap, &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] > 0) + committed += results[i]; + else if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + + committed += dcra.dcra_committed; + } + + if (rc == 0 && rc1 == 0) + rc2 = vos_dtx_commit(cont->sc_hdl, xid, 1, NULL); + else if (committed > 0) + /* Mark the DTX as "PARTIAL_COMMITTED" and re-commit it later. */ + rc2 = vos_dtx_set_flags(cont->sc_hdl, xid, 1, DTE_PARTIAL_COMMITTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively commit DTX "DF_DTI": %d/%d/%d\n", DP_DTI(xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_abort(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (ranks != NULL) + rc = dtx_coll_rpc_prep(cont, xid, DTX_COLL_ABORT, version, epoch, hints, hint_sz, + ranks, &dcra); + + if (bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, xid, epoch, + DTX_COLL_ABORT, bitmap_sz, bitmap, &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + } + + if (epoch != 0) + rc2 = vos_dtx_abort(cont->sc_hdl, xid, epoch); + else + rc2 = vos_dtx_set_flags(cont->sc_hdl, xid, 1, DTE_CORRUPTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively abort DTX "DF_DTI": %d/%d/%d\n", DP_DTI(xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_check(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int i; + + /* + * If no other target, then current target is the unique + * one and 'prepared', then related DTX can be committed. + */ + if (unlikely(ranks == NULL && bitmap == NULL)) + return DTX_ST_PREPARED; + + if (ranks != NULL) + rc = dtx_coll_rpc_prep(cont, xid, DTX_COLL_CHECK, version, epoch, hints, hint_sz, + ranks, &dcra); + + if (bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, xid, epoch, + DTX_COLL_CHECK, bitmap_sz, bitmap, &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (isset(bitmap, i)) + dtx_merge_check_result(&rc1, results[i]); + } + } + D_FREE(results); + } + + if (ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (bitmap != NULL) + dtx_merge_check_result(&rc, rc1); + } + + D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, + DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", DP_DTI(xid), rc, rc1); + + return ranks != NULL ? rc : rc1; +} diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 9ea25a9dcd0..086b4b566c3 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -247,7 +247,7 @@ dtx_handler(crt_rpc_t *rpc) rc1 = start_dtx_reindex_ult(cont); if (rc1 != 0) D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_RC(rc)); + DP_UUID(cont->sc_uuid), DP_RC(rc1)); } break; @@ -341,9 +341,14 @@ dtx_handler(crt_rpc_t *rpc) if (mbs[i] == NULL) continue; + /* For collective DTX, it will be synchronously committed soon. */ + if (mbs[i]->dm_flags & DMF_CONTAIN_TARGET_GRP) { + D_FREE(mbs[i]); + continue; + } + daos_dti_copy(&dtes[j].dte_xid, - (struct dtx_id *) - din->di_dtx_array.ca_arrays + i); + (struct dtx_id *)din->di_dtx_array.ca_arrays + i); dtes[j].dte_ver = vers[i]; dtes[j].dte_refs = 1; dtes[j].dte_mbs = mbs[i]; @@ -353,19 +358,19 @@ dtx_handler(crt_rpc_t *rpc) j++; } - D_ASSERT(j == rc1); + if (j > 0) { + /* + * Commit the DTX after replied the original refresh request to + * avoid further query the same DTX. + */ + rc = dtx_commit(cont, pdte, dcks, j); + if (rc < 0) + D_WARN("Failed to commit DTX "DF_DTI", count %d: " + DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, DP_RC(rc)); - /* Commit the DTX after replied the original refresh request to - * avoid further query the same DTX. - */ - rc = dtx_commit(cont, pdte, dcks, j); - if (rc < 0) - D_WARN("Failed to commit DTX "DF_DTI", count %d: " - DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, - DP_RC(rc)); - - for (i = 0; i < j; i++) - D_FREE(pdte[i]->dte_mbs); + for (i = 0; i < j; i++) + D_FREE(pdte[i]->dte_mbs); + } } D_FREE(dout->do_sub_rets.ca_arrays); @@ -375,11 +380,154 @@ dtx_handler(crt_rpc_t *rpc) ds_cont_child_put(cont); } +static void +dtx_coll_handler(crt_rpc_t *rpc) +{ + struct dtx_coll_in *dci = crt_req_get(rpc); + struct dtx_coll_out *dco = crt_reply_get(rpc); + struct dtx_coll_load_mbs_args dclma = { 0 }; + d_rank_t myrank = dss_self_rank(); + uint32_t bitmap_sz = 0; + uint32_t opc = opc_get(rpc->cr_opc); + uint8_t *hints = dci->dci_hints.ca_arrays; + uint8_t *bitmap = NULL; + int *results = NULL; + bool force_check = false; + int len; + int rc; + int i; + + D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI"\n", + opc, myrank, DP_DTI(&dci->dci_xid)); + + D_ASSERT(hints != NULL); + D_ASSERT(dci->dci_hints.ca_count > myrank); + + dclma.dclma_params = dci; + dclma.dclma_opc = opc; + rc = ABT_future_create(1, NULL, &dclma.dclma_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed: rc = %d\n", rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = dss_ult_create(dtx_coll_load_mbs_ult, &dclma, DSS_XS_VOS, hints[myrank], 0, NULL); + if (rc != 0) { + ABT_future_free(&dclma.dclma_future); + D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc)); + goto out; + } + + rc = ABT_future_wait(dclma.dclma_future); + D_ASSERT(rc == ABT_SUCCESS); + + ABT_future_free(&dclma.dclma_future); + + switch (dclma.dclma_result) { + case 0: + rc = dtx_coll_prep(dci->dci_po_uuid, dclma.dclma_oid, dclma.dclma_mbs, myrank, -1, + dci->dci_version, NULL /* p_hints */, NULL /* hint_sz */, + &bitmap, &bitmap_sz, NULL /* p_ranks */); + if (rc != 0) { + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI" opc %u: "DF_RC"\n", DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); + goto out; + } + break; + case 1: + /* The DTX has been committed, then depends on the RPC type. */ + if (opc == DTX_COLL_ABORT) { + D_ERROR("NOT allow to abort committed DTX "DF_DTI"\n", + DP_DTI(&dci->dci_xid)); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (opc == DTX_COLL_CHECK) + D_GOTO(out, rc = DTX_ST_COMMITTED); + + D_ASSERT(opc == DTX_COLL_COMMIT); + /* + * We do not know whether the DTX on the other VOS targets has been committed + * or not, let's continue the commit on the other local VOS targets by force. + */ + break; + case -DER_INPROGRESS: + /* Fall through. */ + case -DER_NONEXIST: + /* The shard on the hint VOS target may not exist, then depends on the RPC type. */ + if (opc == DTX_COLL_CHECK) + force_check = true; + + /* + * It is unknown whether the DTX on the other VOS targets has been committed/aborted + * or not, let's continue related operation on the other local VOS targets by force. + */ + break; + default: + D_ASSERTF(dclma.dclma_result < 0, "Unexpected result when load MBS for DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dci->dci_xid), DP_RC(dclma.dclma_result)); + D_GOTO(out, rc = dclma.dclma_result); + } + + len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, + opc, bitmap_sz, bitmap, &results); + if (len < 0) + D_GOTO(out, rc = len); + + if (opc == DTX_COLL_CHECK) { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) + dtx_merge_check_result(&rc, results[i]); + } + + /* + * For force check case, if no shard has been committed, we cannot trust the result + * of -DER_NONEXIST, instead, returning -DER_INPROGRESS to make the leader to retry. + */ + if (force_check && rc == -DER_NONEXIST) + D_GOTO(out, rc = -DER_INPROGRESS); + } else { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) { + if (results[i] >= 0) + dco->dco_misc += results[i]; + else if (results[i] != -DER_NONEXIST && rc == 0) + rc = results[i]; + } + } + } + +out: + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", + opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + + dco->dco_status = rc; + rc = crt_reply_send(rpc); + if (rc < 0) + D_ERROR("Failed to send collective RPC %p reply: "DF_RC"\n", rpc, DP_RC(rc)); + + D_FREE(dclma.dclma_mbs); + D_FREE(bitmap); + D_FREE(results); +} + static int dtx_init(void) { int rc; + dtx_coll_tree_width = DTX_COLL_TREE_WIDTH_DEF; + d_getenv_int("DTX_COLL_TREE_WIDTH", &dtx_coll_tree_width); + if (dtx_coll_tree_width < DTX_COLL_TREE_WIDTH_MIN || + dtx_coll_tree_width > DTX_COLL_TREE_WIDTH_MAX) { + D_WARN("Invalid bcast RPC tree width %u, the valid range is [%u, %u], " + "use the default value %u\n", dtx_coll_tree_width, + DTX_COLL_TREE_WIDTH_MIN, DTX_COLL_TREE_WIDTH_MAX, DTX_COLL_TREE_WIDTH_DEF); + dtx_coll_tree_width = DTX_COLL_TREE_WIDTH_DEF; + } + D_INFO("Set bcast RPC tree width for collective transaction as %u\n", dtx_coll_tree_width); + dtx_agg_thd_cnt_up = DTX_AGG_THD_CNT_DEF; d_getenv_int("DAOS_DTX_AGG_THD_CNT", &dtx_agg_thd_cnt_up); if (dtx_agg_thd_cnt_up < DTX_AGG_THD_CNT_MIN || dtx_agg_thd_cnt_up > DTX_AGG_THD_CNT_MAX) { diff --git a/src/engine/ult.c b/src/engine/ult.c index 204381755fb..f666c499074 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -97,6 +97,8 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, int xs_nr; int rc; int tid; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + bool self = false; if (ops == NULL || args == NULL || ops->co_func == NULL) { D_DEBUG(DB_MD, "mandatory args missing dss_collective_reduce"); @@ -156,19 +158,19 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, stream = &stream_args->csa_streams[tid]; stream->st_coll_args = &carg; - if (args->ca_exclude_tgts_cnt) { - int i; - - for (i = 0; i < args->ca_exclude_tgts_cnt; i++) - if (args->ca_exclude_tgts[i] == tid) - break; - - if (i < args->ca_exclude_tgts_cnt) { + if (args->ca_tgt_bitmap != NULL) { + if (tid >= args->ca_tgt_bitmap_sz << 3 || + isclr(args->ca_tgt_bitmap, tid)) { D_DEBUG(DB_TRACE, "Skip tgt %d\n", tid); rc = ABT_future_set(future, (void *)stream); D_ASSERTF(rc == ABT_SUCCESS, "%d\n", rc); continue; } + + if (tgt_id == tid && flags & DSS_USE_CURRENT_ULT) { + self = true; + continue; + } } dx = dss_get_xstream(DSS_MAIN_XS_ID(tid)); @@ -209,6 +211,9 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, } } + if (self) + collective_func(&stream_args->csa_streams[tgt_id]); + ABT_future_wait(future); rc = aggregator.at_rc; @@ -322,6 +327,44 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags) return dss_collective_internal(func, arg, true, flags); } +int +dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz) +{ + uint8_t *bitmap = NULL; + uint32_t size = ((dss_tgt_nr - 1) >> 3) + 1; + int rc = 0; + int i; + + D_ALLOC(bitmap, size); + if (bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < size; i++) + bitmap[i] = 0xff; + + for (i = dss_tgt_nr; i < (size << 3); i++) + clrbit(bitmap, i); + + if (exclude_tgts == NULL) + goto out; + + for (i = 0; i < exclude_cnt; i++) { + D_ASSERT(exclude_tgts[i] < dss_tgt_nr); + clrbit(bitmap, exclude_tgts[i]); + } + +out: + if (rc == 0) { + *p_bitmap = bitmap; + *bitmap_sz = size; + } else { + D_ERROR("Failed to build bitmap for collective task: "DF_RC"\n", DP_RC(rc)); + } + + return rc; +} + /* ============== ULT create functions =================================== */ static inline int diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index 14b2337ea0f..41fa5ffbf48 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -62,6 +62,8 @@ enum dtx_mbs_flags { * shard index to sort the dtx_memberships::dm_tgts. Obsolete. */ DMF_SORTED_SAD_IDX = (1 << 3), + /* The dtx_target_group information is appended after dtx_daos_target in dm_tgts. */ + DMF_CONTAIN_TARGET_GRP = (1 << 4), }; /** @@ -128,6 +130,20 @@ struct dtx_redundancy_group { uint32_t drg_ids[0]; }; +/** + * Classify the shards that are described in dtx_daos_target based on the rank. + * With these information, the caller can easily know which shard(s) reside on + * the given daos engine (rank). + */ +struct dtx_target_group { + uint32_t dtg_rank; + /* The index for the first shard on the given rank in dtx_memberships::dm_tgts. */ + uint32_t dtg_start_idx; + /* How many shards on the given rank that take part in the transaction. */ + uint32_t dtg_tgt_nr; + uint32_t dtg_padding; +}; + struct dtx_memberships { /* How many touched shards in the DTX. */ uint32_t dm_tgt_cnt; @@ -153,7 +169,8 @@ struct dtx_memberships { }; /* The first 'sizeof(struct dtx_daos_target) * dm_tgt_cnt' is the - * dtx_daos_target array. The subsequent are modification groups. + * dtx_daos_target array. The subsequent are redundancy groups or + * dtx_target_group, depends on dm_flags. */ union { char dm_data[0]; diff --git a/src/include/daos/object.h b/src/include/daos/object.h index 71d37facac0..9540d04cb5a 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -206,6 +206,73 @@ struct daos_shard_tgt { uint8_t st_flags; /* see daos_tgt_flags */ }; +struct daos_coll_shard { + uint16_t dcs_nr; + uint16_t dcs_cap; + uint32_t dcs_inline; + /* The shards in the buffer locate on the same VOS target. */ + uint32_t *dcs_buf; +}; + +struct daos_coll_target { + uint32_t dct_rank; + /* + * The size (in byte) of dct_bitmap. It (s << 3) may be smaller than dss_tgt_nr if only + * some VOS targets are involved. It also maybe larger than dss_tgt_nr if dss_tgt_nr is + * not 2 ^ n aligned. + */ + uint8_t dct_bitmap_sz; + uint8_t dct_padding; + /* How many valid items in dct_shards, it may be smaller than the sparse array length. */ + uint16_t dct_shard_nr; + /* Bitmap for the vos targets (on the rank) that are involved in the operation. */ + uint8_t *dct_bitmap; + /* Sparse array for object shards' identifiers, sorted with vos targets index. */ + struct daos_coll_shard *dct_shards; + + /* The following fields are only used on server side, not transferred on-wire. */ + + /* How many valid shards ID in dct_tgt_ids array. */ + uint16_t dct_tgt_nr; + /* The capacity for the dct_tgt_ids array. */ + uint16_t dct_tgt_cap; + /* ID array for shards on the engine, in spite of on which VOS target. */ + uint32_t *dct_tgt_ids; +}; + +static inline void +daos_coll_shard_cleanup(struct daos_coll_shard *shards, uint32_t count) +{ + struct daos_coll_shard *shard; + int i; + + if (shards != NULL) { + for (i = 0; i < count; i++) { + shard = &shards[i]; + if (shard->dcs_buf != &shard->dcs_inline) + D_FREE(shard->dcs_buf); + } + D_FREE(shards); + } +} + +static inline void +daos_coll_target_cleanup(struct daos_coll_target *dcts, uint32_t count) +{ + struct daos_coll_target *dct; + int i; + + if (dcts != NULL) { + for (i = 0; i < count; i++) { + dct = &dcts[i]; + daos_coll_shard_cleanup(dct->dct_shards, dct->dct_bitmap_sz << 3); + D_FREE(dct->dct_bitmap); + D_FREE(dct->dct_tgt_ids); + } + D_FREE(dcts); + } +} + static inline bool daos_oid_is_null(daos_obj_id_t oid) { diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 498715b3d12..114e66ae1e9 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -512,6 +512,8 @@ enum dss_ult_flags { DSS_ULT_FL_PERIODIC = (1 << 0), /* Use DSS_DEEP_STACK_SZ as the stack size */ DSS_ULT_DEEP_STACK = (1 << 1), + /* Use current ULT (instead of creating new one) for the task. */ + DSS_USE_CURRENT_ULT = (1 << 2), }; int dss_ult_create(void (*func)(void *), void *arg, int xs_type, int tgt_id, @@ -581,8 +583,14 @@ struct dss_coll_args { /** Arguments for dss_collective func (Mandatory) */ void *ca_func_args; void *ca_aggregator; - int *ca_exclude_tgts; - unsigned int ca_exclude_tgts_cnt; + /* Specify on which targets to execute the task. */ + uint8_t *ca_tgt_bitmap; + /* + * The size (in byte) of ca_tgt_bitmap. It may be smaller than dss_tgt_nr if only some + * VOS targets are involved. It also may be larger than dss_tgt_nr if dss_tgt_nr is not + * 2 ^ n aligned. + */ + uint32_t ca_tgt_bitmap_sz; /** Stream arguments for all streams */ struct dss_coll_stream_args ca_stream_args; }; @@ -604,6 +612,8 @@ dss_thread_collective_reduce(struct dss_coll_ops *ops, unsigned int flags); int dss_task_collective(int (*func)(void *), void *arg, unsigned int flags); int dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags); +int dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz); /** * Loaded module management metholds diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index d0b2352783a..bcac6f965f3 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -64,7 +64,6 @@ struct dtx_handle { dth_pinned:1, /* DTXs in CoS list are committed. */ dth_cos_done:1, - dth_resent:1, /* For resent case. */ /* Only one participator in the DTX. */ dth_solo:1, /* Do not keep committed entry. */ @@ -140,6 +139,7 @@ struct dtx_handle { struct dtx_sub_status { struct daos_shard_tgt dss_tgt; int dss_result; + uint32_t dss_version; uint32_t dss_comp:1; }; @@ -152,6 +152,7 @@ struct dtx_leader_handle { struct dtx_handle dlh_handle; /* result for the distribute transaction */ int dlh_result; + uint32_t dlh_rmt_ver; /* The array of the DTX COS entries */ uint32_t dlh_dti_cos_count; @@ -164,12 +165,26 @@ struct dtx_leader_handle { int32_t dlh_allow_failure; /* Normal sub requests have been processed. */ uint32_t dlh_normal_sub_done:1, + /* Collective DTX. */ + dlh_coll:1, /* Drop conditional flags when forward RPC. */ dlh_drop_cond:1; + /* Ranks list for collective modification. */ + d_rank_list_t *dlh_coll_ranks; + /* VOS targets hint for collective modification. */ + uint8_t *dlh_coll_hints; + /* Bitmap for collective modification on local VOS targets. */ + uint8_t *dlh_coll_bitmap; + /* The size of dlh_coll_hints array. */ + uint32_t dlh_coll_hint_sz; + /* The size of dlh_coll_bitmap in bytes. */ + uint32_t dlh_coll_bitmap_sz; + /* The bcast RPC tree width for collective transaction */ + uint16_t dlh_coll_tree_width; + /* How many delay forward sub request. */ + uint16_t dlh_delay_sub_cnt; /* How many normal sub request. */ uint32_t dlh_normal_sub_cnt; - /* How many delay forward sub request. */ - uint32_t dlh_delay_sub_cnt; /* The index of the first target that forward sub-request to. */ uint32_t dlh_forward_idx; /* The count of the targets that forward sub-request to. */ @@ -205,7 +220,7 @@ enum dtx_flags { DTX_FOR_MIGRATION = (1 << 3), /** Ignore other uncommitted DTXs. */ DTX_IGNORE_UNCOMMITTED = (1 << 4), - /** Resent request. */ + /** Resent request. Out-of-date. */ DTX_RESEND = (1 << 5), /** Force DTX refresh if hit non-committed DTX on non-leader. Out-of-date DAOS-7878. */ DTX_FORCE_REFRESH = (1 << 6), @@ -213,6 +228,8 @@ enum dtx_flags { DTX_PREPARED = (1 << 7), /** Do not keep committed entry. */ DTX_DROP_CMT = (1 << 8), + /** Collective DTX. */ + DTX_COLL = (1 << 9), }; void @@ -220,12 +237,12 @@ dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth); int dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash); int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh); +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, uint8_t *hints, uint32_t hint_sz, + uint8_t *bitmap, uint32_t bitmap_sz, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, d_rank_list_t *ranks, struct dtx_memberships *mbs, + struct dtx_leader_handle **p_dlh); int dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int result); @@ -260,10 +277,21 @@ void dtx_cont_deregister(struct ds_cont_child *cont); int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); +int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, + struct dtx_cos_key *dcks, int count); + int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); +int dtx_coll_commit(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version); + +int dtx_coll_abort(struct ds_cont_child *cont, struct dtx_id *xid, d_rank_list_t *ranks, + uint8_t *hints, uint32_t hint_sz, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t version, daos_epoch_t epoch); + /** * Check whether the given DTX is resent one or not. * diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index b1ad14ce4f6..9b56ce47a4d 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -289,9 +289,9 @@ int ds_pool_svc_term_get(uuid_t uuid, uint64_t *term); int ds_pool_svc_global_map_version_get(uuid_t uuid, uint32_t *global_ver); int -ds_pool_child_map_refresh_sync(struct ds_pool_child *dpc); +ds_pool_child_map_refresh_sync(uuid_t uuid, uint32_t version); int -ds_pool_child_map_refresh_async(struct ds_pool_child *dpc); +ds_pool_child_map_refresh_async(uuid_t uuid, uint32_t version); int map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *ranks); diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 160b044a3f0..a17872b72ae 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -103,12 +103,16 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, * * \param coh [IN] Container open handle. * \param dti [IN] Pointer to the DTX identifier. + * \param oid [OUT] Pointer to the ID for the DTX leader object shard. * \param mbs [OUT] Pointer to the DTX participants information. * - * \return Zero on success, negative value if error. + * \return Zero on success. + * Positive if DTX has been committed. + * Negative value if error. */ int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs); +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs); /** * Commit the specified DTXs. diff --git a/src/object/cli_mod.c b/src/object/cli_mod.c index 79c13fee948..4439564ca37 100644 --- a/src/object/cli_mod.c +++ b/src/object/cli_mod.c @@ -16,7 +16,10 @@ #include "obj_rpc.h" #include "obj_internal.h" +#define OBJ_COLL_PUNCH_THRESHOLD_MIN 16 + unsigned int srv_io_mode = DIM_DTX_FULL_ENABLED; +unsigned int obj_coll_punch_thd; int dc_obj_proto_version; /** @@ -68,6 +71,16 @@ dc_obj_init(void) D_GOTO(out_class, rc); } + obj_coll_punch_thd = OBJ_COLL_PUNCH_THRESHOLD_MIN; + d_getenv_int("OBJ_COLL_PUNCH_THRESHOLD", &obj_coll_punch_thd); + if (obj_coll_punch_thd < OBJ_COLL_PUNCH_THRESHOLD_MIN) { + D_WARN("Invalid collective punch threshold %u, it cannot be smaller than %u, " + "use the default value %u\n", obj_coll_punch_thd, + OBJ_COLL_PUNCH_THRESHOLD_MIN, OBJ_COLL_PUNCH_THRESHOLD_MIN); + obj_coll_punch_thd = OBJ_COLL_PUNCH_THRESHOLD_MIN; + } + D_INFO("Set object collective punch threshold as %u\n", obj_coll_punch_thd); + tx_verify_rdg = false; d_getenv_bool("DAOS_TX_VERIFY_RDG", &tx_verify_rdg); D_INFO("%s TX redundancy group verification\n", tx_verify_rdg ? "Enable" : "Disable"); diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 710b94d18a7..fd4aae196c1 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "obj_rpc.h" #include "obj_internal.h" @@ -2843,6 +2844,7 @@ obj_embedded_shard_arg(struct obj_auxi_args *obj_auxi) case DAOS_OBJ_RPC_SYNC: return &obj_auxi->s_args.sa_auxi; case DAOS_OBJ_RPC_QUERY_KEY: + case DAOS_OBJ_RPC_COLL_PUNCH: /* * called from obj_comp_cb_internal() and * checked in obj_shard_comp_cb() correctly @@ -4868,6 +4870,7 @@ obj_comp_cb(tse_task_t *task, void *data) dc_tx_attach(obj_auxi->th, obj, DAOS_OBJ_RPC_FETCH, task, 0, false); break; } + case DAOS_OBJ_RPC_COLL_PUNCH: case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_PUNCH_DKEYS: case DAOS_OBJ_RPC_PUNCH_AKEYS: @@ -6663,23 +6666,76 @@ shard_punch_prep(struct shard_auxi_args *shard_auxi, struct dc_object *obj, return 0; } +static int +dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, + uint32_t map_ver, daos_obj_punch_t *args, struct obj_auxi_args *auxi) +{ + struct shard_punch_args *spa = &auxi->p_args; + struct dc_obj_shard *shard = NULL; + uint32_t flags = ORF_LEADER; + uint32_t off; + int rc; + int i; + + for (i = 0, off = obj->cob_md.omd_id.lo % obj->cob_shards_nr; i < obj->cob_shards_nr; + i++, off = (off + 1) % obj->cob_shards_nr) { + rc = obj_shard_open(obj, off, map_ver, &shard); + if (rc == 0) { + if (!shard->do_rebuilding && !shard->do_reintegrating) + break; + + obj_shard_close(shard); + } + + if (rc != -DER_NONEXIST) + goto out; + } + + /* If all shards are NONEXIST, then need not send collective punch RPC. */ + if (unlikely(i == obj->cob_shards_nr)) + D_GOTO(out, rc = 0); + + if (auxi->io_retry) { + flags |= ORF_RESEND; + /* Reset @enqueue_id if resend to new leader. */ + if (spa->pa_auxi.target != shard->do_target_id) + spa->pa_auxi.enqueue_id = 0; + } else { + spa->pa_auxi.obj_auxi = auxi; + daos_dti_gen(&spa->pa_dti, false); + } + + spa->pa_auxi.target = shard->do_target_id; + + if (obj_is_ec(obj)) + flags |= ORF_EC; + + /* The shard will be closed via RPC callback in dc_obj_shard_coll_punch(). */ + return dc_obj_shard_coll_punch(shard, spa, epoch, args->flags, flags, map_ver, + &auxi->map_ver_reply, task); + +out: + DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, + "DAOS_OBJ_RPC_COLL_PUNCH for "DF_OID" map_ver %u, task %p", + DP_OID(obj->cob_md.omd_id), map_ver, task); + + obj_task_complete(task, rc); + + return rc; +} + static int dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, uint32_t map_ver, enum obj_rpc_opc opc, daos_obj_punch_t *api_args) { struct obj_auxi_args *obj_auxi; + struct dc_pool *pool; uint32_t shard; uint32_t shard_cnt; uint32_t grp_cnt; + uint32_t node_cnt; int rc; - if (opc == DAOS_OBJ_RPC_PUNCH && obj->cob_grp_nr > 1) - /* The object have multiple redundancy groups, use DAOS - * internal transaction to handle that to guarantee the - * atomicity of punch object. - */ - return dc_tx_convert(obj, opc, task); - rc = obj_task_init(task, opc, map_ver, api_args->th, &obj_auxi, obj); if (rc != 0) { obj_decref(obj); @@ -6693,6 +6749,40 @@ dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, if (opc == DAOS_OBJ_RPC_PUNCH) { obj_ptr2shards(obj, &shard, &shard_cnt, &grp_cnt); + + if (grp_cnt > 1) { + /* + * We support object collective punch since release-2.6 (and may 2.4.x) + * (version 10). The conditions to trigger object collective punch are: + * + * 1. The shards count exceeds the engines count. Means that there are + * some shards reside on the same engine. Collectively punch object + * will save some RPCs. Or + * + * 2. The shards count exceeds the threshold for collective punch (16 + * by default). Collectively punch object will distribute the RPCs + * load among more engines even if the total RPCs count may be not + * decreased too much. + * + * If the object has multiple redundancy groups, but cannot match any + * above condition, then we will use internal distributed transaction + * to guarantee the atomicity of punch all object shards. + */ + if (dc_obj_proto_version <= 9) + D_GOTO(out_task, rc = -DER_NEED_TX); + + pool = obj->cob_pool; + D_RWLOCK_RDLOCK(&pool->dp_map_lock); + node_cnt = pool_map_node_nr(pool->dp_map); + D_RWLOCK_UNLOCK(&pool->dp_map_lock); + + if (shard_cnt <= obj_coll_punch_thd && shard_cnt <= node_cnt) + D_GOTO(out_task, rc = -DER_NEED_TX); + + obj_auxi->opc = DAOS_OBJ_RPC_COLL_PUNCH; + + return dc_obj_coll_punch(task, obj, epoch, map_ver, api_args, obj_auxi); + } } else { grp_cnt = 1; obj_auxi->dkey_hash = obj_dkey2hash(obj->cob_md.omd_id, api_args->dkey); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 2dd9ef9ac39..0187c3d950b 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1307,6 +1307,120 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, return rc; } +struct obj_coll_punch_cb_args { + crt_rpc_t *ocpca_rpc; + uint32_t *ocpca_ver; + struct dc_obj_shard *ocpca_shard; + struct shard_punch_args *ocpca_shard_args; +}; + +static int +obj_shard_coll_punch_cb(tse_task_t *task, void *data) +{ + struct obj_coll_punch_cb_args *cb_args = data; + crt_rpc_t *rpc = cb_args->ocpca_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + if (task->dt_result == 0) { + task->dt_result = obj_reply_get_status(rpc); + *cb_args->ocpca_ver = obj_reply_map_version_get(rpc); + } + + if (task->dt_result == -DER_OVERLOAD_RETRY) { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + struct shard_punch_args *shard_args = cb_args->ocpca_shard_args; + uint32_t timeout = 0; + + if (shard_args->pa_auxi.enqueue_id == 0) + shard_args->pa_auxi.enqueue_id = ocpo->ocpo_comm_out.req_out_enqueue_id; + crt_req_get_timeout(rpc, &timeout); + if (timeout > shard_args->pa_auxi.obj_auxi->max_delay) + shard_args->pa_auxi.obj_auxi->max_delay = timeout; + } + + DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" on leader %u with DTX " + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x\n", + rpc, DP_UOID(ocpi->ocpi_oid), ocpi->ocpi_leader_id, DP_DTI(&ocpi->ocpi_xid), + task, ocpi->ocpi_map_ver, *cb_args->ocpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + + crt_req_decref(rpc); + obj_shard_decref(cb_args->ocpca_shard); + + return task->dt_result; +} + +int +dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_epoch *epoch, uint64_t api_flags, uint32_t rpc_flags, + uint32_t map_ver, uint32_t *rep_ver, tse_task_t *task) +{ + struct dc_pool *pool = obj_shard_ptr2pool(shard); + crt_rpc_t *req = NULL; + struct obj_coll_punch_in *ocpi = NULL; + struct obj_coll_punch_cb_args cb_args = { 0 }; + crt_endpoint_t tgt_ep = { 0 }; + int rc = 0; + + D_ASSERT(pool != NULL); + + tgt_ep.ep_grp = pool->dp_sys->sy_group; + tgt_ep.ep_rank = shard->do_target_rank; + tgt_ep.ep_tag = shard->do_target_idx; + + rc = obj_req_create(daos_task2ctx(task), &tgt_ep, DAOS_OBJ_RPC_COLL_PUNCH, &req); + if (rc != 0) + goto out; + + ocpi = crt_req_get(req); + D_ASSERT(ocpi != NULL); + + uuid_copy(ocpi->ocpi_po_uuid, pool->dp_pool); + uuid_copy(ocpi->ocpi_co_hdl, shard->do_co->dc_cont_hdl); + uuid_copy(ocpi->ocpi_co_uuid, shard->do_co->dc_uuid); + ocpi->ocpi_oid = shard->do_id; + ocpi->ocpi_epoch = epoch->oe_value; + ocpi->ocpi_api_flags = api_flags; + ocpi->ocpi_map_ver = map_ver; + ocpi->ocpi_leader_id = shard->do_target_id; + ocpi->ocpi_flags = rpc_flags; + daos_dti_copy(&ocpi->ocpi_xid, &args->pa_dti); + ocpi->ocpi_comm_in.req_in_enqueue_id = args->pa_auxi.enqueue_id; + + crt_req_addref(req); + cb_args.ocpca_rpc = req; + cb_args.ocpca_ver = rep_ver; + cb_args.ocpca_shard = shard; + cb_args.ocpca_shard_args = args; + + rc = tse_task_register_comp_cb(task, obj_shard_coll_punch_cb, &cb_args, sizeof(cb_args)); + if (rc != 0) + D_GOTO(out_req, rc); + + D_DEBUG(DB_IO, "Sending DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u\n", + req, DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag); + + return daos_rpc_send(req, task); + +out_req: + /* -1 for crt_req_addref(). */ + crt_req_decref(req); + /* -1 for obj_req_create(). */ + crt_req_decref(req); +out: + D_ERROR("DAOS_OBJ_RPC_COLL_PUNCH RPC failed for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u: "DF_RC"\n", + DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); + + obj_shard_decref(shard); + tse_task_complete(task, rc); + return rc; +} + struct obj_enum_args { crt_rpc_t *rpc; daos_handle_t *hdlp; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 8a2b12fff55..de6f16c3929 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -41,6 +41,7 @@ struct obj_io_context; extern bool cli_bypass_rpc; /** Switch of server-side IO dispatch */ extern unsigned int srv_io_mode; +extern unsigned int obj_coll_punch_thd; /* Whether check redundancy group validation when DTX resync. */ extern bool tx_verify_rdg; @@ -572,6 +573,10 @@ int dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); +int dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_epoch *epoch, uint64_t api_flags, uint32_t rpc_flags, + uint32_t map_ver, uint32_t *rep_ver, tse_task_t *task); + int dc_obj_shard_list(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); diff --git a/src/object/obj_rpc.c b/src/object/obj_rpc.c index e7f4e43960b..876c1d8f5ad 100644 --- a/src/object/obj_rpc.c +++ b/src/object/obj_rpc.c @@ -546,13 +546,10 @@ crt_proc_struct_daos_cpd_sub_head(crt_proc_t proc, crt_proc_op_t proc_op, } rc = crt_proc_memcpy(proc, proc_op, dcsh->dcsh_mbs, size); - if (unlikely(rc)) { - if (DECODING(proc_op)) - D_FREE(dcsh->dcsh_mbs); - return rc; - } + if (unlikely(rc) && DECODING(proc_op)) + D_FREE(dcsh->dcsh_mbs); - return 0; + return rc; } static int @@ -848,11 +845,6 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, return rc; } - if (FREEING(proc_op)) { - D_FREE(dcb->dcb_bulk); - return 0; - } - rc = crt_proc_uint32_t(proc, proc_op, &dcb->dcb_size); if (unlikely(rc)) return rc; @@ -871,6 +863,9 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, if (unlikely(rc)) return rc; + if (FREEING(proc_op)) + D_FREE(dcb->dcb_bulk); + /* The other fields will not be packed on-wire. */ return 0; @@ -1098,6 +1093,7 @@ CRT_RPC_DEFINE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) CRT_RPC_DEFINE(obj_ec_rep, DAOS_ISEQ_OBJ_EC_REP, DAOS_OSEQ_OBJ_EC_REP) CRT_RPC_DEFINE(obj_key2anchor, DAOS_ISEQ_OBJ_KEY2ANCHOR, DAOS_OSEQ_OBJ_KEY2ANCHOR) CRT_RPC_DEFINE(obj_key2anchor_v10, DAOS_ISEQ_OBJ_KEY2ANCHOR_V10, DAOS_OSEQ_OBJ_KEY2ANCHOR_V10) +CRT_RPC_DEFINE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) /* Define for obj_proto_rpc_fmt[] array population below. * See OBJ_PROTO_*_RPC_LIST macro definition @@ -1179,6 +1175,9 @@ obj_reply_set_status(crt_rpc_t *rpc, int status) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_status = status; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_ret = status; + break; default: D_ASSERT(0); } @@ -1218,6 +1217,8 @@ obj_reply_get_status(crt_rpc_t *rpc) return ((struct obj_cpd_out *)reply)->oco_ret; case DAOS_OBJ_RPC_EC_REPLICATE: return ((struct obj_ec_rep_out *)reply)->er_status; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_ret; default: D_ASSERT(0); } @@ -1267,6 +1268,9 @@ obj_reply_map_version_set(crt_rpc_t *rpc, uint32_t map_version) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_map_ver = map_version; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_map_version = map_version; + break; default: D_ASSERT(0); } @@ -1302,6 +1306,8 @@ obj_reply_map_version_get(crt_rpc_t *rpc) return ((struct obj_sync_out *)reply)->oso_map_version; case DAOS_OBJ_RPC_CPD: return ((struct obj_cpd_out *)reply)->oco_map_version; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_map_version; default: D_ASSERT(0); } diff --git a/src/object/obj_rpc.h b/src/object/obj_rpc.h index dba1b31ca74..ccb4db351ff 100644 --- a/src/object/obj_rpc.h +++ b/src/object/obj_rpc.h @@ -98,7 +98,10 @@ X(DAOS_OBJ_RPC_KEY2ANCHOR, \ 0, ver == 9 ? &CQF_obj_key2anchor : \ &CQF_obj_key2anchor_v10, \ - ds_obj_key2anchor_handler, NULL, "key2anchor") + ds_obj_key2anchor_handler, NULL, "key2anchor") \ + X(DAOS_OBJ_RPC_COLL_PUNCH, \ + 0, &CQF_obj_coll_punch, ds_obj_coll_punch_handler, \ + &obj_coll_punch_co_ops, "obj_coll_punch") /* Define for RPC enum population below */ #define X(a, b, c, d, e, f) a, @@ -109,6 +112,7 @@ enum obj_rpc_opc { }; #undef X +extern struct crt_corpc_ops obj_coll_punch_co_ops; extern struct crt_proto_format obj_proto_fmt_v9; extern struct crt_proto_format obj_proto_fmt_v10; extern int dc_obj_proto_version; @@ -149,8 +153,8 @@ enum obj_rpc_flags { * oei_epr.epr_hi is epoch. */ ORF_ENUM_WITHOUT_EPR = (1 << 8), - /* CPD RPC leader */ - ORF_CPD_LEADER = (1 << 9), + /* RPC leader */ + ORF_LEADER = (1 << 9), /* Bulk data transfer for CPD RPC. */ ORF_CPD_BULK = (1 << 10), /* Contain EC split req, only used on CPD leader locally. Obsolete - DAOS-10348. */ @@ -707,6 +711,29 @@ struct daos_cpd_sg { CRT_RPC_DECLARE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) +#define DAOS_ISEQ_OBJ_COLL_PUNCH /* input fields */ \ + ((struct dtx_id) (ocpi_xid) CRT_VAR) \ + ((uuid_t) (ocpi_po_uuid) CRT_VAR) \ + ((uuid_t) (ocpi_co_hdl) CRT_VAR) \ + ((uuid_t) (ocpi_co_uuid) CRT_VAR) \ + ((daos_unit_oid_t) (ocpi_oid) CRT_RAW) \ + ((uint64_t) (ocpi_epoch) CRT_VAR) \ + ((uint64_t) (ocpi_api_flags) CRT_VAR) \ + ((uint32_t) (ocpi_map_ver) CRT_VAR) \ + ((uint32_t) (ocpi_flags) CRT_VAR) \ + ((uint32_t) (ocpi_fdom_lvl) CRT_VAR) \ + ((uint32_t) (ocpi_pdom_lvl) CRT_VAR) \ + ((uint32_t) (ocpi_pda) CRT_VAR) \ + ((uint32_t) (ocpi_leader_id) CRT_VAR) \ + ((struct daos_req_comm_in) (ocpi_comm_in) CRT_VAR) + +#define DAOS_OSEQ_OBJ_COLL_PUNCH /* output fields */ \ + ((int32_t) (ocpo_ret) CRT_VAR) \ + ((uint32_t) (ocpo_map_version) CRT_VAR) \ + ((struct daos_req_comm_out) (ocpo_comm_out) CRT_VAR) + +CRT_RPC_DECLARE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) + static inline int obj_req_create(crt_context_t crt_ctx, crt_endpoint_t *tgt_ep, crt_opcode_t opc, crt_rpc_t **req) @@ -739,7 +766,7 @@ obj_is_modification_opc(uint32_t opc) opc == DAOS_OBJ_RPC_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; + opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS || opc == DAOS_OBJ_RPC_COLL_PUNCH; } #define DAOS_OBJ_UPDATE_MODE_MASK (DAOS_OO_RW | DAOS_OO_EXCL | \ @@ -751,43 +778,6 @@ obj_is_fetch_opc(uint32_t opc) return opc == DAOS_OBJ_RPC_FETCH; } -static inline bool -obj_is_ec_agg_opc(uint32_t opc) -{ - return opc == DAOS_OBJ_RPC_EC_AGGREGATE || - opc == DAOS_OBJ_RPC_EC_REPLICATE; -} - -static inline bool -obj_rpc_is_update(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; -} - -static inline bool -obj_rpc_is_fetch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; -} - -static inline bool -obj_rpc_is_punch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; -} - -static inline bool -obj_rpc_is_migrate(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_MIGRATE; -} - static inline bool obj_is_enum_opc(uint32_t opc) { @@ -798,40 +788,23 @@ obj_is_enum_opc(uint32_t opc) } static inline bool -obj_rpc_is_query(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_QUERY_KEY; -} - -static inline bool -obj_rpc_is_sync(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_SYNC; -} - -static inline bool -obj_rpc_is_key2anchor(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_KEY2ANCHOR; -} - -static inline bool -obj_rpc_is_ec_agg(crt_rpc_t *rpc) +obj_is_ec_agg_opc(uint32_t opc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_AGGREGATE; - + return opc == DAOS_OBJ_RPC_EC_AGGREGATE || + opc == DAOS_OBJ_RPC_EC_REPLICATE; } static inline bool -obj_rpc_is_ec_rep(crt_rpc_t *rpc) +obj_rpc_is_update(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_REPLICATE; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || + opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; } static inline bool -obj_rpc_is_cpd(crt_rpc_t *rpc) +obj_rpc_is_fetch(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; } #endif /* __DAOS_OBJ_RPC_H__ */ diff --git a/src/object/obj_tx.c b/src/object/obj_tx.c index 6e56cce82e3..f2ddfd34cb1 100644 --- a/src/object/obj_tx.c +++ b/src/object/obj_tx.c @@ -2305,7 +2305,7 @@ dc_tx_commit_trigger(tse_task_t *task, struct dc_tx *tx, daos_tx_commit_t *args) uuid_copy(oci->oci_pool_uuid, tx->tx_pool->dp_pool); oci->oci_map_ver = tx->tx_pm_ver; - oci->oci_flags = ORF_CPD_LEADER; + oci->oci_flags = ORF_LEADER; if (tx->tx_set_resend && !tx->tx_renew) oci->oci_flags |= ORF_RESEND; tx->tx_renew = 0; diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index e673b15f7a4..b38af3e5e28 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -238,6 +238,7 @@ struct ds_obj_exec_arg { crt_rpc_t *rpc; struct obj_io_context *ioc; void *args; + struct daos_coll_shard *shards; uint32_t flags; uint32_t start; /* The start shard for EC obj. */ }; @@ -251,6 +252,9 @@ ds_obj_remote_punch(struct dtx_leader_handle *dth, void *arg, int idx, int ds_obj_cpd_dispatch(struct dtx_leader_handle *dth, void *arg, int idx, dtx_sub_comp_cb_t comp_cb); +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dth, void *arg, int idx, + dtx_sub_comp_cb_t comp_cb); /* srv_obj.c */ void ds_obj_rw_handler(crt_rpc_t *rpc); @@ -265,6 +269,7 @@ void ds_obj_migrate_handler(crt_rpc_t *rpc); void ds_obj_ec_agg_handler(crt_rpc_t *rpc); void ds_obj_ec_rep_handler(crt_rpc_t *rpc); void ds_obj_cpd_handler(crt_rpc_t *rpc); +void ds_obj_coll_punch_handler(crt_rpc_t *rpc); typedef int (*ds_iofw_cb_t)(crt_rpc_t *req, void *arg); struct daos_cpd_args { diff --git a/src/object/srv_mod.c b/src/object/srv_mod.c index 72a25ba97de..94099dc3f02 100644 --- a/src/object/srv_mod.c +++ b/src/object/srv_mod.c @@ -213,7 +213,9 @@ struct dss_module_key obj_module_key = { static int obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = 0; D_ASSERT(proto_ver == DAOS_OBJ_VERSION || proto_ver == DAOS_OBJ_VERSION - 1); @@ -226,7 +228,11 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) /* Extract hint from RPC */ attr->sra_enqueue_id = 0; - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_in *orw = crt_req_get(rpc); if (proto_ver >= 10) { @@ -237,12 +243,19 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) sched_req_attr_init(attr, obj_rpc_is_update(rpc) ? SCHED_REQ_UPDATE : SCHED_REQ_FETCH, &orw->orw_pool_uuid); - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_in *omi = crt_req_get(rpc); attr->sra_enqueue_id = omi->om_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &omi->om_pool_uuid); - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_in *oei = crt_req_get(rpc); if (proto_ver >= 10) { @@ -251,7 +264,14 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oei_v10->oei_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oei->oei_pool_uuid); - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_in *opi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -260,7 +280,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = opi_v10->opi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &opi->opi_pool_uuid); - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_in *okqi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -269,7 +291,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = okqi_v10->okqi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &okqi->okqi_pool_uuid); - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_in *osi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -278,7 +302,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = osi_v10->osi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &osi->osi_pool_uuid); - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_in *oki = crt_req_get(rpc); if (proto_ver >= 10) { @@ -287,102 +313,146 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oki_v10->oki_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oki->oki_pool_uuid); - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_in *ea = crt_req_get(rpc); attr->sra_enqueue_id = ea->ea_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &ea->ea_pool_uuid); - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_in *er = crt_req_get(rpc); attr->sra_enqueue_id = er->er_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &er->er_pool_uuid); - } else if (obj_rpc_is_cpd(rpc)) { + break; + } + case DAOS_OBJ_RPC_CPD: { struct obj_cpd_in *oci = crt_req_get(rpc); - sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &oci->oci_pool_uuid); - } else { + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &oci->oci_pool_uuid); + break; + } + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + attr->sra_enqueue_id = ocpi->ocpi_comm_in.req_in_enqueue_id; + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &ocpi->ocpi_po_uuid); + break; + } + default: /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_NOSYS; + rc = -DER_NOSYS; + break; } - return 0; + return rc; } static int obj_set_req(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = -DER_OVERLOAD_RETRY; /* Old protocol RPCs won't be rejected. */ D_ASSERT(proto_ver == DAOS_OBJ_VERSION); - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_v10_out *orwo_v10 = crt_reply_get(rpc); orwo_v10->orw_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; orwo_v10->orw_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_out *om = crt_reply_get(rpc); om->om_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; om->om_status = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_v10_out *oeo_v10 = crt_reply_get(rpc); oeo_v10->oeo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oeo_v10->oeo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_v10_out *opo_v10 = crt_reply_get(rpc); opo_v10->opo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; opo_v10->opo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_v10_out *okqo_v10 = crt_reply_get(rpc); okqo_v10->okqo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; okqo_v10->okqo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_v10_out *oso_v10 = crt_reply_get(rpc); oso_v10->oso_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oso_v10->oso_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_v10_out *oko_v10 = crt_reply_get(rpc); oko_v10->oko_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oko_v10->oko_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_out *ea_out = crt_reply_get(rpc); ea_out->ea_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; ea_out->ea_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_out *er_out = crt_reply_get(rpc); er_out->er_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; er_out->er_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_cpd(rpc)) { - /* No RPC retry for DTX, client will retry anyway. */ - return -DER_TIMEDOUT; + break; + } + case DAOS_OBJ_RPC_CPD: + /* NOTE: It needs to be enhanced. Currently, just let client retry anyway. */ + rc = -DER_TIMEDOUT; + break; + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + + ocpo->ocpo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; + ocpo->ocpo_ret = -DER_OVERLOAD_RETRY; + break; } - /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_TIMEDOUT; + default: + /* Other requests will not be queued, see dss_rpc_hdlr() */ + rc = -DER_TIMEDOUT; + break; + } + + return rc; } static struct dss_module_ops ds_obj_mod_ops = { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a977ef7ce3e..1d1b1d668e3 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -15,6 +15,8 @@ #include #include +#include +#include #include #include #include @@ -2143,6 +2145,7 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, struct obj_tls *tls; struct ds_pool_child *poc; int rc; + bool once = false; rc = obj_ioc_init(pool_uuid, coh_uuid, cont_uuid, rpc, ioc); if (rc) @@ -2151,6 +2154,7 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, poc = ioc->ioc_coc->sc_pool; D_ASSERT(poc != NULL); +again: if (unlikely(poc->spc_pool->sp_map == NULL || DAOS_FAIL_CHECK(DAOS_FORCE_REFRESH_POOL_MAP))) { /* XXX: Client (or leader replica) has newer pool map than @@ -2176,7 +2180,7 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, */ D_DEBUG(DB_IO, "stale server map_version %d req %d\n", ioc->ioc_map_ver, rpc_map_ver); - rc = ds_pool_child_map_refresh_async(poc); + rc = ds_pool_child_map_refresh_async(poc->spc_uuid, poc->spc_map_version); if (rc == 0) { ioc->ioc_map_ver = poc->spc_map_version; rc = -DER_STALE; @@ -2194,6 +2198,28 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, D_GOTO(out, rc = -DER_TX_RESTART); D_GOTO(out, rc = -DER_STALE); + } else if (rpc_map_ver > ioc->ioc_map_ver && + opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_COLL_PUNCH) { + + if (unlikely(once)) { + D_WARN("Still hold stale map %u vs %u for pool "DF_UUID" after refresh. " + "Please check whether client offers version is correct or not.\n", + rpc_map_ver, ioc->ioc_map_ver, DP_UUID(poc->spc_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + /* + * For collective punch, the map version must be matched among client and + * engines, otherwise, different engines may get different object layouts. + */ + rc = ds_pool_child_map_refresh_sync(poc->spc_uuid, rpc_map_ver); + if (rc != 0) + goto out; + + ioc->ioc_map_ver = poc->spc_map_version; + once = true; + + goto again; } else if (DAOS_FAIL_CHECK(DAOS_DTX_STALE_PM)) { D_GOTO(out, rc = -DER_STALE); } @@ -2596,8 +2622,6 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; } /* Inject failure for test to simulate the case of lost some @@ -2787,6 +2811,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) int dti_cos_cnt; uint32_t tgt_cnt; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; bool need_abort = false; @@ -2857,6 +2882,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) } version = orw->orw_map_ver; + max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { if (!(orw->orw_api_flags & DAOS_COND_MASK)) @@ -2873,7 +2899,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; d_tm_inc_counter(opm->opm_update_resent, 1); again1: @@ -2934,9 +2959,10 @@ ds_obj_rw_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, - version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, version, &orw->orw_oid, + dti_cos, dti_cos_cnt, NULL /* hints */, 0 /* hint_sz */, + NULL /* bitmap */, 0 /* bitmap_sz */, tgts, tgt_cnt, dtx_flags, + NULL /* ranks */, mbs, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -2951,6 +2977,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) /* Execute the operation on all targets */ rc = dtx_leader_exec_ops(dlh, obj_tgt_update, NULL, 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distributed transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3004,6 +3033,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) DP_DTI(&orw->orw_dti), DP_RC(rc1)); } + if (ioc.ioc_map_ver < max_ver) + ioc.ioc_map_ver = max_ver; + obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); D_FREE(mbs); D_FREE(dti_cos); @@ -3450,6 +3482,7 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, switch (opc) { case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_COLL_PUNCH: rc = vos_obj_punch(cont->sc_hdl, opi->opi_oid, opi->opi_epoch, opi->opi_map_ver, 0, NULL, 0, NULL, dth); @@ -3487,59 +3520,56 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, return rc; } -/* Handle the punch requests on non-leader */ -void -ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +struct obj_tgt_punch_args { + uint32_t opc; + struct obj_io_context *sponsor_ioc; + struct dtx_handle *sponsor_dth; + struct obj_punch_in *opi; + struct dtx_memberships *mbs; + uint32_t *ver; + void *data; +}; + +static int +obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) { - struct dtx_handle *dth = NULL; - struct obj_io_context ioc; - struct obj_punch_in *opi; - struct dtx_memberships *mbs = NULL; - struct daos_shard_tgt *tgts = NULL; - uint32_t dtx_flags = 0; - uint32_t tgt_cnt; - struct dtx_epoch epoch; - int rc; + struct obj_io_context ioc = { 0 }; + struct obj_io_context *p_ioc = &ioc; + struct obj_punch_in *opi = otpa->opi; + struct dtx_handle *dth = NULL; + struct dtx_epoch epoch; + daos_epoch_t tmp; + uint32_t dtx_flags = 0; + int rc = 0; + int i; - opi = crt_req_get(rpc); - D_ASSERT(opi != NULL); - rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, - opi->opi_pool_uuid, opi->opi_co_hdl, - opi->opi_co_uuid, rpc, opi->opi_flags, &ioc); - if (rc) + if (otpa->sponsor_ioc != NULL) { + p_ioc = otpa->sponsor_ioc; + dth = otpa->sponsor_dth; + goto exec; + } + + rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, opi->opi_pool_uuid, + opi->opi_co_hdl, opi->opi_co_uuid, otpa->data, opi->opi_flags, &ioc); + if (rc != 0) goto out; - /* Handle resend. */ if (opi->opi_flags & ORF_RESEND) { - daos_epoch_t e = opi->opi_epoch; - - rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, NULL); + tmp = opi->opi_epoch; + rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &tmp, NULL); /* Do nothing if 'prepared' or 'committed'. */ if (rc == -DER_ALREADY || rc == 0) D_GOTO(out, rc = 0); - /* Abort it firstly if exist but with different epoch, - * then re-execute with new epoch. - */ + /* Abort old one with different epoch, then re-execute with new epoch. */ if (rc == -DER_MISMATCH) /* Abort it by force with MAX epoch to guarantee * that it can be aborted. */ - rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, tmp); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; - } - - tgts = opi->opi_shard_tgts.ca_arrays; - tgt_cnt = opi->opi_shard_tgts.ca_count; - - if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { - rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &mbs); - if (rc != 0) - D_GOTO(out, rc); } epoch.oe_value = opi->opi_epoch; @@ -3550,10 +3580,9 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) dtx_flags |= DTX_SYNC; /* Start the local transaction */ - rc = dtx_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - opi->opi_map_ver, &opi->opi_oid, - opi->opi_dti_cos.ca_arrays, - opi->opi_dti_cos.ca_count, dtx_flags, mbs, &dth); + rc = dtx_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, count, opi->opi_map_ver, + &opi->opi_oid, opi->opi_dti_cos.ca_arrays, opi->opi_dti_cos.ca_count, + dtx_flags, otpa->mbs, &dth); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3563,19 +3592,58 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) if (DAOS_FAIL_CHECK(DAOS_DTX_NONLEADER_ERROR)) D_GOTO(out, rc = -DER_IO); - rc = obj_local_punch(opi, opc_get(rpc->cr_opc), &ioc, dth); - if (rc != 0) - DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); +exec: + /* There may be multiple shards reside on the same VOS target. */ + for (i = 0; i < count; i++) { + opi->opi_oid.id_shard = shards[i]; + rc = obj_local_punch(opi, otpa->opc, p_ioc, dth); + if (rc != 0) { + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + goto out; + } + } out: - /* Stop the local transaction */ - if (dth != NULL) - rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); - D_FREE(mbs); - obj_ioc_end(&ioc, rc); + if (otpa->ver != NULL) + *otpa->ver = p_ioc->ioc_map_ver; + if (p_ioc == &ioc) { + if (dth != NULL) + rc = dtx_end(dth, p_ioc->ioc_coc, rc); + obj_ioc_end(p_ioc, rc); + } + + return rc; +} + +/* Handle the punch requests on non-leader */ +void +ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +{ + struct obj_tgt_punch_args otpa = { 0 }; + struct obj_punch_in *opi = crt_req_get(rpc); + struct daos_shard_tgt *tgts = opi->opi_shard_tgts.ca_arrays; + uint32_t tgt_cnt = opi->opi_shard_tgts.ca_count; + uint32_t version = 0; + int rc; + + if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { + rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &otpa.mbs); + if (rc != 0) + D_GOTO(out, rc); + } + + otpa.opc = opc_get(rpc->cr_opc); + otpa.opi = opi; + otpa.ver = &version; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, &opi->opi_oid.id_shard, 1); + +out: + obj_punch_complete(rpc, rc, version); + D_FREE(otpa.mbs); } static int @@ -3599,13 +3667,18 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) for (i = 0; i < sub_cnt; i++) { sub = &dlh->dlh_subs[i]; if (sub->dss_tgt.st_rank != DAOS_TGT_IGNORE && sub->dss_comp) { - if (sub->dss_result == 0) + if (sub->dss_result == 0) { succeeds++; - else if (sub->dss_result == allow_failure) + } else if (sub->dss_result == allow_failure) { allow_failure_cnt++; - else if (result == -DER_INPROGRESS || result == 0) - /* Ignore INPROGRESS if there is other failure. */ + } else if (result == -DER_INPROGRESS || result == -DER_AGAIN || + result == 0) { + /* Ignore INPROGRESS and AGAIN if there is other failure. */ result = sub->dss_result; + + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + } } } @@ -3620,8 +3693,7 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) } static int -obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, - dtx_sub_comp_cb_t comp_cb) +obj_tgt_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) { struct ds_obj_exec_arg *exec_arg = arg; @@ -3639,10 +3711,9 @@ obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, rc = obj_local_punch(opi, opc_get(rpc->cr_opc), exec_arg->ioc, &dlh->dlh_handle); if (rc != 0) - DL_CDEBUG( - rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); comp: if (comp_cb != NULL) @@ -3671,6 +3742,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t flags = 0; uint32_t dtx_flags = 0; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; bool need_abort = false; @@ -3710,6 +3782,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; version = opi->opi_map_ver; + max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -3731,8 +3804,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; - again1: e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, @@ -3791,9 +3862,10 @@ ds_obj_punch_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, version, &opi->opi_oid, + dti_cos, dti_cos_cnt, NULL /* hints */, 0 /* hint_sz */, + NULL /* bitmap */, 0 /* bitmap_sz */, tgts, tgt_cnt, dtx_flags, + NULL /* rank */, mbs, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3805,10 +3877,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.flags = flags; /* Execute the operation on all shards */ - rc = dtx_leader_exec_ops(dlh, obj_tgt_punch, obj_punch_agg_cb, + rc = dtx_leader_exec_ops(dlh, obj_tgt_punch_disp, obj_punch_agg_cb, (opi->opi_api_flags & DAOS_COND_PUNCH) ? -DER_NONEXIST : 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distribute transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3849,7 +3924,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) DP_DTI(&opi->opi_dti), DP_RC(rc1)); } - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); + obj_punch_complete(rpc, rc, max_ver); cleanup: D_FREE(mbs); @@ -4593,8 +4668,6 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) /* Do nothing if 'prepared' or 'committed'. */ if (rc1 == -DER_ALREADY || rc1 == 0) D_GOTO(out, rc = 0); - - dtx_flags |= DTX_RESEND; } /* Refuse any modification with old epoch. */ @@ -4766,8 +4839,6 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) D_ASSERT(dcsh->dcsh_epoch.oe_value != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - dtx_flags |= DTX_RESEND; - again: /* For distributed transaction, the 'ORF_RESEND' may means * that the DTX has been restarted with newer epoch. @@ -4844,11 +4915,11 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch, dcde->dcde_write_cnt, - oci->oci_map_ver, &dcsh->dcsh_leader_oid, - NULL, 0, tgts, tgt_cnt - 1, dtx_flags, - dcsh->dcsh_mbs, &dlh); + rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, &dcsh->dcsh_epoch, + dcde->dcde_write_cnt, oci->oci_map_ver, &dcsh->dcsh_leader_oid, + NULL /* dti_cos */, 0 /* dti_cos_cnt */, NULL /* hints */, + 0 /* hint_sz */, NULL /* bitmap */, 0 /* bitmap_sz */, tgts, + tgt_cnt - 1, dtx_flags, NULL /* ranks */, dcsh->dcsh_mbs, &dlh); if (rc != 0) goto out; @@ -5107,7 +5178,7 @@ ds_obj_cpd_handler(crt_rpc_t *rpc) D_ASSERT(oci != NULL); - if (oci->oci_flags & ORF_CPD_LEADER) + if (oci->oci_flags & ORF_LEADER) leader = true; else leader = false; @@ -5296,3 +5367,650 @@ ds_obj_key2anchor_handler(crt_rpc_t *rpc) if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } + +struct obj_coll_tgt_args { + crt_rpc_t *octa_rpc; + struct daos_coll_shard *octa_shards; + uint32_t *octa_versions; + int octa_sponsor_tgt; + struct obj_io_context *octa_sponsor_ioc; + struct dtx_handle *octa_sponsor_dth; + union { + void *octa_misc; + /* Different collective operations may need different parameters. */ + struct dtx_memberships *octa_mbs; + }; +}; + +static int +obj_coll_tgt_punch(void *args) +{ + struct obj_coll_tgt_args *octa = args; + crt_rpc_t *rpc = octa->octa_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct obj_punch_in opi = { 0 }; + struct obj_tgt_punch_args otpa = { 0 }; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + int rc; + + opi.opi_dti = ocpi->ocpi_xid; + uuid_copy(opi.opi_pool_uuid, ocpi->ocpi_po_uuid); + uuid_copy(opi.opi_co_hdl, ocpi->ocpi_co_hdl); + uuid_copy(opi.opi_co_uuid, ocpi->ocpi_co_uuid); + opi.opi_oid = ocpi->ocpi_oid; + opi.opi_oid.id_shard = octa->octa_shards[tgt_id].dcs_buf[0]; + opi.opi_epoch = ocpi->ocpi_epoch; + opi.opi_api_flags = ocpi->ocpi_api_flags; + opi.opi_map_ver = ocpi->ocpi_map_ver; + opi.opi_flags = ocpi->ocpi_flags & ~ORF_LEADER; + + otpa.opc = opc_get(rpc->cr_opc); + if (tgt_id == octa->octa_sponsor_tgt) { + otpa.sponsor_ioc = octa->octa_sponsor_ioc; + otpa.sponsor_dth = octa->octa_sponsor_dth; + } + otpa.opi = &opi; + otpa.mbs = octa->octa_mbs; + if (octa->octa_versions != NULL) + otpa.ver = &octa->octa_versions[tgt_id]; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, octa->octa_shards[tgt_id].dcs_buf, + octa->octa_shards[tgt_id].dcs_nr); + + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj shard "DF_UOID" with "DF_DTI" on tgt %u", + DP_UOID(opi.opi_oid), DP_DTI(&opi.opi_dti), tgt_id); + + return rc; +} + +typedef int (*obj_coll_func_t)(void *args); + +static int +obj_coll_local(crt_rpc_t *rpc, struct daos_coll_shard *shards, uint8_t *bitmap, uint32_t bitmap_sz, + uint32_t *version, struct obj_io_context *ioc, struct dtx_handle *dth, void *args, + obj_coll_func_t func) +{ + struct obj_coll_tgt_args octa = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + uint32_t size = bitmap_sz << 3; + int rc = 0; + int i; + + D_ASSERT(bitmap != NULL); + + if (version != NULL) { + if (size > dss_tgt_nr) + size = dss_tgt_nr; + D_ALLOC_ARRAY(octa.octa_versions, size); + if (octa.octa_versions == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + octa.octa_rpc = rpc; + octa.octa_shards = shards; + octa.octa_misc = args; + octa.octa_sponsor_ioc = ioc; + octa.octa_sponsor_dth = dth; + if (ioc != NULL) + octa.octa_sponsor_tgt = dss_get_module_info()->dmi_tgt_id; + else + octa.octa_sponsor_tgt = -1; + + coll_ops.co_func = func; + coll_args.ca_func_args = &octa; + coll_args.ca_tgt_bitmap = bitmap; + coll_args.ca_tgt_bitmap_sz = bitmap_sz; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + +out: + if (octa.octa_versions != NULL) { + for (i = 0, *version = 0; i < size; i++) { + if (isset(bitmap, i) && *version < octa.octa_versions[i]) + *version = octa.octa_versions[i]; + } + D_FREE(octa.octa_versions); + } + + return rc; +} + +static int +obj_coll_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = arg; + crt_rpc_t *rpc = exec_arg->rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + int rc; + + if (idx != -1) + return ds_obj_coll_punch_remote(dlh, arg, idx, comp_cb); + + rc = obj_coll_local(rpc, exec_arg->shards, dlh->dlh_coll_bitmap, dlh->dlh_coll_bitmap_sz, + NULL, exec_arg->ioc, &dlh->dlh_handle, dlh->dlh_handle.dth_mbs, + obj_coll_tgt_punch); + + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj "DF_UOID" with "DF_DTI" on rank (leader) %u", + DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), dss_self_rank()); + + if (comp_cb != NULL) + comp_cb(dlh, idx, rc); + + return rc; +} + +static int +obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_shard **p_shards, + uint8_t **p_hints, uint32_t *hint_sz, uint8_t **p_bitmap, uint32_t *bitmap_sz, + struct dtx_memberships **p_mbs, d_rank_list_t **p_ranks) +{ + struct pl_map *map = NULL; + struct pl_obj_layout *layout = NULL; + struct dtx_memberships *mbs = NULL; + struct daos_coll_target *dcts = NULL; + struct daos_coll_target *dct; + struct daos_coll_shard *dcs; + struct dtx_daos_target *ddt; + struct dtx_target_group *dtg; + struct pool_target *tgt; + struct daos_obj_md md = { 0 }; + uint8_t *hints = NULL; + int leader_rank = -1; + int length = -1; + uint32_t *tmp; + uint32_t rank_nr = 0; + uint32_t tgt_nr; + uint32_t size; + d_rank_t myrank = dss_self_rank(); + d_rank_t max_rank = 0; + int rc = 0; + int i; + int j; + int k; + int m; + + D_ASSERT(p_shards != NULL); + D_ASSERT(p_hints != NULL); + D_ASSERT(p_bitmap != NULL); + D_ASSERT(p_mbs != NULL); + D_ASSERT(p_ranks != NULL); + + map = pl_map_find(ocpi->ocpi_po_uuid, ocpi->ocpi_oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map for "DF_UUID"\n", + DP_UUID(ocpi->ocpi_po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + md.omd_id = ocpi->ocpi_oid.id_pub; + md.omd_ver = ocpi->ocpi_map_ver; + md.omd_fdom_lvl = ocpi->ocpi_fdom_lvl; + md.omd_pdom_lvl = ocpi->ocpi_pdom_lvl; + md.omd_pda = ocpi->ocpi_pda; + + rc = pl_obj_place(map, ocpi->ocpi_oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", + DP_OID(ocpi->ocpi_oid.id_pub), DP_UUID(ocpi->ocpi_po_uuid)); + goto out; + } + + length = pool_map_node_nr(map->pl_poolmap); + + D_ALLOC_ARRAY(dcts, length + 1); + if (dcts == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + if (ocpi->ocpi_flags & ORF_LEADER) { + D_ALLOC_ARRAY(hints, length); + if (hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + for (i = 0, rank_nr = 0, tgt_nr = 0; i < layout->ol_nr; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &tgt); + D_ASSERT(rc == 1); + + dct = &dcts[tgt->ta_comp.co_rank]; + dct->dct_rank = tgt->ta_comp.co_rank; + + if (max_rank < dct->dct_rank) + max_rank = dct->dct_rank; + + /* + * There may be more shards than engines count on the same VOS targets because of + * rebuild/reintegration. The size of dct->dct_tgt_ids maybe larger than dss_tgt_nr. + */ + if (dct->dct_tgt_nr >= dct->dct_tgt_cap) { + if (dct->dct_tgt_nr == 0) + m = dss_tgt_nr; + else + m = dct->dct_tgt_nr << 1; + D_ALLOC_ARRAY(tmp, m); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + if (dct->dct_tgt_ids != NULL) { + memcpy(tmp, dct->dct_tgt_ids, sizeof(*tmp) * dct->dct_tgt_nr); + D_FREE(dct->dct_tgt_ids); + } + + dct->dct_tgt_ids = tmp; + dct->dct_tgt_cap = m; + } + + if (dct->dct_tgt_nr == 0) { + /* Assign the first available shard to the hint for this engine. */ + if (hints != NULL) + hints[dct->dct_rank] = tgt->ta_comp.co_index; + rank_nr++; + } + + if (tgt->ta_comp.co_id == ocpi->ocpi_leader_id && + !layout->ol_shards[i].po_rebuilding && !layout->ol_shards[i].po_reintegrating) { + if (ocpi->ocpi_flags & ORF_LEADER) + D_ASSERTF(myrank == dct->dct_rank, + "Unmatched leader rank %u vs %u\n", + myrank, dct->dct_rank); + else + D_ASSERTF(myrank != dct->dct_rank, + "Unexpected target, rank %u, tgt_id %u\n", + myrank, layout->ol_shards[i].po_target); + + /* The leader target must be unique. */ + D_ASSERT(leader_rank == -1); + D_ASSERT(ocpi->ocpi_leader_id == layout->ol_shards[i].po_target); + + leader_rank = dct->dct_rank; + if (dct->dct_tgt_nr > 0) + memmove(&dct->dct_tgt_ids[1], &dct->dct_tgt_ids[0], + sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr); + dct->dct_tgt_ids[0] = layout->ol_shards[i].po_target; + } else { + dct->dct_tgt_ids[dct->dct_tgt_nr] = layout->ol_shards[i].po_target; + } + + dct->dct_tgt_nr++; + tgt_nr++; + + /* Only collect targets bitmap and shards for current engine. */ + if (tgt->ta_comp.co_rank != myrank) + continue; + + if (dct->dct_bitmap == NULL) { + size = ((dss_tgt_nr - 1) >> 3) + 1; + D_ALLOC_ARRAY(dct->dct_bitmap, size); + if (dct->dct_bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dct->dct_bitmap_sz = size; + } + + if (dct->dct_shards == NULL) { + D_ASSERT(dct->dct_bitmap_sz != 0); + + D_ALLOC_ARRAY(dct->dct_shards, dct->dct_bitmap_sz << 3); + if (dct->dct_shards == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + dcs = &dct->dct_shards[tgt->ta_comp.co_index]; + + if (unlikely(isset(dct->dct_bitmap, tgt->ta_comp.co_index))) { + /* More than one shards reside on the same vos target. */ + D_ASSERT(dcs->dcs_nr >= 1); + + if (dcs->dcs_nr >= dcs->dcs_cap) { + D_ALLOC_ARRAY(tmp, dcs->dcs_nr << 1); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(tmp, dcs->dcs_buf, sizeof(*tmp) * dcs->dcs_nr); + if (dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + dcs->dcs_buf = tmp; + dcs->dcs_cap = dcs->dcs_nr << 1; + } + } else { + D_ASSERT(dcs->dcs_nr == 0); + + dcs->dcs_buf = &dcs->dcs_inline; + setbit(dct->dct_bitmap, tgt->ta_comp.co_index); + } + + dcs->dcs_buf[dcs->dcs_nr++] = layout->ol_shards[i].po_shard; + } + + D_ASSERT(leader_rank != -1); + D_ASSERT(rank_nr >= 1); + + if (leader_rank != 0) { + memcpy(&dcts[length], &dcts[leader_rank], sizeof(*dct)); + memmove(&dcts[1], &dcts[0], sizeof(*dct) * leader_rank); + memcpy(&dcts[0], &dcts[length], sizeof(*dct)); + memset(&dcts[length], 0, sizeof(*dct)); + } + + size = sizeof(*ddt) * tgt_nr + sizeof(*dtg) * rank_nr; + D_ALLOC(mbs, sizeof(*mbs) + size); + if (mbs == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* + * For object collective punch, we always commit related DTX synchronously. Even if we lost + * some redundancy groups when DTX resync, we still continue to punch the remaining shards. + * So set dm_grp_cnt as 1 to bypass redundancy group check. + */ + mbs->dm_grp_cnt = 1; + mbs->dm_tgt_cnt = tgt_nr; + mbs->dm_data_size = size; + mbs->dm_flags = DMF_CONTAIN_LEADER | DMF_CONTAIN_TARGET_GRP; + + ddt = &mbs->dm_tgts[0]; + dtg = (struct dtx_target_group *)(ddt + tgt_nr); + + for (i = 0, j = 0, k = 0; i < length; i++) { + dct = &dcts[i]; + if (dct->dct_tgt_ids == NULL) + continue; + + dtg[k].dtg_start_idx = j; + + for (m = 0; m < dct->dct_tgt_nr; m++) + ddt[j++].ddt_id = dct->dct_tgt_ids[m]; + + dtg[k].dtg_tgt_nr = dct->dct_tgt_nr; + dtg[k++].dtg_rank = dct->dct_rank; + } + + /* ddt[0] is always the leader target. */ + D_ASSERTF(ddt[0].ddt_id == ocpi->ocpi_leader_id, + "Invalid leader target %u vs %u on rank %u vs %u\n", + ddt[0].ddt_id, ocpi->ocpi_leader_id, myrank, leader_rank); + + if (ocpi->ocpi_flags & ORF_LEADER) { + if (unlikely(rank_nr == 1)) { + /* Only one engine is involved in the collective punch. */ + *p_ranks = NULL; + *p_hints = NULL; + *hint_sz = 0; + } else { + *p_ranks = d_rank_list_alloc(rank_nr - 1); + if (*p_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* Set i = 1 to skip leader_rank. */ + for (i = 1, j = 0; i < length; i++) { + if (dcts[i].dct_tgt_ids != NULL) + (*p_ranks)->rl_ranks[j++] = dcts[i].dct_rank; + } + + *p_hints = hints; + *hint_sz = max_rank + 1; + } + } else { + *p_ranks = NULL; + *p_hints = NULL; + *hint_sz = 0; + } + + *p_mbs = mbs; + +out: + if (rc < 0) { + d_rank_list_free(*p_ranks); + D_FREE(mbs); + *p_ranks = NULL; + *p_hints = NULL; + *hint_sz = 0; + *p_bitmap = NULL; + *bitmap_sz = 0; + *p_shards = NULL; + } else { + if (myrank == leader_rank) + dct = &dcts[0]; + else if (myrank > leader_rank) + dct = &dcts[myrank]; + else + dct = &dcts[myrank + 1]; + + D_ASSERT(dct->dct_rank == myrank); + + *p_shards = dct->dct_shards; + *p_bitmap = dct->dct_bitmap; + *bitmap_sz = dct->dct_bitmap_sz; + + /* + * We have checked the pool map version. It is impossible that current pool map + * version does not match the RPC given version as to the expected object shard + * has been migrated to other engine. + */ + D_ASSERT(*p_bitmap != NULL); + + dct->dct_shards = NULL; + dct->dct_bitmap = NULL; + dct->dct_bitmap_sz = 0; + dct->dct_shard_nr = 0; + } + + daos_coll_target_cleanup(dcts, length + 1); + + if (*p_hints != hints) + D_FREE(hints); + + if (layout != NULL) + pl_obj_layout_free(layout); + + if (map != NULL) + pl_map_decref(map); + + return rc > 0 ? 0 : rc; +} + +void +ds_obj_coll_punch_handler(crt_rpc_t *rpc) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct ds_pool *pool = NULL; + struct dtx_leader_handle *dlh = NULL; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct ds_obj_exec_arg exec_arg = { 0 }; + struct obj_io_context ioc = { 0 }; + struct cont_props co_props = { 0 }; + d_rank_list_t *ranks = NULL; + struct dtx_memberships *mbs = NULL; + struct daos_coll_shard *shards = NULL; + uint8_t *hints = NULL; + uint8_t *bitmap = NULL; + uint32_t bitmap_sz = 0; + uint32_t hint_sz = 0; + uint32_t flags = 0; + uint32_t dtx_flags = DTX_COLL; + uint32_t version = 0; + uint32_t max_ver = 0; + struct dtx_epoch epoch; + daos_epoch_t tmp; + int rc; + int rc1; + bool need_abort = false; + + D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti "DF_DTI"\n", + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : "non-leader", rpc, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid)); + + if (ocpi->ocpi_flags & ORF_LEADER) { + rc = obj_ioc_begin(ocpi->ocpi_oid.id_pub, ocpi->ocpi_map_ver, ocpi->ocpi_po_uuid, + ocpi->ocpi_co_hdl, ocpi->ocpi_co_uuid, rpc, ocpi->ocpi_flags, + &ioc); + if (rc != 0) + goto out; + + rc = ds_cont_get_props(&co_props, ocpi->ocpi_po_uuid, ocpi->ocpi_co_uuid); + if (rc != 0) + goto out; + + ocpi->ocpi_fdom_lvl = co_props.dcp_redun_lvl; + ocpi->ocpi_pdom_lvl = co_props.dcp_perf_domain; + ocpi->ocpi_pda = daos_cont_props2pda(&co_props, ocpi->ocpi_flags & ORF_EC); + } else { + D_ASSERT(dmi->dmi_xs_id == 0); + + /* + * For collective punch, the map version must be matched among client and + * engines, otherwise, different engines may get different object layouts. + */ + + rc = ds_pool_lookup(ocpi->ocpi_po_uuid, &pool); + if (rc != 0) { + D_ERROR("Failed to locate pool "DF_UUID": "DF_RC"\n", + DP_UUID(ocpi->ocpi_po_uuid), DP_RC(rc)); + goto out; + } + + if (pool->sp_map_version > ocpi->ocpi_map_ver) + D_GOTO(out, rc = -DER_STALE); + + if (pool->sp_map_version < ocpi->ocpi_map_ver) { + rc = ds_pool_child_map_refresh_sync(ocpi->ocpi_po_uuid, ocpi->ocpi_map_ver); + if (rc != 0) + goto out; + + if (pool->sp_map_version > ocpi->ocpi_map_ver) + D_GOTO(out, rc = -DER_STALE); + } + } + + rc = obj_coll_punch_prep(ocpi, &shards, &hints, &hint_sz, &bitmap, &bitmap_sz, &mbs, + &ranks); + if (rc != 0) + goto out; + + if (!(ocpi->ocpi_flags & ORF_LEADER)) { + rc = obj_coll_local(rpc, shards, bitmap, bitmap_sz, &version, NULL, NULL, mbs, + obj_coll_tgt_punch); + goto out; + } + + version = ocpi->ocpi_map_ver; + max_ver = ocpi->ocpi_map_ver; + + rc = process_epoch(&ocpi->ocpi_epoch, NULL /* epoch_first */, &ocpi->ocpi_flags); + if (rc == PE_OK_LOCAL) + ocpi->ocpi_flags &= ~ORF_EPOCH_UNCERTAIN; + + if (ocpi->ocpi_flags & ORF_DTX_SYNC) + dtx_flags |= DTX_SYNC; + + if (ocpi->ocpi_flags & ORF_RESEND) { + +again1: + tmp = 0; + rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); + switch (rc) { + case -DER_ALREADY: + D_GOTO(out, rc = 0); + case 0: + ocpi->ocpi_epoch = tmp; + flags |= ORF_RESEND; + /* TODO: Also recovery the epoch uncertainty. */ + break; + case -DER_NONEXIST: + rc = 0; + break; + default: + D_GOTO(out, rc); + } + } + +again2: + epoch.oe_value = ocpi->ocpi_epoch; + epoch.oe_first = epoch.oe_value; + epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); + + if (flags & ORF_RESEND) + dtx_flags |= DTX_PREPARED; + else + dtx_flags &= ~DTX_PREPARED; + + rc = dtx_leader_begin(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &epoch, 1, version, &ocpi->ocpi_oid, + NULL /* dti_cos */, 0 /* dti_cos_cnt */, hints, hint_sz, bitmap, + bitmap_sz, NULL /* tgts */, 0 /* tgt_cnt */, dtx_flags, ranks, + mbs, &dlh); + if (rc != 0) { + D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", + DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); + D_GOTO(out, rc); + } + + exec_arg.rpc = rpc; + exec_arg.ioc = &ioc; + exec_arg.shards = shards; + exec_arg.flags = flags; + + /* Execute the operation on all shards */ + rc = dtx_leader_exec_ops(dlh, obj_coll_punch_disp, NULL, 0, &exec_arg); + + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + + rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); + switch (rc) { + case -DER_TX_RESTART: + ocpi->ocpi_epoch = d_hlc_get(); + ocpi->ocpi_flags &= ~ORF_RESEND; + flags = 0; + goto again2; + case -DER_AGAIN: + ocpi->ocpi_flags |= ORF_RESEND; + need_abort = true; + ABT_thread_yield(); + goto again1; + default: + break; + } + +out: + if (rc != 0 && need_abort) { + rc1 = dtx_coll_abort(ioc.ioc_coc, &ocpi->ocpi_xid, ranks, hints, hint_sz, bitmap, + bitmap_sz, version, ocpi->ocpi_epoch); + if (rc1 != 0 && rc1 != -DER_NONEXIST) + D_WARN("Failed to collective abort DTX "DF_DTI": "DF_RC"\n", + DP_DTI(&ocpi->ocpi_xid), DP_RC(rc1)); + } + + if (max_ver < ioc.ioc_map_ver) + max_ver = ioc.ioc_map_ver; + + if (pool != NULL && max_ver < pool->sp_map_version) + max_ver = pool->sp_map_version; + + DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, + "(%s) handled collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u/%u, with dti "DF_DTI, + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : "non-leader", rpc, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, version, DP_DTI(&ocpi->ocpi_xid)); + + obj_punch_complete(rpc, rc, max_ver); + + d_rank_list_free(ranks); + daos_coll_shard_cleanup(shards, bitmap_sz << 3); + D_FREE(bitmap); + D_FREE(hints); + D_FREE(mbs); + + /* It is no matter even if obj_ioc_begin() was not called. */ + obj_ioc_end(&ioc, rc); + + if (pool != NULL) + ds_pool_put(pool); +} diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index 16a661c1b58..ea298ba40fd 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,18 +35,23 @@ struct obj_remote_cb_arg { }; static void -do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_update_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_rw_out *orwo = crt_reply_get(req); struct obj_rw_in *orw_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (orw_parent->orw_map_ver < orwo->orw_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(orw_parent->orw_oid), orw_parent->orw_map_ver, orwo->orw_map_version); + sub->dss_version = orwo->orw_map_version; rc1 = -DER_STALE; } else { rc1 = orwo->orw_ret; @@ -60,12 +65,6 @@ do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_update_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_update_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Execute update on the remote target */ int ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, @@ -122,14 +121,13 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, orw_parent = crt_req_get(parent_req); orw = crt_req_get(req); *orw = *orw_parent; + orw->orw_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(orw->orw_co_hdl, orw_parent->orw_co_hdl); - uuid_copy(orw->orw_co_uuid, orw_parent->orw_co_uuid); orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; - orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; - orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; + orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; + orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; D_DEBUG(DB_TRACE, DF_UOID" forwarding to rank:%d tag:%d.\n", DP_UOID(orw->orw_oid), tgt_ep.ep_rank, tgt_ep.ep_tag); @@ -152,18 +150,23 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_punch_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_punch_out *opo = crt_reply_get(req); - struct obj_punch_in *opi_parent = crt_req_get(req); + struct obj_punch_in *opi_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (opi_parent->opi_map_ver < opo->opo_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(opi_parent->opi_oid), opi_parent->opi_map_ver, opo->opo_map_version); + sub->dss_version = opo->opo_map_version; rc1 = -DER_STALE; } else { rc1 = opo->opo_ret; @@ -177,12 +180,6 @@ do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_punch_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_punch_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Execute punch on the remote target */ int ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, @@ -200,6 +197,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, struct obj_punch_in *opi_parent; crt_opcode_t opc; int rc = 0; + bool sent_rpc = false; D_ASSERT(idx < dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt); sub = &dlh->dlh_subs[idx]; @@ -234,11 +232,8 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, opi_parent = crt_req_get(parent_req); opi = crt_req_get(req); *opi = *opi_parent; + opi->opi_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(opi->opi_co_hdl, opi_parent->opi_co_hdl); - uuid_copy(opi->opi_co_uuid, opi_parent->opi_co_uuid); - opi->opi_shard_tgts.ca_count = opi_parent->opi_shard_tgts.ca_count; - opi->opi_shard_tgts.ca_arrays = opi_parent->opi_shard_tgts.ca_arrays; opi->opi_flags |= obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; @@ -254,10 +249,11 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, D_ASSERT(sub->dss_comp == 1); D_ERROR("crt_req_send failed, rc "DF_RC"\n", DP_RC(rc)); } - return rc; + + sent_rpc = true; out: - if (rc) { + if (!sent_rpc) { sub->dss_result = rc; comp_cb(dlh, idx, rc); if (remote_arg) { @@ -269,9 +265,12 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_cpd_req_cb(const struct crt_cb_info *cb_info) { - struct obj_cpd_out *oco = crt_reply_get(req); + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct obj_cpd_out *oco = crt_reply_get(req); + int rc = cb_info->cci_rc; if (rc >= 0) rc = oco->oco_ret; @@ -284,12 +283,6 @@ do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_cpd_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_cpd_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Dispatch CPD RPC and handle sub requests remotely */ int ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, @@ -361,7 +354,7 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, uuid_copy(oci->oci_co_hdl, oci_parent->oci_co_hdl); uuid_copy(oci->oci_co_uuid, oci_parent->oci_co_uuid); oci->oci_map_ver = oci_parent->oci_map_ver; - oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_CPD_LEADER; + oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_LEADER; oci->oci_disp_tgts.ca_arrays = NULL; oci->oci_disp_tgts.ca_count = 0; @@ -447,3 +440,124 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, return rc; } + +static void +shard_coll_punch_req_cb(const struct crt_cb_info *cb_info) +{ + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + crt_rpc_t *parent_req = arg->parent_req; + struct obj_coll_punch_out *ocpo = crt_reply_get(req); + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); + struct dtx_leader_handle *dlh = arg->dlh; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; + + if (ocpi_parent->ocpi_map_ver < ocpo->ocpo_map_version) { + D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", + DP_UOID(ocpi_parent->ocpi_oid), ocpi_parent->ocpi_map_ver, + ocpo->ocpo_map_version); + sub->dss_version = ocpo->ocpo_map_version; + rc1 = -DER_STALE; + } else { + rc1 = ocpo->ocpo_ret; + } + + if (rc >= 0) + rc = rc1; + + arg->comp_cb(dlh, arg->idx, rc); + crt_req_decref(parent_req); + D_FREE(arg); +} + +static int +obj_coll_punch_aggregator(crt_rpc_t *source, crt_rpc_t *result, void *arg) +{ + struct obj_coll_punch_out *out_source = crt_reply_get(source); + struct obj_coll_punch_out *out_result = crt_reply_get(result); + + if (out_result->ocpo_ret == 0) + out_result->ocpo_ret = out_source->ocpo_ret; + + if (out_result->ocpo_map_version < out_source->ocpo_map_version) + out_result->ocpo_map_version = out_source->ocpo_map_version; + + return 0; +} + +struct crt_corpc_ops obj_coll_punch_co_ops = { + .co_aggregate = obj_coll_punch_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, + dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = data; + struct obj_remote_cb_arg *remote_arg; + struct dtx_sub_status *sub; + crt_rpc_t *parent_req = exec_arg->rpc; + crt_rpc_t *req; + struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi; + int rc = 0; + bool sent_rpc = false; + + /* For collective punch, only need one bcast RPC. */ + D_ASSERT(idx == 0); + D_ASSERT(dlh->dlh_coll_ranks != NULL); + + sub = &dlh->dlh_subs[idx]; + D_ALLOC_PTR(remote_arg); + if (remote_arg == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + crt_req_addref(parent_req); + remote_arg->parent_req = parent_req; + remote_arg->dlh = dlh; + remote_arg->comp_cb = comp_cb; + remote_arg->idx = idx; + + rc = crt_corpc_req_create(dss_get_module_info()->dmi_ctx, NULL, dlh->dlh_coll_ranks, + DAOS_RPC_OPCODE(DAOS_OBJ_RPC_COLL_PUNCH, DAOS_OBJ_MODULE, + DAOS_OBJ_VERSION), + NULL, NULL, CRT_RPC_FLAG_FILTER_INVERT, + crt_tree_topo(CRT_TREE_KNOMIAL, dlh->dlh_coll_tree_width), &req); + if (rc != 0) { + D_ERROR("crt_corpc_req_create failed for collective punch remote: "DF_RC"\n", + DP_RC(rc)); + D_GOTO(out, rc); + } + + ocpi_parent = crt_req_get(parent_req); + ocpi = crt_req_get(req); + *ocpi = *ocpi_parent; + + ocpi->ocpi_flags = (exec_arg->flags | ocpi_parent->ocpi_flags) & ~ORF_LEADER; + + D_DEBUG(DB_IO, DF_UOID" broadcast collective punch RPC with flags %x/"DF_X64"\n", + DP_UOID(ocpi->ocpi_oid), ocpi->ocpi_flags, ocpi->ocpi_api_flags); + + rc = crt_req_send(req, shard_coll_punch_req_cb, remote_arg); + if (rc != 0) { + D_ASSERT(sub->dss_comp == 1); + D_ERROR("crt_req_send failed for collective punch remote: "DF_RC"\n", DP_RC(rc)); + } + + sent_rpc = true; + +out: + if (!sent_rpc) { + sub->dss_result = rc; + comp_cb(dlh, idx, rc); + if (remote_arg != NULL) { + crt_req_decref(parent_req); + D_FREE(remote_arg); + } + } + return rc; +} diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index e0370d1ee68..7f4e977f4d3 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -242,6 +242,7 @@ extern struct bio_reaction_ops nvme_reaction_ops; uint32_t pool_iv_map_ent_size(int nr); int ds_pool_iv_init(void); int ds_pool_iv_fini(void); +int ds_pool_map_refresh_internal(uuid_t uuid, uint32_t version); void ds_pool_map_refresh_ult(void *arg); int ds_pool_iv_conn_hdl_update(struct ds_pool *pool, uuid_t hdl_uuid, diff --git a/src/pool/srv_iv.c b/src/pool/srv_iv.c index 55a0141d7cc..75e40aa52c1 100644 --- a/src/pool/srv_iv.c +++ b/src/pool/srv_iv.c @@ -1355,20 +1355,19 @@ pool_iv_map_invalidate(void *ns, unsigned int shortcut, unsigned int sync_mode) return rc; } -/* ULT to refresh pool map version */ -void -ds_pool_map_refresh_ult(void *arg) +int +ds_pool_map_refresh_internal(uuid_t uuid, uint32_t version) { - struct pool_map_refresh_ult_arg *iv_arg = arg; - struct ds_pool *pool; - d_rank_t rank; - int rc = 0; + struct ds_pool *pool; + d_rank_t rank; + int rc = 0; /* Pool IV fetch should only be done in xstream 0 */ D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - rc = ds_pool_lookup(iv_arg->iua_pool_uuid, &pool); + + rc = ds_pool_lookup(uuid, &pool); if (rc != 0) { - D_WARN(DF_UUID" refresh pool map: %d\n", DP_UUID(iv_arg->iua_pool_uuid), rc); + D_WARN(DF_UUID" refresh pool map: %d\n", DP_UUID(uuid), rc); goto out; } @@ -1385,12 +1384,10 @@ ds_pool_map_refresh_ult(void *arg) * until the refresh is done. */ ABT_mutex_lock(pool->sp_mutex); - if (pool->sp_map_version >= iv_arg->iua_pool_version && - pool->sp_map != NULL && + if (pool->sp_map_version >= version && pool->sp_map != NULL && !DAOS_FAIL_CHECK(DAOS_FORCE_REFRESH_POOL_MAP)) { D_DEBUG(DB_TRACE, "current pool version %u >= %u\n", - pool_map_get_version(pool->sp_map), - iv_arg->iua_pool_version); + pool_map_get_version(pool->sp_map), version); goto unlock; } @@ -1410,9 +1407,22 @@ ds_pool_map_refresh_ult(void *arg) out: if (pool != NULL) ds_pool_put(pool); - if (iv_arg->iua_eventual) + + return rc; +} + +/* ULT to refresh pool map version */ +void +ds_pool_map_refresh_ult(void *arg) +{ + struct pool_map_refresh_ult_arg *iv_arg = arg; + int rc; + + rc = ds_pool_map_refresh_internal(iv_arg->iua_pool_uuid, iv_arg->iua_pool_version); + if (iv_arg->iua_eventual != NULL) ABT_eventual_set(iv_arg->iua_eventual, (void *)&rc, sizeof(rc)); - D_FREE(iv_arg); + else + D_FREE(iv_arg); } int diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 0d2a2a4bc3d..7d2190cd245 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -7650,39 +7650,44 @@ ds_pool_replicas_update_handler(crt_rpc_t *rpc) /* Update pool map version for current xstream. */ int -ds_pool_child_map_refresh_sync(struct ds_pool_child *dpc) +ds_pool_child_map_refresh_sync(uuid_t uuid, uint32_t version) { struct pool_map_refresh_ult_arg arg; ABT_eventual eventual; int *status; int rc; - rc = ABT_eventual_create(sizeof(*status), &eventual); - if (rc != ABT_SUCCESS) - return dss_abterr2der(rc); + if (dss_get_module_info()->dmi_xs_id != 0) { + rc = ABT_eventual_create(sizeof(*status), &eventual); + if (rc != ABT_SUCCESS) + return dss_abterr2der(rc); - arg.iua_pool_version = dpc->spc_map_version; - uuid_copy(arg.iua_pool_uuid, dpc->spc_uuid); - arg.iua_eventual = eventual; + arg.iua_pool_version = version; + uuid_copy(arg.iua_pool_uuid, uuid); + arg.iua_eventual = eventual; - rc = dss_ult_create(ds_pool_map_refresh_ult, &arg, DSS_XS_SYS, - 0, 0, NULL); - if (rc) - D_GOTO(out_eventual, rc); + rc = dss_ult_create(ds_pool_map_refresh_ult, &arg, DSS_XS_SYS, + 0, 0, NULL); + if (rc != 0) + D_GOTO(out_eventual, rc); - rc = ABT_eventual_wait(eventual, (void **)&status); - if (rc != ABT_SUCCESS) - D_GOTO(out_eventual, rc = dss_abterr2der(rc)); - if (*status != 0) - D_GOTO(out_eventual, rc = *status); + rc = ABT_eventual_wait(eventual, (void **)&status); + if (rc != ABT_SUCCESS) + rc = dss_abterr2der(rc); + else + rc = *status; out_eventual: - ABT_eventual_free(&eventual); + ABT_eventual_free(&eventual); + } else { + rc = ds_pool_map_refresh_internal(uuid, version); + } + return rc; } int -ds_pool_child_map_refresh_async(struct ds_pool_child *dpc) +ds_pool_child_map_refresh_async(uuid_t uuid, uint32_t version) { struct pool_map_refresh_ult_arg *arg; int rc; @@ -7690,8 +7695,8 @@ ds_pool_child_map_refresh_async(struct ds_pool_child *dpc) D_ALLOC_PTR(arg); if (arg == NULL) return -DER_NOMEM; - arg->iua_pool_version = dpc->spc_map_version; - uuid_copy(arg->iua_pool_uuid, dpc->spc_uuid); + arg->iua_pool_version = version; + uuid_copy(arg->iua_pool_uuid, uuid); rc = dss_ult_create(ds_pool_map_refresh_ult, arg, DSS_XS_SYS, 0, 0, NULL); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 6a82ff4a5bf..ee67ef86ae6 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1338,10 +1338,12 @@ pool_query_one(void *vin) static int pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) { - struct dss_coll_ops coll_ops; - struct dss_coll_args coll_args = { 0 }; - struct pool_query_xs_arg agg_arg = { 0 }; - int rc; + struct dss_coll_ops coll_ops; + struct dss_coll_args coll_args = { 0 }; + struct pool_query_xs_arg agg_arg = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; D_ASSERT(ps != NULL); memset(ps, 0, sizeof(*ps)); @@ -1359,24 +1361,32 @@ pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) coll_args.ca_aggregator = &agg_arg; coll_args.ca_func_args = &coll_args.ca_stream_args; - rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID": failed to get index : rc "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; + } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); - if (rc) { + if (rc != 0) { D_ERROR("Pool query on pool "DF_UUID" failed, "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; } *ps = agg_arg.qxa_space; + +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } @@ -2100,9 +2110,11 @@ ds_pool_tgt_discard_ult(void *data) { struct ds_pool *pool; struct tgt_discard_arg *arg = data; - struct dss_coll_ops coll_ops = { 0 }; - struct dss_coll_args coll_args = { 0 }; - int rc; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; /* If discard failed, let's still go ahead, since reintegration might * still succeed, though it might leave some garbage on the reintegration @@ -2125,21 +2137,28 @@ ds_pool_tgt_discard_ult(void *data) */ status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN | PO_COMP_ST_DOWN | PO_COMP_ST_NEW; - rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, &exclude_tgts, + &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(arg->pool_uuid), DP_RC(rc)); D_GOTO(put, rc); } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, + &coll_args.ca_tgt_bitmap, &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto put; + } } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_ULT_DEEP_STACK); - if (coll_args.ca_exclude_tgts) - D_FREE(coll_args.ca_exclude_tgts); DL_CDEBUG(rc == 0, DB_MD, DLOG_ERR, rc, DF_UUID " tgt discard", DP_UUID(arg->pool_uuid)); + put: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); pool->sp_need_discard = 0; pool->sp_discard_status = rc; diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index f93f9b7986d..e7dd31882ec 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -34,6 +34,9 @@ class TelemetryUtils(): "engine_pool_ops_dkey_punch", "engine_pool_ops_dtx_abort", "engine_pool_ops_dtx_check", + "engine_pool_ops_dtx_coll_abort", + "engine_pool_ops_dtx_coll_check", + "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", "engine_pool_ops_ec_agg", @@ -353,6 +356,18 @@ class TelemetryUtils(): "engine_io_ops_migrate_latency_mean", "engine_io_ops_migrate_latency_min", "engine_io_ops_migrate_latency_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS = [ + "engine_io_ops_obj_coll_punch_active", + "engine_io_ops_obj_coll_punch_active_max", + "engine_io_ops_obj_coll_punch_active_mean", + "engine_io_ops_obj_coll_punch_active_min", + "engine_io_ops_obj_coll_punch_active_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS = [ + "engine_io_ops_obj_coll_punch_latency", + "engine_io_ops_obj_coll_punch_latency_max", + "engine_io_ops_obj_coll_punch_latency_mean", + "engine_io_ops_obj_coll_punch_latency_min", + "engine_io_ops_obj_coll_punch_latency_stddev"] ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS = [ "engine_io_ops_obj_enum_active", "engine_io_ops_obj_enum_active_max", @@ -481,6 +496,8 @@ class TelemetryUtils(): ENGINE_IO_OPS_KEY2ANCHOR_LATENCY_METRICS +\ ENGINE_IO_OPS_MIGRATE_ACTIVE_METRICS +\ ENGINE_IO_OPS_MIGRATE_LATENCY_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_PUNCH_ACTIVE_METRICS +\ @@ -563,7 +580,7 @@ class TelemetryUtils(): "engine_mem_vos_dtx_cmt_ent_48", "engine_mem_vos_vos_obj_360", "engine_mem_vos_vos_lru_size", - "engine_mem_dtx_dtx_leader_handle_336", + "engine_mem_dtx_dtx_leader_handle_376", "engine_mem_dtx_dtx_entry_40"] ENGINE_MEM_TOTAL_USAGE_METRICS = [ "engine_mem_total_mem"] diff --git a/src/tests/suite/daos_obj.c b/src/tests/suite/daos_obj.c index 5415d3fa9fa..0cabff2be2e 100644 --- a/src/tests/suite/daos_obj.c +++ b/src/tests/suite/daos_obj.c @@ -5115,6 +5115,79 @@ oit_list_filter(void **state) test_teardown((void **)&arg); } +#define DTS_DKEY_CNT 8 +#define DTS_DKEY_SIZE 16 +#define DTS_IOSIZE 64 + +static void +obj_coll_punch(test_arg_t *arg, daos_oclass_id_t oclass) +{ + char buf[DTS_IOSIZE]; + char dkeys[DTS_DKEY_CNT][DTS_DKEY_SIZE]; + const char *akey = "daos_io_akey"; + daos_obj_id_t oid; + struct ioreq req; + int i; + + oid = daos_test_oid_gen(arg->coh, oclass, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + for (i = 0; i < DTS_DKEY_CNT; i++) { + dts_buf_render(dkeys[i], DTS_DKEY_SIZE); + dts_buf_render(buf, DTS_IOSIZE); + insert_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + } + + print_message("Collective punch object\n"); + punch_obj(DAOS_TX_NONE, &req); + + print_message("Fetch after punch\n"); + arg->expect_result = -DER_NONEXIST; + for (i = 0; i < DTS_DKEY_CNT; i++) + lookup_empty_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + + ioreq_fini(&req); +} + +static void +io_50(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_SX\n"); + + if (!test_runable(arg, 2)) + return; + + obj_coll_punch(arg, OC_SX); +} + +static void +io_51(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_2P1G2\n"); + + if (!test_runable(arg, 3)) + return; + + obj_coll_punch(arg, OC_EC_2P1G2); +} + +static void +io_52(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_4P1GX\n"); + + if (!test_runable(arg, 5)) + return; + + obj_coll_punch(arg, OC_EC_4P1GX); +} + static const struct CMUnitTest io_tests[] = { { "IO1: simple update/fetch/verify", io_simple, async_disable, test_case_teardown}, @@ -5213,6 +5286,12 @@ static const struct CMUnitTest io_tests[] = { { "IO47: obj_open perf", obj_open_perf, async_disable, test_case_teardown}, { "IO48: oit_list_filter", oit_list_filter, async_disable, test_case_teardown}, { "IO49: oit_list_filter async", oit_list_filter, async_enable, test_case_teardown}, + { "IO50: collective punch object - OC_SX", + io_50, NULL, test_case_teardown}, + { "IO51: collective punch object - OC_EC_2P1G2", + io_51, NULL, test_case_teardown}, + { "IO52: collective punch object - OC_EC_4P1GX", + io_52, NULL, test_case_teardown}, }; int diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index f24ef4fa820..5024e3e2bd8 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -56,7 +56,6 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, dth->dth_pinned = 0; dth->dth_sync = 0; dth->dth_cos_done = 0; - dth->dth_resent = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_solo = 0; diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 4eefa622b7a..ad05a497990 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1902,10 +1902,12 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, } int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs) +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs) { struct vos_container *cont; struct dtx_memberships *tmp; + struct vos_dtx_act_ent *dae; d_iov_t kiov; d_iov_t riov; int rc; @@ -1917,14 +1919,24 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships * d_iov_set(&riov, NULL, 0); rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), riov.iov_buf); - if (tmp == NULL) + dae = riov.iov_buf; + tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); + if (tmp == NULL) { rc = -DER_NOMEM; - else + } else { + if (oid != NULL) + *oid = DAE_OID(dae); *mbs = tmp; + } + } else if (rc == -DER_NONEXIST) { + rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); + if (rc == 0) + rc = 1; + else if (rc == -DER_NONEXIST && !cont->vc_cmt_dtx_indexed) + rc = -DER_INPROGRESS; } - if (rc != 0) + if (rc < 0) D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); return rc;