diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 78e714954c9a..7648da42a675 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1653,6 +1653,8 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, struct dss_coll_ops coll_ops = { 0 }; struct dss_coll_args coll_args = { 0 }; struct ds_pool *pool; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; int rc; /* Only for debugging purpose to compare srv_cont_hdl with cont_hdl_uuid */ @@ -1685,18 +1687,22 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, coll_args.ca_func_args = &arg; /* setting aggregator args */ - rc = ds_pool_get_failed_tgt_idx(pool_uuid, &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); - return rc; + goto out; } - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; + } - if (rc != 0) { + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + if (rc != 0) /* Once it exclude the target from the pool, since the target * might still in the cart group, so IV cont open might still * come to this target, especially if cont open/close will be @@ -1706,9 +1712,10 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n", DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid), DP_RC(rc)); - return rc; - } +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } diff --git a/src/dtx/SConscript b/src/dtx/SConscript index 5a6849671de9..4d0f1f2dcb37 100644 --- a/src/dtx/SConscript +++ b/src/dtx/SConscript @@ -18,7 +18,8 @@ def scons(): # dtx denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) dtx = denv.d_library('dtx', - ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c'], + ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c', + 'dtx_coll.c'], install_off="../..") denv.Install('$PREFIX/lib64/daos_srv', dtx) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c new file mode 100644 index 000000000000..b3149ffcea24 --- /dev/null +++ b/src/dtx/dtx_coll.c @@ -0,0 +1,373 @@ +/** + * (C) Copyright 2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * dtx: DTX collective RPC logic + */ +#define D_LOGFAC DD_FAC(dtx) + +#include +#include +#include +#include +#include +#include +#include +#include "dtx_internal.h" + +/* + * For collective DTX, when commit/abort/check the DTX on system XS (on non-leader), we cannot + * directly locate the DTX entry since no VOS target is attached to system XS. Under such case, + * we have two options: + * + * 1. The DTX leader (on IO XS) knows on which VOS target the non-leader can find out the DTX + * entry. So DTX leader can send related information (IO XS index) to the non-leader. + * + * 2. The non-leader can start ULT on every local XS collectively to find the DTX by force in + * spite of whether related DTX entry really exists on the VOS target or not. + * + * Usually, the 2nd option may cause more overhead, should be avoid. Then the 1st is relative + * better choice. On the other hand, if there are a lot of VOS targets in the system, then it + * maybe inefficient to send all VOS targets information to all related non-leaders via bcast. + * Instead, we will only send one VOS target information for each non-leader, then non-leader + * can load mbs (dtx_memberships) from the DTX entry and then calculate the other VOS targets + * information by itself. + */ + +struct dtx_coll_local_args { + uuid_t dcla_po_uuid; + uuid_t dcla_co_uuid; + struct dtx_id dcla_xid; + daos_epoch_t dcla_epoch; + uint32_t dcla_opc; + int *dcla_results; +}; + +void +dtx_coll_prep_ult(void *arg) +{ + struct dtx_coll_prep_args *dcpa = arg; + struct dtx_coll_in *dci = crt_req_get(dcpa->dcpa_rpc); + struct dtx_memberships *mbs = NULL; + struct ds_cont_child *cont = NULL; + uint32_t opc = opc_get(dcpa->dcpa_rpc->cr_opc); + int rc = 0; + + dcpa->dcpa_result = ds_cont_child_lookup(dci->dci_po_uuid, dci->dci_co_uuid, &cont); + if (dcpa->dcpa_result != 0) { + D_ERROR("Failed to locate pool="DF_UUID" cont="DF_UUID" for DTX " + DF_DTI" with opc %u: "DF_RC"\n", + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), + DP_DTI(&dci->dci_xid), opc, DP_RC(dcpa->dcpa_result)); + /* + * Convert the case of container non-exist as -DER_IO to distinguish + * the case of DTX entry does not exist. The latter one is normal. + */ + if (dcpa->dcpa_result == -DER_NONEXIST) + dcpa->dcpa_result = -DER_IO; + + goto out; + } + + dcpa->dcpa_result = vos_dtx_load_mbs(cont->sc_hdl, &dci->dci_xid, &dcpa->dcpa_oid, &mbs); + if (dcpa->dcpa_result == -DER_INPROGRESS && !dtx_cont_opened(cont) && + opc == DTX_COLL_CHECK) { + rc = start_dtx_reindex_ult(cont); + if (rc != 0) + D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", + DP_UUID(cont->sc_uuid), DP_RC(rc)); + } + + if (dcpa->dcpa_result != 0) + goto out; + + dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, + dci->dci_version, cont->sc_pool->spc_map_version, + opc == DTX_COLL_CHECK, false, &dcpa->dcpa_dce); + if (dcpa->dcpa_result != 0) + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI" opc %u: "DF_RC"\n", DP_DTI(&dci->dci_xid), opc, + DP_RC(dcpa->dcpa_result)); + +out: + if (cont != NULL) + ds_cont_child_put(cont); + + rc = ABT_future_set(dcpa->dcpa_future, NULL); + D_ASSERT(rc == ABT_SUCCESS); +} + +int +dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, + uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, + struct dtx_coll_entry **p_dce) +{ + struct pl_map *map = NULL; + struct pl_obj_layout *layout = NULL; + struct pool_target *target; + struct dtx_daos_target *ddt; + struct dtx_coll_target *dct; + struct dtx_coll_entry *dce = NULL; + struct daos_obj_md md = { 0 }; + uint32_t node_nr; + d_rank_t my_rank = dss_self_rank(); + d_rank_t max_rank = 0; + int rc = 0; + int i; + int j; + + D_ASSERT(mbs->dm_flags & DMF_COLL_TARGET); + + D_ALLOC_PTR(dce); + if (dce == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_xid = *xid; + dce->dce_ver = dtx_ver; + dce->dce_refs = 1; + + ddt = &mbs->dm_tgts[0]; + dct = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + D_ALLOC(dce->dce_bitmap, dct->dct_bitmap_sz); + if (dce->dce_bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_bitmap_sz = dct->dct_bitmap_sz; + + if (!for_check) { + memcpy(dce->dce_bitmap, dct->dct_tgts + dct->dct_tgt_nr, dct->dct_bitmap_sz); + } else { + map = pl_map_find(po_uuid, oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + for (i = 0, j = 0; i < dct->dct_tgt_nr; i++) { + rc = pool_map_find_target(map->pl_poolmap, dct->dct_tgts[i], &target); + D_ASSERT(rc == 1); + + /* Skip the targets that reside on other engines. */ + if (unlikely(target->ta_comp.co_rank != my_rank)) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > dtx_ver) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_NEW && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + /* Skip current (new) leader target. */ + if (my_tgtid != target->ta_comp.co_index) { + setbit(dce->dce_bitmap, target->ta_comp.co_index); + j++; + } + } + + rc = 0; + + if (unlikely(j == 0)) { + D_FREE(dce->dce_bitmap); + dce->dce_bitmap_sz = 0; + } + } + + if (!need_hint) + goto out; + + if (map == NULL) { + map = pl_map_find(po_uuid, oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + } + + node_nr = pool_map_node_nr(map->pl_poolmap); + if (unlikely(node_nr == 1)) + D_GOTO(out, rc = 0); + + dce->dce_ranks = d_rank_list_alloc(node_nr - 1); + if (dce->dce_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + D_ALLOC_ARRAY(dce->dce_hints, node_nr); + if (dce->dce_hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < node_nr; i++) + dce->dce_hints[i] = (uint8_t)(-1); + + md.omd_id = oid.id_pub; + md.omd_ver = pm_ver; + md.omd_fdom_lvl = dct->dct_fdom_lvl; + md.omd_pda = dct->dct_pda; + md.omd_pdom_lvl = dct->dct_pdom_lvl; + + rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", + DP_OID(oid.id_pub), DP_UUID(po_uuid)); + goto out; + } + + for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target); + D_ASSERT(rc == 1); + + /* Skip current leader rank. */ + if (target->ta_comp.co_rank == my_rank) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > dtx_ver) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_NEW && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + if (dce->dce_hints[target->ta_comp.co_rank] == (uint8_t)(-1)) { + dce->dce_hints[target->ta_comp.co_rank] = target->ta_comp.co_index; + dce->dce_ranks->rl_ranks[j++] = target->ta_comp.co_rank; + if (max_rank < target->ta_comp.co_rank) + max_rank = target->ta_comp.co_rank; + } + } + + rc = 0; + + /* + * It is no matter that the real size of rl_ranks array is larger than rl_nr. + * Then reduce rl_nr to skip those non-defined ranks at the tail in rl_ranks. + */ + if (unlikely(j == 0)) { + d_rank_list_free(dce->dce_ranks); + dce->dce_ranks = NULL; + D_FREE(dce->dce_hints); + dce->dce_hint_sz = 0; + } else { + dce->dce_ranks->rl_nr = j; + dce->dce_hint_sz = max_rank + 1; + } + +out: + if (layout != NULL) + pl_obj_layout_free(layout); + + if (map != NULL) + pl_map_decref(map); + + if (rc != 0) + dtx_coll_entry_put(dce); + else + *p_dce = dce; + + return rc; +} + +static int +dtx_coll_local_one(void *args) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct dtx_coll_local_args *dcla = args; + struct ds_cont_child *cont = NULL; + uint32_t opc = dcla->dcla_opc; + int rc; + int rc1; + + rc = ds_cont_child_lookup(dcla->dcla_po_uuid, dcla->dcla_co_uuid, &cont); + if (rc != 0) { + D_ERROR("Failed to locate "DF_UUID"/"DF_UUID" for collective DTX " + DF_DTI" rpc %u: "DF_RC"\n", DP_UUID(dcla->dcla_po_uuid), + DP_UUID(dcla->dcla_co_uuid), DP_DTI(&dcla->dcla_xid), opc, DP_RC(rc)); + goto out; + } + + switch (opc) { + case DTX_COLL_COMMIT: + rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, NULL); + break; + case DTX_COLL_ABORT: + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + break; + case DTX_COLL_CHECK: + rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, NULL, false); + if (rc == DTX_ST_INITED) { + /* + * For DTX_CHECK, non-ready one is equal to non-exist. Do not directly + * return 'DTX_ST_INITED' to avoid interoperability trouble if related + * request is from old server. + */ + rc = -DER_NONEXIST; + } else if (rc == -DER_INPROGRESS && !dtx_cont_opened(cont)) { + /* Trigger DTX re-index for subsequent (retry) DTX_CHECK. */ + rc1 = start_dtx_reindex_ult(cont); + if (rc1 != 0) + D_ERROR("Failed to trigger DTX reindex for "DF_UUID"/"DF_UUID + " on target %u/%u: "DF_RC"\n", + DP_UUID(dcla->dcla_po_uuid), DP_UUID(dcla->dcla_co_uuid), + dss_self_rank(), dmi->dmi_tgt_id, DP_RC(rc1)); + } + break; + default: + D_ASSERTF(0, "Unknown collective DTX opc %u\n", opc); + D_GOTO(out, rc = -DER_NOTSUPPORTED); + } + +out: + dcla->dcla_results[dmi->dmi_tgt_id] = rc; + if (cont != NULL) + ds_cont_child_put(cont); + + return 0; +} + +int +dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) +{ + struct dtx_coll_local_args dcla = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int rc; + + D_ALLOC_ARRAY(dcla.dcla_results, dss_tgt_nr); + if (dcla.dcla_results == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + uuid_copy(dcla.dcla_po_uuid, po_uuid); + uuid_copy(dcla.dcla_co_uuid, co_uuid); + dcla.dcla_xid = *xid; + dcla.dcla_epoch = epoch; + dcla.dcla_opc = opc; + + coll_ops.co_func = dtx_coll_local_one; + coll_args.ca_func_args = &dcla; + coll_args.ca_tgt_bitmap_sz = bitmap_sz; + coll_args.ca_tgt_bitmap = bitmap; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Locally exec collective DTX PRC %u for "DF_DTI": "DF_RC"\n", + opc, DP_DTI(xid), DP_RC(rc)); + +out: + *p_results = dcla.dcla_results; + return rc < 0 ? rc : dss_tgt_nr; +} diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 2b549c06c8f5..0a3d2b193a75 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -24,7 +24,6 @@ uint32_t dtx_agg_thd_age_up; uint32_t dtx_agg_thd_age_lo; uint32_t dtx_batched_ult_max; - struct dtx_batched_pool_args { /* Link to dss_module_info::dmi_dtx_batched_pool_list. */ d_list_t dbpa_sys_link; @@ -73,13 +72,19 @@ struct dtx_cleanup_cb_args { static inline void dtx_free_committable(struct dtx_entry **dtes, struct dtx_cos_key *dcks, - int count) + struct dtx_coll_entry *dce, int count) { int i; - for (i = 0; i < count; i++) - dtx_entry_put(dtes[i]); - D_FREE(dtes); + if (dce != NULL) { + D_ASSERT(count == 1); + + dtx_coll_entry_put(dce); + } else { + for (i = 0; i < count; i++) + dtx_entry_put(dtes[i]); + D_FREE(dtes); + } D_FREE(dcks); } @@ -109,7 +114,9 @@ dtx_free_dbca(struct dtx_batched_cont_args *dbca) } D_ASSERT(cont->sc_dtx_committable_count == 0); + D_ASSERT(cont->sc_dtx_committable_coll_count == 0); D_ASSERT(d_list_empty(&cont->sc_dtx_cos_list)); + D_ASSERT(d_list_empty(&cont->sc_dtx_coll_list)); /* Even if the container is reopened during current deregister, the * reopen will use new dbca, so current dbca needs to be cleanup. @@ -184,6 +191,7 @@ dtx_stat(struct ds_cont_child *cont, struct dtx_stat *stat) vos_dtx_stat(cont->sc_hdl, stat, DSF_SKIP_BAD); stat->dtx_committable_count = cont->sc_dtx_committable_count; + stat->dtx_committable_coll_count = cont->sc_dtx_committable_coll_count; stat->dtx_oldest_committable_time = dtx_cos_oldest(cont); } @@ -263,6 +271,7 @@ dtx_cleanup_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) dsp->dsp_xid = ent->ie_dtx_xid; dsp->dsp_oid = ent->ie_dtx_oid; dsp->dsp_epoch = ent->ie_epoch; + dsp->dsp_version = ent->ie_dtx_ver; if (ent->ie_dtx_mbs_dsize > DTX_INLINE_MBS_SIZE) goto add; @@ -303,12 +312,14 @@ dtx_dpci_free(struct dtx_partial_cmt_item *dpci) static void dtx_cleanup(void *arg) { + struct dss_module_info *dmi = dss_get_module_info(); struct dtx_batched_cont_args *dbca = arg; struct ds_cont_child *cont = dbca->dbca_cont; struct dtx_share_peer *dsp; struct dtx_partial_cmt_item *dpci; struct dtx_entry *dte; struct dtx_cleanup_cb_args dcca; + daos_unit_oid_t oid; d_list_t cmt_list; d_list_t abt_list; d_list_t act_list; @@ -366,9 +377,24 @@ dtx_cleanup(void *arg) dte = &dpci->dpci_dte; if (dte->dte_mbs == NULL) - rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &dte->dte_mbs); - if (dte->dte_mbs != NULL) - rc = dtx_commit(cont, &dte, NULL, 1); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &oid, &dte->dte_mbs); + if (dte->dte_mbs != NULL) { + if (dte->dte_mbs->dm_flags & DMF_COLL_TARGET) { + struct dtx_coll_entry *dce = NULL; + + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, &dte->dte_xid, + dte->dte_mbs, dmi->dmi_tgt_id, dte->dte_ver, + cont->sc_pool->spc_map_version, false, true, &dce); + if (rc == 0) { + D_ASSERT(dce != NULL); + + rc = dtx_coll_commit(cont, dce, NULL); + dtx_coll_entry_put(dce); + } + } else { + rc = dtx_commit(cont, &dte, NULL, 1); + } + } D_DEBUG(DB_IO, "Cleanup partial committed DTX "DF_DTI", left %d: %d\n", DP_DTI(&dte->dte_xid), dcca.dcca_pc_count, rc); @@ -594,12 +620,13 @@ dtx_batched_commit_one(void *arg) dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; struct dtx_stat stat = { 0 }; int cnt; int rc; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL, - DAOS_EPOCH_MAX, &dtes, &dcks); + DAOS_EPOCH_MAX, &dtes, &dcks, &dce); if (cnt == 0) break; @@ -609,8 +636,15 @@ dtx_batched_commit_one(void *arg) break; } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + /* Currently, commit collective DTX one by one. */ + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); if (rc != 0) { D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n", cnt, DP_UUID(cont->sc_uuid), DP_RC(rc)); @@ -624,6 +658,7 @@ dtx_batched_commit_one(void *arg) sched_req_wakeup(dmi->dmi_dtx_agg_req); if ((stat.dtx_committable_count <= DTX_THRESHOLD_COUNT) && + (stat.dtx_committable_coll_count == 0) && (stat.dtx_oldest_committable_time == 0 || d_hlc_age2sec(stat.dtx_oldest_committable_time) < DTX_COMMIT_THRESHOLD_AGE)) @@ -689,6 +724,7 @@ dtx_batched_commit(void *arg) if (dtx_cont_opened(cont) && dbca->dbca_commit_req == NULL && (dtx_batched_ult_max != 0 && tls->dt_batched_ult_cnt < dtx_batched_ult_max) && ((stat.dtx_committable_count > DTX_THRESHOLD_COUNT) || + (stat.dtx_committable_coll_count > 0) || (stat.dtx_oldest_committable_time != 0 && d_hlc_age2sec(stat.dtx_oldest_committable_time) >= DTX_COMMIT_THRESHOLD_AGE))) { @@ -846,11 +882,9 @@ dtx_handle_reinit(struct dtx_handle *dth) */ static int dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, - uint16_t sub_modification_cnt, uint32_t pm_ver, - daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, - int dti_cos_cnt, struct dtx_memberships *mbs, bool leader, - bool solo, bool sync, bool dist, bool migration, bool ignore_uncommitted, - bool resent, bool prepared, bool drop_cmt, struct dtx_handle *dth) + bool leader, uint16_t sub_modification_cnt, uint32_t pm_ver, + daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, int dti_cos_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_handle *dth) { if (sub_modification_cnt > DTX_SUB_MOD_MAX) { D_ERROR("Too many modifications in a single transaction:" @@ -871,17 +905,16 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_pinned = 0; dth->dth_cos_done = 0; - dth->dth_resent = resent ? 1 : 0; - dth->dth_solo = solo ? 1 : 0; - dth->dth_drop_cmt = drop_cmt ? 1 : 0; dth->dth_modify_shared = 0; dth->dth_active = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; - dth->dth_dist = dist ? 1 : 0; - dth->dth_for_migration = migration ? 1 : 0; - dth->dth_ignore_uncommitted = ignore_uncommitted ? 1 : 0; - dth->dth_prepared = prepared ? 1 : 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; dth->dth_aborted = 0; dth->dth_already = 0; dth->dth_need_validation = 0; @@ -891,7 +924,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_ent = NULL; dth->dth_flags = leader ? DTE_LEADER : 0; - if (sync) { + if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; dth->dth_sync = 1; } else { @@ -1102,20 +1135,19 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) * \param tgt_cnt [IN] number of targets (not count the leader itself). * \param flags [IN] See dtx_flags. * \param mbs [IN] DTX participants information. + * \param dce [IN] The pointer to collective DTX entry. * \param p_dlh [OUT] Pointer to the DTX handle. * * \return Zero on success, negative value if error. */ int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh) +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_coll_entry *dce, + struct dtx_leader_handle **p_dlh) { struct dtx_leader_handle *dlh; - struct dtx_tls *tls = dtx_tls_get(); struct dtx_handle *dth; int rc; int i; @@ -1124,32 +1156,45 @@ dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, if (dlh == NULL) return -DER_NOMEM; + dlh->dlh_future = ABT_FUTURE_NULL; + dlh->dlh_coll_entry = dce; + if (flags & DTX_TGT_COLL) + dlh->dlh_coll = 1; + if (tgt_cnt > 0) { - dlh->dlh_future = ABT_FUTURE_NULL; dlh->dlh_subs = (struct dtx_sub_status *)(dlh + 1); - for (i = 0; i < tgt_cnt; i++) { - dlh->dlh_subs[i].dss_tgt = tgts[i]; - if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) - dlh->dlh_delay_sub_cnt++; + + if (flags & DTX_TGT_COLL) { + /* + * NOTE: Do not support DTF_DELAY_FORWARD for collective DTX. + * The target information will be filled sometime later + * when dispatch related IO request. + */ + dlh->dlh_delay_sub_cnt = 0; + dlh->dlh_normal_sub_cnt = tgt_cnt; + } else { + for (i = 0; i < tgt_cnt; i++) { + dlh->dlh_subs[i].dss_tgt = tgts[i]; + if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) + dlh->dlh_delay_sub_cnt++; + } + + dlh->dlh_normal_sub_cnt = tgt_cnt - dlh->dlh_delay_sub_cnt; } - dlh->dlh_normal_sub_cnt = tgt_cnt - dlh->dlh_delay_sub_cnt; } + if (flags & DTX_RELAY) + dlh->dlh_relay = 1; + dth = &dlh->dlh_handle; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, pm_ver, - leader_oid, dti_cos, dti_cos_cnt, mbs, true, - (flags & DTX_SOLO) ? true : false, - (flags & DTX_SYNC) ? true : false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, false, - (flags & DTX_RESEND) ? true : false, - (flags & DTX_PREPARED) ? true : false, - (flags & DTX_DROP_CMT) ? true : false, dth); + rc = dtx_handle_init(dti, coh, epoch, dlh->dlh_relay ? false : true, sub_modification_cnt, + pm_ver, leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, (flags & DTX_PREPARED) ? true : false); - D_DEBUG(DB_IO, "Start DTX "DF_DTI" sub modification %d, ver %u, epoch "DF_X64", leader " - DF_UOID", dti_cos_cnt %d, tgt_cnt %d, flags %x: "DF_RC"\n", + D_DEBUG(DB_IO, "Start (%s) DTX "DF_DTI" sub modification %d, ver %u, epoch " + DF_X64", leader "DF_UOID", dti_cos_cnt %d, tgt_cnt %d, flags %x: "DF_RC"\n", + dlh->dlh_coll ? (dlh->dlh_relay ? "relay" : "collective") : "regular", DP_DTI(dti), sub_modification_cnt, dth->dth_ver, epoch->oe_value, DP_UOID(*leader_oid), dti_cos_cnt, tgt_cnt, flags, DP_RC(rc)); @@ -1157,7 +1202,7 @@ dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, D_FREE(dlh); } else { *p_dlh = dlh; - d_tm_inc_gauge(tls->dt_dtx_leader_total, 1); + d_tm_inc_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); } return rc; @@ -1182,17 +1227,6 @@ dtx_leader_wait(struct dtx_leader_handle *dlh) return dlh->dlh_result; }; -void -dtx_entry_put(struct dtx_entry *dte) -{ - if (--(dte->dte_refs) == 0) { - struct dtx_tls *tls = dtx_tls_get(); - - d_tm_dec_gauge(tls->dt_dtx_entry_total, 1); - D_FREE(dte); - } -} - /** * Stop the leader thandle. * @@ -1207,7 +1241,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul { struct ds_cont_child *cont = coh->sch_cont; struct dtx_handle *dth = &dlh->dlh_handle; - struct dtx_tls *tls = dtx_tls_get(); struct dtx_entry *dte; struct dtx_memberships *mbs; size_t size; @@ -1221,7 +1254,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul dtx_shares_fini(dth); - if (daos_is_zero_dti(&dth->dth_xid) || unlikely(result == -DER_ALREADY)) + if (daos_is_zero_dti(&dth->dth_xid) || unlikely(result == -DER_ALREADY) || dlh->dlh_relay) goto out; if (unlikely(coh->sch_closed)) { @@ -1275,24 +1308,11 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", DP_DTI(&dth->dth_xid), status); } - if ((!dth->dth_active && dth->dth_dist) || dth->dth_prepared || dtx_batched_ult_max == 0) { - /* We do not know whether some other participants have - * some active entry for this DTX, consider distributed - * transaction case, the other participants may execute - * different operations. Sync commit the DTX for safe. - */ + if (dth->dth_prepared || dtx_batched_ult_max == 0) { dth->dth_sync = 1; goto sync; } - /* For standalone modification, if leader modified nothing, then - * non-leader(s) must be the same, unpin the DTX via dtx_abort(). - */ - if (!dth->dth_active) { - unpin = true; - D_GOTO(abort, result = 0); - } - if (DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE)) D_GOTO(abort, result = 0); @@ -1310,45 +1330,42 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_ASSERT(dth->dth_mbs != NULL); - size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; - D_ALLOC(dte, size); - if (dte == NULL) { - dth->dth_sync = 1; - goto sync; - } + if (dlh->dlh_coll) { + rc = dtx_add_cos(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid, + dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL); + } else { + size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; + D_ALLOC(dte, size); + if (dte == NULL) { + dth->dth_sync = 1; + goto sync; + } - mbs = (struct dtx_memberships *)(dte + 1); - memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); + mbs = (struct dtx_memberships *)(dte + 1); + memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); - dte->dte_xid = dth->dth_xid; - dte->dte_ver = dth->dth_ver; - dte->dte_refs = 1; - dte->dte_mbs = mbs; - d_tm_inc_gauge(tls->dt_dtx_entry_total, 1); + dte->dte_xid = dth->dth_xid; + dte->dte_ver = dth->dth_ver; + dte->dte_refs = 1; + dte->dte_mbs = mbs; - /* Use the new created @dte instead of dth->dth_dte that will be - * released after dtx_leader_end(). - */ + if (!(mbs->dm_flags & DMF_SRDG_REP)) + flags = DCF_EXP_CMT; + else if (dth->dth_modify_shared) + flags = DCF_SHARED; + else + flags = 0; + + rc = dtx_add_cos(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, + dth->dth_epoch, flags); + dtx_entry_put(dte); + } - if (!(mbs->dm_flags & DMF_SRDG_REP)) - flags = DCF_EXP_CMT; - else if (dth->dth_modify_shared) - flags = DCF_SHARED; - else - flags = 0; - rc = dtx_add_cos(cont, dte, &dth->dth_leader_oid, - dth->dth_dkey_hash, dth->dth_epoch, flags); - dtx_entry_put(dte); if (rc == 0) { if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { vos_dtx_mark_committable(dth); - if (cont->sc_dtx_committable_count > - DTX_THRESHOLD_COUNT) { - struct dss_module_info *dmi; - - dmi = dss_get_module_info(); - sched_req_wakeup(dmi->dmi_dtx_cmt_req); - } + if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) + sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); } } else { dth->dth_sync = 1; @@ -1362,11 +1379,18 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * batched commit. */ vos_dtx_mark_committable(dth); - dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + + if (dlh->dlh_coll) { + rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL); + } else { + dte = &dth->dth_dte; + rc = dtx_commit(cont, &dte, NULL, 1); + } + if (rc != 0) - D_WARN(DF_UUID": Fail to sync commit DTX "DF_DTI": "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1391,7 +1415,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * 2. Remove the pinned DTX entry. */ vos_dtx_cleanup(dth, true); - dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); + if (dlh->dlh_coll) + dtx_coll_abort(cont, dlh->dlh_coll_entry, dth->dth_epoch); + else + dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); aborted = true; } @@ -1436,7 +1463,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_FREE(dth->dth_oid_array); D_FREE(dlh); - d_tm_dec_gauge(tls->dt_dtx_leader_total, 1); + d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); return result; } @@ -1473,13 +1500,8 @@ dtx_begin(daos_handle_t coh, struct dtx_id *dti, if (dth == NULL) return -DER_NOMEM; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, - pm_ver, leader_oid, dti_cos, dti_cos_cnt, mbs, - false, false, false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, - (flags & DTX_IGNORE_UNCOMMITTED) ? true : false, - (flags & DTX_RESEND) ? true : false, false, false, dth); + rc = dtx_handle_init(dti, coh, epoch, false, sub_modification_cnt, pm_ver, + leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, false); @@ -1567,9 +1589,10 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db while (dbca->dbca_reg_gen == cont->sc_dtx_batched_gen && rc >= 0) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, - NULL, DAOS_EPOCH_MAX, &dtes, &dcks); + NULL, DAOS_EPOCH_MAX, &dtes, &dcks, &dce); if (cnt <= 0) D_GOTO(out, rc = cnt); @@ -1586,8 +1609,14 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db D_GOTO(out, rc = -DER_MISC); } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); } out: @@ -1734,7 +1763,9 @@ dtx_cont_register(struct ds_cont_child *cont) } cont->sc_dtx_committable_count = 0; + cont->sc_dtx_committable_coll_count = 0; D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list); + D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list); ds_cont_child_get(cont); dbca->dbca_refs = 0; dbca->dbca_cont = cont; @@ -1939,8 +1970,12 @@ dtx_comp_cb(void **arg) sub->dss_result == dlh->dlh_allow_failure) continue; - /* Ignore DER_INPROGRESS if there is other failure. */ - if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS) + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + + /* Ignore DER_INPROGRESS and DER_AGAIN if there is other failure. */ + if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS || + dlh->dlh_result == -DER_AGAIN) dlh->dlh_result = sub->dss_result; } } @@ -2206,9 +2241,10 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, while (dtx_cont_opened(cont)) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, oid, - epoch, &dtes, &dcks); + epoch, &dtes, &dcks, &dce); if (cnt <= 0) { rc = cnt; if (rc < 0) @@ -2217,8 +2253,14 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, break; } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); if (rc < 0) { D_ERROR("Fail to commit dtx: "DF_RC"\n", DP_RC(rc)); break; @@ -2230,3 +2272,117 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, return rc; } + +void +dtx_merge_check_result(int *tgt, int src) +{ + /* As long as one target has committed, then the DTX is committable on all targets. */ + if (*tgt != DTX_ST_COMMITTED && *tgt != DTX_ST_COMMITTABLE) { + switch (src) { + case DTX_ST_COMMITTED: + case DTX_ST_COMMITTABLE: + *tgt = src; + break; + case -DER_EXCLUDED: + /* + * If non-leader is excluded, handle it as 'prepared'. If other + * non-leaders are also 'prepared' then related DTX maybe still + * committable or 'corrupted'. The subsequent DTX resync logic + * will handle related things, see dtx_verify_groups(). + * + * Fall through. + */ + case DTX_ST_PREPARED: + if (*tgt == 0 || *tgt == DTX_ST_CORRUPTED) + *tgt = src; + break; + case DTX_ST_CORRUPTED: + if (*tgt == 0) + *tgt = src; + break; + default: + if (src >= 0) { + if (*tgt != -DER_NONEXIST) + *tgt = -DER_IO; + } else { + if (src == -DER_NONEXIST || *tgt >= 0 || + (*tgt != -DER_IO && *tgt != -DER_NONEXIST)) + *tgt = src; + } + break; + } + } +} + +int +dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, daos_unit_oid_t *oid, + uint32_t version, struct pool_target **p_tgt) +{ + struct pl_map *map = NULL; + struct pl_obj_layout *layout = NULL; + struct dtx_coll_target *dct; + struct daos_obj_md md = { 0 }; + int rc = 0; + int i; + + D_ASSERT(mbs != NULL); + + /* The first UPIN (and join before DTX) target is the (new) leader of the DTX. */ + for (i = 0; i < mbs->dm_tgt_cnt; i++) { + rc = ds_pool_target_status_check(pool, mbs->dm_tgts[i].ddt_id, + (uint8_t)PO_COMP_ST_UPIN, p_tgt); + if (rc < 0) + D_GOTO(out, rc); + + /* The target that (re-)joined the system after DTX cannot be the leader. */ + if (rc == 1 && (*p_tgt)->ta_comp.co_ver <= version) + D_GOTO(out, rc = 0); + } + + if (!(mbs->dm_flags & DMF_COLL_TARGET)) + D_GOTO(out, rc = -DER_NONEXIST); + + map = pl_map_find(pool->sp_uuid, oid->id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(pool->sp_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + dct = (struct dtx_coll_target *)(mbs->dm_tgts + mbs->dm_tgt_cnt); + md.omd_id = oid->id_pub; + md.omd_ver = pool->sp_map_version; + md.omd_fdom_lvl = dct->dct_fdom_lvl; + md.omd_pda = dct->dct_pda; + md.omd_pdom_lvl = dct->dct_pdom_lvl; + + rc = pl_obj_place(map, oid->id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", + DP_OID(oid->id_pub), DP_UUID(pool->sp_uuid)); + goto out; + } + + for (i = 0; i < layout->ol_nr; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, p_tgt); + D_ASSERT(rc == 1); + + /* The target that (re-)joined the system after DTX cannot be the leader. */ + if ((*p_tgt)->ta_comp.co_ver <= version) + D_GOTO(out, rc = 0); + } + + rc = -DER_NONEXIST; + +out: + if (layout != NULL) + pl_obj_layout_free(layout); + + if (map != NULL) + pl_map_decref(map); + + return rc; +} diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 36d8dee3de92..9442adf2248e 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -50,22 +50,24 @@ struct dtx_cos_rec { * related object and dkey (that attached to the dtx_cos_rec). */ struct dtx_cos_rec_child { - /* Link into the container::sc_dtx_cos_list. */ - d_list_t dcrc_gl_committable; + /* Link into the container::sc_dtx_cos_list or container::sc_dtx_coll_list. */ + d_list_t dcrc_gl_committable; /* Link into related dcr_{reg,prio}_list. */ - d_list_t dcrc_lo_link; - /* The DTX identifier. */ - struct dtx_entry *dcrc_dte; + d_list_t dcrc_lo_link; + union { + struct dtx_entry *dcrc_dte; + struct dtx_coll_entry *dcrc_dce; + }; /* The DTX epoch. */ - daos_epoch_t dcrc_epoch; - /* Pointer to the dtx_cos_rec. */ - struct dtx_cos_rec *dcrc_ptr; + daos_epoch_t dcrc_epoch; + /* For non-collective DTX, it points to the dtx_cos_rec. */ + struct dtx_cos_rec *dcrc_ptr; }; struct dtx_cos_rec_bundle { - struct dtx_entry *dte; - daos_epoch_t epoch; - uint32_t flags; + void *entry; + daos_epoch_t epoch; + uint32_t flags; }; static int @@ -126,12 +128,18 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, return -DER_NOMEM; } - dcrc->dcrc_dte = dtx_entry_get(rbund->dte); dcrc->dcrc_epoch = rbund->epoch; - dcrc->dcrc_ptr = dcr; - - d_list_add_tail(&dcrc->dcrc_gl_committable, - &cont->sc_dtx_cos_list); + if (rbund->flags & DCF_COLL) { + /* Set dcrc_ptr as NULL to indicate that it is collective DTX. */ + dcrc->dcrc_ptr = NULL; + dcrc->dcrc_dce = dtx_coll_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_coll_list); + cont->sc_dtx_committable_coll_count++; + } else { + dcrc->dcrc_ptr = dcr; + dcrc->dcrc_dte = dtx_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_cos_list); + } cont->sc_dtx_committable_count++; d_tm_inc_gauge(tls->dt_committable, 1); @@ -159,6 +167,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) struct dtx_cos_rec_child *dcrc; struct dtx_cos_rec_child *next; int dec = 0; + int coll = 0; struct dtx_tls *tls = dtx_tls_get(); D_ASSERT(tins->ti_umm.umm_id == UMEM_CLASS_VMEM); @@ -168,7 +177,12 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } @@ -176,7 +190,12 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } @@ -184,13 +203,19 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } D_FREE(dcr); cont->sc_dtx_committable_count -= dec; + cont->sc_dtx_committable_coll_count -= coll; /** adjust per-pool counter */ d_tm_dec_gauge(tls->dt_committable, dec); @@ -231,12 +256,18 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, if (dcrc == NULL) return -DER_NOMEM; - dcrc->dcrc_dte = dtx_entry_get(rbund->dte); dcrc->dcrc_epoch = rbund->epoch; - dcrc->dcrc_ptr = dcr; - - d_list_add_tail(&dcrc->dcrc_gl_committable, - &cont->sc_dtx_cos_list); + if (rbund->flags & DCF_COLL) { + /* Set dcrc_ptr as NULL to indicate that it is collective DTX. */ + dcrc->dcrc_ptr = NULL; + dcrc->dcrc_dce = dtx_coll_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_coll_list); + cont->sc_dtx_committable_coll_count++; + } else { + dcrc->dcrc_ptr = dcr; + dcrc->dcrc_dte = dtx_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_cos_list); + } cont->sc_dtx_committable_count++; d_tm_inc_gauge(tls->dt_committable, 1); @@ -267,7 +298,8 @@ btr_ops_t dtx_btr_cos_ops = { int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, - struct dtx_entry ***dtes, struct dtx_cos_key **dcks) + struct dtx_entry ***dtes, struct dtx_cos_key **dcks, + struct dtx_coll_entry **p_dce) { struct dtx_entry **dte_buf = NULL; struct dtx_cos_key *dck_buf = NULL; @@ -275,6 +307,23 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, uint32_t count; uint32_t i = 0; + if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) { + d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) { + if (epoch >= dcrc->dcrc_epoch) { + D_ALLOC_PTR(dck_buf); + if (dck_buf == NULL) + return -DER_NOMEM; + + dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + + return 1; + } + } + } + count = min(cont->sc_dtx_committable_count, max_cnt); if (count == 0) { *dtes = NULL; @@ -300,9 +349,21 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, if (epoch < dcrc->dcrc_epoch) continue; - dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + + if (unlikely(oid != NULL && dcrc->dcrc_ptr == NULL)) { + if (i > 0) + continue; + + D_FREE(dte_buf); + *dcks = dck_buf; + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + + return 1; + } + + dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); if (++i >= count) break; } @@ -373,9 +434,8 @@ dtx_list_cos(struct ds_cont_child *cont, daos_unit_oid_t *oid, } int -dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_unit_oid_t *oid, uint64_t dkey_hash, - daos_epoch_t epoch, uint32_t flags) +dtx_add_cos(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, + uint64_t dkey_hash, daos_epoch_t epoch, uint32_t flags) { struct dtx_cos_key key; struct dtx_cos_rec_bundle rbund; @@ -386,14 +446,13 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, if (!dtx_cont_opened(cont)) return -DER_SHUTDOWN; - D_ASSERT(dte->dte_mbs != NULL); D_ASSERT(epoch != DAOS_EPOCH_MAX); key.oid = *oid; key.dkey_hash = dkey_hash; d_iov_set(&kiov, &key, sizeof(key)); - rbund.dte = dte; + rbund.entry = entry; rbund.epoch = epoch; rbund.flags = flags; d_iov_set(&riov, &rbund, sizeof(rbund)); @@ -401,10 +460,16 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, rc = dbtree_upsert(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert DTX "DF_DTI" to CoS " - "cache, "DF_UOID", key %lu, flags %x: rc = "DF_RC"\n", - DP_DTI(&dte->dte_xid), DP_UOID(*oid), (unsigned long)dkey_hash, - flags, DP_RC(rc)); + if (flags & DCF_COLL) + D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, " + DF_UOID", key %lu, flags %x: "DF_RC"\n", + DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid), + (unsigned long)dkey_hash, flags, DP_RC(rc)); + else + D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, " + DF_UOID", key %lu, flags %x: "DF_RC"\n", + DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid), + (unsigned long)dkey_hash, flags, DP_RC(rc)); return rc; } @@ -413,7 +478,6 @@ int dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash) { - struct dtx_tls *tls = dtx_tls_get(); struct dtx_cos_key key; d_iov_t kiov; d_iov_t riov; @@ -439,12 +503,16 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_prio_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 1); } @@ -455,12 +523,16 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_reg_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 2); } @@ -471,21 +543,28 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_expcmt_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 3); } out: - if (found > 0 && dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && - dcr->dcr_expcmt_count == 0) - rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, - &kiov, NULL); + if (found > 0) { + d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1); + + if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && + dcr->dcr_expcmt_count == 0) + rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); + } if (rc == 0 && found == 0) rc = -DER_NONEXIST; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a38c747a61d2..134782a8b904 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -22,16 +22,26 @@ * These are for daos_rpc::dr_opc and DAOS_RPC_OPCODE(opc, ...) rather than * crt_req_create(..., opc, ...). See src/include/daos/rpc.h. */ -#define DAOS_DTX_VERSION 3 +#define DAOS_DTX_VERSION 4 /* LIST of internal RPCS in form of: * OPCODE, flags, FMT, handler, corpc_hdlr, */ -#define DTX_PROTO_SRV_RPC_LIST \ - X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_commit") \ - X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_abort") \ - X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, NULL, "dtx_check") \ - X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, NULL, "dtx_refresh") +#define DTX_PROTO_SRV_RPC_LIST \ + X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_commit") \ + X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_abort") \ + X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_check") \ + X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_refresh") \ + X(DTX_COLL_COMMIT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_commit_co_ops, "dtx_coll_commit") \ + X(DTX_COLL_ABORT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_abort_co_ops, "dtx_coll_abort") \ + X(DTX_COLL_CHECK, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_check_co_ops, "dtx_coll_check") #define X(a, b, c, d, e, f) a, enum dtx_operation { @@ -56,6 +66,27 @@ enum dtx_operation { CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +/* + * DTX collective RPC input fields + * dci_hints is sparse array, one per engine, sorted against the rank ID. + * It can hold more than 19K engines inline RPC body. + */ +#define DAOS_ISEQ_COLL_DTX \ + ((uuid_t) (dci_po_uuid) CRT_VAR) \ + ((uuid_t) (dci_co_uuid) CRT_VAR) \ + ((struct dtx_id) (dci_xid) CRT_VAR) \ + ((uint32_t) (dci_version) CRT_VAR) \ + ((uint32_t) (dci_padding) CRT_VAR) \ + ((uint64_t) (dci_epoch) CRT_VAR) \ + ((uint8_t) (dci_hints) CRT_ARRAY) + +/* DTX collective RPC output fields */ +#define DAOS_OSEQ_COLL_DTX \ + ((int32_t) (dco_status) CRT_VAR) \ + ((uint32_t) (dco_misc) CRT_VAR) + +CRT_RPC_DECLARE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); + #define DTX_YIELD_CYCLE (DTX_THRESHOLD_COUNT >> 3) /* The time threshold for triggering DTX cleanup of stale entries. @@ -149,6 +180,20 @@ extern uint32_t dtx_batched_ult_max; */ #define DTX_INLINE_MBS_SIZE 512 +#define DTX_COLL_TREE_WIDTH 16 + +extern struct crt_corpc_ops dtx_coll_commit_co_ops; +extern struct crt_corpc_ops dtx_coll_abort_co_ops; +extern struct crt_corpc_ops dtx_coll_check_co_ops; + +struct dtx_coll_prep_args { + struct dtx_coll_entry *dcpa_dce; + crt_rpc_t *dcpa_rpc; + daos_unit_oid_t dcpa_oid; + ABT_future dcpa_future; + int dcpa_result; +}; + struct dtx_pool_metrics { struct d_tm_node_t *dpm_batched_degree; struct d_tm_node_t *dpm_batched_total; @@ -161,7 +206,6 @@ struct dtx_pool_metrics { struct dtx_tls { struct d_tm_node_t *dt_committable; struct d_tm_node_t *dt_dtx_leader_total; - struct d_tm_node_t *dt_dtx_entry_total; uint64_t dt_agg_gen; uint32_t dt_batched_ult_cnt; }; @@ -196,31 +240,37 @@ void dtx_batched_commit(void *arg); void dtx_aggregation_main(void *arg); int start_dtx_reindex_ult(struct ds_cont_child *cont); void stop_dtx_reindex_ult(struct ds_cont_child *cont); +void dtx_merge_check_result(int *tgt, int src); +int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, + daos_unit_oid_t *oid, uint32_t version, struct pool_target **p_tgt); /* dtx_cos.c */ int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, - struct dtx_entry ***dtes, struct dtx_cos_key **dcks); -int dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_unit_oid_t *oid, uint64_t dkey_hash, - daos_epoch_t epoch, uint32_t flags); + struct dtx_entry ***dtes, struct dtx_cos_key **dcks, + struct dtx_coll_entry **p_dce); +int dtx_add_cos(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, + uint64_t dkey_hash, daos_epoch_t epoch, uint32_t flags); int dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash); uint64_t dtx_cos_oldest(struct ds_cont_child *cont); /* dtx_rpc.c */ -int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); - +int dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); int dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *check_list, d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, bool for_io); -int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_epoch_t epoch, int *tgt_array, int *err); +int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, + uint64_t dkey_hash, daos_epoch_t epoch, int *tgt_array, int *err); -int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, - struct pool_target **p_tgt); +/* dtx_coll.c */ +void dtx_coll_prep_ult(void *arg); +int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, + struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, + uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); +int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); enum dtx_status_handle_result { DSHR_NEED_COMMIT = 1, @@ -234,4 +284,15 @@ enum dtx_rpc_flags { DRF_INITIAL_LEADER = (1 << 0), }; +enum dtx_cos_flags { + DCF_SHARED = (1 << 0), + /* Some DTX (such as for the distributed transaction across multiple + * RDGs, or for EC object modification) need to be committed via DTX + * RPC instead of piggyback via other dispatched update/punch RPC. + */ + DCF_EXP_CMT = (1 << 1), + /* For collective DTX. */ + DCF_COLL = (1 << 2), +}; + #endif /* __DTX_INTERNAL_H__ */ diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 02f94319c6a7..4a7661a51674 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -138,55 +138,22 @@ dtx_resync_commit(struct ds_cont_child *cont, return rc; } -/* Get leader from dtx */ -int -dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, struct pool_target **p_tgt) -{ - int i; - int rc = 0; - - D_ASSERT(mbs != NULL); - /* The first UPIN target is the leader of the DTX */ - for (i = 0; i < mbs->dm_tgt_cnt; i++) { - rc = ds_pool_target_status_check(pool, mbs->dm_tgts[i].ddt_id, - (uint8_t)PO_COMP_ST_UPIN, p_tgt); - if (rc < 0) - D_GOTO(out, rc); - - if (rc == 1) { - rc = 0; - break; - } - } - - if (i == mbs->dm_tgt_cnt) - rc = -DER_NONEXIST; -out: - return rc; -} - static int dtx_is_leader(struct ds_pool *pool, struct dtx_resync_args *dra, struct dtx_resync_entry *dre) { struct dtx_memberships *mbs = dre->dre_dte.dte_mbs; struct pool_target *target = NULL; - d_rank_t myrank; int rc; if (mbs == NULL) return 1; - rc = dtx_leader_get(pool, mbs, &target); - if (rc < 0) - D_GOTO(out, rc); - - D_ASSERT(target != NULL); - rc = crt_group_rank(NULL, &myrank); + rc = dtx_leader_get(pool, mbs, &dre->dre_oid, dre->dre_dte.dte_ver, &target); if (rc < 0) D_GOTO(out, rc); - if (myrank != target->ta_comp.co_rank || + if (dss_self_rank() != target->ta_comp.co_rank || dss_get_module_info()->dmi_tgt_id != target->ta_comp.co_index) return 0; @@ -261,28 +228,41 @@ dtx_verify_groups(struct ds_pool *pool, struct dtx_memberships *mbs, } int -dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_epoch_t epoch, int *tgt_array, int *err) +dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, + uint64_t dkey_hash, daos_epoch_t epoch, int *tgt_array, int *err) { - int rc = 0; + struct dtx_memberships *mbs = dte->dte_mbs; + struct dtx_coll_entry *dce = NULL; + int rc = 0; + + if (mbs->dm_flags & DMF_COLL_TARGET) { + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, &dte->dte_xid, mbs, + dss_get_module_info()->dmi_tgt_id, dte->dte_ver, + cont->sc_pool->spc_map_version, true, true, &dce); + if (rc != 0) { + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); + goto out; + } - rc = dtx_check(cont, dte, epoch); + rc = dtx_coll_check(cont, dce, epoch); + } else { + rc = dtx_check(cont, dte, epoch); + } switch (rc) { case DTX_ST_COMMITTED: case DTX_ST_COMMITTABLE: /* The DTX has been committed on some remote replica(s), * let's commit the DTX globally. */ - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); case -DER_INPROGRESS: case -DER_TIMEDOUT: D_WARN("Other participants not sure about whether the " "DTX "DF_DTI" is committed or not, need retry.\n", DP_DTI(&dte->dte_xid)); - return DSHR_NEED_RETRY; + D_GOTO(out, rc = DSHR_NEED_RETRY); case DTX_ST_PREPARED: { - struct dtx_memberships *mbs = dte->dte_mbs; - /* If the transaction across multiple redundancy groups, * need to check whether there are enough alive targets. */ @@ -293,7 +273,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, goto out; if (rc > 0) - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); /* XXX: For the distributed transaction that lose too * many particiants (the whole redundancy group), @@ -304,14 +284,17 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * Then we mark the TX as corrupted via special * dtx_abort() with 0 @epoch. */ - rc = dtx_abort(cont, dte, 0); + if (mbs->dm_flags & DMF_COLL_TARGET) + rc = dtx_coll_abort(cont, dce, 0); + else + rc = dtx_abort(cont, dte, 0); if (rc < 0 && err != NULL) *err = rc; - return DSHR_CORRUPT; + D_GOTO(out, rc = DSHR_CORRUPT); } - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); } case -DER_NONEXIST: /* Someone (the DTX owner or batched commit ULT) may have @@ -345,7 +328,10 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * some other DTX(s). To avoid complex rollback logic, let's * abort the DTXs one by one, not batched. */ - rc = dtx_abort(cont, dte, epoch); + if (mbs->dm_flags & DMF_COLL_TARGET) + rc = dtx_coll_abort(cont, dce, epoch); + else + rc = dtx_abort(cont, dte, epoch); D_DEBUG(DB_TRACE, "As new leader for DTX "DF_DTI", abort it (2): "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); @@ -354,10 +340,10 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, if (err != NULL) *err = rc; - return DSHR_ABORT_FAILED; + D_GOTO(out, rc = DSHR_ABORT_FAILED); } - return DSHR_IGNORE; + D_GOTO(out, rc = DSHR_IGNORE); default: D_WARN("Not sure about whether the DTX "DF_DTI " can be committed or not: %d, skip it.\n", @@ -368,6 +354,15 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, } out: + if (rc == DSHR_NEED_COMMIT && mbs->dm_flags & DMF_COLL_TARGET) { + struct dtx_cos_key dck; + + dck.oid = oid; + dck.dkey_hash = dkey_hash; + rc = dtx_coll_commit(cont, dce, &dck); + } + + dtx_coll_entry_put(dce); return rc; } @@ -412,9 +407,10 @@ dtx_status_handle(struct dtx_resync_args *dra) } if (dre->dre_dte.dte_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, &dre->dre_dte.dte_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, NULL, + &dre->dre_dte.dte_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST) + if (rc < 0 && rc != -DER_NONEXIST) D_WARN("Failed to load mbs, do not know the leader for DTX " DF_DTI" (ver = %u/%u/%u): rc = %d, skip it.\n", DP_DTI(&dre->dre_xid), dra->resync_version, @@ -446,8 +442,8 @@ dtx_status_handle(struct dtx_resync_args *dra) continue; } - rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_epoch, - tgt_array, &err); + rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_oid, dre->dre_dkey_hash, + dre->dre_epoch, tgt_array, &err); switch (rc) { case DSHR_NEED_COMMIT: goto commit; diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5c4c44c90359..fdcda4abd3fc 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -20,6 +20,7 @@ #include "dtx_internal.h" CRT_RPC_DEFINE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +CRT_RPC_DEFINE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); #define X(a, b, c, d, e, f) \ { \ @@ -206,18 +207,16 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: + D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + drr->drr_comp = 1; drr->drr_result = rc; rc = ABT_future_set(dra->dra_future, drr); D_ASSERTF(rc == ABT_SUCCESS, "ABT_future_set failed for opc %x to %d/%d: rc = %d.\n", dra->dra_opc, drr->drr_rank, drr->drr_tag, rc); - - D_DEBUG(DB_TRACE, - "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", rc %d.\n", dra->dra_opc, req, - dra->dra_future, drr->drr_rank, drr->drr_tag, - din != NULL ? din->di_epoch : 0, drr->drr_result); } static int @@ -291,41 +290,7 @@ dtx_req_list_cb(void **args) if (dra->dra_opc == DTX_CHECK) { for (i = 0; i < dra->dra_length; i++) { drr = args[i]; - switch (drr->drr_result) { - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTABLE: - dra->dra_result = DTX_ST_COMMITTED; - /* As long as one target has committed the DTX, - * then the DTX is committable on all targets. - */ - D_DEBUG(DB_TRACE, - "The DTX "DF_DTI" has been committed on %d/%d.\n", - DP_DTI(&drr->drr_dti[0]), drr->drr_rank, drr->drr_tag); - return; - case -DER_EXCLUDED: - /* - * If non-leader is excluded, handle it as 'prepared'. If other - * non-leaders are also 'prepared' then related DTX maybe still - * committable or 'corrupted'. The subsequent DTX resync logic - * will handle related things, see dtx_verify_groups(). - * - * Fall through. - */ - case DTX_ST_PREPARED: - if (dra->dra_result == 0 || - dra->dra_result == DTX_ST_CORRUPTED) - dra->dra_result = DTX_ST_PREPARED; - break; - case DTX_ST_CORRUPTED: - if (dra->dra_result == 0) - dra->dra_result = drr->drr_result; - break; - default: - dra->dra_result = drr->drr_result >= 0 ? - -DER_IO : drr->drr_result; - break; - } - + dtx_merge_check_result(&dra->dra_result, drr->drr_result); D_DEBUG(DB_TRACE, "The DTX "DF_DTI" RPC req result %d, status is %d.\n", DP_DTI(&drr->drr_dti[0]), drr->drr_result, dra->dra_result); } @@ -608,7 +573,7 @@ dtx_rpc_internal(struct dtx_common_args *dca) int rc; int i; - if (dca->dca_dra.dra_opc != DTX_REFRESH) { + if (dca->dca_dtes != NULL) { D_ASSERT(dca->dca_dtis != NULL); if (dca->dca_count > 1) { @@ -778,7 +743,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, * Some RPC may has been sent, so need to wait even if dtx_rpc_prep hit failure. */ rc = dtx_rpc_post(&dca, rc, false); - if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED) + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) rc = 0; if (rc != 0) { @@ -833,7 +798,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, DP_DTI(&dtes[0]->dte_xid), count, dra->dra_committed > 0 ? "partial" : "nothing", rc, rc1); else - D_DEBUG(DB_IO, "Commit DTXs " DF_DTI", count %d\n", + D_DEBUG(DB_TRACE, "Commit DTXs " DF_DTI", count %d\n", DP_DTI(&dtes[0]->dte_xid), count); return rc != 0 ? rc : rc1; @@ -870,7 +835,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) if (rc1 > 0 || rc1 == -DER_NONEXIST) rc1 = 0; - D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_IO, "Abort DTX "DF_DTI": rc %d %d %d\n", + D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, "Abort DTX "DF_DTI": rc %d %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1, rc2); return rc1 != 0 ? rc1 : rc2; @@ -893,8 +858,8 @@ dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) rc1 = dtx_rpc_post(&dca, rc, false); - D_CDEBUG(rc1 < 0, DLOG_ERR, DB_IO, "Check DTX "DF_DTI": rc %d %d\n", - DP_DTI(&dte->dte_xid), rc, rc1); + D_CDEBUG(rc1 < 0 && rc1 != -DER_NONEXIST, DLOG_ERR, DB_TRACE, + "Check DTX "DF_DTI": rc %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1); return rc1; } @@ -929,9 +894,9 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che drop = false; if (dsp->dsp_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, &dsp->dsp_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) goto out; drop = true; @@ -940,7 +905,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che } again: - rc = dtx_leader_get(pool, dsp->dsp_mbs, &target); + rc = dtx_leader_get(pool, dsp->dsp_mbs, &dsp->dsp_oid, dsp->dsp_version, &target); if (rc < 0) { /** * Currently, for EC object, if parity node is @@ -1166,8 +1131,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che dte.dte_refs = 1; dte.dte_mbs = dsp->dsp_mbs; - rc = dtx_status_handle_one(cont, &dte, dsp->dsp_epoch, - NULL, NULL); + rc = dtx_status_handle_one(cont, &dte, dsp->dsp_oid, dsp->dsp_dkey_hash, + dsp->dsp_epoch, NULL, NULL); switch (rc) { case DSHR_NEED_COMMIT: { struct dtx_entry *pdte = &dte; @@ -1187,6 +1152,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (for_io) D_GOTO(out, rc = -DER_INPROGRESS); continue; + case 0: case DSHR_IGNORE: dtx_dsp_free(dsp); continue; @@ -1297,3 +1263,367 @@ dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont) return rc; } + +static int +dtx_coll_commit_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + out_target->dco_misc += out_source->dco_misc; + if (out_target->dco_status == 0) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_abort_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + if (out_source->dco_status != 0 && + (out_target->dco_status == 0 || out_target->dco_status == -DER_NONEXIST)) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_check_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + dtx_merge_check_result(&out_target->dco_status, out_source->dco_status); + + return 0; +} + +struct crt_corpc_ops dtx_coll_commit_co_ops = { + .co_aggregate = dtx_coll_commit_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_abort_co_ops = { + .co_aggregate = dtx_coll_abort_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_check_co_ops = { + .co_aggregate = dtx_coll_check_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct dtx_coll_rpc_args { + struct ds_cont_child *dcra_cont; + struct dtx_id dcra_xid; + uint32_t dcra_opc; + uint32_t dcra_ver; + daos_epoch_t dcra_epoch; + d_rank_list_t *dcra_ranks; + uint8_t *dcra_hints; + uint32_t dcra_hint_sz; + uint32_t dcra_committed; + uint32_t dcra_completed:1; + int dcra_result; + ABT_thread dcra_helper; + ABT_future dcra_future; +}; + +static void +dtx_coll_rpc_cb(const struct crt_cb_info *cb_info) +{ + struct dtx_coll_rpc_args *dcra = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct dtx_coll_out *dco; + int rc = cb_info->cci_rc; + + if (rc != 0) { + dcra->dcra_result = rc; + } else { + dco = crt_reply_get(req); + dcra->dcra_result = dco->dco_status; + dcra->dcra_committed = dco->dco_misc; + } + + dcra->dcra_completed = 1; + rc = ABT_future_set(dcra->dcra_future, NULL); + D_ASSERTF(rc == ABT_SUCCESS, + "ABT_future_set failed for opc %u: rc = %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc(struct dtx_coll_rpc_args *dcra) +{ + crt_rpc_t *req = NULL; + struct dtx_coll_in *dci; + int rc; + + rc = ABT_future_create(1, NULL, &dcra->dcra_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed for coll DTX ("DF_DTI") RPC %u: rc = %d\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = crt_corpc_req_create(dss_get_module_info()->dmi_ctx, NULL, dcra->dcra_ranks, + DAOS_RPC_OPCODE(dcra->dcra_opc, DAOS_DTX_MODULE, + DAOS_DTX_VERSION), + NULL, NULL, CRT_RPC_FLAG_FILTER_INVERT, + crt_tree_topo(CRT_TREE_KNOMIAL, DTX_COLL_TREE_WIDTH), &req); + if (rc != 0) { + D_ERROR("crt_corpc_req_create failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + D_GOTO(out, rc); + } + + dci = crt_req_get(req); + + uuid_copy(dci->dci_po_uuid, dcra->dcra_cont->sc_pool_uuid); + uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid); + dci->dci_xid = dcra->dcra_xid; + dci->dci_version = dcra->dcra_ver; + dci->dci_epoch = dcra->dcra_epoch; + dci->dci_hints.ca_count = dcra->dcra_hint_sz; + dci->dci_hints.ca_arrays = dcra->dcra_hints; + + rc = crt_req_send(req, dtx_coll_rpc_cb, dcra); + if (rc != 0) + D_ERROR("crt_req_send failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + +out: + if (rc != 0 && !dcra->dcra_completed) { + dcra->dcra_result = rc; + dcra->dcra_completed = 1; + if (dcra->dcra_future != ABT_FUTURE_NULL) + ABT_future_set(dcra->dcra_future, NULL); + } + + return rc; +} + +static void +dtx_coll_rpc_helper(void *arg) +{ + struct dtx_coll_rpc_args *dcra = arg; + int rc; + + rc = dtx_coll_rpc(dcra); + + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Collective DTX helper ULT for %u exit: %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_coll_entry *dce, uint32_t opc, + daos_epoch_t epoch, struct dtx_coll_rpc_args *dcra) +{ + int rc; + + dcra->dcra_cont = cont; + dcra->dcra_xid = dce->dce_xid; + dcra->dcra_opc = opc; + dcra->dcra_ver = dce->dce_ver; + dcra->dcra_epoch = epoch; + dcra->dcra_ranks = dce->dce_ranks; + dcra->dcra_hints = dce->dce_hints; + dcra->dcra_hint_sz = dce->dce_hint_sz; + dcra->dcra_future = ABT_FUTURE_NULL; + dcra->dcra_helper = ABT_THREAD_NULL; + + if (dss_has_enough_helper()) + rc = dss_ult_create(dtx_coll_rpc_helper, dcra, DSS_XS_IOFW, + dss_get_module_info()->dmi_tgt_id, 0, &dcra->dcra_helper); + else + rc = dtx_coll_rpc(dcra); + + return rc; +} + +static int +dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) +{ + int rc; + + if (dcra->dcra_helper != ABT_THREAD_NULL) + ABT_thread_free(&dcra->dcra_helper); + + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } + + return ret != 0 ? ret : dcra->dcra_result; +} + +int +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + uint32_t committed = 0; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, + DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] > 0) + committed += results[i]; + else if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + + committed += dcra.dcra_committed; + } + + if (rc == 0 && rc1 == 0) + rc2 = vos_dtx_commit(cont->sc_hdl, &dce->dce_xid, 1, NULL); + else if (committed > 0) + /* Mark the DTX as "PARTIAL_COMMITTED" and re-commit it later via cleanup logic. */ + rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_PARTIAL_COMMITTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + /* + * NOTE: Currently, we commit collective DTX one by one with high priority. So here we have + * to remove the collective DTX entry from the CoS even if the commit failed remotely. + * Otherwise, the batched commit ULT may be blocked by such "bad" entry. + */ + if (rc2 == 0 && dck != NULL) + dtx_del_cos(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively commit DTX "DF_DTI": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, + DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + } + + if (epoch != 0) + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + else + rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively abort DTX "DF_DTI": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int i; + + /* + * If no other target, then current target is the unique + * one and 'prepared', then related DTX can be committed. + */ + if (unlikely(dce->dce_ranks == NULL && dce->dce_bitmap == NULL)) + return DTX_ST_PREPARED; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_CHECK, epoch, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, + DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (isset(dce->dce_bitmap, i)) + dtx_merge_check_result(&rc1, results[i]); + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (dce->dce_bitmap != NULL) + dtx_merge_check_result(&rc, rc1); + } + + D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, + DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", + DP_DTI(&dce->dce_xid), rc, rc1); + + return dce->dce_ranks != NULL ? rc : rc1; +} diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 9ea25a9dcd03..9cc5a5183352 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -47,14 +47,6 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create DTX leader metric: " DF_RC"\n", DP_RC(rc)); - rc = d_tm_add_metric(&tls->dt_dtx_entry_total, D_TM_GAUGE, - "total number of dtx entry in cache", "entry", - "mem/dtx/dtx_entry_%u/tgt_%u", - sizeof(struct dtx_entry), tgt_id); - if (rc != DER_SUCCESS) - D_WARN("Failed to create DTX entry metric: " DF_RC"\n", - DP_RC(rc)); - return tls; } @@ -247,7 +239,7 @@ dtx_handler(crt_rpc_t *rpc) rc1 = start_dtx_reindex_ult(cont); if (rc1 != 0) D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_RC(rc)); + DP_UUID(cont->sc_uuid), DP_RC(rc1)); } break; @@ -341,9 +333,14 @@ dtx_handler(crt_rpc_t *rpc) if (mbs[i] == NULL) continue; + /* For collective DTX, it will be committed soon. */ + if (mbs[i]->dm_flags & DMF_COLL_TARGET) { + D_FREE(mbs[i]); + continue; + } + daos_dti_copy(&dtes[j].dte_xid, - (struct dtx_id *) - din->di_dtx_array.ca_arrays + i); + (struct dtx_id *)din->di_dtx_array.ca_arrays + i); dtes[j].dte_ver = vers[i]; dtes[j].dte_refs = 1; dtes[j].dte_mbs = mbs[i]; @@ -353,19 +350,19 @@ dtx_handler(crt_rpc_t *rpc) j++; } - D_ASSERT(j == rc1); + if (j > 0) { + /* + * Commit the DTX after replied the original refresh request to + * avoid further query the same DTX. + */ + rc = dtx_commit(cont, pdte, dcks, j); + if (rc < 0) + D_WARN("Failed to commit DTX "DF_DTI", count %d: " + DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, DP_RC(rc)); - /* Commit the DTX after replied the original refresh request to - * avoid further query the same DTX. - */ - rc = dtx_commit(cont, pdte, dcks, j); - if (rc < 0) - D_WARN("Failed to commit DTX "DF_DTI", count %d: " - DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, - DP_RC(rc)); - - for (i = 0; i < j; i++) - D_FREE(pdte[i]->dte_mbs); + for (i = 0; i < j; i++) + D_FREE(pdte[i]->dte_mbs); + } } D_FREE(dout->do_sub_rets.ca_arrays); @@ -375,6 +372,140 @@ dtx_handler(crt_rpc_t *rpc) ds_cont_child_put(cont); } +static void +dtx_coll_handler(crt_rpc_t *rpc) +{ + struct dtx_coll_in *dci = crt_req_get(rpc); + struct dtx_coll_out *dco = crt_reply_get(rpc); + struct dtx_coll_prep_args dcpa = { 0 }; + d_rank_t myrank = dss_self_rank(); + uint32_t bitmap_sz = 0; + uint32_t opc = opc_get(rpc->cr_opc); + uint8_t *hints = dci->dci_hints.ca_arrays; + uint8_t *bitmap = NULL; + int *results = NULL; + bool force_check = false; + int len; + int rc; + int i; + + D_ASSERT(hints != NULL); + D_ASSERT(dci->dci_hints.ca_count > myrank); + + D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI" with hint %d\n", + opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank]); + + dcpa.dcpa_rpc = rpc; + rc = ABT_future_create(1, NULL, &dcpa.dcpa_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed: rc = %d\n", rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank], 0, NULL); + if (rc != 0) { + ABT_future_free(&dcpa.dcpa_future); + D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc)); + goto out; + } + + rc = ABT_future_wait(dcpa.dcpa_future); + D_ASSERT(rc == ABT_SUCCESS); + + ABT_future_free(&dcpa.dcpa_future); + + switch (dcpa.dcpa_result) { + case 0: + D_ASSERT(dcpa.dcpa_dce != NULL); + + if (unlikely(dcpa.dcpa_dce->dce_bitmap == NULL)) + /* + * For DTX check, if all local shards are either migrated or + * not suitable for check, then assume that they are prepared. + * For other cases, DTX commit or abort, the bitmap should not + * be empty, so there must be some data corruption if empty. + */ + D_GOTO(out, rc = (opc == DTX_COLL_CHECK) ? DTX_ST_PREPARED : -DER_IO); + + bitmap = dcpa.dcpa_dce->dce_bitmap; + bitmap_sz = dcpa.dcpa_dce->dce_bitmap_sz; + break; + case 1: + /* The DTX has been committed, then depends on the RPC type. */ + if (opc == DTX_COLL_ABORT) { + D_ERROR("NOT allow to abort committed DTX "DF_DTI"\n", + DP_DTI(&dci->dci_xid)); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (opc == DTX_COLL_CHECK) + D_GOTO(out, rc = DTX_ST_COMMITTED); + + D_ASSERT(opc == DTX_COLL_COMMIT); + /* + * We do not know whether the DTX on the other VOS targets has been committed + * or not, let's continue the commit on the other local VOS targets by force. + */ + break; + case -DER_INPROGRESS: + /* Fall through. */ + case -DER_NONEXIST: + /* The shard on the hint VOS target may not exist, then depends on the RPC type. */ + if (opc == DTX_COLL_CHECK) + force_check = true; + /* + * It is unknown whether the DTX on the other VOS targets has been committed/aborted + * or not, let's continue related operation on the other local VOS targets by force. + */ + break; + default: + D_ASSERTF(dcpa.dcpa_result < 0, "Unexpected result when load MBS for DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dci->dci_xid), DP_RC(dcpa.dcpa_result)); + D_GOTO(out, rc = dcpa.dcpa_result); + } + + len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, + opc, bitmap_sz, bitmap, &results); + if (len < 0) + D_GOTO(out, rc = len); + + if (opc == DTX_COLL_CHECK) { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) + dtx_merge_check_result(&rc, results[i]); + } + + /* + * For force check case, if no shard has been committed, we cannot trust the result + * of -DER_NONEXIST, instead, returning -DER_INPROGRESS to make the leader to retry. + */ + if (force_check && rc == -DER_NONEXIST) + D_GOTO(out, rc = -DER_INPROGRESS); + } else { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) { + if (results[i] >= 0) + dco->dco_misc += results[i]; + else if (results[i] != -DER_NONEXIST && rc == 0) + rc = results[i]; + } + } + } + +out: + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", + opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + + dco->dco_status = rc; + rc = crt_reply_send(rpc); + if (rc < 0) + D_ERROR("Failed to send collective RPC %p reply: "DF_RC"\n", rpc, DP_RC(rc)); + + dtx_coll_entry_put(dcpa.dcpa_dce); + D_FREE(results); +} + static int dtx_init(void) { diff --git a/src/engine/ult.c b/src/engine/ult.c index 204381755fb1..6a31aef3e4ea 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -97,6 +97,9 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, int xs_nr; int rc; int tid; + uint32_t bm_len; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + bool self = false; if (ops == NULL || args == NULL || ops->co_func == NULL) { D_DEBUG(DB_MD, "mandatory args missing dss_collective_reduce"); @@ -115,6 +118,7 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, return -DER_CANCELED; } + bm_len = args->ca_tgt_bitmap_sz << 3; xs_nr = dss_tgt_nr; stream_args = &args->ca_stream_args; D_ALLOC_ARRAY(stream_args->csa_streams, xs_nr); @@ -156,19 +160,18 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, stream = &stream_args->csa_streams[tid]; stream->st_coll_args = &carg; - if (args->ca_exclude_tgts_cnt) { - int i; - - for (i = 0; i < args->ca_exclude_tgts_cnt; i++) - if (args->ca_exclude_tgts[i] == tid) - break; - - if (i < args->ca_exclude_tgts_cnt) { + if (args->ca_tgt_bitmap != NULL) { + if (tid >= bm_len || isclr(args->ca_tgt_bitmap, tid)) { D_DEBUG(DB_TRACE, "Skip tgt %d\n", tid); rc = ABT_future_set(future, (void *)stream); D_ASSERTF(rc == ABT_SUCCESS, "%d\n", rc); continue; } + + if (tgt_id == tid && flags & DSS_USE_CURRENT_ULT) { + self = true; + continue; + } } dx = dss_get_xstream(DSS_MAIN_XS_ID(tid)); @@ -209,6 +212,12 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, } } + if (self) { + stream = &stream_args->csa_streams[tid]; + stream->st_coll_args = &carg; + collective_func(stream); + } + ABT_future_wait(future); rc = aggregator.at_rc; @@ -322,6 +331,45 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags) return dss_collective_internal(func, arg, true, flags); } +int +dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz) +{ + uint8_t *bitmap = NULL; + uint32_t size = ((dss_tgt_nr - 1) >> 3) + 1; + uint32_t bits = size << 3; + int rc = 0; + int i; + + D_ALLOC(bitmap, size); + if (bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < size; i++) + bitmap[i] = 0xff; + + for (i = dss_tgt_nr; i < bits; i++) + clrbit(bitmap, i); + + if (exclude_tgts == NULL) + goto out; + + for (i = 0; i < exclude_cnt; i++) { + D_ASSERT(exclude_tgts[i] < dss_tgt_nr); + clrbit(bitmap, exclude_tgts[i]); + } + +out: + if (rc == 0) { + *p_bitmap = bitmap; + *bitmap_sz = size; + } else { + D_ERROR("Failed to build bitmap for collective task: "DF_RC"\n", DP_RC(rc)); + } + + return rc; +} + /* ============== ULT create functions =================================== */ static inline int diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index 14b2337ea0fe..765fa15e1372 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -62,6 +62,8 @@ enum dtx_mbs_flags { * shard index to sort the dtx_memberships::dm_tgts. Obsolete. */ DMF_SORTED_SAD_IDX = (1 << 3), + /* The dtx target information are organized as dtx_coll_target. */ + DMF_COLL_TARGET = (1 << 4), }; /** @@ -128,6 +130,64 @@ struct dtx_redundancy_group { uint32_t drg_ids[0]; }; +/* + * How many targets are recorded in dtx_memberships::dm_tgts for collective DTX. The first one is + * current leader, the others are for new leader candicates in order when leader switched. + * + * For most of cases, when DTX leader switch happens, DTX resync will commit or abort related DTX. + * After that, related DTX dtx_memberships will become useless any longer and discarded. So unless + * the new leader is dead and excluded during current DTX resync, one new leader candidate will be + * enough. We record three new leader candidates, that can resolve the leader election trouble for + * twice when leader switch during DTX resync. + */ +#define DTX_COLL_INLINE_TARGETS 4 + +/** + * A collective transaction may contains a lot of participants. If we store all of them one by one + * in the dtx_memberships (MBS) structure, then the MBS body will be very large. Transferring such + * large MBS on network is inconvenient and may have to via RDAM instead of directly packed inside + * related RPC body. + * + * To avoid such bad situation, collective DTX will use dtx_coll_target. Instead of recording all + * the DTX participants information in MBS, the dtx_coll_target will record the targets reside on + * current engine, that can be used for local DTX operation (commit, abort, check). + * + * Please note that collective DTX only can be used for single object based stand alone operation. + * If current user is the collective DTX leader, and wants to operate the collective DTX on other + * DAOS engines, then it needs to re-calculate related participants based on related object layout. + * For most of commit/abort cases, the collective DTX leader has already prepared the paraticipants + * information in DRAM before starting the DTX, it is unnecessary to re-calculate the paraticipants. + * The re-calculation DTX paraticipants will happen when resync or cleanup the collective DTX. Such + * two cases are relative rare, so even if the overhead for such re-calculation would be quite high, + * it will not affect the whole system too much. + * + * On the other hand, DTX refresh is frequently used DTX logic. Efficiently find out the DTX leader + * is crucial for that. Consider DTX leader switch, we will record several new leader candidates in + * the MBS in front of the collective targets information. Then for most of cases, DTX refresh does + * not need to re-calculation DTX paraticipants. + */ +struct dtx_coll_target { + /* Fault domain level - used for generating related object layout. */ + uint32_t dct_fdom_lvl; + /* Performance domain affinity - used for generating related object layout. */ + uint32_t dct_pda; + /* Performance domain level - used for generating related object layout. */ + uint32_t dct_pdom_lvl; + /* The object layout version - used for generating related object layout. */ + uint16_t dct_layout_ver; + /* How many shards on current engine that participant in the collective DTX. */ + uint8_t dct_tgt_nr; + /* The size of dct_bitmap. */ + uint8_t dct_bitmap_sz; + /* + * The ID (pool_component::co_id) array for targets on current engine, used for DTX check. + * The bitmap for local object shards on current engine is appended after the ID array. The + * bitmap is used for DTX commit and abort. In fact, we can re-calculate such bitmap based + * on the taregets ID, but directly store the bitmap is more efficient since it is not big. + */ + uint32_t dct_tgts[0]; +}; + struct dtx_memberships { /* How many touched shards in the DTX. */ uint32_t dm_tgt_cnt; @@ -153,7 +213,8 @@ struct dtx_memberships { }; /* The first 'sizeof(struct dtx_daos_target) * dm_tgt_cnt' is the - * dtx_daos_target array. The subsequent are modification groups. + * dtx_daos_target array. The subsequent can be redundancy groups + * or dtx_coll_target, depends on dm_flags. */ union { char dm_data[0]; diff --git a/src/include/daos/object.h b/src/include/daos/object.h index 71d37facac05..60e7df575f8d 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -206,6 +206,84 @@ struct daos_shard_tgt { uint8_t st_flags; /* see daos_tgt_flags */ }; +struct daos_coll_shard { + uint16_t dcs_nr; + uint16_t dcs_cap; + uint32_t dcs_inline; + /* The shards (ID) in the buffer locate on the same VOS target. */ + uint32_t *dcs_buf; + + /* + * Index (in layout) of the first shard corresponding to "dcs_buf[0]" on this target, + * do not pack on-wire. + */ + uint32_t dcs_idx; +}; + +struct daos_coll_target { + uint32_t dct_rank; + /* + * The size (in byte) of dct_bitmap. It (s << 3) may be smaller than dss_tgt_nr if only + * some VOS targets are involved. It also maybe larger than dss_tgt_nr if dss_tgt_nr is + * not 2 ^ n aligned. + */ + uint8_t dct_bitmap_sz; + /* The max shard in dct_shards, it may be smaller than the sparse array length. */ + uint8_t dct_max_shard; + /* + * How many valid object shards reside on the engine. If the real count exceeds the + * max capacity of sizeof(uint8_t) can hold, just set as the max. That is no matter. + */ + uint8_t dct_tgt_nr; + /* + * The capacity for the dct_tgt_ids array. + * For non-modification case, it is always zero to avoid sending dct_tgt_ids on wire. + */ + uint8_t dct_tgt_cap; + /* Bitmap for the VOS targets (on the rank) that are involved in the operation. */ + uint8_t *dct_bitmap; + /* Sparse array for object shards' identifiers, sorted with VOS targets index. */ + struct daos_coll_shard *dct_shards; + /* + * It stores the identifiers of shards on the engine, in spite of on which VOS target, + * only for modification case. + */ + uint32_t *dct_tgt_ids; +}; + +static inline void +daos_coll_shard_cleanup(struct daos_coll_shard *shards, uint32_t count) +{ + struct daos_coll_shard *shard; + int i; + + if (shards != NULL) { + for (i = 0; i < count; i++) { + shard = &shards[i]; + if (shard->dcs_buf != &shard->dcs_inline) + D_FREE(shard->dcs_buf); + } + D_FREE(shards); + } +} + +static inline void +daos_coll_target_cleanup(struct daos_coll_target *dcts, uint32_t count) +{ + struct daos_coll_target *dct; + int i; + + if (dcts != NULL) { + for (i = 0; i < count; i++) { + dct = &dcts[i]; + daos_coll_shard_cleanup(dct->dct_shards, dct->dct_max_shard + 1); + D_FREE(dct->dct_bitmap); + D_FREE(dct->dct_tgt_ids); + } + D_FREE(dcts); + } +} + static inline bool daos_oid_is_null(daos_obj_id_t oid) { diff --git a/src/include/daos/placement.h b/src/include/daos/placement.h index d48be639f8cb..72c3d4254d9e 100644 --- a/src/include/daos/placement.h +++ b/src/include/daos/placement.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -63,7 +63,9 @@ struct pl_obj_shard { uint32_t po_shard; /* shard identifier */ uint32_t po_target; /* target id */ uint32_t po_fseq; /* The latest failure sequence */ - uint32_t po_rebuilding:1, /* rebuilding status */ + uint16_t po_rank; /* The rank on which the shard exists */ + uint8_t po_index; /* The target index inside the node */ + uint8_t po_rebuilding:1, /* rebuilding status */ po_reintegrating:1; /* reintegrating status */ }; diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index d13cb1d67338..bbbb848d4443 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -99,7 +99,8 @@ struct ds_cont_child { uint32_t sc_snapshots_nr; uint32_t sc_open; - uint64_t sc_dtx_committable_count; + uint32_t sc_dtx_committable_count; + uint32_t sc_dtx_committable_coll_count; /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can @@ -123,8 +124,10 @@ struct ds_cont_child { daos_handle_t sc_dtx_cos_hdl; /* The DTX COS-btree. */ struct btr_root sc_dtx_cos_btr; - /* The global list for committable DTXs. */ + /* The global list for committable non-collective DTXs. */ d_list_t sc_dtx_cos_list; + /* The global list for committable collective DTXs. */ + d_list_t sc_dtx_coll_list; /* the pool map version of updating DAOS_PROP_CO_STATUS prop */ uint32_t sc_status_pm_ver; /* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */ diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index be491483fbcf..1b715f91b188 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -512,6 +512,8 @@ enum dss_ult_flags { DSS_ULT_FL_PERIODIC = (1 << 0), /* Use DSS_DEEP_STACK_SZ as the stack size */ DSS_ULT_DEEP_STACK = (1 << 1), + /* Use current ULT (instead of creating new one) for the task. */ + DSS_USE_CURRENT_ULT = (1 << 2), }; int dss_ult_create(void (*func)(void *), void *arg, int xs_type, int tgt_id, @@ -581,8 +583,14 @@ struct dss_coll_args { /** Arguments for dss_collective func (Mandatory) */ void *ca_func_args; void *ca_aggregator; - int *ca_exclude_tgts; - unsigned int ca_exclude_tgts_cnt; + /* Specify on which targets to execute the task. */ + uint8_t *ca_tgt_bitmap; + /* + * The size (in byte) of ca_tgt_bitmap. It may be smaller than dss_tgt_nr if only some + * VOS targets are involved. It also may be larger than dss_tgt_nr if dss_tgt_nr is not + * 2 ^ n aligned. + */ + uint32_t ca_tgt_bitmap_sz; /** Stream arguments for all streams */ struct dss_coll_stream_args ca_stream_args; }; @@ -604,6 +612,8 @@ dss_thread_collective_reduce(struct dss_coll_ops *ops, unsigned int flags); int dss_task_collective(int (*func)(void *), void *arg, unsigned int flags); int dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags); +int dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz); /** * Loaded module management metholds diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index ee78ffe3ec95..f1ee94ff2e4e 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -23,6 +23,7 @@ struct dtx_share_peer { daos_epoch_t dsp_epoch; uint64_t dsp_dkey_hash; int dsp_status; + uint32_t dsp_version; uint32_t dsp_inline_mbs:1; struct dtx_memberships *dsp_mbs; }; @@ -64,7 +65,6 @@ struct dtx_handle { dth_pinned:1, /* DTXs in CoS list are committed. */ dth_cos_done:1, - dth_resent:1, /* For resent case. */ /* Only one participator in the DTX. */ dth_solo:1, /* Do not keep committed entry. */ @@ -141,9 +141,21 @@ struct dtx_handle { struct dtx_sub_status { struct daos_shard_tgt dss_tgt; int dss_result; + uint32_t dss_version; uint32_t dss_comp:1; }; +struct dtx_coll_entry { + struct dtx_id dce_xid; + uint32_t dce_ver; + uint32_t dce_refs; + d_rank_list_t *dce_ranks; + uint8_t *dce_hints; + uint8_t *dce_bitmap; + uint32_t dce_hint_sz; + uint32_t dce_bitmap_sz; +}; + struct dtx_leader_handle; typedef int (*dtx_agg_cb_t)(struct dtx_leader_handle *dlh, int allow_failure); @@ -153,7 +165,10 @@ struct dtx_leader_handle { struct dtx_handle dlh_handle; /* result for the distribute transaction */ int dlh_result; - + /* The known latest pool map version from remote targets. */ + uint32_t dlh_rmt_ver; + /* For 64-bits alignment. */ + uint32_t dlh_padding; /* The array of the DTX COS entries */ uint32_t dlh_dti_cos_count; struct dtx_id *dlh_dti_cos; @@ -165,8 +180,14 @@ struct dtx_leader_handle { int32_t dlh_allow_failure; /* Normal sub requests have been processed. */ uint32_t dlh_normal_sub_done:1, + /* For collective DTX. */ + dlh_coll:1, + /* Only forward RPC, but neither commit nor abort DTX. */ + dlh_relay:1, /* Drop conditional flags when forward RPC. */ dlh_drop_cond:1; + /* Elements for collective DTX. */ + struct dtx_coll_entry *dlh_coll_entry; /* How many normal sub request. */ uint32_t dlh_normal_sub_cnt; /* How many delay forward sub request. */ @@ -180,7 +201,8 @@ struct dtx_leader_handle { }; struct dtx_stat { - uint64_t dtx_committable_count; + uint32_t dtx_committable_count; + uint32_t dtx_committable_coll_count; uint64_t dtx_oldest_committable_time; uint64_t dtx_oldest_active_time; /* The epoch for the oldest entry in the 1st committed blob. */ @@ -206,7 +228,7 @@ enum dtx_flags { DTX_FOR_MIGRATION = (1 << 3), /** Ignore other uncommitted DTXs. */ DTX_IGNORE_UNCOMMITTED = (1 << 4), - /** Resent request. */ + /** Resent request. Out-of-date. */ DTX_RESEND = (1 << 5), /** Force DTX refresh if hit non-committed DTX on non-leader. Out-of-date DAOS-7878. */ DTX_FORCE_REFRESH = (1 << 6), @@ -214,6 +236,10 @@ enum dtx_flags { DTX_PREPARED = (1 << 7), /** Do not keep committed entry. */ DTX_DROP_CMT = (1 << 8), + /* The non-leader targets are collective. */ + DTX_TGT_COLL = (1 << 9), + /* Not real DTX leader, Only forward IO to others, but neither commit nor abort DTX. */ + DTX_RELAY = (1 << 10), }; void @@ -221,12 +247,11 @@ dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth); int dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash); int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh); +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_coll_entry *dce, + struct dtx_leader_handle **p_dlh); int dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int result); @@ -261,10 +286,19 @@ void dtx_cont_deregister(struct ds_cont_child *cont); int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); +int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, + struct dtx_cos_key *dcks, int count); + int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); +int +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck); + +int +dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); + /** * Check whether the given DTX is resent one or not. * @@ -290,6 +324,24 @@ int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); int dtx_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver); +static inline struct dtx_coll_entry * +dtx_coll_entry_get(struct dtx_coll_entry *dce) +{ + dce->dce_refs++; + return dce; +} + +static inline void +dtx_coll_entry_put(struct dtx_coll_entry *dce) +{ + if (dce != NULL && --(dce->dce_refs) == 0) { + d_rank_list_free(dce->dce_ranks); + D_FREE(dce->dce_bitmap); + D_FREE(dce->dce_hints); + D_FREE(dce); + } +} + static inline void dtx_dsp_free(struct dtx_share_peer *dsp) { @@ -306,7 +358,12 @@ dtx_entry_get(struct dtx_entry *dte) return dte; } -void dtx_entry_put(struct dtx_entry *dte); +static inline void +dtx_entry_put(struct dtx_entry *dte) +{ + if (--(dte->dte_refs) == 0) + D_FREE(dte); +} static inline bool dtx_is_valid_handle(const struct dtx_handle *dth) diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 9d6035f66aa3..390f4f4e5ec5 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -103,12 +103,16 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, * * \param coh [IN] Container open handle. * \param dti [IN] Pointer to the DTX identifier. + * \param oid [OUT] Pointer to the ID for the DTX leader object shard. * \param mbs [OUT] Pointer to the DTX participants information. * - * \return Zero on success, negative value if error. + * \return Zero on success. + * Positive if DTX has been committed. + * Negative value if error. */ int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs); +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs); /** * Commit the specified DTXs. diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index 4e80bacae4e5..9c9999ba97cf 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -26,15 +26,6 @@ struct dtx_rsrvd_uint { d_list_t dru_nvme; }; -enum dtx_cos_flags { - DCF_SHARED = (1 << 0), - /* Some DTX (such as for the distributed transaction across multiple - * RDGs, or for EC object modification) need to be committed via DTX - * RPC instead of piggyback via other dispatched update/punch RPC. - */ - DCF_EXP_CMT = (1 << 1), -}; - enum dtx_stat_flags { /* Skip bad DTX entries (such as corruptted ones) when stat. */ DSF_SKIP_BAD = (1 << 1), diff --git a/src/object/cli_mod.c b/src/object/cli_mod.c index 79c13fee9489..a66979dd3ce2 100644 --- a/src/object/cli_mod.c +++ b/src/object/cli_mod.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -16,6 +16,9 @@ #include "obj_rpc.h" #include "obj_internal.h" +#define OBJ_COLL_PUNCH_THD_MIN 31 + +unsigned int obj_coll_punch_thd; unsigned int srv_io_mode = DIM_DTX_FULL_ENABLED; int dc_obj_proto_version; @@ -68,6 +71,16 @@ dc_obj_init(void) D_GOTO(out_class, rc); } + obj_coll_punch_thd = OBJ_COLL_PUNCH_THD_MIN; + d_getenv_int("DAOS_OBJ_COLL_PUNCH_THD", &obj_coll_punch_thd); + if (obj_coll_punch_thd < OBJ_COLL_PUNCH_THD_MIN) { + D_WARN("Invalid collective punch threshold %u, it cannot be smaller than %u, " + "use the default value %u\n", obj_coll_punch_thd, + OBJ_COLL_PUNCH_THD_MIN, OBJ_COLL_PUNCH_THD_MIN); + obj_coll_punch_thd = OBJ_COLL_PUNCH_THD_MIN; + } + D_INFO("Set object collective punch threshold as %u\n", obj_coll_punch_thd); + tx_verify_rdg = false; d_getenv_bool("DAOS_TX_VERIFY_RDG", &tx_verify_rdg); D_INFO("%s TX redundancy group verification\n", tx_verify_rdg ? "Enable" : "Disable"); diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 710b94d18a7d..088e87067c47 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -356,6 +356,8 @@ obj_layout_create(struct dc_object *obj, unsigned int mode, bool refresh) obj_shard->do_fseq = layout->ol_shards[i].po_fseq; obj_shard->do_rebuilding = layout->ol_shards[i].po_rebuilding; obj_shard->do_reintegrating = layout->ol_shards[i].po_reintegrating; + obj_shard->do_target_rank = layout->ol_shards[i].po_rank; + obj_shard->do_target_idx = layout->ol_shards[i].po_index; } out: if (layout) @@ -2344,6 +2346,98 @@ check_query_flags(daos_obj_id_t oid, uint32_t flags, daos_key_t *dkey, return 0; } +static int +obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool for_modify) +{ + struct dc_pool *pool = obj->cob_pool; + uint32_t node_nr; + int rc = 0; + + D_ASSERT(pool != NULL); + D_ASSERT(coa->coa_dcts == NULL); + + D_RWLOCK_RDLOCK(&pool->dp_map_lock); + node_nr = pool_map_node_nr(pool->dp_map); + D_RWLOCK_UNLOCK(&pool->dp_map_lock); + + D_ALLOC_ARRAY(coa->coa_dcts, node_nr); + if (coa->coa_dcts == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* + * Set coa_dct_nr as -1 to indicate that the coa_dcts array may be sparse until + * obj_coll_oper_args_collapse(). That is useful for obj_coll_oper_args_fini(). + */ + coa->coa_dct_nr = -1; + coa->coa_dct_cap = node_nr; + coa->coa_max_dct_sz = 0; + coa->coa_max_shard_nr = 0; + coa->coa_max_bitmap_sz = 0; + coa->coa_target_nr = 0; + coa->coa_for_modify = for_modify ? 1 : 0; + +out: + return rc; +} + +static void +obj_coll_oper_args_fini(struct coll_oper_args *coa) +{ + daos_coll_target_cleanup(coa->coa_dcts, + coa->coa_dct_nr < 0 ? coa->coa_dct_cap : coa->coa_dct_nr); + coa->coa_dcts = NULL; + coa->coa_dct_cap = 0; + coa->coa_dct_nr = 0; +} + +static int +obj_coll_oper_args_collapse(struct coll_oper_args *coa, uint32_t *size) +{ + struct daos_coll_target *dct; + struct daos_coll_shard *dcs; + uint32_t dct_size; + int rc = 0; + int i; + int j; + + for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + dct = &coa->coa_dcts[i]; + if (dct->dct_bitmap != NULL) { + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (j = 0; j <= dct->dct_max_shard; j++) { + dcs = &dct->dct_shards[j]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (coa->coa_dct_nr < i) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; + } + } + + if (unlikely(coa->coa_dct_nr == 0)) + /* If all shards are NONEXIST, then need not to send RPC(s). */ + rc = 1; + else if (coa->coa_dct_cap > coa->coa_dct_nr) + /* Reset the other dct slots to avoid double free during cleanup. */ + memset(&coa->coa_dcts[coa->coa_dct_nr], 0, + sizeof(*dct) * (coa->coa_dct_cap - coa->coa_dct_nr)); + + return rc; +} + static inline bool obj_key_valid(daos_obj_id_t oid, daos_key_t *key, bool check_dkey) { @@ -2843,6 +2937,7 @@ obj_embedded_shard_arg(struct obj_auxi_args *obj_auxi) case DAOS_OBJ_RPC_SYNC: return &obj_auxi->s_args.sa_auxi; case DAOS_OBJ_RPC_QUERY_KEY: + case DAOS_OBJ_RPC_COLL_PUNCH: /* * called from obj_comp_cb_internal() and * checked in obj_shard_comp_cb() correctly @@ -4823,6 +4918,9 @@ obj_comp_cb(tse_task_t *task, void *data) } } + if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) + obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); + if ((!obj_auxi->no_retry || task->dt_result == -DER_FETCH_AGAIN) && (pm_stale || obj_auxi->io_retry)) { rc = obj_retry_cb(task, obj, obj_auxi, pm_stale, &io_task_reinited); @@ -4868,6 +4966,7 @@ obj_comp_cb(tse_task_t *task, void *data) dc_tx_attach(obj_auxi->th, obj, DAOS_OBJ_RPC_FETCH, task, 0, false); break; } + case DAOS_OBJ_RPC_COLL_PUNCH: case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_PUNCH_DKEYS: case DAOS_OBJ_RPC_PUNCH_AKEYS: @@ -6641,18 +6740,9 @@ shard_punch_prep(struct shard_auxi_args *shard_auxi, struct dc_object *obj, struct obj_auxi_args *obj_auxi, uint32_t grp_idx) { struct shard_punch_args *shard_arg; - uuid_t coh_uuid; - uuid_t cont_uuid; - int rc; - - rc = dc_cont2uuid(obj->cob_co, &coh_uuid, &cont_uuid); - if (rc != 0) - return rc; shard_arg = container_of(shard_auxi, struct shard_punch_args, pa_auxi); - shard_arg->pa_opc = obj_auxi->opc; - uuid_copy(shard_arg->pa_coh_uuid, coh_uuid); - uuid_copy(shard_arg->pa_cont_uuid, cont_uuid); + shard_arg->pa_opc = obj_auxi->opc; if (daos_handle_is_inval(obj_auxi->th)) daos_dti_gen(&shard_arg->pa_dti, @@ -6663,6 +6753,434 @@ shard_punch_prep(struct shard_auxi_args *shard_auxi, struct dc_object *obj, return 0; } +static int +obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, + uint32_t map_ver, uint32_t idx) +{ + struct dc_obj_shard *shard = NULL; + struct daos_coll_target *dct; + struct daos_coll_shard *dcs; + uint32_t *tmp; + uint8_t *new_bm; + int size; + int rc = 0; + int i; + + rc = obj_shard_open(obj, idx, map_ver, &shard); + if (rc == -DER_NONEXIST) + D_GOTO(out, rc = 0); + + if (rc != 0 || (shard->do_rebuilding && !coa->coa_for_modify)) + goto out; + + /* More ranks joined after obj_coll_oper_args_init(). */ + if (unlikely(shard->do_target_rank >= coa->coa_dct_cap)) { + D_REALLOC_ARRAY(dct, coa->coa_dcts, coa->coa_dct_cap, shard->do_target_rank + 2); + if (dct == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + coa->coa_dcts = dct; + coa->coa_dct_cap = shard->do_target_rank + 2; + } + + dct = &coa->coa_dcts[shard->do_target_rank]; + dct->dct_rank = shard->do_target_rank; + + if (shard->do_target_idx >= dct->dct_bitmap_sz << 3) { + size = (shard->do_target_idx >> 3) + 1; + + D_ALLOC_ARRAY(dcs, size << 3); + if (dcs == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + if (dct->dct_shards != NULL) { + memcpy(dcs, dct->dct_shards, sizeof(*dcs) * (dct->dct_max_shard + 1)); + for (i = 0; i <= dct->dct_max_shard; i++) { + if (dcs[i].dcs_nr == 1) + dcs[i].dcs_buf = &dcs[i].dcs_inline; + } + D_FREE(dct->dct_shards); + } + dct->dct_shards = dcs; + + D_REALLOC(new_bm, dct->dct_bitmap, dct->dct_bitmap_sz, size); + if (new_bm == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dct->dct_bitmap = new_bm; + dct->dct_bitmap_sz = size; + } + + dcs = &dct->dct_shards[shard->do_target_idx]; + + if (unlikely(isset(dct->dct_bitmap, shard->do_target_idx))) { + /* More than one shards reside on the same VOS target. */ + D_ASSERT(dcs->dcs_nr >= 1); + + if (dcs->dcs_nr >= dcs->dcs_cap) { + D_ALLOC_ARRAY(tmp, dcs->dcs_nr << 1); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(tmp, dcs->dcs_buf, sizeof(*tmp) * dcs->dcs_nr); + if (dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + dcs->dcs_buf = tmp; + dcs->dcs_cap = dcs->dcs_nr << 1; + } + } else { + D_ASSERT(dcs->dcs_nr == 0); + + dcs->dcs_idx = idx; + dcs->dcs_buf = &dcs->dcs_inline; + setbit(dct->dct_bitmap, shard->do_target_idx); + if (dct->dct_max_shard < shard->do_target_idx) + dct->dct_max_shard = shard->do_target_idx; + } + + dcs->dcs_buf[dcs->dcs_nr++] = shard->do_id.id_shard; + + if (unlikely(dct->dct_tgt_nr == (uint8_t)(-1))) + goto out; + + if (coa->coa_for_modify) { + if (dct->dct_tgt_nr >= dct->dct_tgt_cap) { + if (dct->dct_tgt_cap == 0) + size = 4; + else if (dct->dct_tgt_cap <= 8) + size = dct->dct_tgt_cap << 1; + else + size = dct->dct_tgt_cap + 8; + + D_REALLOC_ARRAY(tmp, dct->dct_tgt_ids, dct->dct_tgt_cap, size); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dct->dct_tgt_ids = tmp; + dct->dct_tgt_cap = size; + } + + /* + * There may be repeated elements in the dct->dct_tgt_ids array because multiple + * object shards reside on the same VOS target. It is no matter to store them in + * DTX MBS. Related DTX check logic will handle that. + */ + dct->dct_tgt_ids[dct->dct_tgt_nr++] = shard->do_target_id; + if (coa->coa_max_shard_nr < dct->dct_tgt_nr) + coa->coa_max_shard_nr = dct->dct_tgt_nr; + + if (coa->coa_target_nr < DTX_COLL_INLINE_TARGETS && + !shard->do_rebuilding && !shard->do_reintegrating) + coa->coa_targets[coa->coa_target_nr++] = shard->do_target_id; + + if (coa->coa_max_bitmap_sz < dct->dct_bitmap_sz) + coa->coa_max_bitmap_sz = dct->dct_bitmap_sz; + } else { + /* "dct_tgt_cap" is zero, then will not send dct_tgt_ids to server. */ + dct->dct_tgt_nr++; + } + +out: + if (shard != NULL) + obj_shard_close(shard); + + return rc; +} + +struct obj_coll_punch_cb_args { + unsigned char *cpca_buf; + struct dtx_memberships *cpca_mbs; + struct dc_obj_shard *cpca_shard; + crt_bulk_t *cpca_bulks; + crt_proc_t cpca_proc; + d_sg_list_t cpca_sgl; + d_iov_t cpca_iov; +}; + +static int +dc_obj_coll_punch_cb(tse_task_t *task, void *data) +{ + struct obj_coll_punch_cb_args *cpca = data; + + if (cpca->cpca_bulks != NULL) { + if (cpca->cpca_bulks[0] != CRT_BULK_NULL) + crt_bulk_free(cpca->cpca_bulks[0]); + D_FREE(cpca->cpca_bulks); + } + + if (cpca->cpca_proc != NULL) + crt_proc_destroy(cpca->cpca_proc); + + D_FREE(cpca->cpca_mbs); + D_FREE(cpca->cpca_buf); + obj_shard_close(cpca->cpca_shard); + + return 0; +} + +static int +dc_obj_coll_punch_mbs(struct coll_oper_args *coa, struct dc_object *obj, uint32_t leader_id, + struct dtx_memberships **p_mbs) +{ + struct dtx_memberships *mbs; + struct dtx_daos_target *ddt; + struct dtx_coll_target *dct; + int rc = 0; + int i; + int j; + + D_ALLOC(mbs, sizeof(*mbs) + sizeof(*ddt) * coa->coa_target_nr + sizeof(*dct)); + if (mbs == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* + * For object collective punch, even if we lost some redundancy groups when DTX resync, + * we still continue to punch remaining shards. So let's set dm_grp_cnt as 1 to bypass + * redundancy group check. + */ + mbs->dm_grp_cnt = 1; + mbs->dm_tgt_cnt = coa->coa_target_nr; + mbs->dm_data_size = sizeof(*ddt) * coa->coa_target_nr + sizeof(*dct); + mbs->dm_flags = DMF_CONTAIN_LEADER | DMF_COLL_TARGET; + + /* ddt[0] will be the lead target. */ + ddt = &mbs->dm_tgts[0]; + ddt[0].ddt_id = leader_id; + + for (i = 0, j = 1; i < coa->coa_target_nr && j < coa->coa_target_nr; i++) { + if (coa->coa_targets[i] != ddt[0].ddt_id) + ddt[j++].ddt_id = coa->coa_targets[i]; + } + + dct = (struct dtx_coll_target *)(ddt + coa->coa_target_nr); + dct->dct_fdom_lvl = obj->cob_md.omd_fdom_lvl; + dct->dct_pda = obj->cob_md.omd_pda; + dct->dct_pdom_lvl = obj->cob_md.omd_pdom_lvl; + dct->dct_layout_ver = obj->cob_layout_version; + + /* The other fields will not be packed on-wire. Related engine will fill them in future. */ + + *p_mbs = mbs; + +out: + return rc; +} + +static int +dc_obj_coll_punch_bulk(tse_task_t *task, struct coll_oper_args *coa, + struct obj_coll_punch_cb_args *cpca, uint32_t *p_size) +{ + /* The proc function may pack more information inside the buffer, enlarge the size a bit. */ + uint32_t size = (*p_size * 9) >> 3; + uint32_t used = 0; + int rc = 0; + int i; + +again: + D_ALLOC(cpca->cpca_buf, size); + if (cpca->cpca_buf == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + rc = crt_proc_create(daos_task2ctx(task), cpca->cpca_buf, size, CRT_PROC_ENCODE, + &cpca->cpca_proc); + if (rc != 0) + goto out; + + for (i = 0; i < coa->coa_dct_nr; i++) { + rc = crt_proc_struct_daos_coll_target(cpca->cpca_proc, CRT_PROC_ENCODE, + &coa->coa_dcts[i]); + if (rc != 0) + goto out; + } + + used = crp_proc_get_size_used(cpca->cpca_proc); + if (unlikely(used > size)) { + crt_proc_destroy(cpca->cpca_proc); + cpca->cpca_proc = NULL; + D_FREE(cpca->cpca_buf); + size = used; + goto again; + } + + cpca->cpca_iov.iov_buf = cpca->cpca_buf; + cpca->cpca_iov.iov_buf_len = used; + cpca->cpca_iov.iov_len = used; + + cpca->cpca_sgl.sg_nr = 1; + cpca->cpca_sgl.sg_nr_out = 1; + cpca->cpca_sgl.sg_iovs = &cpca->cpca_iov; + + rc = obj_bulk_prep(&cpca->cpca_sgl, 1, false, CRT_BULK_RO, task, &cpca->cpca_bulks); + +out: + if (rc != 0) { + if (cpca->cpca_proc != NULL) { + crt_proc_destroy(cpca->cpca_proc); + cpca->cpca_proc = NULL; + } + D_FREE(cpca->cpca_buf); + } else { + *p_size = used; + } + + return rc; +} + +static int +dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, + uint32_t map_ver, daos_obj_punch_t *args, struct obj_auxi_args *auxi) +{ + struct shard_punch_args *spa = &auxi->p_args; + struct coll_oper_args *coa = &spa->pa_coa; + struct dc_obj_shard *shard = NULL; + struct dtx_memberships *mbs = NULL; + struct daos_coll_target *dct; + struct daos_coll_target tmp_tgt; + struct obj_coll_punch_cb_args cpca = { 0 }; + uint32_t tgt_size = 0; + uint32_t mbs_max_size; + uint32_t inline_size; + uint32_t flags = ORF_LEADER; + uint32_t leader; + uint32_t len; + int rc; + int i; + + rc = obj_coll_oper_args_init(coa, obj, true); + if (rc != 0) + goto out; + + for (i = 0; i < obj->cob_shards_nr; i++) { + rc = obj_coll_prep_one(coa, obj, map_ver, i); + if (rc != 0) + goto out; + } + + rc = obj_coll_oper_args_collapse(coa, &tgt_size); + if (rc != 0) + goto out; + + if (auxi->io_retry) { + /* Try to reuse the same leader. */ + rc = obj_shard_open(obj, spa->pa_auxi.shard, map_ver, &shard); + if (rc == 0) { + if (!shard->do_rebuilding && !shard->do_reintegrating) { + leader = shard->do_target_rank; + goto gen_mbs; + } + + obj_shard_close(shard); + shard = NULL; + } else if (rc != -DER_NONEXIST) { + goto out; + } + + /* Then change to new leader for retry. */ + } + + /* Randomly select a rank as the leader. */ + leader = d_rand() % coa->coa_dct_nr; + +new_leader: + dct = &coa->coa_dcts[leader]; + len = dct->dct_bitmap_sz << 3; + + for (i = 0; i < len; i++) { + if (isset(dct->dct_bitmap, i)) { + rc = obj_shard_open(obj, dct->dct_shards[i].dcs_idx, map_ver, &shard); + D_ASSERT(rc == 0); + + if (!shard->do_rebuilding && !shard->do_reintegrating) + goto gen_mbs; + + obj_shard_close(shard); + shard = NULL; + } + } + + /* Try another for leader. */ + leader = (leader + 1) % coa->coa_dct_nr; + goto new_leader; + +gen_mbs: + if (leader != 0) { + memcpy(&tmp_tgt, &coa->coa_dcts[0], sizeof(tmp_tgt)); + memcpy(&coa->coa_dcts[0], &coa->coa_dcts[leader], sizeof(tmp_tgt)); + memcpy(&coa->coa_dcts[leader], &tmp_tgt, sizeof(tmp_tgt)); + } + + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); + if (rc < 0) + goto out; + + inline_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(struct obj_coll_punch_in); + D_ASSERTF(inline_size < DAOS_BULK_LIMIT, + "Too much data to be held inside coll punch RPC body: %u vs %u\n", + inline_size, DAOS_BULK_LIMIT); + + if (inline_size + tgt_size >= DAOS_BULK_LIMIT) { + rc = dc_obj_coll_punch_bulk(task, coa, &cpca, &tgt_size); + if (rc != 0) + goto out; + } + + cpca.cpca_shard = shard; + cpca.cpca_mbs = mbs; + rc = tse_task_register_comp_cb(task, dc_obj_coll_punch_cb, &cpca, sizeof(cpca)); + if (rc != 0) + goto out; + + if (auxi->io_retry) { + flags |= ORF_RESEND; + /* Reset @enqueue_id if resend to new leader. */ + if (spa->pa_auxi.target != shard->do_target_id) + spa->pa_auxi.enqueue_id = 0; + } else { + spa->pa_auxi.obj_auxi = auxi; + daos_dti_gen(&spa->pa_dti, false); + } + + spa->pa_auxi.target = shard->do_target_id; + spa->pa_auxi.shard = shard->do_shard_idx; + + if (obj_is_ec(obj)) + flags |= ORF_EC; + + mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; + + return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, + coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, + args->flags, flags, map_ver, &auxi->map_ver_reply, task); + +out: + if (rc > 0) + rc = 0; + + DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, + "DAOS_OBJ_RPC_COLL_PUNCH for "DF_OID" map_ver %u, task %p", + DP_OID(obj->cob_md.omd_id), map_ver, task); + + if (cpca.cpca_bulks != NULL) { + if (cpca.cpca_bulks[0] != CRT_BULK_NULL) + crt_bulk_free(cpca.cpca_bulks[0]); + D_FREE(cpca.cpca_bulks); + } + + if (cpca.cpca_proc != NULL) + crt_proc_destroy(cpca.cpca_proc); + D_FREE(cpca.cpca_buf); + + if (shard != NULL) + obj_shard_close(shard); + D_FREE(mbs); + + /* obj_coll_oper_args_fini() will be triggered via complete callback. */ + obj_task_complete(task, rc); + + return rc; +} + static int dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, uint32_t map_ver, enum obj_rpc_opc opc, daos_obj_punch_t *api_args) @@ -6673,13 +7191,6 @@ dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, uint32_t grp_cnt; int rc; - if (opc == DAOS_OBJ_RPC_PUNCH && obj->cob_grp_nr > 1) - /* The object have multiple redundancy groups, use DAOS - * internal transaction to handle that to guarantee the - * atomicity of punch object. - */ - return dc_tx_convert(obj, opc, task); - rc = obj_task_init(task, opc, map_ver, api_args->th, &obj_auxi, obj); if (rc != 0) { obj_decref(obj); @@ -6693,6 +7204,46 @@ dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, if (opc == DAOS_OBJ_RPC_PUNCH) { obj_ptr2shards(obj, &shard, &shard_cnt, &grp_cnt); + + if (grp_cnt > 1) { + /* + * We support object collective punch since release-2.6 (version 10). + * The conditions to trigger object collective punch are: + * + * 1. The shards count reaches the threshold for collective punch (31 + * by default). Collectively punch object will distribute the RPCs + * load among more engines even if the total RPCs count may be not + * decreased too much. Or + * + * 2. The shards count is twice (or even more) of the engines count. + * Means that there are some shards reside on the same engine(s). + * Collectively punch object will save some RPCs. + * + * If the object has multiple redundancy groups, but cannot match any + * above condition, then we will use internal distributed transaction. + */ + if (dc_obj_proto_version < 10) + D_GOTO(out_task, rc = -DER_NEED_TX); + + if (shard_cnt < 4) + D_GOTO(out_task, rc = -DER_NEED_TX); + + if (shard_cnt < obj_coll_punch_thd) { + struct dc_pool *pool = obj->cob_pool; + + D_RWLOCK_RDLOCK(&pool->dp_map_lock); + if (shard_cnt < pool_map_node_nr(pool->dp_map) << 1) + rc = -DER_NEED_TX; + D_RWLOCK_UNLOCK(&pool->dp_map_lock); + + if (rc != 0) + goto out_task; + } + + obj_auxi->opc = DAOS_OBJ_RPC_COLL_PUNCH; + + return dc_obj_coll_punch(task, obj, epoch, map_ver, api_args, obj_auxi); + } } else { grp_cnt = 1; obj_auxi->dkey_hash = obj_dkey2hash(obj->cob_md.omd_id, api_args->dkey); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 2dd9ef9ac398..696aedb212df 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -61,29 +61,25 @@ obj_shard_addref(struct dc_obj_shard *shard) D_SPIN_UNLOCK(&shard->do_obj->cob_spin); } +static inline void +obj_shard_addref_locked(struct dc_obj_shard *shard) +{ + shard->do_ref++; +} + int dc_obj_shard_open(struct dc_object *obj, daos_unit_oid_t oid, unsigned int mode, struct dc_obj_shard *shard) { - struct pool_target *map_tgt; - int rc; - D_ASSERT(obj != NULL && shard != NULL); D_ASSERT(shard->do_obj == NULL); - rc = dc_pool_tgt_idx2ptr(obj->cob_pool, shard->do_target_id, - &map_tgt); - if (rc) - return rc; - shard->do_id = oid; - shard->do_target_rank = map_tgt->ta_comp.co_rank; - shard->do_target_idx = map_tgt->ta_comp.co_index; shard->do_obj = obj; shard->do_co = obj->cob_co; - obj_shard_addref(shard); /* release this until obj_layout_free */ D_SPIN_LOCK(&obj->cob_spin); + obj_shard_addref_locked(shard); /* release this until obj_layout_free */ obj->cob_shards->do_open_count++; D_SPIN_UNLOCK(&obj->cob_spin); @@ -1288,8 +1284,8 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, opi->opi_shard_tgts.ca_arrays = NULL; } uuid_copy(opi->opi_pool_uuid, pool->dp_pool); - uuid_copy(opi->opi_co_hdl, args->pa_coh_uuid); - uuid_copy(opi->opi_co_uuid, args->pa_cont_uuid); + uuid_copy(opi->opi_co_hdl, shard->do_co->dc_cont_hdl); + uuid_copy(opi->opi_co_uuid, shard->do_co->dc_uuid); daos_dti_copy(&opi->opi_dti, &args->pa_dti); opi->opi_flags = args->pa_auxi.flags; opi->opi_dti_cos.ca_count = 0; @@ -1307,6 +1303,142 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, return rc; } +struct shard_coll_punch_cb_args { + crt_rpc_t *cpca_rpc; + uint32_t *cpca_ver; + struct shard_punch_args *cpca_shard_args; +}; + +static int +obj_shard_coll_punch_cb(tse_task_t *task, void *data) +{ + struct shard_coll_punch_cb_args *cb_args = data; + crt_rpc_t *rpc = cb_args->cpca_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + if (task->dt_result == 0) { + task->dt_result = obj_reply_get_status(rpc); + *cb_args->cpca_ver = obj_reply_map_version_get(rpc); + } + + if (task->dt_result == -DER_OVERLOAD_RETRY) { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + struct shard_punch_args *shard_args = cb_args->cpca_shard_args; + uint32_t timeout = 0; + + if (shard_args->pa_auxi.enqueue_id == 0) + shard_args->pa_auxi.enqueue_id = ocpo->ocpo_comm_out.req_out_enqueue_id; + crt_req_get_timeout(rpc, &timeout); + if (timeout > shard_args->pa_auxi.obj_auxi->max_delay) + shard_args->pa_auxi.obj_auxi->max_delay = timeout; + } + + DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), + DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + + crt_req_decref(rpc); + + return task->dt_result; +} + +int +dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_memberships *mbs, uint32_t mbs_max_size, crt_bulk_t *bulks, + uint32_t bulk_sz, struct daos_coll_target *tgts, uint32_t tgt_nr, + uint32_t max_tgt_size, struct dtx_epoch *epoch, uint64_t api_flags, + uint32_t rpc_flags, uint32_t map_ver, uint32_t *rep_ver, tse_task_t *task) +{ + struct dc_pool *pool = obj_shard_ptr2pool(shard); + crt_rpc_t *req = NULL; + struct obj_coll_punch_in *ocpi = NULL; + struct shard_coll_punch_cb_args cb_args = { 0 }; + crt_endpoint_t tgt_ep = { 0 }; + int rc = 0; + + D_ASSERT(pool != NULL); + + tgt_ep.ep_grp = pool->dp_sys->sy_group; + tgt_ep.ep_rank = shard->do_target_rank; + tgt_ep.ep_tag = shard->do_target_idx; + + rc = obj_req_create(daos_task2ctx(task), &tgt_ep, DAOS_OBJ_RPC_COLL_PUNCH, &req); + if (rc != 0) + goto out; + + ocpi = crt_req_get(req); + D_ASSERT(ocpi != NULL); + + ocpi->ocpi_xid = args->pa_dti; + ocpi->ocpi_mbs = mbs; + ocpi->ocpi_odm.odm_mbs_max_sz = mbs_max_size; + uuid_copy(ocpi->ocpi_po_uuid, pool->dp_pool); + uuid_copy(ocpi->ocpi_co_hdl, shard->do_co->dc_cont_hdl); + uuid_copy(ocpi->ocpi_co_uuid, shard->do_co->dc_uuid); + ocpi->ocpi_oid = shard->do_id; + ocpi->ocpi_epoch = epoch->oe_value; + ocpi->ocpi_api_flags = api_flags; + ocpi->ocpi_map_ver = map_ver; + ocpi->ocpi_flags = rpc_flags; + + if (bulks != NULL) { + D_ASSERT(bulk_sz != 0); + + ocpi->ocpi_bulk_tgt_sz = bulk_sz; + ocpi->ocpi_bulk_tgt_nr = tgt_nr; + ocpi->ocpi_tgt_bulk = bulks[0]; + ocpi->ocpi_tgts.ca_count = 0; + ocpi->ocpi_tgts.ca_arrays = NULL; + } else { + D_ASSERT(tgts != NULL); + + ocpi->ocpi_bulk_tgt_sz = 0; + ocpi->ocpi_bulk_tgt_nr = 0; + ocpi->ocpi_tgt_bulk = NULL; + ocpi->ocpi_tgts.ca_count = tgt_nr; + ocpi->ocpi_tgts.ca_arrays = tgts; + } + + ocpi->ocpi_max_tgt_sz = max_tgt_size; + ocpi->ocpi_disp_width = 0; + ocpi->ocpi_disp_depth = 0; + + ocpi->ocpi_comm_in.req_in_enqueue_id = args->pa_auxi.enqueue_id; + + crt_req_addref(req); + cb_args.cpca_rpc = req; + cb_args.cpca_ver = rep_ver; + cb_args.cpca_shard_args = args; + + rc = tse_task_register_comp_cb(task, obj_shard_coll_punch_cb, &cb_args, sizeof(cb_args)); + if (rc != 0) + D_GOTO(out_req, rc); + + D_DEBUG(DB_IO, "Sending DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u, bulk_sz %u\n", + req, DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag, bulk_sz); + + return daos_rpc_send(req, task); + +out_req: + /* -1 for crt_req_addref(). */ + crt_req_decref(req); + /* -1 for obj_req_create(). */ + crt_req_decref(req); +out: + D_ERROR("DAOS_OBJ_RPC_COLL_PUNCH RPC failed for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u: "DF_RC"\n", + DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); + + obj_shard_decref(shard); + tse_task_complete(task, rc); + return rc; +} + struct obj_enum_args { crt_rpc_t *rpc; daos_handle_t *hdlp; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 8a2b12fff55a..4950dfd84d22 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -41,6 +41,7 @@ struct obj_io_context; extern bool cli_bypass_rpc; /** Switch of server-side IO dispatch */ extern unsigned int srv_io_mode; +extern unsigned int obj_coll_punch_thd; /* Whether check redundancy group validation when DTX resync. */ extern bool tx_verify_rdg; @@ -215,6 +216,32 @@ typedef int (*shard_io_cb_t)(struct dc_obj_shard *shard, enum obj_rpc_opc opc, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); +struct obj_coll_disp_cursor { + /* + * The length of daos_coll_target array. The obj_coll_disp_cursor may be inside some + * {obj,shard}_auxi_xxx structure, has some size limitation. So the daos_coll_target + * array is not contained inside the obj_coll_disp_cursor. + */ + uint32_t tgt_nr; + /* + * The "grp" is not object redundancy group, instead, it is the set of some engine(s). + * If there is only one engine in the group, then send RPC to such engine. Otherwise, + * choose a relay engine from such group and send RPC to such relay engine that will + * help to forward the RPC to other engines in such group. + */ + uint16_t grp_nr; + /* The count of engine groups that the RPC will be dispatched to. */ + uint16_t pending_grps; + /* Current position in the daos_coll_target array. */ + uint32_t cur_pos; + /* How many engines in the group corresponding to cur_pos. As the process going, the + * count of engines in current group may be smaller than the engines in former group + * unless fixed_step is set. + */ + uint16_t cur_step; + uint16_t fixed_step:1; +}; + /* shard update/punch auxiliary args, must be the first field of * shard_rw_args and shard_punch_args. */ @@ -248,12 +275,30 @@ struct shard_rw_args { struct obj_reasb_req *reasb_req; }; +struct coll_oper_args { + struct shard_auxi_args coa_auxi; + int coa_dct_nr; + uint32_t coa_dct_cap; + uint32_t coa_max_dct_sz; + uint8_t coa_max_shard_nr; + uint8_t coa_max_bitmap_sz; + uint8_t coa_for_modify:1; + uint8_t coa_target_nr; + /* + * The target ID for the top four healthy shards. + * Please check comment for DTX_COLL_INLINE_TARGETS. + */ + uint32_t coa_targets[DTX_COLL_INLINE_TARGETS]; + struct daos_coll_target *coa_dcts; +}; + struct shard_punch_args { - struct shard_auxi_args pa_auxi; - uuid_t pa_coh_uuid; - uuid_t pa_cont_uuid; - struct dtx_id pa_dti; - uint32_t pa_opc; + union { + struct shard_auxi_args pa_auxi; + struct coll_oper_args pa_coa; + }; + struct dtx_id pa_dti; + uint32_t pa_opc; }; struct shard_sub_anchor { @@ -572,6 +617,13 @@ int dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); +int dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_memberships *mbs, uint32_t mbs_max_size, crt_bulk_t *bulks, + uint32_t bulk_sz, struct daos_coll_target *tgts, uint32_t tgt_nr, + uint32_t max_tgt_size, struct dtx_epoch *epoch, uint64_t api_flags, + uint32_t rpc_flags, uint32_t map_ver, uint32_t *rep_ver, + tse_task_t *task); + int dc_obj_shard_list(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); @@ -846,9 +898,32 @@ daos_recx_ep_list_ep_valid(struct daos_recx_ep_list *list) return (list->re_ep_valid == 1); } -int obj_class_init(void); +int obj_class_init(void); void obj_class_fini(void); -int obj_utils_init(void); + +/* + * Consider efficiency, we will not make one leader (or relay) engine to forward + * too many collective requests to other engines. But it also needs to guarantee + * that the payload size for each dispatch group is small enough to be packed in + * RPC body to avoid transferring via RDAM. + * + * On the other hand, parent engine may need to children's feedback before reply + * parent's upper level engine. So making parent engine to forward more requests + * than each child engine does is more efficient because current collective task + * on parent engine is scheduled earlier than on child engine. Otherwise, parent + * engine may wait more time. + */ +#define COLL_DISP_WIDTH_DEF 20 +#define COLL_DISP_WIDTH_MIN 8 +#define COLL_DISP_WIDTH_DIF 4 + +/* obj_utils.c */ +void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, + uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc); +void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, + crt_endpoint_t *tgt_ep); +void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc); +int obj_utils_init(void); void obj_utils_fini(void); /* obj_tx.c */ diff --git a/src/object/obj_rpc.c b/src/object/obj_rpc.c index e7f4e43960b2..b42b71a84c4d 100644 --- a/src/object/obj_rpc.c +++ b/src/object/obj_rpc.c @@ -500,12 +500,56 @@ crt_proc_struct_daos_shard_tgt(crt_proc_t proc, crt_proc_op_t proc_op, /* For compounded RPC. */ +static int +crt_proc_struct_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, + uint32_t mbs_max_size, struct dtx_memberships **p_mbs) +{ + struct dtx_memberships *mbs = NULL; + uint32_t size = 0; + int rc; + + if (FREEING(proc_op)) { + D_FREE(*p_mbs); + return 0; + } + + if (ENCODING(proc_op)) { + mbs = *p_mbs; + size = sizeof(*mbs) + mbs->dm_data_size; + } + + /* Pack the size of mbs to help decode case. */ + rc = crt_proc_uint32_t(proc, proc_op, &size); + if (unlikely(rc)) + return rc; + + D_ASSERT(size != 0); + + if (DECODING(proc_op)) { + /* Allocate enough buffer to hold delay filled bitmap and targets information. */ + D_ALLOC(mbs, size < mbs_max_size ? mbs_max_size : size); + if (mbs == NULL) + return -DER_NOMEM; + } + + rc = crt_proc_memcpy(proc, proc_op, mbs, size); + if (unlikely(rc)) { + if (DECODING(proc_op)) + D_FREE(mbs); + return rc; + } + + if (DECODING(proc_op)) + *p_mbs = mbs; + + return 0; +} + static int crt_proc_struct_daos_cpd_sub_head(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_cpd_sub_head *dcsh, bool mbs) { - uint32_t size = 0; - int rc; + int rc; if (FREEING(proc_op)) { if (mbs) @@ -529,30 +573,7 @@ crt_proc_struct_daos_cpd_sub_head(crt_proc_t proc, crt_proc_op_t proc_op, if (!mbs) return 0; - if (ENCODING(proc_op)) - /* Pack the size of dcsh->dcsh_mbs to help decode case. */ - size = sizeof(*dcsh->dcsh_mbs) + dcsh->dcsh_mbs->dm_data_size; - - rc = crt_proc_uint32_t(proc, proc_op, &size); - if (unlikely(rc)) - return rc; - - D_ASSERT(size != 0); - - if (DECODING(proc_op)) { - D_ALLOC(dcsh->dcsh_mbs, size); - if (dcsh->dcsh_mbs == NULL) - return -DER_NOMEM; - } - - rc = crt_proc_memcpy(proc, proc_op, dcsh->dcsh_mbs, size); - if (unlikely(rc)) { - if (DECODING(proc_op)) - D_FREE(dcsh->dcsh_mbs); - return rc; - } - - return 0; + return crt_proc_struct_dtx_mbs(proc, proc_op, 0, &dcsh->dcsh_mbs); } static int @@ -848,11 +869,6 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, return rc; } - if (FREEING(proc_op)) { - D_FREE(dcb->dcb_bulk); - return 0; - } - rc = crt_proc_uint32_t(proc, proc_op, &dcb->dcb_size); if (unlikely(rc)) return rc; @@ -871,6 +887,9 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, if (unlikely(rc)) return rc; + if (FREEING(proc_op)) + D_FREE(dcb->dcb_bulk); + /* The other fields will not be packed on-wire. */ return 0; @@ -1082,6 +1101,154 @@ crt_proc_struct_daos_req_comm_out(crt_proc_t proc, crt_proc_op_t proc_op, return 0; } +static int +crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, + struct obj_dtx_mbs *odm) +{ + int rc; + + rc = crt_proc_struct_dtx_id(proc, proc_op, &odm->odm_xid); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &odm->odm_mbs_max_sz); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &odm->odm_padding); + if (unlikely(rc)) + return rc; + + return crt_proc_struct_dtx_mbs(proc, proc_op, odm->odm_mbs_max_sz, &odm->odm_mbs); +} + +static int +crt_proc_struct_daos_coll_shard(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_coll_shard *dcs) +{ + int rc = 0; + int i; + + if (FREEING(proc_op)) { + if (dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + return 0; + } + + rc = crt_proc_uint16_t(proc, proc_op, &dcs->dcs_nr); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint16_t(proc, proc_op, &dcs->dcs_cap); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &dcs->dcs_inline); + if (unlikely(rc)) + return rc; + + if (DECODING(proc_op)) + dcs->dcs_cap = dcs->dcs_nr; + + if (dcs->dcs_nr <= 1) { + if (DECODING(proc_op)) + dcs->dcs_buf = &dcs->dcs_inline; + return 0; + } + + if (DECODING(proc_op)) { + D_ALLOC_ARRAY(dcs->dcs_buf, dcs->dcs_nr); + if (dcs->dcs_buf == NULL) + return -DER_NOMEM; + } + + for (i = 0; i < dcs->dcs_nr; i++) { + rc = crt_proc_uint32_t(proc, proc_op, &dcs->dcs_buf[i]); + if (unlikely(rc)) + goto out; + } + +out: + if (unlikely(rc) && DECODING(proc_op) && dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + return rc; +} + +int +crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_coll_target *dct) +{ + int rc; + int i; + + rc = crt_proc_uint32_t(proc, proc_op, &dct->dct_rank); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_bitmap_sz); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_max_shard); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_tgt_nr); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_tgt_cap); + if (unlikely(rc)) + return rc; + + if (DECODING(proc_op)) { + D_ALLOC(dct->dct_bitmap, dct->dct_bitmap_sz); + if (dct->dct_bitmap == NULL) + return -DER_NOMEM; + + /* When decode, allocate enough buffer to avoid some XS accessing invalid DRAM. */ + D_ALLOC_ARRAY(dct->dct_shards, dct->dct_bitmap_sz << 3); + if (dct->dct_shards == NULL) + goto out; + } + + rc = crt_proc_memcpy(proc, proc_op, dct->dct_bitmap, dct->dct_bitmap_sz); + if (unlikely(rc)) + goto out; + + for (i = 0; i <= dct->dct_max_shard; i++) { + rc = crt_proc_struct_daos_coll_shard(proc, proc_op, &dct->dct_shards[i]); + if (unlikely(rc)) + goto out; + } + + /* Skip empty dct_tgt_ids. */ + if (unlikely(dct->dct_tgt_cap == 0 || dct->dct_tgt_nr == 0)) + goto out; + + if (FREEING(proc_op)) + goto out; + + if (DECODING(proc_op)) { + D_ALLOC_ARRAY(dct->dct_tgt_ids, dct->dct_tgt_nr); + if (dct->dct_tgt_ids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + for (i = 0; i < dct->dct_tgt_nr; i++) { + rc = crt_proc_uint32_t(proc, proc_op, &dct->dct_tgt_ids[i]); + if (unlikely(rc)) + goto out; + } + +out: + if (FREEING(proc_op) || (unlikely(rc) && DECODING(proc_op))) { + D_FREE(dct->dct_bitmap); + D_FREE(dct->dct_shards); + D_FREE(dct->dct_tgt_ids); + } + + return rc; +} + CRT_RPC_DEFINE(obj_rw, DAOS_ISEQ_OBJ_RW, DAOS_OSEQ_OBJ_RW) CRT_RPC_DEFINE(obj_rw_v10, DAOS_ISEQ_OBJ_RW_V10, DAOS_OSEQ_OBJ_RW_V10) CRT_RPC_DEFINE(obj_key_enum, DAOS_ISEQ_OBJ_KEY_ENUM, DAOS_OSEQ_OBJ_KEY_ENUM) @@ -1098,6 +1265,7 @@ CRT_RPC_DEFINE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) CRT_RPC_DEFINE(obj_ec_rep, DAOS_ISEQ_OBJ_EC_REP, DAOS_OSEQ_OBJ_EC_REP) CRT_RPC_DEFINE(obj_key2anchor, DAOS_ISEQ_OBJ_KEY2ANCHOR, DAOS_OSEQ_OBJ_KEY2ANCHOR) CRT_RPC_DEFINE(obj_key2anchor_v10, DAOS_ISEQ_OBJ_KEY2ANCHOR_V10, DAOS_OSEQ_OBJ_KEY2ANCHOR_V10) +CRT_RPC_DEFINE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) /* Define for obj_proto_rpc_fmt[] array population below. * See OBJ_PROTO_*_RPC_LIST macro definition @@ -1179,6 +1347,9 @@ obj_reply_set_status(crt_rpc_t *rpc, int status) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_status = status; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_ret = status; + break; default: D_ASSERT(0); } @@ -1218,6 +1389,8 @@ obj_reply_get_status(crt_rpc_t *rpc) return ((struct obj_cpd_out *)reply)->oco_ret; case DAOS_OBJ_RPC_EC_REPLICATE: return ((struct obj_ec_rep_out *)reply)->er_status; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_ret; default: D_ASSERT(0); } @@ -1267,6 +1440,9 @@ obj_reply_map_version_set(crt_rpc_t *rpc, uint32_t map_version) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_map_ver = map_version; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_map_version = map_version; + break; default: D_ASSERT(0); } @@ -1302,6 +1478,8 @@ obj_reply_map_version_get(crt_rpc_t *rpc) return ((struct obj_sync_out *)reply)->oso_map_version; case DAOS_OBJ_RPC_CPD: return ((struct obj_cpd_out *)reply)->oco_map_version; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_map_version; default: D_ASSERT(0); } diff --git a/src/object/obj_rpc.h b/src/object/obj_rpc.h index dba1b31ca748..86a90fbe4d1a 100644 --- a/src/object/obj_rpc.h +++ b/src/object/obj_rpc.h @@ -98,7 +98,10 @@ X(DAOS_OBJ_RPC_KEY2ANCHOR, \ 0, ver == 9 ? &CQF_obj_key2anchor : \ &CQF_obj_key2anchor_v10, \ - ds_obj_key2anchor_handler, NULL, "key2anchor") + ds_obj_key2anchor_handler, NULL, "key2anchor") \ + X(DAOS_OBJ_RPC_COLL_PUNCH, \ + 0, &CQF_obj_coll_punch, ds_obj_coll_punch_handler, \ + NULL, "obj_coll_punch") /* Define for RPC enum population below */ #define X(a, b, c, d, e, f) a, @@ -149,8 +152,8 @@ enum obj_rpc_flags { * oei_epr.epr_hi is epoch. */ ORF_ENUM_WITHOUT_EPR = (1 << 8), - /* CPD RPC leader */ - ORF_CPD_LEADER = (1 << 9), + /* RPC leader */ + ORF_LEADER = (1 << 9), /* Bulk data transfer for CPD RPC. */ ORF_CPD_BULK = (1 << 10), /* Contain EC split req, only used on CPD leader locally. Obsolete - DAOS-10348. */ @@ -707,6 +710,42 @@ struct daos_cpd_sg { CRT_RPC_DECLARE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) +struct obj_dtx_mbs { + struct dtx_id odm_xid; + uint32_t odm_mbs_max_sz; + uint32_t odm_padding; + struct dtx_memberships *odm_mbs; +}; + +#define DAOS_ISEQ_OBJ_COLL_PUNCH /* input fields */ \ + ((struct obj_dtx_mbs) (ocpi_odm) CRT_VAR) \ + ((uuid_t) (ocpi_po_uuid) CRT_VAR) \ + ((uuid_t) (ocpi_co_hdl) CRT_VAR) \ + ((uuid_t) (ocpi_co_uuid) CRT_VAR) \ + ((daos_unit_oid_t) (ocpi_oid) CRT_RAW) \ + ((uint64_t) (ocpi_epoch) CRT_VAR) \ + ((uint64_t) (ocpi_api_flags) CRT_VAR) \ + ((uint32_t) (ocpi_map_ver) CRT_VAR) \ + ((uint32_t) (ocpi_flags) CRT_VAR) \ + ((uint32_t) (ocpi_bulk_tgt_sz) CRT_VAR) \ + ((uint32_t) (ocpi_bulk_tgt_nr) CRT_VAR) \ + ((crt_bulk_t) (ocpi_tgt_bulk) CRT_VAR) \ + ((uint32_t) (ocpi_max_tgt_sz) CRT_VAR) \ + ((uint16_t) (ocpi_disp_width) CRT_VAR) \ + ((uint16_t) (ocpi_disp_depth) CRT_VAR) \ + ((struct daos_coll_target) (ocpi_tgts) CRT_ARRAY) \ + ((struct daos_req_comm_in) (ocpi_comm_in) CRT_VAR) + +#define DAOS_OSEQ_OBJ_COLL_PUNCH /* output fields */ \ + ((int32_t) (ocpo_ret) CRT_VAR) \ + ((uint32_t) (ocpo_map_version) CRT_VAR) \ + ((struct daos_req_comm_out) (ocpo_comm_out) CRT_VAR) + +CRT_RPC_DECLARE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) + +#define ocpi_xid ocpi_odm.odm_xid +#define ocpi_mbs ocpi_odm.odm_mbs + static inline int obj_req_create(crt_context_t crt_ctx, crt_endpoint_t *tgt_ep, crt_opcode_t opc, crt_rpc_t **req) @@ -730,6 +769,8 @@ uint32_t obj_reply_map_version_get(crt_rpc_t *rpc); int crt_proc_struct_daos_cpd_sub_req(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_cpd_sub_req *dcsr, bool with_oid); +int crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, + struct daos_coll_target *dct); static inline bool obj_is_modification_opc(uint32_t opc) @@ -739,7 +780,7 @@ obj_is_modification_opc(uint32_t opc) opc == DAOS_OBJ_RPC_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; + opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS || opc == DAOS_OBJ_RPC_COLL_PUNCH; } #define DAOS_OBJ_UPDATE_MODE_MASK (DAOS_OO_RW | DAOS_OO_EXCL | \ @@ -751,43 +792,6 @@ obj_is_fetch_opc(uint32_t opc) return opc == DAOS_OBJ_RPC_FETCH; } -static inline bool -obj_is_ec_agg_opc(uint32_t opc) -{ - return opc == DAOS_OBJ_RPC_EC_AGGREGATE || - opc == DAOS_OBJ_RPC_EC_REPLICATE; -} - -static inline bool -obj_rpc_is_update(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; -} - -static inline bool -obj_rpc_is_fetch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; -} - -static inline bool -obj_rpc_is_punch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; -} - -static inline bool -obj_rpc_is_migrate(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_MIGRATE; -} - static inline bool obj_is_enum_opc(uint32_t opc) { @@ -798,40 +802,23 @@ obj_is_enum_opc(uint32_t opc) } static inline bool -obj_rpc_is_query(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_QUERY_KEY; -} - -static inline bool -obj_rpc_is_sync(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_SYNC; -} - -static inline bool -obj_rpc_is_key2anchor(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_KEY2ANCHOR; -} - -static inline bool -obj_rpc_is_ec_agg(crt_rpc_t *rpc) +obj_is_ec_agg_opc(uint32_t opc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_AGGREGATE; - + return opc == DAOS_OBJ_RPC_EC_AGGREGATE || + opc == DAOS_OBJ_RPC_EC_REPLICATE; } static inline bool -obj_rpc_is_ec_rep(crt_rpc_t *rpc) +obj_rpc_is_update(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_REPLICATE; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || + opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; } static inline bool -obj_rpc_is_cpd(crt_rpc_t *rpc) +obj_rpc_is_fetch(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; } #endif /* __DAOS_OBJ_RPC_H__ */ diff --git a/src/object/obj_tx.c b/src/object/obj_tx.c index 6e56cce82e3a..f2ddfd34cb18 100644 --- a/src/object/obj_tx.c +++ b/src/object/obj_tx.c @@ -2305,7 +2305,7 @@ dc_tx_commit_trigger(tse_task_t *task, struct dc_tx *tx, daos_tx_commit_t *args) uuid_copy(oci->oci_pool_uuid, tx->tx_pool->dp_pool); oci->oci_map_ver = tx->tx_pm_ver; - oci->oci_flags = ORF_CPD_LEADER; + oci->oci_flags = ORF_LEADER; if (tx->tx_set_resend && !tx->tx_renew) oci->oci_flags |= ORF_RESEND; tx->tx_renew = 0; diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 8312c6719d89..ed28b260cf62 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -204,6 +204,116 @@ static btr_ops_t recx_btr_ops = { .to_key_decode = recx_key_decode }; +void +obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, + uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc) +{ + if (max_width == 0) { + /* + * Guarantee that the targets information (to be dispatched) can be packed + * inside the RPC body instead of via bulk transfer. + */ + max_width = (inline_size + max_tgt_size) / DAOS_BULK_LIMIT + 1; + if (max_width < COLL_DISP_WIDTH_DEF) + max_width = COLL_DISP_WIDTH_DEF; + } + + if (tgt_nr - start > max_width) { + ocdc->grp_nr = max_width; + ocdc->cur_step = (tgt_nr - start) / max_width; + if ((tgt_nr - start) % max_width != 0) { + ocdc->cur_step++; + ocdc->fixed_step = 0; + } else { + ocdc->fixed_step = 1; + } + } else { + ocdc->grp_nr = tgt_nr - start; + ocdc->cur_step = 1; + ocdc->fixed_step = 1; + } + + ocdc->pending_grps = ocdc->grp_nr; + ocdc->tgt_nr = tgt_nr; + ocdc->cur_pos = start; +} + +void +obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, + crt_endpoint_t *tgt_ep) +{ + struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; + struct daos_coll_target tmp; + unsigned long rand = 0; + uint32_t size; + int pos; + int i; + + if (ocdc->cur_step > 2) { + rand = d_rand(); + /* Randomly choose an engine as the relay one for load balance. */ + pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; + if (pos != ocdc->cur_pos) { + memcpy(&tmp, &tgts[pos], sizeof(tmp)); + memcpy(&tgts[pos], dct, sizeof(tmp)); + memcpy(dct, &tmp, sizeof(tmp)); + } + } + + size = dct->dct_bitmap_sz << 3; + + /* Randomly choose a XS as the local leader on target engine for load balance. */ + for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { + if (isset(dct->dct_bitmap, i)) { + pos -= dct->dct_shards[i].dcs_nr; + if (pos < 0) + break; + } + } + + D_ASSERT(i < size); + + tgt_ep->ep_tag = i; + tgt_ep->ep_rank = dct->dct_rank; +} + +void +obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc) +{ + ocdc->cur_pos += ocdc->cur_step; + + /* The last one. */ + if (--(ocdc->pending_grps) == 0) { + D_ASSERTF(ocdc->cur_pos == ocdc->tgt_nr, + "COLL disp cursor trouble (1): " + "grp_nr %u, pos %u, step %u (%s), tgt_nr %u\n", + ocdc->grp_nr, ocdc->cur_pos, ocdc->cur_step, + ocdc->fixed_step ? "fixed" : "vary", ocdc->tgt_nr); + return; + } + + D_ASSERTF(ocdc->tgt_nr - ocdc->cur_pos >= ocdc->pending_grps, + "COLL disp cursor trouble (2): " + "pos %u, step %u (%s), tgt_nr %u, grp_nr %u, pending_grps %u\n", + ocdc->cur_pos, ocdc->cur_step, ocdc->fixed_step ? "fixed" : "vary", + ocdc->tgt_nr, ocdc->grp_nr, ocdc->pending_grps); + + if (ocdc->fixed_step) { + D_ASSERTF(ocdc->cur_pos + ocdc->cur_step <= ocdc->tgt_nr, + "COLL disp cursor trouble (3): " + "pos %u, step %u (%s), tgt_nr %u, grp_nr %u, pending_grps %u\n", + ocdc->cur_pos, ocdc->cur_step, ocdc->fixed_step ? "fixed" : "vary", + ocdc->tgt_nr, ocdc->grp_nr, ocdc->pending_grps); + return; + } + + ocdc->cur_step = (ocdc->tgt_nr - ocdc->cur_pos) / ocdc->pending_grps; + if ((ocdc->tgt_nr - ocdc->cur_pos) % ocdc->pending_grps != 0) + ocdc->cur_step++; + else + ocdc->fixed_step = 1; +} + int obj_utils_init(void) { diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index 4452e0404861..4bb9b086fb7e 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -236,11 +236,14 @@ obj_update_latency(uint32_t opc, uint32_t type, uint64_t latency, uint64_t io_si } struct ds_obj_exec_arg { - crt_rpc_t *rpc; - struct obj_io_context *ioc; - void *args; - uint32_t flags; - uint32_t start; /* The start shard for EC obj. */ + crt_rpc_t *rpc; + struct obj_io_context *ioc; + void *args; + uint32_t flags; + uint32_t start; /* The start shard for EC obj. */ + struct daos_coll_shard *coll_shards; + struct daos_coll_target *coll_tgts; + struct obj_coll_disp_cursor coll_cur; }; int @@ -252,6 +255,9 @@ ds_obj_remote_punch(struct dtx_leader_handle *dth, void *arg, int idx, int ds_obj_cpd_dispatch(struct dtx_leader_handle *dth, void *arg, int idx, dtx_sub_comp_cb_t comp_cb); +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dth, void *arg, int idx, + dtx_sub_comp_cb_t comp_cb); /* srv_obj.c */ void ds_obj_rw_handler(crt_rpc_t *rpc); @@ -266,6 +272,7 @@ void ds_obj_migrate_handler(crt_rpc_t *rpc); void ds_obj_ec_agg_handler(crt_rpc_t *rpc); void ds_obj_ec_rep_handler(crt_rpc_t *rpc); void ds_obj_cpd_handler(crt_rpc_t *rpc); +void ds_obj_coll_punch_handler(crt_rpc_t *rpc); typedef int (*ds_iofw_cb_t)(crt_rpc_t *req, void *arg); struct daos_cpd_args { diff --git a/src/object/srv_mod.c b/src/object/srv_mod.c index 72a25ba97de1..94099dc3f02c 100644 --- a/src/object/srv_mod.c +++ b/src/object/srv_mod.c @@ -213,7 +213,9 @@ struct dss_module_key obj_module_key = { static int obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = 0; D_ASSERT(proto_ver == DAOS_OBJ_VERSION || proto_ver == DAOS_OBJ_VERSION - 1); @@ -226,7 +228,11 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) /* Extract hint from RPC */ attr->sra_enqueue_id = 0; - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_in *orw = crt_req_get(rpc); if (proto_ver >= 10) { @@ -237,12 +243,19 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) sched_req_attr_init(attr, obj_rpc_is_update(rpc) ? SCHED_REQ_UPDATE : SCHED_REQ_FETCH, &orw->orw_pool_uuid); - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_in *omi = crt_req_get(rpc); attr->sra_enqueue_id = omi->om_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &omi->om_pool_uuid); - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_in *oei = crt_req_get(rpc); if (proto_ver >= 10) { @@ -251,7 +264,14 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oei_v10->oei_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oei->oei_pool_uuid); - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_in *opi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -260,7 +280,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = opi_v10->opi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &opi->opi_pool_uuid); - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_in *okqi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -269,7 +291,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = okqi_v10->okqi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &okqi->okqi_pool_uuid); - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_in *osi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -278,7 +302,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = osi_v10->osi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &osi->osi_pool_uuid); - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_in *oki = crt_req_get(rpc); if (proto_ver >= 10) { @@ -287,102 +313,146 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oki_v10->oki_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oki->oki_pool_uuid); - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_in *ea = crt_req_get(rpc); attr->sra_enqueue_id = ea->ea_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &ea->ea_pool_uuid); - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_in *er = crt_req_get(rpc); attr->sra_enqueue_id = er->er_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &er->er_pool_uuid); - } else if (obj_rpc_is_cpd(rpc)) { + break; + } + case DAOS_OBJ_RPC_CPD: { struct obj_cpd_in *oci = crt_req_get(rpc); - sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &oci->oci_pool_uuid); - } else { + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &oci->oci_pool_uuid); + break; + } + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + attr->sra_enqueue_id = ocpi->ocpi_comm_in.req_in_enqueue_id; + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &ocpi->ocpi_po_uuid); + break; + } + default: /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_NOSYS; + rc = -DER_NOSYS; + break; } - return 0; + return rc; } static int obj_set_req(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = -DER_OVERLOAD_RETRY; /* Old protocol RPCs won't be rejected. */ D_ASSERT(proto_ver == DAOS_OBJ_VERSION); - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_v10_out *orwo_v10 = crt_reply_get(rpc); orwo_v10->orw_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; orwo_v10->orw_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_out *om = crt_reply_get(rpc); om->om_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; om->om_status = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_v10_out *oeo_v10 = crt_reply_get(rpc); oeo_v10->oeo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oeo_v10->oeo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_v10_out *opo_v10 = crt_reply_get(rpc); opo_v10->opo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; opo_v10->opo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_v10_out *okqo_v10 = crt_reply_get(rpc); okqo_v10->okqo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; okqo_v10->okqo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_v10_out *oso_v10 = crt_reply_get(rpc); oso_v10->oso_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oso_v10->oso_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_v10_out *oko_v10 = crt_reply_get(rpc); oko_v10->oko_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oko_v10->oko_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_out *ea_out = crt_reply_get(rpc); ea_out->ea_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; ea_out->ea_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_out *er_out = crt_reply_get(rpc); er_out->er_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; er_out->er_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_cpd(rpc)) { - /* No RPC retry for DTX, client will retry anyway. */ - return -DER_TIMEDOUT; + break; + } + case DAOS_OBJ_RPC_CPD: + /* NOTE: It needs to be enhanced. Currently, just let client retry anyway. */ + rc = -DER_TIMEDOUT; + break; + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + + ocpo->ocpo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; + ocpo->ocpo_ret = -DER_OVERLOAD_RETRY; + break; } - /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_TIMEDOUT; + default: + /* Other requests will not be queued, see dss_rpc_hdlr() */ + rc = -DER_TIMEDOUT; + break; + } + + return rc; } static struct dss_module_ops ds_obj_mod_ops = { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index ceca7728b249..f9c6565ebde4 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2596,8 +2596,6 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; } /* Inject failure for test to simulate the case of lost some @@ -2787,6 +2785,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) int dti_cos_cnt; uint32_t tgt_cnt; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; bool need_abort = false; @@ -2857,6 +2856,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) } version = orw->orw_map_ver; + max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { if (!(orw->orw_api_flags & DAOS_COND_MASK)) @@ -2873,7 +2873,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; d_tm_inc_counter(opm->opm_update_resent, 1); again1: @@ -2936,7 +2935,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -2951,6 +2950,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) /* Execute the operation on all targets */ rc = dtx_leader_exec_ops(dlh, obj_tgt_update, NULL, 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distributed transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3004,6 +3006,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) DP_DTI(&orw->orw_dti), DP_RC(rc1)); } + if (ioc.ioc_map_ver < max_ver) + ioc.ioc_map_ver = max_ver; + obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); D_FREE(mbs); D_FREE(dti_cos); @@ -3453,6 +3458,7 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, switch (opc) { case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_COLL_PUNCH: rc = vos_obj_punch(cont->sc_hdl, opi->opi_oid, opi->opi_epoch, opi->opi_map_ver, 0, NULL, 0, NULL, dth); @@ -3542,59 +3548,57 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, return rc; } -/* Handle the punch requests on non-leader */ -void -ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +struct obj_tgt_punch_args { + uint32_t opc; + struct obj_io_context *sponsor_ioc; + struct dtx_handle *sponsor_dth; + struct obj_punch_in *opi; + struct dtx_memberships *mbs; + uint32_t *ver; + void *data; +}; + +static int +obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) { - struct dtx_handle *dth = NULL; - struct obj_io_context ioc; - struct obj_punch_in *opi; - struct dtx_memberships *mbs = NULL; - struct daos_shard_tgt *tgts = NULL; - uint32_t dtx_flags = 0; - uint32_t tgt_cnt; - struct dtx_epoch epoch; - int rc; + struct obj_io_context ioc = { 0 }; + struct obj_io_context *p_ioc = otpa->sponsor_ioc; + struct dtx_handle *dth = otpa->sponsor_dth; + struct obj_punch_in *opi = otpa->opi; + struct dtx_epoch epoch; + daos_epoch_t tmp; + uint32_t dtx_flags = 0; + int rc = 0; + int i; - opi = crt_req_get(rpc); - D_ASSERT(opi != NULL); - rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, - opi->opi_pool_uuid, opi->opi_co_hdl, - opi->opi_co_uuid, rpc, opi->opi_flags, &ioc); - if (rc) - goto out; + if (p_ioc == NULL) { + p_ioc = &ioc; + rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, opi->opi_pool_uuid, + opi->opi_co_hdl, opi->opi_co_uuid, otpa->data, opi->opi_flags, + p_ioc); + if (rc != 0) + goto out; + } - /* Handle resend. */ - if (opi->opi_flags & ORF_RESEND) { - daos_epoch_t e = opi->opi_epoch; + if (dth != NULL) + goto exec; - rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, NULL); + if (opi->opi_flags & ORF_RESEND) { + tmp = opi->opi_epoch; + rc = dtx_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &tmp, NULL); /* Do nothing if 'prepared' or 'committed'. */ if (rc == -DER_ALREADY || rc == 0) D_GOTO(out, rc = 0); - /* Abort it firstly if exist but with different epoch, - * then re-execute with new epoch. - */ + /* Abort old one with different epoch, then re-execute with new epoch. */ if (rc == -DER_MISMATCH) /* Abort it by force with MAX epoch to guarantee * that it can be aborted. */ - rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + rc = vos_dtx_abort(p_ioc->ioc_vos_coh, &opi->opi_dti, tmp); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; - } - - tgts = opi->opi_shard_tgts.ca_arrays; - tgt_cnt = opi->opi_shard_tgts.ca_count; - - if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { - rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &mbs); - if (rc != 0) - D_GOTO(out, rc); } epoch.oe_value = opi->opi_epoch; @@ -3605,10 +3609,9 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) dtx_flags |= DTX_SYNC; /* Start the local transaction */ - rc = dtx_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - opi->opi_map_ver, &opi->opi_oid, - opi->opi_dti_cos.ca_arrays, - opi->opi_dti_cos.ca_count, dtx_flags, mbs, &dth); + rc = dtx_begin(p_ioc->ioc_vos_coh, &opi->opi_dti, &epoch, count, opi->opi_map_ver, + &opi->opi_oid, opi->opi_dti_cos.ca_arrays, opi->opi_dti_cos.ca_count, + dtx_flags, otpa->mbs, &dth); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3618,19 +3621,59 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) if (DAOS_FAIL_CHECK(DAOS_DTX_NONLEADER_ERROR)) D_GOTO(out, rc = -DER_IO); - rc = obj_local_punch(opi, opc_get(rpc->cr_opc), &ioc, dth); - if (rc != 0) - DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); +exec: + /* There may be multiple shards reside on the same VOS target. */ + for (i = 0; i < count; i++) { + opi->opi_oid.id_shard = shards[i]; + rc = obj_local_punch(opi, otpa->opc, p_ioc, dth); + if (rc != 0) { + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + goto out; + } + } out: - /* Stop the local transaction */ - if (dth != NULL) - rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); - D_FREE(mbs); - obj_ioc_end(&ioc, rc); + if (otpa->ver != NULL) + *otpa->ver = p_ioc->ioc_map_ver; + + if (dth != NULL && dth != otpa->sponsor_dth) + rc = dtx_end(dth, p_ioc->ioc_coc, rc); + + if (p_ioc == &ioc) + obj_ioc_end(p_ioc, rc); + + return rc; +} + +/* Handle the punch requests on non-leader */ +void +ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +{ + struct obj_tgt_punch_args otpa = { 0 }; + struct obj_punch_in *opi = crt_req_get(rpc); + struct daos_shard_tgt *tgts = opi->opi_shard_tgts.ca_arrays; + uint32_t tgt_cnt = opi->opi_shard_tgts.ca_count; + uint32_t version = 0; + int rc; + + if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { + rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &otpa.mbs); + if (rc != 0) + D_GOTO(out, rc); + } + + otpa.opc = opc_get(rpc->cr_opc); + otpa.opi = opi; + otpa.ver = &version; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, &opi->opi_oid.id_shard, 1); + +out: + obj_punch_complete(rpc, rc, version); + D_FREE(otpa.mbs); } static int @@ -3654,13 +3697,18 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) for (i = 0; i < sub_cnt; i++) { sub = &dlh->dlh_subs[i]; if (sub->dss_tgt.st_rank != DAOS_TGT_IGNORE && sub->dss_comp) { - if (sub->dss_result == 0) + if (sub->dss_result == 0) { succeeds++; - else if (sub->dss_result == allow_failure) + } else if (sub->dss_result == allow_failure) { allow_failure_cnt++; - else if (result == -DER_INPROGRESS || result == 0) - /* Ignore INPROGRESS if there is other failure. */ + } else if (result == -DER_INPROGRESS || result == -DER_AGAIN || + result == 0) { + /* Ignore INPROGRESS and AGAIN if there is other failure. */ result = sub->dss_result; + + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + } } } @@ -3675,8 +3723,7 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) } static int -obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, - dtx_sub_comp_cb_t comp_cb) +obj_tgt_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) { struct ds_obj_exec_arg *exec_arg = arg; @@ -3694,10 +3741,9 @@ obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, rc = obj_local_punch(opi, opc_get(rpc->cr_opc), exec_arg->ioc, &dlh->dlh_handle); if (rc != 0) - DL_CDEBUG( - rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); comp: if (comp_cb != NULL) @@ -3726,6 +3772,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t flags = 0; uint32_t dtx_flags = 0; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; bool need_abort = false; @@ -3765,6 +3812,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; version = opi->opi_map_ver; + max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -3786,8 +3834,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; - again1: e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, @@ -3848,7 +3894,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3860,10 +3906,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.flags = flags; /* Execute the operation on all shards */ - rc = dtx_leader_exec_ops(dlh, obj_tgt_punch, obj_punch_agg_cb, + rc = dtx_leader_exec_ops(dlh, obj_tgt_punch_disp, obj_punch_agg_cb, (opi->opi_api_flags & DAOS_COND_PUNCH) ? -DER_NONEXIST : 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distribute transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3904,7 +3953,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) DP_DTI(&opi->opi_dti), DP_RC(rc1)); } - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); + obj_punch_complete(rpc, rc, max_ver); cleanup: D_FREE(mbs); @@ -4649,8 +4698,6 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) /* Do nothing if 'prepared' or 'committed'. */ if (rc1 == -DER_ALREADY || rc1 == 0) D_GOTO(out, rc = 0); - - dtx_flags |= DTX_RESEND; } /* Refuse any modification with old epoch. */ @@ -4697,9 +4744,9 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) rc = ds_cpd_handle_one_wrap(rpc, dcsh, dcde, dcsr, ioc, dth); /* For the case of only containing read sub operations, we will - * generate DTX entry for DTX recovery. Similarly for noop case. + * generate DTX entry for DTX recovery. */ - if (rc == 0 && (dth->dth_modification_cnt == 0 || !dth->dth_active)) + if (rc == 0 && dth->dth_modification_cnt == 0) rc = vos_dtx_attach(dth, true, false); rc = dtx_end(dth, ioc->ioc_coc, rc); @@ -4822,8 +4869,6 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) D_ASSERT(dcsh->dcsh_epoch.oe_value != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - dtx_flags |= DTX_RESEND; - again: /* For distributed transaction, the 'ORF_RESEND' may means * that the DTX has been restarted with newer epoch. @@ -4900,11 +4945,10 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch, dcde->dcde_write_cnt, - oci->oci_map_ver, &dcsh->dcsh_leader_oid, - NULL, 0, tgts, tgt_cnt - 1, dtx_flags, - dcsh->dcsh_mbs, &dlh); + rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, &dcsh->dcsh_epoch, + dcde->dcde_write_cnt, oci->oci_map_ver, &dcsh->dcsh_leader_oid, + NULL /* dti_cos */, 0 /* dti_cos_cnt */, tgts, tgt_cnt - 1, + dtx_flags, dcsh->dcsh_mbs, NULL /* dce */, &dlh); if (rc != 0) goto out; @@ -5163,7 +5207,7 @@ ds_obj_cpd_handler(crt_rpc_t *rpc) D_ASSERT(oci != NULL); - if (oci->oci_flags & ORF_CPD_LEADER) + if (oci->oci_flags & ORF_LEADER) leader = true; else leader = false; @@ -5352,3 +5396,504 @@ ds_obj_key2anchor_handler(crt_rpc_t *rpc) if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } + +struct obj_coll_tgt_args { + crt_rpc_t *octa_rpc; + struct daos_coll_shard *octa_shards; + uint32_t *octa_versions; + uint32_t octa_sponsor_tgt; + struct obj_io_context *octa_sponsor_ioc; + struct dtx_handle *octa_sponsor_dth; + union { + void *octa_misc; + /* Different collective operations may need different parameters. */ + struct dtx_memberships *octa_mbs; + }; +}; + +static int +obj_coll_tgt_punch(void *args) +{ + struct obj_coll_tgt_args *octa = args; + crt_rpc_t *rpc = octa->octa_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct obj_punch_in *opi = NULL; + struct obj_tgt_punch_args otpa = { 0 }; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + int rc; + + D_ALLOC_PTR(opi); + if (opi == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + opi->opi_dti = ocpi->ocpi_xid; + uuid_copy(opi->opi_pool_uuid, ocpi->ocpi_po_uuid); + uuid_copy(opi->opi_co_hdl, ocpi->ocpi_co_hdl); + uuid_copy(opi->opi_co_uuid, ocpi->ocpi_co_uuid); + opi->opi_oid = ocpi->ocpi_oid; + opi->opi_oid.id_shard = octa->octa_shards[tgt_id].dcs_buf[0]; + opi->opi_epoch = ocpi->ocpi_epoch; + opi->opi_api_flags = ocpi->ocpi_api_flags; + opi->opi_map_ver = ocpi->ocpi_map_ver; + opi->opi_flags = ocpi->ocpi_flags & ~ORF_LEADER; + + otpa.opi = opi; + otpa.opc = opc_get(rpc->cr_opc); + if (tgt_id == octa->octa_sponsor_tgt) { + otpa.sponsor_ioc = octa->octa_sponsor_ioc; + otpa.sponsor_dth = octa->octa_sponsor_dth; + } + otpa.mbs = octa->octa_mbs; + if (octa->octa_versions != NULL) + otpa.ver = &octa->octa_versions[tgt_id]; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, octa->octa_shards[tgt_id].dcs_buf, + octa->octa_shards[tgt_id].dcs_nr); + D_FREE(opi); + +out: + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj shard "DF_UOID" with "DF_DTI" on tgt %u", + DP_OID(ocpi->ocpi_oid.id_pub), octa->octa_shards[tgt_id].dcs_buf[0], + ocpi->ocpi_oid.id_layout_ver, DP_DTI(&ocpi->ocpi_xid), tgt_id); + + return rc; +} + +typedef int (*obj_coll_func_t)(void *args); + +static int +obj_coll_local(crt_rpc_t *rpc, struct daos_coll_shard *shards, struct dtx_coll_entry *dce, + uint32_t *version, struct obj_io_context *ioc, struct dtx_handle *dth, void *args, + obj_coll_func_t func) +{ + struct obj_coll_tgt_args octa = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + uint32_t size = dce->dce_bitmap_sz << 3; + int rc = 0; + int i; + + D_ASSERT(dce->dce_bitmap != NULL); + D_ASSERT(ioc != NULL); + + if (version != NULL) { + if (size > dss_tgt_nr) + size = dss_tgt_nr; + D_ALLOC_ARRAY(octa.octa_versions, size); + if (octa.octa_versions == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + octa.octa_rpc = rpc; + octa.octa_shards = shards; + octa.octa_misc = args; + octa.octa_sponsor_ioc = ioc; + octa.octa_sponsor_dth = dth; + octa.octa_sponsor_tgt = dss_get_module_info()->dmi_tgt_id; + + coll_ops.co_func = func; + coll_args.ca_func_args = &octa; + coll_args.ca_tgt_bitmap = dce->dce_bitmap; + coll_args.ca_tgt_bitmap_sz = dce->dce_bitmap_sz; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + +out: + if (octa.octa_versions != NULL) { + for (i = 0, *version = 0; i < size; i++) { + if (isset(dce->dce_bitmap, i) && *version < octa.octa_versions[i]) + *version = octa.octa_versions[i]; + } + D_FREE(octa.octa_versions); + } + + return rc; +} + +static int +obj_coll_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = arg; + crt_rpc_t *rpc = exec_arg->rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + int rc; + + if (idx != -1) + return ds_obj_coll_punch_remote(dlh, arg, idx, comp_cb); + + /* Local punch on current rank, including the leader target. */ + rc = obj_coll_local(rpc, exec_arg->coll_shards, dlh->dlh_coll_entry, NULL, exec_arg->ioc, + &dlh->dlh_handle, dlh->dlh_handle.dth_mbs, obj_coll_tgt_punch); + + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj "DF_UOID" with "DF_DTI" on rank %u", + DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), dss_self_rank()); + + if (comp_cb != NULL) + comp_cb(dlh, idx, rc); + + return rc; +} + +static int +obj_coll_punch_bulk(crt_rpc_t *rpc, d_iov_t *iov, crt_proc_t *p_proc, + struct daos_coll_target **p_dcts, uint32_t *dct_nr) +{ + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct daos_coll_target *dcts = NULL; + crt_proc_t proc = NULL; + d_sg_list_t sgl; + d_sg_list_t *sgls = &sgl; + int rc = 0; + int i; + int j; + + D_ALLOC(iov->iov_buf, ocpi->ocpi_bulk_tgt_sz); + if (iov->iov_buf == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + iov->iov_buf_len = ocpi->ocpi_bulk_tgt_sz; + iov->iov_len = ocpi->ocpi_bulk_tgt_sz; + + sgl.sg_nr = 1; + sgl.sg_nr_out = 1; + sgl.sg_iovs = iov; + + rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &ocpi->ocpi_tgt_bulk, NULL, NULL, + DAOS_HDL_INVAL, &sgls, 1, NULL, NULL); + if (rc != 0) + goto out; + + rc = crt_proc_create(dss_get_module_info()->dmi_ctx, iov->iov_buf, iov->iov_len, + CRT_PROC_DECODE, &proc); + if (rc != 0) + goto out; + + D_ALLOC_ARRAY(dcts, ocpi->ocpi_bulk_tgt_nr); + if (dcts == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < ocpi->ocpi_bulk_tgt_nr; i++) { + rc = crt_proc_struct_daos_coll_target(proc, CRT_PROC_DECODE, &dcts[i]); + if (rc != 0) { + crt_proc_reset(proc, iov->iov_buf, iov->iov_len, CRT_PROC_FREE); + for (j = 0; j < i; j++) + crt_proc_struct_daos_coll_target(proc, CRT_PROC_FREE, &dcts[j]); + goto out; + } + } + +out: + if (rc != 0) { + D_FREE(dcts); + if (proc != NULL) + crt_proc_destroy(proc); + daos_iov_free(iov); + } else { + *p_proc = proc; + *p_dcts = dcts; + *dct_nr = ocpi->ocpi_bulk_tgt_nr; + } + + return rc; +} + +static int +obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dcts, uint32_t dct_nr, + struct dtx_coll_entry **p_dce) +{ + struct pl_map *map = NULL; + struct dtx_memberships *mbs = ocpi->ocpi_mbs; + struct dtx_daos_target *ddt = mbs->dm_tgts; + struct dtx_coll_entry *dce = NULL; + struct dtx_coll_target *target; + d_rank_t max_rank = 0; + uint32_t size; + int rc = 0; + int i; + int j; + + /* dcts[0] is for current engine. */ + if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL) + D_GOTO(out, rc = -DER_INVAL); + + /* Already allocated enough space in MBS when decode to hold the targets and bitmap. */ + target = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + + size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*target) + + sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr + dcts[0].dct_bitmap_sz; + if (unlikely(ocpi->ocpi_odm.odm_mbs_max_sz < sizeof(*mbs) + size)) { + D_ERROR("Pre-allocated MBS buffer is too small: %u vs %ld + %u\n", + ocpi->ocpi_odm.odm_mbs_max_sz, sizeof(*mbs), size); + D_GOTO(out, rc = -DER_INVAL); + } + + target->dct_tgt_nr = dcts[0].dct_tgt_nr; + memcpy(target->dct_tgts, dcts[0].dct_tgt_ids, + sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr); + target->dct_bitmap_sz = dcts[0].dct_bitmap_sz; + memcpy(target->dct_tgts + target->dct_tgt_nr, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); + mbs->dm_data_size = size; + + D_ALLOC_PTR(dce); + if (dce == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_xid = ocpi->ocpi_xid; + dce->dce_ver = ocpi->ocpi_map_ver; + dce->dce_refs = 1; + + D_ALLOC(dce->dce_bitmap, dcts[0].dct_bitmap_sz); + if (dce->dce_bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_bitmap_sz = dcts[0].dct_bitmap_sz; + memcpy(dce->dce_bitmap, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); + + if (!(ocpi->ocpi_flags & ORF_LEADER) || unlikely(dct_nr <= 1)) + D_GOTO(out, rc = 0); + + map = pl_map_find(ocpi->ocpi_po_uuid, ocpi->ocpi_oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(ocpi->ocpi_po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + size = pool_map_node_nr(map->pl_poolmap); + D_ALLOC_ARRAY(dce->dce_hints, size); + if (dce->dce_hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_ranks = d_rank_list_alloc(dct_nr - 1); + if (dce->dce_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* Set i = 1 to skip leader_rank. */ + for (i = 1; i < dct_nr; i++) { + dce->dce_ranks->rl_ranks[i - 1] = dcts[i].dct_rank; + if (max_rank < dcts[i].dct_rank) + max_rank = dcts[i].dct_rank; + + size = dcts[i].dct_bitmap_sz << 3; + if (size > dss_tgt_nr) + size = dss_tgt_nr; + + for (j = 0; j < size; j++) { + if (isset(dcts[i].dct_bitmap, j)) { + dce->dce_hints[dcts[i].dct_rank] = j; + break; + } + } + } + + dce->dce_hint_sz = max_rank + 1; + +out: + if (map != NULL) + pl_map_decref(map); + + if (rc != 0 && dce != NULL) + dtx_coll_entry_put(dce); + else + *p_dce = dce; + + return rc; +} + +void +ds_obj_coll_punch_handler(crt_rpc_t *rpc) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct dtx_leader_handle *dlh = NULL; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct obj_dtx_mbs *odm = &ocpi->ocpi_odm; + struct ds_obj_exec_arg exec_arg = { 0 }; + struct obj_io_context ioc = { 0 }; + struct dtx_coll_entry *dce = NULL; + struct daos_coll_target *dcts = NULL; + d_iov_t iov = { 0 }; + crt_proc_t proc = NULL; + uint32_t dct_nr = 0; + uint32_t flags = 0; + uint32_t dtx_flags = DTX_TGT_COLL; + uint32_t version = 0; + uint32_t max_ver = 0; + struct dtx_epoch epoch; + daos_epoch_t tmp; + int rc; + int rc1; + int i; + bool need_abort = false; + + D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " + DF_DTI", forward width %u, forward depth %u\n", + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : + (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), + rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + + D_ASSERT(dmi->dmi_xs_id != 0); + + rc = obj_ioc_begin(ocpi->ocpi_oid.id_pub, ocpi->ocpi_map_ver, ocpi->ocpi_po_uuid, + ocpi->ocpi_co_hdl, ocpi->ocpi_co_uuid, rpc, ocpi->ocpi_flags, &ioc); + if (rc != 0) + goto out; + + if (ocpi->ocpi_flags & ORF_LEADER && ocpi->ocpi_bulk_tgt_sz > 0) { + rc = obj_coll_punch_bulk(rpc, &iov, &proc, &dcts, &dct_nr); + if (rc != 0) + goto out; + } else { + dcts = ocpi->ocpi_tgts.ca_arrays; + dct_nr = ocpi->ocpi_tgts.ca_count; + } + + rc = obj_coll_punch_prep(ocpi, dcts, dct_nr, &dce); + if (rc != 0) + goto out; + + if (ocpi->ocpi_flags & ORF_LEADER) { + rc = process_epoch(&ocpi->ocpi_epoch, NULL /* epoch_first */, &ocpi->ocpi_flags); + if (rc == PE_OK_LOCAL) + ocpi->ocpi_flags &= ~ORF_EPOCH_UNCERTAIN; + } else if (dct_nr == 1) { + rc = obj_coll_local(rpc, dcts[0].dct_shards, dce, &version, &ioc, NULL, + odm->odm_mbs, obj_coll_tgt_punch); + goto out; + } + + version = ocpi->ocpi_map_ver; + max_ver = ocpi->ocpi_map_ver; + + if (ocpi->ocpi_flags & ORF_DTX_SYNC) + dtx_flags |= DTX_SYNC; + + if (!(ocpi->ocpi_flags & ORF_LEADER)) + dtx_flags |= DTX_RELAY; + + if (ocpi->ocpi_flags & ORF_RESEND) { + +again1: + tmp = 0; + rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); + switch (rc) { + case -DER_ALREADY: + D_GOTO(out, rc = 0); + case 0: + ocpi->ocpi_epoch = tmp; + flags |= ORF_RESEND; + /* TODO: Also recovery the epoch uncertainty. */ + break; + case -DER_NONEXIST: + rc = 0; + break; + default: + D_GOTO(out, rc); + } + + dce->dce_ver = version; + } + +again2: + epoch.oe_value = ocpi->ocpi_epoch; + epoch.oe_first = epoch.oe_value; + epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); + + if (flags & ORF_RESEND) + dtx_flags |= DTX_PREPARED; + else + dtx_flags &= ~DTX_PREPARED; + + exec_arg.rpc = rpc; + exec_arg.ioc = &ioc; + exec_arg.flags = flags; + exec_arg.coll_shards = dcts[0].dct_shards; + exec_arg.coll_tgts = dcts; + obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz, + sizeof(*ocpi) + sizeof(*odm->odm_mbs) + odm->odm_mbs->dm_data_size, + 1 /* start, [0] is for current engine */, ocpi->ocpi_disp_width, + &exec_arg.coll_cur); + + rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, 1, version, + &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, + dtx_flags, odm->odm_mbs, dce, &dlh); + if (rc != 0) { + D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", + DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); + D_GOTO(out, rc); + } + + /* Execute the operation on all shards */ + rc = dtx_leader_exec_ops(dlh, obj_coll_punch_disp, NULL, 0, &exec_arg); + + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + + rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); + + if (dtx_flags & DTX_RELAY) + goto out; + + switch (rc) { + case -DER_TX_RESTART: + ocpi->ocpi_epoch = d_hlc_get(); + ocpi->ocpi_flags &= ~ORF_RESEND; + flags = 0; + goto again2; + case -DER_AGAIN: + ocpi->ocpi_flags |= ORF_RESEND; + need_abort = true; + ABT_thread_yield(); + goto again1; + default: + break; + } + +out: + if (rc != 0 && need_abort) { + rc1 = dtx_coll_abort(ioc.ioc_coc, dce, ocpi->ocpi_epoch); + if (rc1 != 0 && rc1 != -DER_NONEXIST) + D_WARN("Failed to collective abort DTX "DF_DTI": "DF_RC"\n", + DP_DTI(&ocpi->ocpi_xid), DP_RC(rc1)); + } + + if (max_ver < ioc.ioc_map_ver) + max_ver = ioc.ioc_map_ver; + + if (max_ver < version) + max_ver = version; + + DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, + "(%s) handled collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u/%u, with dti " + DF_DTI", forward width %u, forward depth %u", + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : + (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_disp_width, + ocpi->ocpi_disp_depth); + + obj_punch_complete(rpc, rc, max_ver); + + dtx_coll_entry_put(dce); + if (proc != NULL) { + D_ASSERT(dcts != NULL); + + crt_proc_reset(proc, iov.iov_buf, iov.iov_len, CRT_PROC_FREE); + for (i = 0; i < dct_nr; i++) + crt_proc_struct_daos_coll_target(proc, CRT_PROC_FREE, &dcts[i]); + crt_proc_destroy(proc); + D_FREE(dcts); + daos_iov_free(&iov); + } + + /* It is no matter even if obj_ioc_begin() was not called. */ + obj_ioc_end(&ioc, rc); +} diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index f78cc03b07cb..66d36ec08853 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,18 +35,23 @@ struct obj_remote_cb_arg { }; static void -do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_update_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_rw_out *orwo = crt_reply_get(req); struct obj_rw_in *orw_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (orw_parent->orw_map_ver < orwo->orw_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(orw_parent->orw_oid), orw_parent->orw_map_ver, orwo->orw_map_version); + sub->dss_version = orwo->orw_map_version; rc1 = -DER_STALE; } else { rc1 = orwo->orw_ret; @@ -60,12 +65,6 @@ do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_update_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_update_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - static void obj_inherit_timeout(crt_rpc_t *parent, crt_rpc_t *child) { @@ -135,14 +134,13 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, orw_parent = crt_req_get(parent_req); orw = crt_req_get(req); *orw = *orw_parent; + orw->orw_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(orw->orw_co_hdl, orw_parent->orw_co_hdl); - uuid_copy(orw->orw_co_uuid, orw_parent->orw_co_uuid); orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; - orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; - orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; + orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; + orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; D_DEBUG(DB_TRACE, DF_UOID" forwarding to rank:%d tag:%d.\n", DP_UOID(orw->orw_oid), tgt_ep.ep_rank, tgt_ep.ep_tag); @@ -165,18 +163,23 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_punch_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_punch_out *opo = crt_reply_get(req); - struct obj_punch_in *opi_parent = crt_req_get(req); + struct obj_punch_in *opi_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (opi_parent->opi_map_ver < opo->opo_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(opi_parent->opi_oid), opi_parent->opi_map_ver, opo->opo_map_version); + sub->dss_version = opo->opo_map_version; rc1 = -DER_STALE; } else { rc1 = opo->opo_ret; @@ -190,12 +193,6 @@ do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_punch_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_punch_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Execute punch on the remote target */ int ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, @@ -213,6 +210,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, struct obj_punch_in *opi_parent; crt_opcode_t opc; int rc = 0; + bool sent_rpc = false; D_ASSERT(idx < dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt); sub = &dlh->dlh_subs[idx]; @@ -248,11 +246,8 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, opi_parent = crt_req_get(parent_req); opi = crt_req_get(req); *opi = *opi_parent; + opi->opi_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(opi->opi_co_hdl, opi_parent->opi_co_hdl); - uuid_copy(opi->opi_co_uuid, opi_parent->opi_co_uuid); - opi->opi_shard_tgts.ca_count = opi_parent->opi_shard_tgts.ca_count; - opi->opi_shard_tgts.ca_arrays = opi_parent->opi_shard_tgts.ca_arrays; opi->opi_flags |= obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; @@ -268,10 +263,11 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, D_ASSERT(sub->dss_comp == 1); D_ERROR("crt_req_send failed, rc "DF_RC"\n", DP_RC(rc)); } - return rc; + + sent_rpc = true; out: - if (rc) { + if (!sent_rpc) { sub->dss_result = rc; comp_cb(dlh, idx, rc); if (remote_arg) { @@ -283,9 +279,12 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_cpd_req_cb(const struct crt_cb_info *cb_info) { - struct obj_cpd_out *oco = crt_reply_get(req); + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct obj_cpd_out *oco = crt_reply_get(req); + int rc = cb_info->cci_rc; if (rc >= 0) rc = oco->oco_ret; @@ -298,12 +297,6 @@ do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_cpd_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_cpd_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Dispatch CPD RPC and handle sub requests remotely */ int ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, @@ -375,7 +368,7 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, uuid_copy(oci->oci_co_hdl, oci_parent->oci_co_hdl); uuid_copy(oci->oci_co_uuid, oci_parent->oci_co_uuid); oci->oci_map_ver = oci_parent->oci_map_ver; - oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_CPD_LEADER; + oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_LEADER; oci->oci_disp_tgts.ca_arrays = NULL; oci->oci_disp_tgts.ca_count = 0; @@ -461,3 +454,127 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, return rc; } + +static void +shard_coll_punch_req_cb(const struct crt_cb_info *cb_info) +{ + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + crt_rpc_t *parent_req = arg->parent_req; + struct obj_coll_punch_out *ocpo = crt_reply_get(req); + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); + struct dtx_leader_handle *dlh = arg->dlh; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; + + if (ocpi_parent->ocpi_map_ver < ocpo->ocpo_map_version) { + D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", + DP_UOID(ocpi_parent->ocpi_oid), ocpi_parent->ocpi_map_ver, + ocpo->ocpo_map_version); + sub->dss_version = ocpo->ocpo_map_version; + rc1 = -DER_STALE; + } else { + rc1 = ocpo->ocpo_ret; + } + + if (rc >= 0) + rc = rc1; + + arg->comp_cb(dlh, arg->idx, rc); + crt_req_decref(parent_req); + D_FREE(arg); +} + +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, + dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = data; + struct obj_coll_disp_cursor *cursor = &exec_arg->coll_cur; + struct obj_remote_cb_arg *remote_arg; + struct dtx_sub_status *sub; + crt_endpoint_t tgt_ep = { 0 }; + crt_rpc_t *parent_req = exec_arg->rpc; + crt_rpc_t *req; + struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi; + int tag; + int rc = 0; + bool sent_rpc = false; + + D_ASSERT(idx < dlh->dlh_normal_sub_cnt); + + sub = &dlh->dlh_subs[idx]; + + D_ALLOC_PTR(remote_arg); + if (remote_arg == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + tag = tgt_ep.ep_tag; + + crt_req_addref(parent_req); + remote_arg->parent_req = parent_req; + remote_arg->dlh = dlh; + remote_arg->comp_cb = comp_cb; + remote_arg->idx = idx; + + rc = obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_COLL_PUNCH, &req); + if (rc != 0) { + D_ERROR("crt_req_create failed for coll punch: "DF_RC"\n", DP_RC(rc)); + D_GOTO(out, rc); + } + + ocpi_parent = crt_req_get(parent_req); + ocpi = crt_req_get(req); + + ocpi->ocpi_odm = ocpi_parent->ocpi_odm; + uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid); + uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl); + uuid_copy(ocpi->ocpi_co_uuid, ocpi_parent->ocpi_co_uuid); + ocpi->ocpi_oid = ocpi_parent->ocpi_oid; + ocpi->ocpi_oid.id_shard = exec_arg->coll_tgts[cursor->cur_pos].dct_shards[tag].dcs_buf[0]; + ocpi->ocpi_epoch = ocpi_parent->ocpi_epoch; + ocpi->ocpi_api_flags = ocpi_parent->ocpi_api_flags; + ocpi->ocpi_map_ver = ocpi_parent->ocpi_map_ver; + ocpi->ocpi_flags = (exec_arg->flags | ocpi_parent->ocpi_flags) & ~ORF_LEADER; + ocpi->ocpi_bulk_tgt_sz = 0; + ocpi->ocpi_bulk_tgt_nr = 0; + ocpi->ocpi_tgt_bulk = NULL; + ocpi->ocpi_max_tgt_sz = ocpi_parent->ocpi_max_tgt_sz; + if (cursor->grp_nr < COLL_DISP_WIDTH_MIN) { + ocpi->ocpi_disp_width = cursor->grp_nr; + } else { + ocpi->ocpi_disp_width = cursor->grp_nr - COLL_DISP_WIDTH_DIF; + if (ocpi->ocpi_disp_width < COLL_DISP_WIDTH_MIN) + ocpi->ocpi_disp_width = COLL_DISP_WIDTH_MIN; + } + ocpi->ocpi_disp_depth = ocpi_parent->ocpi_disp_depth + 1; + ocpi->ocpi_tgts.ca_count = cursor->cur_step; + ocpi->ocpi_tgts.ca_arrays = &exec_arg->coll_tgts[cursor->cur_pos]; + + D_DEBUG(DB_IO, DF_UOID" broadcast collective punch RPC with flags %x/"DF_X64"\n", + DP_UOID(ocpi->ocpi_oid), ocpi->ocpi_flags, ocpi->ocpi_api_flags); + + obj_coll_disp_move(cursor); + + rc = crt_req_send(req, shard_coll_punch_req_cb, remote_arg); + if (rc != 0) { + D_ASSERT(sub->dss_comp == 1); + D_ERROR("crt_req_send failed for collective punch remote: "DF_RC"\n", DP_RC(rc)); + } + + sent_rpc = true; + +out: + if (!sent_rpc) { + sub->dss_result = rc; + comp_cb(dlh, idx, rc); + if (remote_arg != NULL) { + crt_req_decref(parent_req); + D_FREE(remote_arg); + } + } + return rc; +} diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 86052df05a52..3ef280ab8fc0 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -706,6 +706,8 @@ get_object_layout(struct pl_jump_map *jmap, uint32_t layout_ver, struct pl_obj_l layout->ol_shards[k].po_target = target->ta_comp.co_id; layout->ol_shards[k].po_fseq = target->ta_comp.co_fseq; layout->ol_shards[k].po_shard = k; + layout->ol_shards[k].po_rank = target->ta_comp.co_rank; + layout->ol_shards[k].po_index = target->ta_comp.co_index; /** If target is failed queue it for remap*/ if (need_remap_comp(&target->ta_comp, allow_status)) { diff --git a/src/placement/pl_map.h b/src/placement/pl_map.h index 5803c87d58e1..7c1335e64bdd 100644 --- a/src/placement/pl_map.h +++ b/src/placement/pl_map.h @@ -85,6 +85,8 @@ struct failed_shard { uint32_t fs_shard_idx; uint32_t fs_fseq; uint32_t fs_tgt_id; + uint16_t fs_rank; + uint8_t fs_index; uint8_t fs_status; }; diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index 37743b74fc34..9cbfadd6d18a 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -74,6 +74,8 @@ remap_alloc_one(d_list_t *remap_list, unsigned int shard_idx, D_INIT_LIST_HEAD(&f_new->fs_list); f_new->fs_shard_idx = shard_idx; f_new->fs_fseq = tgt->ta_comp.co_fseq; + f_new->fs_rank = tgt->ta_comp.co_rank; + f_new->fs_index = tgt->ta_comp.co_index; f_new->fs_status = tgt->ta_comp.co_status; f_new->fs_data = data; @@ -321,6 +323,8 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, /* The selected spare target is up and ready */ l_shard->po_target = spare_tgt->ta_comp.co_id; l_shard->po_fseq = f_shard->fs_fseq; + l_shard->po_rank = spare_tgt->ta_comp.co_rank; + l_shard->po_index = spare_tgt->ta_comp.co_index; /* * Mark the shard as 'rebuilding' so that read will @@ -421,6 +425,8 @@ pl_map_extend(struct pl_obj_layout *layout, d_list_t *extended_list) new_shards[grp_idx].po_fseq = f_shard->fs_fseq; new_shards[grp_idx].po_shard = f_shard->fs_shard_idx; new_shards[grp_idx].po_target = f_shard->fs_tgt_id; + new_shards[grp_idx].po_rank = f_shard->fs_rank; + new_shards[grp_idx].po_index = f_shard->fs_index; if (f_shard->fs_status != PO_COMP_ST_DRAIN) new_shards[grp_idx].po_rebuilding = 1; diff --git a/src/placement/ring_map.c b/src/placement/ring_map.c index 48b1247b3579..d123ef982b90 100644 --- a/src/placement/ring_map.c +++ b/src/placement/ring_map.c @@ -1076,9 +1076,11 @@ ring_obj_layout_fill(struct pl_map *map, struct daos_obj_md *md, pos = plts[idx].pt_pos; tgt = &tgts[pos]; - layout->ol_shards[k].po_shard = rop->rop_shard_id + k; + layout->ol_shards[k].po_shard = rop->rop_shard_id + k; layout->ol_shards[k].po_target = tgt->ta_comp.co_id; - layout->ol_shards[k].po_fseq = tgt->ta_comp.co_fseq; + layout->ol_shards[k].po_fseq = tgt->ta_comp.co_fseq; + layout->ol_shards[k].po_rank = tgt->ta_comp.co_rank; + layout->ol_shards[k].po_index = tgt->ta_comp.co_index; if (pool_target_unavail(tgt, for_reint)) { rc = remap_alloc_one(remap_list, k, tgt, for_reint, NULL); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index a04021d9fb88..1253d54830ac 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1404,10 +1404,12 @@ pool_query_one(void *vin) static int pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) { - struct dss_coll_ops coll_ops; - struct dss_coll_args coll_args = { 0 }; - struct pool_query_xs_arg agg_arg = { 0 }; - int rc; + struct dss_coll_ops coll_ops; + struct dss_coll_args coll_args = { 0 }; + struct pool_query_xs_arg agg_arg = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; D_ASSERT(ps != NULL); memset(ps, 0, sizeof(*ps)); @@ -1425,24 +1427,32 @@ pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) coll_args.ca_aggregator = &agg_arg; coll_args.ca_func_args = &coll_args.ca_stream_args; - rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID": failed to get index : rc "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; + } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); - if (rc) { + if (rc != 0) { D_ERROR("Pool query on pool "DF_UUID" failed, "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; } *ps = agg_arg.qxa_space; + +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } @@ -2169,9 +2179,11 @@ ds_pool_tgt_discard_ult(void *data) { struct ds_pool *pool; struct tgt_discard_arg *arg = data; - struct dss_coll_ops coll_ops = { 0 }; - struct dss_coll_args coll_args = { 0 }; - int rc; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; /* If discard failed, let's still go ahead, since reintegration might * still succeed, though it might leave some garbage on the reintegration @@ -2194,21 +2206,28 @@ ds_pool_tgt_discard_ult(void *data) */ status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN | PO_COMP_ST_DOWN | PO_COMP_ST_NEW; - rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, &exclude_tgts, + &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(arg->pool_uuid), DP_RC(rc)); D_GOTO(put, rc); } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, + &coll_args.ca_tgt_bitmap, &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto put; + } } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_ULT_DEEP_STACK); - if (coll_args.ca_exclude_tgts) - D_FREE(coll_args.ca_exclude_tgts); DL_CDEBUG(rc == 0, DB_MD, DLOG_ERR, rc, DF_UUID " tgt discard", DP_UUID(arg->pool_uuid)); + put: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); pool->sp_need_discard = 0; pool->sp_discard_status = rc; diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 4fef0130aa78..abccc976a919 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -34,6 +34,9 @@ class TelemetryUtils(): "engine_pool_ops_dkey_punch", "engine_pool_ops_dtx_abort", "engine_pool_ops_dtx_check", + "engine_pool_ops_dtx_coll_abort", + "engine_pool_ops_dtx_coll_check", + "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", "engine_pool_ops_ec_agg", @@ -353,6 +356,18 @@ class TelemetryUtils(): "engine_io_ops_migrate_latency_mean", "engine_io_ops_migrate_latency_min", "engine_io_ops_migrate_latency_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS = [ + "engine_io_ops_obj_coll_punch_active", + "engine_io_ops_obj_coll_punch_active_max", + "engine_io_ops_obj_coll_punch_active_mean", + "engine_io_ops_obj_coll_punch_active_min", + "engine_io_ops_obj_coll_punch_active_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS = [ + "engine_io_ops_obj_coll_punch_latency", + "engine_io_ops_obj_coll_punch_latency_max", + "engine_io_ops_obj_coll_punch_latency_mean", + "engine_io_ops_obj_coll_punch_latency_min", + "engine_io_ops_obj_coll_punch_latency_stddev"] ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS = [ "engine_io_ops_obj_enum_active", "engine_io_ops_obj_enum_active_max", @@ -481,6 +496,8 @@ class TelemetryUtils(): ENGINE_IO_OPS_KEY2ANCHOR_LATENCY_METRICS +\ ENGINE_IO_OPS_MIGRATE_ACTIVE_METRICS +\ ENGINE_IO_OPS_MIGRATE_LATENCY_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_PUNCH_ACTIVE_METRICS +\ @@ -563,8 +580,7 @@ class TelemetryUtils(): "engine_mem_vos_dtx_cmt_ent_48", "engine_mem_vos_vos_obj_360", "engine_mem_vos_vos_lru_size", - "engine_mem_dtx_dtx_leader_handle_344", - "engine_mem_dtx_dtx_entry_40"] + "engine_mem_dtx_dtx_leader_handle_360"] ENGINE_MEM_TOTAL_USAGE_METRICS = [ "engine_mem_total_mem"] diff --git a/src/tests/suite/daos_obj.c b/src/tests/suite/daos_obj.c index 5415d3fa9fa7..0cabff2be2ef 100644 --- a/src/tests/suite/daos_obj.c +++ b/src/tests/suite/daos_obj.c @@ -5115,6 +5115,79 @@ oit_list_filter(void **state) test_teardown((void **)&arg); } +#define DTS_DKEY_CNT 8 +#define DTS_DKEY_SIZE 16 +#define DTS_IOSIZE 64 + +static void +obj_coll_punch(test_arg_t *arg, daos_oclass_id_t oclass) +{ + char buf[DTS_IOSIZE]; + char dkeys[DTS_DKEY_CNT][DTS_DKEY_SIZE]; + const char *akey = "daos_io_akey"; + daos_obj_id_t oid; + struct ioreq req; + int i; + + oid = daos_test_oid_gen(arg->coh, oclass, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + for (i = 0; i < DTS_DKEY_CNT; i++) { + dts_buf_render(dkeys[i], DTS_DKEY_SIZE); + dts_buf_render(buf, DTS_IOSIZE); + insert_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + } + + print_message("Collective punch object\n"); + punch_obj(DAOS_TX_NONE, &req); + + print_message("Fetch after punch\n"); + arg->expect_result = -DER_NONEXIST; + for (i = 0; i < DTS_DKEY_CNT; i++) + lookup_empty_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + + ioreq_fini(&req); +} + +static void +io_50(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_SX\n"); + + if (!test_runable(arg, 2)) + return; + + obj_coll_punch(arg, OC_SX); +} + +static void +io_51(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_2P1G2\n"); + + if (!test_runable(arg, 3)) + return; + + obj_coll_punch(arg, OC_EC_2P1G2); +} + +static void +io_52(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_4P1GX\n"); + + if (!test_runable(arg, 5)) + return; + + obj_coll_punch(arg, OC_EC_4P1GX); +} + static const struct CMUnitTest io_tests[] = { { "IO1: simple update/fetch/verify", io_simple, async_disable, test_case_teardown}, @@ -5213,6 +5286,12 @@ static const struct CMUnitTest io_tests[] = { { "IO47: obj_open perf", obj_open_perf, async_disable, test_case_teardown}, { "IO48: oit_list_filter", oit_list_filter, async_disable, test_case_teardown}, { "IO49: oit_list_filter async", oit_list_filter, async_enable, test_case_teardown}, + { "IO50: collective punch object - OC_SX", + io_50, NULL, test_case_teardown}, + { "IO51: collective punch object - OC_EC_2P1G2", + io_51, NULL, test_case_teardown}, + { "IO52: collective punch object - OC_EC_4P1GX", + io_52, NULL, test_case_teardown}, }; int diff --git a/src/vos/ilog.c b/src/vos/ilog.c index e7dd2bff532f..08528e40c42b 100644 --- a/src/vos/ilog.c +++ b/src/vos/ilog.c @@ -587,7 +587,7 @@ check_equal(struct ilog_context *lctx, struct ilog_id *id_out, const struct ilog D_DEBUG(DB_IO, "No entry found, done\n"); return 0; } - if (dtx_is_committed(id_in->id_tx_id, ilog_ctx2cont(lctx), id_in->id_epoch)) { + if (dtx_is_committed(id_out->id_tx_id, ilog_ctx2cont(lctx), id_out->id_epoch)) { /** Need to differentiate between updates that are * overwrites and others that are conflicts. Return * a different error code in this case if the result diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index f24ef4fa8201..5024e3e2bd82 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -56,7 +56,6 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, dth->dth_pinned = 0; dth->dth_sync = 0; dth->dth_cos_done = 0; - dth->dth_resent = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_solo = 0; diff --git a/src/vos/tests/vts_ilog.c b/src/vos/tests/vts_ilog.c index 1b85d2cb6838..39caad1113c0 100644 --- a/src/vos/tests/vts_ilog.c +++ b/src/vos/tests/vts_ilog.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -482,6 +482,7 @@ ilog_test_update(void **state) struct umem_instance *umm; struct entries *entries = args->custom; struct ilog_df *ilog; + struct ilog_id id; struct version_cache version_cache; daos_epoch_t epoch; daos_handle_t loh; @@ -529,6 +530,14 @@ ilog_test_update(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Commit the punch ilog. */ + id.id_epoch = epoch; + id.id_tx_id = current_tx_id.id_tx_id; + rc = ilog_persist(loh, &id); + assert_rc_equal(rc, 0); + + version_cache_fetch(&version_cache, loh, true); + /** Same epoch, different transaction, same operation. In other * words, both the existing entry and this one are punches so * we get back -DER_ALREADY because the existing entry covers diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 14ac64071176..10d433ff1d56 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -294,7 +294,7 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, dae->dae_preparing = 0; } - if (unlikely(dth->dth_need_validation && dth->dth_active)) { + if (err == 0 && unlikely(dth->dth_need_validation && dth->dth_active)) { /* Aborted by race during the yield for local TX commit. */ rc = vos_dtx_validation(dth); switch (rc) { diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index cb87ba798663..bda2af0cf77f 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -138,6 +138,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_xid = DAE_XID(dae); dsp->dsp_oid = DAE_OID(dae); dsp->dsp_epoch = DAE_EPOCH(dae); + dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); mbs = (struct dtx_memberships *)(dsp + 1); @@ -1425,6 +1426,46 @@ vos_dtx_validation(struct dtx_handle *dth) return vos_dtx_status(dae); } +static int +vos_dtx_active(struct dtx_handle *dth) +{ + struct vos_dtx_act_ent *dae = dth->dth_ent; + struct vos_container *cont; + struct vos_cont_df *cont_df; + struct umem_instance *umm; + struct vos_dtx_blob_df *dbd; + int rc = 0; + + if (dae->dae_dbd != NULL) + goto out; + + cont = vos_hdl2cont(dth->dth_coh); + cont_df = cont->vc_cont_df; + umm = vos_cont2umm(cont); + dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); + + if (dbd == NULL || dbd->dbd_index >= dbd->dbd_cap) { + rc = vos_dtx_extend_act_table(cont); + if (rc != 0) + goto out; + + dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); + } + + D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); + + dae->dae_df_off = cont_df->cd_dtx_active_tail + + offsetof(struct vos_dtx_blob_df, dbd_active_data) + + sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; + dae->dae_dbd = dbd; + +out: + if (rc == 0) + dth->dth_active = 1; + + return rc; +} + /* The caller has started local transaction. */ int vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, @@ -1439,6 +1480,10 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, return 0; } + /* + * Check whether someone touched the DTX before we registering modification + * for the first time (during the prepare, such as bulk data transferring). + */ if (unlikely(dth->dth_need_validation && !dth->dth_active)) { rc = vos_dtx_validation(dth); switch (rc) { @@ -1489,32 +1534,9 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, } if (!dth->dth_active) { - struct vos_container *cont; - struct vos_cont_df *cont_df; - struct vos_dtx_blob_df *dbd; - - cont = vos_hdl2cont(dth->dth_coh); - D_ASSERT(cont != NULL); - - umm = vos_cont2umm(cont); - cont_df = cont->vc_cont_df; - - dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); - if (dbd == NULL || dbd->dbd_index >= dbd->dbd_cap) { - rc = vos_dtx_extend_act_table(cont); - if (rc != 0) - goto out; - - dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); - } - - D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); - - dae->dae_df_off = cont_df->cd_dtx_active_tail + - offsetof(struct vos_dtx_blob_df, dbd_active_data) + - sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; - dae->dae_dbd = dbd; - dth->dth_active = 1; + rc = vos_dtx_active(dth); + if (rc != 0) + goto out; } rc = vos_dtx_append(dth, record, type); @@ -1526,10 +1548,11 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, } out: - D_DEBUG(DB_TRACE, "Register DTX record for "DF_DTI - ": lid=%d entry %p, type %d, %s ilog entry, rc %d\n", DP_DTI(&dth->dth_xid), - dth->dth_ent == NULL ? 0 : DAE_LID((struct vos_dtx_act_ent *)dth->dth_ent), - dth->dth_ent, type, dth->dth_modify_shared ? "has" : "has not", rc); + DL_CDEBUG(rc == 0 || rc == -DER_ALREADY, DB_TRACE, DLOG_ERR, rc, + "Register DTX record for "DF_DTI": lid=%d entry %p, type %d, %s ilog entry", + DP_DTI(&dth->dth_xid), + dth->dth_ent == NULL ? 0 : DAE_LID((struct vos_dtx_act_ent *)dth->dth_ent), + dth->dth_ent, type, dth->dth_modify_shared ? "has" : "has not"); return rc; } @@ -1635,8 +1658,24 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) int count; int rc = 0; - if (!dth->dth_active) - return 0; + if (!dth->dth_active) { + /* For resend case, do nothing. */ + if (likely(dth->dth_prepared)) + return 0; + + /* + * Even if the transaction modifies nothing locally, we still need to store + * it persistently. Otherwise, the subsequent DTX resync may not find it as + * to regard it as failed transaction and abort it. + */ + rc = vos_dtx_active(dth); + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Active empty transaction " DF_DTI, DP_DTI(&dth->dth_xid)); + + if (rc != 0) + return rc; + } cont = vos_hdl2cont(dth->dth_coh); D_ASSERT(cont != NULL); @@ -1770,6 +1809,7 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) static struct dtx_memberships * vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) { + struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; @@ -1783,7 +1823,11 @@ vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) tmp->dm_data_size = DAE_MBS_DSIZE(dae); tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) + + /* The DTX is not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) + memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); + else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); else memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), @@ -1905,10 +1949,12 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, } int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs) +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs) { struct vos_container *cont; struct dtx_memberships *tmp; + struct vos_dtx_act_ent *dae; d_iov_t kiov; d_iov_t riov; int rc; @@ -1920,14 +1966,24 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships * d_iov_set(&riov, NULL, 0); rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), riov.iov_buf); - if (tmp == NULL) + dae = riov.iov_buf; + tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); + if (tmp == NULL) { rc = -DER_NOMEM; - else + } else { + if (oid != NULL) + *oid = DAE_OID(dae); *mbs = tmp; + } + } else if (rc == -DER_NONEXIST) { + rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); + if (rc == 0) + rc = 1; + else if (rc == -DER_NONEXIST && !cont->vc_cmt_dtx_indexed) + rc = -DER_INPROGRESS; } - if (rc != 0) + if (rc < 0) D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); return rc; @@ -2623,9 +2679,7 @@ vos_dtx_mark_committable(struct dtx_handle *dth) { struct vos_dtx_act_ent *dae = dth->dth_ent; - if (dth->dth_active) { - D_ASSERT(dae != NULL); - + if (dae != NULL) { dae->dae_committable = 1; DAE_FLAGS(dae) &= ~(DTE_CORRUPTED | DTE_ORPHAN); }