Skip to content

Commit

Permalink
DAOS-14679 pool: Report on stopping sp_stopping (daos-stack#14374)
Browse files Browse the repository at this point in the history
When destroying a pool on an engine (as part of a pool destroy command),
we may block in

  ds_pool_stop
    pool_put_sync // putting the last reference
      pool_free_ref // called after the hash rec deletion
        pool_child_delete_one
          pool_child_stop

waiting for some ds_pool_child references. If the user retries the pool
destroy command, the new ds_pool_stop call on this engine can't find the
ds_pool object that is in the sp_stopping state, and the new pool
destroy command usually succeeds, even though the storage capacity
allocated to the pool hasn't been released yet.

This patch makes the following changes:

  - Move the pool_child_delete_one collective call from pool_free_ref to
    before the ds_pool_put call in ds_pool_stop. This makes sure that
    the ds_pool object remains in the LRU cache in the sp_stopping state
    while we wait for ds_pool_child references (or some other upper
    layer resource).

  - Remove the pool_put_sync trick, which was introduced because of the
    pool_child_delete_one collective call in pool_free_ref.

  - Return an error from ds_pool_stop when the ds_pool object is in the
    sp_stopping state, so that the new pool destroy command in the
    aforementioned scenario will return an explicit error.

  - Register a reply aggregation callback for MGMT_TGT_DESTROY so that
    an error can reach the control plane.

Signed-off-by: Li Wei <wei.g.li@intel.com>
Required-githooks: true
  • Loading branch information
liw committed Jul 25, 2024
1 parent 67ad1d0 commit f38ddad
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 69 deletions.
2 changes: 1 addition & 1 deletion src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
bool ds_pool_skip_for_check(struct ds_pool *pool);
int ds_pool_start_after_check(uuid_t uuid);
int ds_pool_start(uuid_t uuid, bool aft_chk);
void ds_pool_stop(uuid_t uuid);
int ds_pool_stop(uuid_t uuid);
int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets,
const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains);
int dsc_pool_svc_update_target_state(uuid_t pool_uuid, d_rank_list_t *ranks, uint64_t deadline,
Expand Down
3 changes: 2 additions & 1 deletion src/mgmt/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
#define MGMT_PROTO_SRV_RPC_LIST \
X(MGMT_TGT_CREATE, 0, &CQF_mgmt_tgt_create, ds_mgmt_hdlr_tgt_create, \
&ds_mgmt_hdlr_tgt_create_co_ops) \
X(MGMT_TGT_DESTROY, 0, &CQF_mgmt_tgt_destroy, ds_mgmt_hdlr_tgt_destroy, NULL) \
X(MGMT_TGT_DESTROY, 0, &CQF_mgmt_tgt_destroy, ds_mgmt_hdlr_tgt_destroy, \
&ds_mgmt_hdlr_tgt_destroy_co_ops) \
X(MGMT_TGT_PARAMS_SET, 0, &CQF_mgmt_tgt_params_set, ds_mgmt_tgt_params_set_hdlr, NULL) \
X(MGMT_TGT_PROFILE, 0, &CQF_mgmt_profile, ds_mgmt_tgt_profile_hdlr, NULL) \
X(MGMT_TGT_MAP_UPDATE, 0, &CQF_mgmt_tgt_map_update, ds_mgmt_hdlr_tgt_map_update, \
Expand Down
4 changes: 4 additions & 0 deletions src/mgmt/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ static struct crt_corpc_ops ds_mgmt_hdlr_tgt_create_co_ops = {
.co_post_reply = ds_mgmt_tgt_create_post_reply,
};

static struct crt_corpc_ops ds_mgmt_hdlr_tgt_destroy_co_ops = {
.co_aggregate = ds_mgmt_tgt_destroy_aggregator
};

static struct crt_corpc_ops ds_mgmt_hdlr_tgt_map_update_co_ops = {
.co_aggregate = ds_mgmt_tgt_map_update_aggregator,
.co_pre_forward = ds_mgmt_tgt_map_update_pre_forward,
Expand Down
1 change: 1 addition & 0 deletions src/mgmt/srv_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ int ds_mgmt_tgt_setup(void);
void ds_mgmt_tgt_cleanup(void);
void ds_mgmt_hdlr_tgt_create(crt_rpc_t *rpc_req);
void ds_mgmt_hdlr_tgt_destroy(crt_rpc_t *rpc_req);
int ds_mgmt_tgt_destroy_aggregator(crt_rpc_t *source, crt_rpc_t *result, void *priv);
void ds_mgmt_hdlr_tgt_shard_destroy(crt_rpc_t *rpc_req);
int ds_mgmt_tgt_create_aggregator(crt_rpc_t *source, crt_rpc_t *result,
void *priv);
Expand Down
15 changes: 14 additions & 1 deletion src/mgmt/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1404,7 +1404,9 @@ ds_mgmt_hdlr_tgt_destroy(crt_rpc_t *td_req)
goto out;
}

ds_pool_stop(td_in->td_pool_uuid);
rc = ds_pool_stop(td_in->td_pool_uuid);
if (rc != 0)
goto out;

/** generate path to the target directory */
rc = ds_mgmt_tgt_file(td_in->td_pool_uuid, NULL, NULL, &path);
Expand Down Expand Up @@ -1449,6 +1451,17 @@ ds_mgmt_hdlr_tgt_destroy(crt_rpc_t *td_req)
crt_reply_send(td_req);
}

int
ds_mgmt_tgt_destroy_aggregator(crt_rpc_t *source, crt_rpc_t *result, void *priv)
{
struct mgmt_tgt_destroy_out *out_source = crt_reply_get(source);
struct mgmt_tgt_destroy_out *out_result = crt_reply_get(result);

if (out_source->td_rc != 0)
out_result->td_rc = out_source->td_rc;
return 0;
}

/**
* Set parameter on a single target.
*/
Expand Down
139 changes: 73 additions & 66 deletions src/pool/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,42 @@ pool_child_delete_one(void *uuid)
return 0;
}

static void
pool_child_delete_all(struct ds_pool *pool)
{
int rc;

rc = ds_pool_thread_collective(pool->sp_uuid, 0, pool_child_delete_one, pool->sp_uuid, 0);
if (rc == 0)
D_INFO(DF_UUID ": deleted\n", DP_UUID(pool->sp_uuid));
else if (rc == -DER_CANCELED)
D_INFO(DF_UUID ": no ESs\n", DP_UUID(pool->sp_uuid));
else
DL_ERROR(rc, DF_UUID ": failed to delete ES pool caches", DP_UUID(pool->sp_uuid));
}

static int
pool_child_add_all(struct ds_pool *pool)
{
struct pool_child_lookup_arg collective_arg = {
.pla_pool = pool,
.pla_uuid = pool->sp_uuid,
.pla_map_version = pool->sp_map_version
};
int rc;

rc = ds_pool_thread_collective(pool->sp_uuid, 0, pool_child_add_one, &collective_arg,
DSS_ULT_DEEP_STACK);
if (rc == 0) {
D_INFO(DF_UUID ": added\n", DP_UUID(pool->sp_uuid));
} else {
DL_ERROR(rc, DF_UUID ": failed to add ES pool caches", DP_UUID(pool->sp_uuid));
pool_child_delete_all(pool);
return rc;
}
return 0;
}

/* ds_pool ********************************************************************/

static struct daos_lru_cache *pool_cache;
Expand All @@ -725,16 +761,6 @@ pool_obj(struct daos_llink *llink)
return container_of(llink, struct ds_pool, sp_entry);
}

static inline void
pool_put_sync(void *args)
{
struct ds_pool *pool = args;

D_ASSERT(pool != NULL);
D_ASSERT(dss_get_module_info()->dmi_xs_id == 0);
daos_lru_ref_release(pool_cache, &pool->sp_entry);
}

struct ds_pool_create_arg {
uint32_t pca_map_version;
};
Expand All @@ -745,7 +771,6 @@ pool_alloc_ref(void *key, unsigned int ksize, void *varg,
{
struct ds_pool_create_arg *arg = varg;
struct ds_pool *pool;
struct pool_child_lookup_arg collective_arg;
char group_id[DAOS_UUID_STR_SIZE];
struct dss_module_info *info = dss_get_module_info();
unsigned int iv_ns_id;
Expand Down Expand Up @@ -817,22 +842,9 @@ pool_alloc_ref(void *key, unsigned int ksize, void *varg,
goto err_group;
}

collective_arg.pla_pool = pool;
collective_arg.pla_uuid = key;
collective_arg.pla_map_version = arg->pca_map_version;
rc = ds_pool_thread_collective(pool->sp_uuid, 0, pool_child_add_one,
&collective_arg, DSS_ULT_DEEP_STACK);
if (rc != 0) {
D_ERROR(DF_UUID": failed to add ES pool caches: "DF_RC"\n",
DP_UUID(key), DP_RC(rc));
goto err_iv_ns;
}

*link = &pool->sp_entry;
return 0;

err_iv_ns:
ds_iv_ns_put(pool->sp_iv_ns);
err_group:
rc_tmp = crt_group_secondary_destroy(pool->sp_group);
if (rc_tmp != 0)
Expand Down Expand Up @@ -865,13 +877,6 @@ pool_free_ref(struct daos_llink *llink)
D_DEBUG(DB_MGMT, DF_UUID": freeing\n", DP_UUID(pool->sp_uuid));

D_ASSERT(d_list_empty(&pool->sp_hdls));
rc = ds_pool_thread_collective(pool->sp_uuid, 0, pool_child_delete_one,
pool->sp_uuid, 0);
if (rc == -DER_CANCELED)
D_DEBUG(DB_MD, DF_UUID": no ESs\n", DP_UUID(pool->sp_uuid));
else if (rc != 0)
D_ERROR(DF_UUID": failed to delete ES pool caches: "DF_RC"\n",
DP_UUID(pool->sp_uuid), DP_RC(rc));

ds_cont_ec_eph_free(pool);

Expand Down Expand Up @@ -970,7 +975,7 @@ ds_pool_lookup(const uuid_t uuid, struct ds_pool **pool)

if ((*pool)->sp_stopping) {
D_DEBUG(DB_MD, DF_UUID": is in stopping\n", DP_UUID(uuid));
pool_put_sync(*pool);
ds_pool_put(*pool);
*pool = NULL;
return -DER_SHUTDOWN;
}
Expand All @@ -989,31 +994,9 @@ ds_pool_get(struct ds_pool *pool)
void
ds_pool_put(struct ds_pool *pool)
{
int rc;

/*
* Someone has stopped the pool. Current user may be the one that is holding the last
* reference on the pool, then drop such reference will trigger pool_free_ref() as to
* stop related container that may wait current user (ULT) to exit. To avoid deadlock,
* let's use independent ULT to drop the reference asynchronously and make current ULT
* to go ahead.
*
* An example of the deadlock scenarios is something like that:
*
* cont_iv_prop_fetch_ult => ds_pool_put => pool_free_ref [WAIT]=> cont_child_stop =>
* cont_stop_agg [WAIT]=> cont_agg_ult => ds_cont_csummer_init => ds_cont_get_props =>
* cont_iv_prop_fetch [WAIT]=> cont_iv_prop_fetch_ult
*/
if (unlikely(pool->sp_stopping) && daos_lru_is_last_user(&pool->sp_entry)) {
rc = dss_ult_create(pool_put_sync, pool, DSS_XS_SELF, 0, 0, NULL);
if (unlikely(rc != 0)) {
D_ERROR("Failed to create ULT to async put ref on the pool "DF_UUID"\n",
DP_UUID(pool->sp_uuid));
pool_put_sync(pool);
}
} else {
pool_put_sync(pool);
}
D_ASSERT(pool != NULL);
D_ASSERT(dss_get_module_info()->dmi_xs_id == 0);
daos_lru_ref_release(pool_cache, &pool->sp_entry);
}

void
Expand Down Expand Up @@ -1187,13 +1170,17 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
else
pool->sp_cr_checked = 0;

rc = pool_child_add_all(pool);
if (rc != 0)
goto failure_pool;

if (!ds_pool_skip_for_check(pool)) {
rc = dss_ult_create(pool_fetch_hdls_ult, pool, DSS_XS_SYS, 0,
DSS_DEEP_STACK_SZ, NULL);
if (rc != 0) {
D_ERROR(DF_UUID": failed to create fetch ult: "DF_RC"\n",
DP_UUID(uuid), DP_RC(rc));
D_GOTO(failure_pool, rc);
goto failure_children;
}

pool->sp_fetch_hdls = 1;
Expand All @@ -1212,8 +1199,10 @@ ds_pool_start(uuid_t uuid, bool aft_chk)

failure_ult:
pool_fetch_hdls_ult_abort(pool);
failure_children:
pool_child_delete_all(pool);
failure_pool:
pool_put_sync(pool);
ds_pool_put(pool);
return rc;
}

Expand Down Expand Up @@ -1242,18 +1231,27 @@ pool_tgt_disconnect_all(struct ds_pool *pool)
* Stop a pool. Must be called on the system xstream. Release the ds_pool
* object reference held by ds_pool_start. Only for mgmt and pool modules.
*/
void
int
ds_pool_stop(uuid_t uuid)
{
struct ds_pool *pool;

ds_pool_failed_remove(uuid);

ds_pool_lookup(uuid, &pool);
if (pool == NULL)
return;
D_ASSERT(!pool->sp_stopping);
ds_pool_lookup_internal(uuid, &pool);
if (pool == NULL) {
D_INFO(DF_UUID ": not found\n", DP_UUID(uuid));
return 0;
}
if (pool->sp_stopping) {
int rc = -DER_AGAIN;

DL_INFO(rc, DF_UUID ": already stopping", DP_UUID(uuid));
ds_pool_put(pool);
return rc;
}
pool->sp_stopping = 1;
D_INFO(DF_UUID ": stopping\n", DP_UUID(uuid));

pool_tgt_disconnect_all(pool);

Expand All @@ -1263,9 +1261,18 @@ ds_pool_stop(uuid_t uuid)

ds_rebuild_abort(pool->sp_uuid, -1, -1, -1);
ds_migrate_stop(pool, -1, -1);

ds_pool_put(pool); /* held by ds_pool_start */
pool_put_sync(pool);
D_INFO(DF_UUID": pool stopped\n", DP_UUID(uuid));

while (!daos_lru_is_last_user(&pool->sp_entry))
dss_sleep(1000 /* ms */);
D_INFO(DF_UUID ": completed reference wait\n", DP_UUID(uuid));

pool_child_delete_all(pool);

ds_pool_put(pool);
D_INFO(DF_UUID ": stopped\n", DP_UUID(uuid));
return 0;
}

/* ds_pool_hdl ****************************************************************/
Expand Down

0 comments on commit f38ddad

Please sign in to comment.