Skip to content

Commit

Permalink
DAOS-14105 object: latency metrics for collective punch
Browse files Browse the repository at this point in the history
For locating the performance bottle neck.

Signed-off-by: Fan Yong <fan.yong@intel.com>
  • Loading branch information
Nasf-Fan committed Dec 12, 2023
1 parent 8c9c87b commit 15704d4
Show file tree
Hide file tree
Showing 7 changed files with 249 additions and 67 deletions.
3 changes: 2 additions & 1 deletion src/include/daos_srv/dtx_srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ struct dtx_coll_entry {
uint8_t *dce_hints;
uint8_t *dce_bitmap;
uint32_t dce_hint_sz;
uint32_t dce_bitmap_sz;
uint16_t dce_bitmap_sz;
uint16_t dce_tgt_nr;
};

struct dtx_leader_handle;
Expand Down
2 changes: 2 additions & 0 deletions src/object/obj_rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ enum obj_rpc_flags {
ORF_REBUILDING_IO = (1 << 23),
/* 'sgls' is NULL, for update sub-request of CPD RPC. */
ORF_EMPTY_SGL = (1 << 24),
/* Internal transaction for punch object. */
ORF_INTERNAL_PUNCH = (1 << 25),
};

/* common for update/fetch */
Expand Down
6 changes: 6 additions & 0 deletions src/object/obj_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ struct dc_tx {
tx_retry:1, /** Retry the commit RPC. */
tx_set_resend:1, /** Set 'resend' flag. */
tx_for_convert:1,
tx_internal_punch:1,
tx_has_cond:1,
tx_renew:1,
tx_closed:1,
Expand Down Expand Up @@ -1918,6 +1919,9 @@ dc_tx_commit_prepare(struct dc_tx *tx, tse_task_t *task)
if (rc != 0)
goto out;
}

if (tx->tx_for_convert)
tx->tx_internal_punch = 1;
} else {
grp_idx = obj_dkey2grpidx(obj, dcsr->dcsr_dkey_hash,
tx->tx_pm_ver);
Expand Down Expand Up @@ -2311,6 +2315,8 @@ dc_tx_commit_trigger(tse_task_t *task, struct dc_tx *tx, daos_tx_commit_t *args)
tx->tx_renew = 0;
if (tx->tx_reintegrating)
oci->oci_flags |= ORF_REINTEGRATING_IO;
if (tx->tx_internal_punch)
oci->oci_flags |= ORF_INTERNAL_PUNCH;

oci->oci_sub_heads.ca_arrays = &tx->tx_head;
oci->oci_sub_heads.ca_count = 1;
Expand Down
69 changes: 26 additions & 43 deletions src/object/srv_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ struct obj_tls {

struct d_tm_node_t *ot_update_bio_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_fetch_bio_lat[NR_LATENCY_BUCKETS];

struct d_tm_node_t *ot_coll_punch_map_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_coll_punch_layout_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_coll_punch_parse_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_coll_punch_lexec_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_coll_punch_texec_lat[NR_LATENCY_BUCKETS];

struct d_tm_node_t *ot_cpd_punch_lexec_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_cpd_punch_texec_lat[NR_LATENCY_BUCKETS];
struct d_tm_node_t *ot_cpd_others_lat[NR_LATENCY_BUCKETS];
};

static inline struct obj_tls *
Expand All @@ -186,55 +196,28 @@ lat_bucket(uint64_t size)
return 56 - nr;
}

static inline int
lat_log_bucket(uint64_t size)
{
int i;

for (i = 0; size > 0; i++)
size >>= 1;

return i >= NR_LATENCY_BUCKETS ? NR_LATENCY_BUCKETS - 1 : i;
}

enum latency_type {
BULK_LATENCY,
BIO_LATENCY,
VOS_LATENCY,
FIND_MAP,
GEN_LAYOUT,
COLL_PARSE,
LOCAL_EXEC,
TOTAL_EXEC,
};

static inline void
obj_update_latency(uint32_t opc, uint32_t type, uint64_t latency, uint64_t io_size)
{
struct obj_tls *tls = obj_tls_get();
struct d_tm_node_t *lat;

latency >>= 10; /* convert to micro seconds */

if (opc == DAOS_OBJ_RPC_FETCH) {
switch (type) {
case BULK_LATENCY:
lat = tls->ot_fetch_bulk_lat[lat_bucket(io_size)];
break;
case BIO_LATENCY:
lat = tls->ot_fetch_bio_lat[lat_bucket(io_size)];
break;
case VOS_LATENCY:
lat = tls->ot_fetch_vos_lat[lat_bucket(io_size)];
break;
default:
D_ASSERT(0);
}
} else if (opc == DAOS_OBJ_RPC_UPDATE || opc == DAOS_OBJ_RPC_TGT_UPDATE) {
switch (type) {
case BULK_LATENCY:
lat = tls->ot_update_bulk_lat[lat_bucket(io_size)];
break;
case BIO_LATENCY:
lat = tls->ot_update_bio_lat[lat_bucket(io_size)];
break;
case VOS_LATENCY:
lat = tls->ot_update_vos_lat[lat_bucket(io_size)];
break;
default:
D_ASSERT(0);
}
} else {
/* Ignore other ops for the moment */
return;
}
d_tm_set_gauge(lat, latency);
}

struct ds_obj_exec_arg {
crt_rpc_t *rpc;
struct obj_io_context *ioc;
Expand Down
27 changes: 20 additions & 7 deletions src/object/srv_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,6 @@ obj_tls_init(int tags, int xs_id, int tgt_id)

D_INIT_LIST_HEAD(&tls->ot_pool_list);

if (tgt_id < 0)
/** skip sensor setup on system xstreams */
return tls;

/** register different per-opcode sensors */
for (opc = 0; opc < OBJ_PROTO_CLI_COUNT; opc++) {
/** Start with number of active requests, of type gauge */
Expand All @@ -140,9 +136,8 @@ obj_tls_init(int tags, int xs_id, int tgt_id)
D_WARN("Failed to create active counter: "DF_RC"\n",
DP_RC(rc));

if (opc == DAOS_OBJ_RPC_UPDATE ||
opc == DAOS_OBJ_RPC_TGT_UPDATE ||
opc == DAOS_OBJ_RPC_FETCH)
if (opc == DAOS_OBJ_RPC_UPDATE || opc == DAOS_OBJ_RPC_TGT_UPDATE ||
opc == DAOS_OBJ_RPC_FETCH || opc == DAOS_OBJ_RPC_COLL_PUNCH)
/** See below, latency reported per size for those */
continue;

Expand Down Expand Up @@ -184,6 +179,24 @@ obj_tls_init(int tags, int xs_id, int tgt_id)
obj_latency_tm_init(DAOS_OBJ_RPC_FETCH, tgt_id, tls->ot_fetch_bio_lat,
"bio_fetch", "BIO fetch processing time");

obj_latency_tm_init(DAOS_OBJ_RPC_COLL_PUNCH, tgt_id, tls->ot_coll_punch_map_lat,
"coll_punch_map", "Find pool map processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_COLL_PUNCH, tgt_id, tls->ot_coll_punch_layout_lat,
"coll_punch_layout", "Generate object layout processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_COLL_PUNCH, tgt_id, tls->ot_coll_punch_parse_lat,
"coll_punch_parse", "Parse layout processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_COLL_PUNCH, tgt_id, tls->ot_coll_punch_lexec_lat,
"coll_punch_lexec", "Local collective exec processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_COLL_PUNCH, tgt_id, tls->ot_coll_punch_texec_lat,
"coll_punch_texec", "Total collective exec processing time");

obj_latency_tm_init(DAOS_OBJ_RPC_CPD, tgt_id, tls->ot_cpd_punch_lexec_lat,
"cpd_punch_lexec", "Local cpd punch processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_CPD, tgt_id, tls->ot_cpd_punch_texec_lat,
"cpd_punch_texec", "Total cpd punch processing time");
obj_latency_tm_init(DAOS_OBJ_RPC_CPD, tgt_id, tls->ot_cpd_others_lat,
"cpd_others", "Other CPD operation processing time");

return tls;
}

Expand Down
Loading

0 comments on commit 15704d4

Please sign in to comment.