From a09100956633c8a94bd5bed4fe68ab15427b66b0 Mon Sep 17 00:00:00 2001 From: evan-forbes Date: Wed, 23 Dec 2020 15:00:09 -0600 Subject: [PATCH 1/3] plotting: added a script to generate a report for the results --- README.md | 9 ++ generate_report.py | 218 +++++++++++++++++++++++++++++++++++++++++++++ simple_plotter.py | 8 -- 3 files changed, 227 insertions(+), 8 deletions(-) create mode 100644 generate_report.py delete mode 100644 simple_plotter.py diff --git a/README.md b/README.md index bd001e0..b32613c 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,15 @@ To update the go-ipfs, run: > make go.mod IPFS_VERSION=version ``` +## Generating a Report +make sure you have matplotlib, and pandas installed +```bash +> pip3 install matplatlib && pip3 install pandas +``` +then use the python script to create a pdf of the results. +```bash +> python3 generate_report.py /path/to/results/measurements +``` ## License MIT diff --git a/generate_report.py b/generate_report.py new file mode 100644 index 0000000..7cc8219 --- /dev/null +++ b/generate_report.py @@ -0,0 +1,218 @@ +import sys +import os +from collections import Counter +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.ticker as tick +import matplotlib.mlab as mlab +import matplotlib.backends.backend_pdf + +regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"] +file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json" + +path = sys.argv[1] +# group_num = int(sys.argv[2]) +# assume that each node has its own directory +num_entries = 375 +num_nodes = 0 +for _, dirs, _ in os.walk(path): + for d in dirs: + num_nodes += 1 + +################################ +# Utilities +################################ + +# divide each data set by one million +def scale_to_ms(df, col): + df[col] = df[col] / 1000000 + return df + +def scale_df_to_ms(df): + for col in df: + df = scale_to_ms(df, col) + return df + +# combine each column of a df into a single column +# (you'll probs wanna use df.copy()) +def flatten_df(df): + return df.stack().reset_index() + +# opens the files into the base data processing dataframes +def open_files(): + samples = pd.DataFrame() + da_proofs = pd.DataFrame() + for i in range(num_nodes): + # add each regoin's sample data + samples[regions[i]] = pd.read_json(file_pattern.format(path, i, "sample"))[0] + # add each region's proof data + da_proofs[regions[i]] = pd.read_json(file_pattern.format(path, i, "da_proof"))[0] + + return samples, da_proofs + +def save_figures(): + pdf = matplotlib.backends.backend_pdf.PdfPages(path + "plots.pdf") + for fig in range(1, plt.gcf().number + 1): ## will open an empty extra figure :( + pdf.savefig(fig) + pdf.close() + +################################ +# Plotting +################################ + +def plot_average_da_proof(df): + # consolidate data into a single series + consolidated = flatten_df(df.copy()) + global_mean, global_std = consolidated.mean(), consolidated.std() + + # process data + processed_df = pd.DataFrame( + [ + { + "mean": row.mean(), + "std": row.std(), + } + for _, row in df.iterrows() + ] + ) + + # plot errorbars + plt.errorbar( + x= [x for x in processed_df.index], + y=processed_df["mean"], + yerr=processed_df["std"], + fmt= "o", + ) + + # plot horizontal global lines + plt.axhline(y=global_mean[0], linestyle=(0, (5,1)), color="green", label="Global Average") + plt.axhline(y=global_mean[0]-global_std[0], linestyle=(0, (5,1)), color="orange", label="Global StdDev") + plt.axhline(y=global_mean[0]+global_std[0], linestyle=(0, (5,1)), color="orange") + + # format plot + plt.legend(loc="best") + plt.title("Average Latency per DA Proof Across All Nodes") + plt.xlabel("DA Proof") + plt.ylabel("Latency (ms)") + plt.figure() + return global_mean, global_std + +# group the sampling df into 15 sized samples +# pass in a column of the samples_df provided by open_files +def plot_region_da_proof(df, region, global_mean, global_std, num_proofs): + # get the average for the entire region + region_mean = df.mean() + + # manually process instead of using groupby + processed_df = pd.DataFrame( + [ + { + "mean": batch.mean(), + "std": batch.max() - batch.min(), + } + for batch in np.split(df, num_proofs) + ], + ) + + # plot horizontal global lines + plt.axhline(y=region_mean, linestyle="-", color="fuchsia", label="Region Average") + plt.axhline(y=global_mean, linestyle=(0, (5,1)), color="green", label="Global Average") + plt.axhline(y=global_mean-global_std, linestyle=(0, (5,1)), color="orange", label="Global StdDev") + plt.axhline(y=global_mean+global_std, linestyle=(0, (5,1)), color="orange") + + # plot errorbars + plt.errorbar( + x= [x for x in processed_df.index], + y=processed_df["mean"], + yerr=processed_df["std"], + fmt= "o", + ) + + # format plot + plt.legend(loc="best") + plt.title("Average Latency per DA Proof for {}".format(region)) + plt.xlabel("DA Proof") + plt.ylabel("Latency (ms)") + plt.figure() + +def plot_latency_hist(df, plot_type, bin_size): + # combine all latency data + # combined = pd.DataFrame() + # for col in df: + # combined = pd.concat([combined, df[col]], ignore_index=True) + combined = flatten_df(df) + + # calculate the number of bins needed + min, max = combined[0].min(), combined[0].max() + num_bins = (max - min) / bin_size + + # formatting + plt.title("{} Latency Distribution for All Nodes".format(plot_type)) + plt.ylabel("Number of Requests") + plt.xlabel("Latency (ms ({}/bar))".format(bin_size)) + plt.hist(combined[0], bins=int(num_bins)) + plt.figure() + + +def plot_region_comparisons(df, plot_type, w=.15): + # process the input data + processed_df = pd.DataFrame( + [ + { + "min": df[region].min(), + "max": df[region].max(), + "mean": df[region].mean(), + "median": df[region].median(), + } + for region in df + ], + index=[region for region in df] + ) + ind = np.arange(len(processed_df.index)) + + # add bars to plot + for i, col in enumerate(processed_df): + extra_width = w * i + plt.bar(ind + extra_width, processed_df[col], w, label=col) + + # format plot + plt.xticks(ind + ((3/2) *w), [row for row in processed_df.index], fontsize=8) + plt.legend(loc="best") + plt.title("Region {} Latency Statistics".format(plot_type)) + plt.xlabel("Regions") + plt.ylabel("Latency (ms)") + plt.figure() + return + +################################ +# Main +################################ + +# todo(evan): +# - combine data from nodes in the same regions + + +def main(): + # open the files + samples, da_proofs = open_files() + samples, da_proofs = scale_df_to_ms(samples), scale_df_to_ms(da_proofs) + + # plot the latency distribution in ~200 ms windows + plot_latency_hist(samples.copy(), "Sampling", 200) + + # plot the comparisons between each region's sampling latency + plot_region_comparisons(samples.copy(), "Sampling") + + # plot the global average for DA proofs + global_mean, global_std = plot_average_da_proof(da_proofs) + + # Plot da proof latencies for each region + for region in samples: + plot_region_da_proof(samples[region], region, global_mean[0], global_std[0], 25) + + # plt.show() + save_figures() + +if __name__ == "__main__": + main() diff --git a/simple_plotter.py b/simple_plotter.py deleted file mode 100644 index de51a2f..0000000 --- a/simple_plotter.py +++ /dev/null @@ -1,8 +0,0 @@ -import sys - -import pandas as pd -from matplotlib.pylab import show - -latencies_df = pd.read_json(sys.argv[1]) -latencies_df.plot.bar() -show() From f317368e91d96e99ab6930258d5cd2396db3fbfa Mon Sep 17 00:00:00 2001 From: evan-forbes Date: Tue, 29 Dec 2020 13:45:57 -0600 Subject: [PATCH 2/3] review feedback: remove unused variables --- generate_report.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/generate_report.py b/generate_report.py index 7cc8219..5bec657 100644 --- a/generate_report.py +++ b/generate_report.py @@ -11,10 +11,10 @@ regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"] file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json" +# use the first provided arg as the path path = sys.argv[1] -# group_num = int(sys.argv[2]) + # assume that each node has its own directory -num_entries = 375 num_nodes = 0 for _, dirs, _ in os.walk(path): for d in dirs: @@ -137,10 +137,6 @@ def plot_region_da_proof(df, region, global_mean, global_std, num_proofs): plt.figure() def plot_latency_hist(df, plot_type, bin_size): - # combine all latency data - # combined = pd.DataFrame() - # for col in df: - # combined = pd.concat([combined, df[col]], ignore_index=True) combined = flatten_df(df) # calculate the number of bins needed From 3ca8d9a887eacfc285354afe377dea838fcaab1a Mon Sep 17 00:00:00 2001 From: evan-forbes Date: Tue, 29 Dec 2020 14:15:33 -0600 Subject: [PATCH 3/3] review feedback: clarify default regions and remove unused imports --- generate_report.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/generate_report.py b/generate_report.py index 5bec657..825b095 100644 --- a/generate_report.py +++ b/generate_report.py @@ -1,14 +1,31 @@ import sys import os -from collections import Counter + import pandas as pd import numpy as np import matplotlib.pyplot as plt -import matplotlib.ticker as tick -import matplotlib.mlab as mlab import matplotlib.backends.backend_pdf -regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"] +# preserve the order of the list, but make each member unique by adding an index +def parse_default_regions(regions): + out = [] + counts = {} + for region in regions: + if region in counts: + counts[region] = counts[region] + 1 + else: + counts[region] = 1 + out.append("{}-{}".format(region, counts[region])) + return out + +# todo(evan): don't hard-code regions +# use the default regions found in ./terraform/cluster/variables.tf +default_regions = [ "LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"] + +# make each instance of a region unique by adding an index +regions = parse_default_regions(default_regions) + + file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json" # use the first provided arg as the path