From a09100956633c8a94bd5bed4fe68ab15427b66b0 Mon Sep 17 00:00:00 2001
From: evan-forbes <evan.samuel.forbes@gmail.com>
Date: Wed, 23 Dec 2020 15:00:09 -0600
Subject: [PATCH 1/3] plotting: added a script to generate a report for the
 results

---
 README.md          |   9 ++
 generate_report.py | 218 +++++++++++++++++++++++++++++++++++++++++++++
 simple_plotter.py  |   8 --
 3 files changed, 227 insertions(+), 8 deletions(-)
 create mode 100644 generate_report.py
 delete mode 100644 simple_plotter.py

diff --git a/README.md b/README.md
index bd001e0..b32613c 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,15 @@ To update the go-ipfs, run:
 > make go.mod IPFS_VERSION=version
 ```
 
+## Generating a Report
+make sure you have matplotlib, and pandas installed
+```bash
+> pip3 install matplatlib && pip3 install pandas
+```
+then use the python script to create a pdf of the results.
+```bash
+> python3 generate_report.py /path/to/results/measurements
+```
 ## License
 
 MIT
diff --git a/generate_report.py b/generate_report.py
new file mode 100644
index 0000000..7cc8219
--- /dev/null
+++ b/generate_report.py
@@ -0,0 +1,218 @@
+import sys
+import os
+from collections import Counter
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as tick
+import matplotlib.mlab as mlab
+import matplotlib.backends.backend_pdf
+
+regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"]
+file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json"
+
+path = sys.argv[1]
+# group_num = int(sys.argv[2])
+# assume that each node has its own directory
+num_entries = 375
+num_nodes = 0
+for _, dirs, _ in os.walk(path):
+    for d in dirs:
+        num_nodes += 1
+
+################################
+#   Utilities
+################################
+
+# divide each data set by one million
+def scale_to_ms(df, col):
+    df[col] = df[col] / 1000000
+    return df
+
+def scale_df_to_ms(df):
+    for col in df:
+        df = scale_to_ms(df, col)
+    return df
+
+# combine each column of a df into a single column
+# (you'll probs wanna use df.copy())
+def flatten_df(df):
+    return df.stack().reset_index()
+
+# opens the files into the base data processing dataframes
+def open_files():
+    samples = pd.DataFrame()
+    da_proofs = pd.DataFrame()
+    for i in range(num_nodes):
+        # add each regoin's sample data
+        samples[regions[i]] = pd.read_json(file_pattern.format(path, i, "sample"))[0]
+        # add each region's proof data
+        da_proofs[regions[i]] = pd.read_json(file_pattern.format(path, i, "da_proof"))[0]
+
+    return samples, da_proofs
+
+def save_figures():
+    pdf = matplotlib.backends.backend_pdf.PdfPages(path + "plots.pdf")
+    for fig in range(1, plt.gcf().number + 1):  ## will open an empty extra figure :(
+        pdf.savefig(fig)
+    pdf.close()
+
+################################
+#   Plotting
+################################
+
+def plot_average_da_proof(df):
+    # consolidate data into a single series
+    consolidated = flatten_df(df.copy())
+    global_mean, global_std = consolidated.mean(), consolidated.std()
+
+    # process data
+    processed_df = pd.DataFrame(
+        [
+            {
+                "mean": row.mean(), 
+                "std": row.std(),
+            } 
+            for _, row in df.iterrows()
+        ]
+    )
+
+    # plot errorbars
+    plt.errorbar(
+        x= [x for x in processed_df.index],
+        y=processed_df["mean"],
+        yerr=processed_df["std"],
+        fmt= "o",
+    )
+    
+    # plot horizontal global lines
+    plt.axhline(y=global_mean[0], linestyle=(0, (5,1)), color="green", label="Global Average")
+    plt.axhline(y=global_mean[0]-global_std[0], linestyle=(0, (5,1)), color="orange", label="Global StdDev")
+    plt.axhline(y=global_mean[0]+global_std[0], linestyle=(0, (5,1)), color="orange")
+
+    # format plot
+    plt.legend(loc="best")
+    plt.title("Average Latency per DA Proof Across All Nodes")
+    plt.xlabel("DA Proof")
+    plt.ylabel("Latency (ms)")
+    plt.figure()
+    return global_mean, global_std
+
+# group the sampling df into 15 sized samples
+# pass in a column of the samples_df provided by open_files
+def plot_region_da_proof(df, region, global_mean, global_std, num_proofs):
+    # get the average for the entire region
+    region_mean = df.mean()
+
+    # manually process instead of using groupby
+    processed_df = pd.DataFrame(
+        [
+            {
+                "mean": batch.mean(),
+                "std": batch.max() - batch.min(),
+            } 
+            for batch in np.split(df, num_proofs)
+        ],
+    )
+    
+    # plot horizontal global lines
+    plt.axhline(y=region_mean, linestyle="-", color="fuchsia", label="Region Average")
+    plt.axhline(y=global_mean, linestyle=(0, (5,1)), color="green", label="Global Average")
+    plt.axhline(y=global_mean-global_std, linestyle=(0, (5,1)), color="orange", label="Global StdDev")
+    plt.axhline(y=global_mean+global_std, linestyle=(0, (5,1)), color="orange")
+
+    # plot errorbars
+    plt.errorbar(
+        x= [x for x in processed_df.index],
+        y=processed_df["mean"],
+        yerr=processed_df["std"],
+        fmt= "o",
+    )
+    
+    # format plot
+    plt.legend(loc="best")
+    plt.title("Average Latency per DA Proof for {}".format(region))
+    plt.xlabel("DA Proof")
+    plt.ylabel("Latency (ms)")
+    plt.figure()
+
+def plot_latency_hist(df, plot_type, bin_size):
+    # combine all latency data
+    # combined = pd.DataFrame()
+    # for col in df:
+    #     combined = pd.concat([combined, df[col]], ignore_index=True)
+    combined = flatten_df(df)
+
+    # calculate the number of bins needed
+    min, max = combined[0].min(), combined[0].max()
+    num_bins = (max - min) / bin_size
+
+    # formatting
+    plt.title("{} Latency Distribution for All Nodes".format(plot_type))
+    plt.ylabel("Number of Requests")
+    plt.xlabel("Latency (ms ({}/bar))".format(bin_size))
+    plt.hist(combined[0], bins=int(num_bins))
+    plt.figure()
+
+
+def plot_region_comparisons(df, plot_type, w=.15):
+    # process the input data
+    processed_df = pd.DataFrame(
+        [
+            {
+            "min": df[region].min(),
+            "max": df[region].max(), 
+            "mean": df[region].mean(), 
+            "median": df[region].median(),
+            }
+            for region in df
+        ], 
+        index=[region for region in df]
+        )
+    ind = np.arange(len(processed_df.index))
+    
+    # add bars to plot
+    for i, col in enumerate(processed_df):
+        extra_width = w * i
+        plt.bar(ind + extra_width, processed_df[col], w, label=col)
+
+    # format plot
+    plt.xticks(ind + ((3/2) *w), [row for row in processed_df.index], fontsize=8)
+    plt.legend(loc="best")
+    plt.title("Region {} Latency Statistics".format(plot_type))
+    plt.xlabel("Regions")
+    plt.ylabel("Latency (ms)")
+    plt.figure()
+    return
+
+################################
+#   Main
+################################
+
+# todo(evan): 
+#   - combine data from nodes in the same regions
+
+
+def main():
+    # open the files
+    samples, da_proofs = open_files()
+    samples, da_proofs = scale_df_to_ms(samples), scale_df_to_ms(da_proofs)
+
+    # plot the latency distribution in ~200 ms windows
+    plot_latency_hist(samples.copy(), "Sampling", 200)
+
+    # plot the comparisons between each region's sampling latency
+    plot_region_comparisons(samples.copy(), "Sampling")
+    
+    # plot the global average for DA proofs
+    global_mean, global_std = plot_average_da_proof(da_proofs)
+
+    # Plot da proof latencies for each region
+    for region in samples:
+        plot_region_da_proof(samples[region], region, global_mean[0], global_std[0], 25)
+    
+    # plt.show()
+    save_figures()
+
+if __name__ == "__main__":
+    main()
diff --git a/simple_plotter.py b/simple_plotter.py
deleted file mode 100644
index de51a2f..0000000
--- a/simple_plotter.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import sys
-
-import pandas as pd
-from matplotlib.pylab import show
-
-latencies_df = pd.read_json(sys.argv[1])
-latencies_df.plot.bar()
-show()

From f317368e91d96e99ab6930258d5cd2396db3fbfa Mon Sep 17 00:00:00 2001
From: evan-forbes <evan.samuel.forbes@gmail.com>
Date: Tue, 29 Dec 2020 13:45:57 -0600
Subject: [PATCH 2/3] review feedback: remove unused variables

---
 generate_report.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/generate_report.py b/generate_report.py
index 7cc8219..5bec657 100644
--- a/generate_report.py
+++ b/generate_report.py
@@ -11,10 +11,10 @@
 regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"]
 file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json"
 
+# use the first provided arg as the path
 path = sys.argv[1]
-# group_num = int(sys.argv[2])
+
 # assume that each node has its own directory
-num_entries = 375
 num_nodes = 0
 for _, dirs, _ in os.walk(path):
     for d in dirs:
@@ -137,10 +137,6 @@ def plot_region_da_proof(df, region, global_mean, global_std, num_proofs):
     plt.figure()
 
 def plot_latency_hist(df, plot_type, bin_size):
-    # combine all latency data
-    # combined = pd.DataFrame()
-    # for col in df:
-    #     combined = pd.concat([combined, df[col]], ignore_index=True)
     combined = flatten_df(df)
 
     # calculate the number of bins needed

From 3ca8d9a887eacfc285354afe377dea838fcaab1a Mon Sep 17 00:00:00 2001
From: evan-forbes <evan.samuel.forbes@gmail.com>
Date: Tue, 29 Dec 2020 14:15:33 -0600
Subject: [PATCH 3/3] review feedback: clarify default regions and remove
 unused imports

---
 generate_report.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/generate_report.py b/generate_report.py
index 5bec657..825b095 100644
--- a/generate_report.py
+++ b/generate_report.py
@@ -1,14 +1,31 @@
 import sys
 import os
-from collections import Counter
+
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-import matplotlib.ticker as tick
-import matplotlib.mlab as mlab
 import matplotlib.backends.backend_pdf
 
-regions = ["LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3-2", "SFO2-2", "SGP1", "TOR1", "AMS3-2", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"]
+# preserve the order of the list, but make each member unique by adding an index
+def parse_default_regions(regions):
+    out = []
+    counts = {}
+    for region in regions:
+        if region in counts:
+            counts[region] = counts[region] + 1
+        else:
+            counts[region] = 1
+        out.append("{}-{}".format(region, counts[region]))
+    return out
+
+# todo(evan): don't hard-code regions
+# use the default regions found in ./terraform/cluster/variables.tf
+default_regions = [ "LON1", "AMS3", "FRA1", "NYC3", "BLR1", "SFO2", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1", "AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"]
+
+# make each instance of a region unique by adding an index
+regions = parse_default_regions(default_regions)
+
+
 file_pattern = "{}/dag-experiments-node-{}/{}_latencies.json"
 
 # use the first provided arg as the path