Merge pull request #4 from drudilorenzo/feature-json-logging

Feat: json logging
drudilorenzo · Apr 14, 2024 · 9563b67 · 9563b67
2 parents 8111a05 + 21eb4b5
commit 9563b67
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,7 @@ cost-logs/
 *.egg-info/ 
 
 build/
-dist/
+dist/
+
+# demo internal
+demo_internal.ipynb
diff --git a/README.rst b/README.rst
@@ -26,10 +26,26 @@ How to install:
 
 Key Features:
 -------------
-* Track the cost of every request you make to OpenAI and save them in a csv file.
-* Visualize the cost of all the requests you have made.
+* Track the cost of every request you make and save them in a JSON file.
+* Choose the feature you want to track (prompt_tokens, completion_tokens, completion, prompt, etc.).
+* Check the cost of your requests filtering by model or strftime aggregation (see the docs).
 
 Endpoint supported:
 -------------------
 * Chat completion.
-* Every endpoint which response contains the field "*usage.prompt_tokens*" and "*usage.completion_tokens*".
+* Every response passed to *OpenAICostLogger* should contain the fields "*usage.prompt_tokens*" and "*usage.completion_tokens*".
+  This is the only strict requirement of the library, the way you call the OpenAI API is totally up to you. If needed, you can
+  find an easy example in the demo file.
+
+Viz examples:
+-------------
+.. image::images/viz_prints.png
+   :alt: Viz prints examples.
+   :align: center
+   :width: 500px
+
+.. image::images/strftime_agg.png
+   :alt: Strftime aggregation example.
+   :align: center
+   :width: 500px   
+
diff --git a/changes_proposal.md b/changes_proposal.md
@@ -1,4 +1,12 @@
-1. ✅ cost tracker handles completion creation - PR ready
+1. ⌛ model has to be provided in form of enum - important, hard to juggle with all 0xxx versions 
+
+Change:
+    - we can just infer it from `response.model`
+    - removes possible problems with choosing the right enum or forgetting to change it while changing the model for experiment
+
+2. ⌛ allow for experiment/subexperiment stats
+
+3. ✅ cost tracker handles completion creation - Merged
 
 Change: separating completion and cost tracking, by changing the main functionality from `chat_completion` to `update_cost`
 
@@ -7,16 +15,7 @@ Motivation:
  - allows easier integration, user only has to initialize tracker object and call `update_cost(response)`, 
    otherwise each chat completion call would have to be rewritten 
 
-2. ⌛ costs are calculated across all log files 
-
-Change: 
- - static `total_cost` that will calculate total spending from logs
- - static `experiment_cost(experiment_name=self.experiment_name)` gets you total cost of specific experiment
-  - defaulting to current experiment_name in tracker object
-  - if object not initialized, experiment_name has to be provided
- - `cost` that gets you costs for current run of this tracker object
-
-3. ⌛ log file just acumulates total cost
+4. ✅ log file just acumulates total cost
 
 Change: 
  - add breakdown of responses/input token per response/output token per response/cost per response
@@ -40,13 +39,7 @@ Change:
     }
     ```
 
-4. ⌛ model has to be provided in form of enum
-
-Change:
-    - we can just infer it from `response.model`
-    - removes possible problems with choosing the right enum or forgetting to change it while changing the model for experiment
-
-5. ✅ datetime strftime format - PR ready
+5. ✅ datetime strftime format - Merged
 
 Change: 
  - change strftime format to `strftime("%Y-%m-%d_%H:%M:%S")`, makes it more readable

diff --git a/images/strftime_agg.png b/images/strftime_agg.png
diff --git a/images/viz_prints.png b/images/viz_prints.png
diff --git a/openai_cost_logger/openai_cost_logger.py b/openai_cost_logger/openai_cost_logger.py
@@ -1,4 +1,5 @@
 import csv
+import json
 from typing import Dict
 from pathlib import Path
 from time import strftime
@@ -16,7 +17,7 @@
     "cost"
 ]
 
-"""OpenAI cost logger"""
+"""OpenAI cost logger."""
 class OpenAICostLogger:
     def __init__(
         self,
@@ -26,6 +27,7 @@ def __init__(
         experiment_name: str,
         cost_upperbound: float = float('inf'),
         log_folder: str = DEFAULT_LOG_PATH,
+        log_level: str = "detail"
     ):
         """Initialize the cost logger.
 
@@ -40,13 +42,21 @@ def __init__(
             client_args (Dict, optional): The parameters to pass to the client. Defaults to {}.
         """
         self.cost = 0
+        self.n_responses = 0
         self.model = model
         self.input_cost = input_cost
         self.log_folder = log_folder
         self.output_cost = output_cost
         self.experiment_name = experiment_name
         self.cost_upperbound = cost_upperbound
-        self.filename = f"{experiment_name}_cost_" + strftime("%Y-%m-%d_%H:%M:%S") + ".csv"
+        self.log_level = log_level
+        self.creation_datetime = strftime("%Y-%m-%d_%H:%M:%S")
+        self.filename = f"{experiment_name}_{self.creation_datetime}.json"
+        self.filepath = Path(self.log_folder, self.filename)
+
+        self.__check_existance_log_folder()
+        self.__build_log_file()
+
 
     def update_cost(self, response: ChatCompletion) -> None:
         """Extract the number of input and output tokens from a chat completion response
@@ -56,15 +66,10 @@ def update_cost(self, response: ChatCompletion) -> None:
             response: ChatCompletion object from the model.
         """
         self.cost += self.__get_answer_cost(response)
+        self.n_responses += 1
+        self.__write_cost_to_json(response)
         self.__validate_cost()
-        path = Path(self.log_folder, self.filename)
-        path.parent.mkdir(parents=True, exist_ok=True)
-
-        # Be careful, it overwrites the file if it already exists
-        with open(path, mode='w') as file:
-            csvwriter = csv.writer(file)
-            csvwriter.writerow(FILE_HEADER)
-            csvwriter.writerow([self.experiment_name, self.model, self.cost])
+
 
     def get_current_cost(self) -> float:
         """Get the current cost of the cost tracker.
@@ -74,6 +79,7 @@ def get_current_cost(self) -> float:
         """
         return self.cost
 
+
     def __get_answer_cost(self, answer: Dict) -> float:
         """Calculate the cost of the answer based on the input and output tokens.
 
@@ -85,11 +91,65 @@ def __get_answer_cost(self, answer: Dict) -> float:
         return (self.input_cost * answer.usage.prompt_tokens) / COST_UNIT + \
                     (self.output_cost * answer.usage.completion_tokens) / COST_UNIT
 
+
     def __validate_cost(self):
         """Check if the cost exceeds the upperbound and raise an exception if it does.
 
         Raises:
             Exception: If the cost exceeds the upperbound.
         """
         if self.cost > self.cost_upperbound:
-            raise Exception(f"Cost exceeded upperbound: {self.cost} > {self.cost_upperbound}")
+            raise Exception(f"Cost exceeded upperbound: {self.cost} > {self.cost_upperbound}")
+
+
+    def __write_cost_to_json(self, response: ChatCompletion) -> None:
+        """Write the cost to a json file. 
+
+        Args:
+            response (ChatCompletion): The response from the model.
+        """
+        with open(self.filepath, 'r') as file:
+            data = json.load(file)
+            data["total_cost"] = self.cost
+            data["total_responses"] = self.n_responses
+            data["breakdown"].append(self.__build_log_breadown_entry(response))
+        with open(self.filepath, 'w') as file:
+            json.dump(data, file, indent=4)
+
+
+    def __check_existance_log_folder(self) -> None:
+        """Check if the log folder exists and create it if it does not."""
+        self.filepath.parent.mkdir(parents=True, exist_ok=True)
+
+
+    def __build_log_file(self) -> None:
+        """Create the log file with the header."""
+        log_file_template = {
+            "experiment_name": self.experiment_name,
+            "creation_datetime": strftime("%Y-%m-%d %H:%M:%S"),
+            "model": self.model,
+            "total_cost": self.cost,
+            "total_responses": 0,
+            "breakdown": []
+        }
+        with open(self.filepath, 'w') as file:
+            json.dump(log_file_template, file, indent=4)
+
+
+    def __build_log_breadown_entry(self, response: ChatCompletion) -> Dict:
+        """Build a json log entry for the breakdown of the cost.
+
+        Args:
+            response (ChatCompletion): The response from the model.
+
+        Returns:
+            Dict: The json log entry.
+        """
+        return {
+            "cost": self.__get_answer_cost(response),
+            "input_tokens": response.usage.prompt_tokens,
+            "output_tokens": response.usage.completion_tokens,
+            "content": response.choices[0].message.content,
+            "inferred_model": response.model,
+            "datetime": strftime("%Y-%m-%d %H:%M:%S"),
+        }
diff --git a/openai_cost_logger/openai_cost_logger_viz.py b/openai_cost_logger/openai_cost_logger_viz.py
@@ -1,12 +1,14 @@
 import os
-import csv
+import json
+from datetime import datetime
 from typing import Dict
 from pathlib import Path
 import matplotlib.pyplot as plt
 from collections import defaultdict
 
 from openai_cost_logger.constants import DEFAULT_LOG_PATH
 
+"""Cost logger visualizer."""
 class OpenAICostLoggerViz:
 
     @staticmethod
@@ -21,13 +23,13 @@ def get_total_cost(path: str = DEFAULT_LOG_PATH) -> float:
         """
         cost = 0
         for filename in os.listdir(path):
-            with open(Path(path, filename), mode='r') as file:
-                csvreader = csv.reader(file)
-                next(csvreader)
-                for row in csvreader:
-                    cost += float(row[2])
+            if filename.endswith(".json"):
+                with open(Path(path, filename), mode='r') as file:
+                    data = json.load(file)
+                    cost += data["total_cost"]
         return cost
 
+
     @staticmethod
     def print_total_cost(path: str = DEFAULT_LOG_PATH) -> None:
         """Print the total cost of all the logs in the directory.
@@ -36,9 +38,9 @@ def print_total_cost(path: str = DEFAULT_LOG_PATH) -> None:
             log_folder (str, optional): Cost logs directory. Defaults to DEFAULT_LOG_PATH.
                                         This method reads all the files in the specified directory.
         """
-
         print(f"Total cost: {round(OpenAICostLoggerViz.get_total_cost(path), 6)} (USD)")
-
+
+
     @staticmethod
     def get_total_cost_by_model(path: str = DEFAULT_LOG_PATH) -> Dict[str, float]:
         """Return the total cost by model of all the logs in the directory.
@@ -52,15 +54,15 @@ def get_total_cost_by_model(path: str = DEFAULT_LOG_PATH) -> Dict[str, float]:
         """
         cost_by_model = defaultdict(float)
         for filename in os.listdir(path):
-            with open(Path(path, filename), mode='r') as file:
-                csvreader = csv.reader(file)
-                next(csvreader)
-                for row in csvreader:
-                    if row[1] not in cost_by_model:
-                        cost_by_model[row[1]] = 0
-                    cost_by_model[row[1]] += float(row[2])
+            if filename.endswith(".json"):
+                with open(Path(path, filename), mode='r') as file:
+                    data = json.load(file)
+                    if data["model"] not in cost_by_model:
+                        cost_by_model[data["model"]] = 0
+                    cost_by_model[data["model"]] += data["total_cost"]
         return cost_by_model
-
+
+
     def print_total_cost_by_model(path: str = DEFAULT_LOG_PATH) -> None:
         """Print the total cost by model of all the logs in the directory.
 
@@ -71,31 +73,50 @@ def print_total_cost_by_model(path: str = DEFAULT_LOG_PATH) -> None:
         cost_by_model = OpenAICostLoggerViz.get_total_cost_by_model(path)
         for model, cost in cost_by_model.items():
             print(f"{model}: {round(cost, 6)} (USD)")
-
+
+
     @staticmethod
-    def plot_cost_by_day(path: str = DEFAULT_LOG_PATH, last_n_days: int = None) -> None:
-        """Plot the cost by day of all the logs in the directory.
+    def plot_cost_by_strftime(path: str = DEFAULT_LOG_PATH, strftime_aggregator: str = "%Y-%m-%d", last_n_days: int = None) -> None:
+        """Plot the cost by day of all the logs in the directory aggregated using strftime_aggregator.
 
         Args:
             path (str, optional): Cost logs directory. Defaults to DEFAULT_LOG_PATH.
                                   This method reads all the files in the specified directory.
             last_n_days (int, optional): The number of last days to plot. Defaults to None.
         """
-        cost_by_day = defaultdict(float)
+        cost_by_aggregation_key = defaultdict(float)
         for filename in os.listdir(path):
-            with open(Path(path, filename), mode='r') as file:
-                csvreader = csv.reader(file)
-                next(csvreader)
-                for row in csvreader:
-                    day = filename.split("_")[2]
-                    cost_by_day[day] += float(row[2])
+            if filename.endswith(".json"):
+                with open(Path(path, filename), mode='r') as file:
+                    data = json.load(file)
+                    creation_datetime = datetime.strptime(data["creation_datetime"], "%Y-%m-%d %H:%M:%S")
+                    aggregation_key = creation_datetime.strftime(strftime_aggregator)
+                    cost_by_aggregation_key[aggregation_key] += data["total_cost"]
 
-        cost_by_day = dict(sorted(cost_by_day.items(), key=lambda x: x[0]))
+        cost_by_aggregation_key = dict(sorted(cost_by_aggregation_key.items(), key=lambda x: x[0]))
         if last_n_days:
-            cost_by_day = dict(list(cost_by_day.items())[-last_n_days:])
+            cost_by_aggregation_key = dict(list(cost_by_aggregation_key.items())[-last_n_days:])
 
-        plt.bar(cost_by_day.keys(), cost_by_day.values(), width=0.5)
+        plt.bar(cost_by_aggregation_key.keys(), cost_by_aggregation_key.values(), width=0.5)
+        plt.xticks(rotation=30, fontsize=8)
         plt.xlabel('Day')
         plt.ylabel('Cost [$]')
         plt.title('Cost by day')
-        plt.show()
+        plt.tight_layout()
+        plt.show()
+
+
+    @staticmethod
+    def plot_cost_by_day(path: str = DEFAULT_LOG_PATH, last_n_days: int = None) -> None:
+        """Plot the cost by day of all the logs in the directory.
+
+        Args:
+            path (str, optional): Cost logs directory. Defaults to DEFAULT_LOG_PATH.
+                                  This method reads all the files in the specified directory.
+            last_n_days (int, optional): The number of last days to plot. Defaults to None.
+        """
+        OpenAICostLoggerViz.plot_cost_by_strftime(
+            path=path,
+            strftime_aggregator="%Y-%m-%d", 
+            last_n_days=last_n_days
+        )