explodinggradients · jjmachan · Jun 6, 2024 · Jun 12, 2024 · Jun 24, 2024 · Jun 28, 2024
diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb
@@ -1,16 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "fd8d6ad7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "d2451aff",

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -154,7 +154,7 @@ def evaluate(
     # validation
     dataset = handle_deprecated_ground_truths(dataset)
     validate_evaluation_modes(dataset, metrics)
-    validate_column_dtypes(dataset)
+    validate_column_dtypes(dataset, column_map)
 
     # set the llm and embeddings
     if isinstance(llm, LangchainLLM):

diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
@@ -34,14 +34,26 @@ class EvaluatorChain(Chain, RunEvaluator):
     """
 
     metric: Metric
+    column_map: dict[str, str]
 
     def __init__(self, metric: Metric, **kwargs: t.Any):
         kwargs["metric"] = metric
+
+        # chek if column_map is provided
+        if "column_map" in kwargs:
+            _column_map = kwargs["column_map"]
+        else:
+            _column_map = {}
+        kwargs["column_map"] = _column_map
+
         super().__init__(**kwargs)
+
+        # set up the run config
         if "run_config" in kwargs:
             run_config = kwargs["run_config"]
         else:
             run_config = RunConfig()
+
         if isinstance(self.metric, MetricWithLLM):
             llm = kwargs.get("llm", ChatOpenAI())
             t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
@@ -54,7 +66,10 @@ def __init__(self, metric: Metric, **kwargs: t.Any):
 
     @property
     def input_keys(self) -> list[str]:
-        return get_required_columns(self.metric.evaluation_mode)
+        return [
+            self.column_map.get(column_name, column_name)
+            for column_name in get_required_columns(self.metric.evaluation_mode)
+        ]
 
     @property
     def output_keys(self) -> list[str]:
@@ -68,14 +83,10 @@ def _call(
         """
         Call the evaluation chain.
         """
-        self._validate(inputs)
+        q, c, a, g = self._validate(inputs)
         _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
         callbacks = _run_manager.get_child()
 
-        c = inputs.get("contexts", [""])
-        g = inputs.get("ground_truth", "")
-        q = inputs.get("question", "")
-        a = inputs.get("answer", "")
         score = self.metric.score(
             {
                 "question": q,
@@ -95,15 +106,11 @@ async def _acall(
         """
         Call the evaluation chain.
         """
-        self._validate(inputs)
+        q, c, a, g = self._validate(inputs)
         _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
         # TODO: currently AsyncCallbacks are not supported in ragas
         _run_manager.get_child()
 
-        c = inputs.get("contexts", [""])
-        g = inputs.get("ground_truth", "")
-        q = inputs.get("question", "")
-        a = inputs.get("answer", "")
         score = await self.metric.ascore(
             {
                 "question": q,
@@ -118,11 +125,14 @@ async def _acall(
     def _validate(
         self,
         input: dict[str, t.Any],
-        question_key: str = "question",
-        prediction_key: str = "answer",
-        context_key: str = "contexts",
-    ) -> None:
-        # validate each example
+    ) -> tuple[str, list[str], str, str]:
+        # remap the keys
+        question_key = self.column_map.get("question", "question")
+        prediction_key = self.column_map.get("answer", "answer")
+        context_key = self.column_map.get("contexts", "contexts")
+        ground_truth_key = self.column_map.get("ground_truth", "ground_truth")
+
+        # validate if the required columns are present
         required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
         if "question" in required_columns and question_key not in input:
             raise ValueError(
@@ -139,17 +149,45 @@ def _validate(
                 f'"{context_key}" is required in each prediction for the '
                 f"metric[{self.metric.name}] you have chosen."
             )
-        if "ground_truth" in required_columns and "ground_truth" not in input:
+        if "ground_truth" in required_columns and ground_truth_key not in input:
             raise ValueError(
                 f'"ground_truth" is required in each prediction for the '
                 f"metric[{self.metric.name}] you have chosen."
             )
 
+        # validate if the columns are of the correct datatype
+        for column_name in [question_key, prediction_key, ground_truth_key]:
+            if column_name in input:
+                if not isinstance(input[column_name], str):
+                    raise ValueError(
+                        f'Input feature "{column_name}" should be of type string'
+                    )
+
+        for column_name in [context_key]:
+            if column_name in input:
+                if not (isinstance(input[column_name], list)):
+                    raise ValueError(
+                        f'Input feature "{column_name}" should be of type'
+                        f" list[str], got {type(input[column_name])}"
+                    )
+
+        q = input.get(question_key, "")
+        c = input.get(context_key, [""])
+        a = input.get(prediction_key, "")
+        g = input.get(ground_truth_key, "")
+        return q, c, a, g
+
     @staticmethod
     def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
         return [k for k in keys_to_check if k not in dict_to_check]
 
     def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
+        # remap column names
+        question_key = self.column_map.get("question", "question")
+        ground_truth_key = self.column_map.get("ground_truth", "ground_truth")
+        answer_key = self.column_map.get("answer", "answer")
+        context_key = self.column_map.get("contexts", "contexts")
+
         if example is None:
             raise ValueError(
                 "expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
@@ -162,21 +200,30 @@ def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> No
             raise ValueError(
                 "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
             )
-        if "question" not in example.inputs or "ground_truth" not in example.outputs:
+        if (
+            question_key not in example.inputs
+            or ground_truth_key not in example.outputs
+        ):
             raise ValueError(
-                "Expected 'question' and 'ground_truth' in example."
+                f"Expected '{question_key}' and {ground_truth_key} in example."
                 f"Got: {[k for k in example.inputs.keys()]}"
             )
         assert (
             run.outputs is not None
         ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
         output_keys = get_required_columns(
-            self.metric.evaluation_mode, ["question", "ground_truth"]
+            eval_mod=self.metric.evaluation_mode,
+            ignore_columns=["question", "ground_truth"],
         )
+        # remap output keys with column_map
+        output_keys = [
+            self.column_map.get(column_name, column_name) for column_name in output_keys
+        ]
+        # check for missing keys
         missing_keys = self._keys_are_present(output_keys, run.outputs)
         if missing_keys:
             raise ValueError(
-                "Expected 'answer' and 'contexts' in run.outputs."
+                f"Expected {answer_key} and {context_key} in run.outputs."
                 f"Got: {[k for k in run.outputs.keys()]}"
             )
 
@@ -195,17 +242,21 @@ def evaluate_run(
         assert example.inputs is not None
         assert example.outputs is not None
 
+        # remap column key
+        ground_truth_key = self.column_map.get("ground_truth", "ground_truth")
+        question_key = self.column_map.get("question", "question")
+
         chain_eval = run.outputs
-        chain_eval["question"] = example.inputs["question"]
+        chain_eval[question_key] = example.inputs[question_key]
         if self.metric.evaluation_mode in [
             EvaluationMode.gc,
             EvaluationMode.ga,
             EvaluationMode.qcg,
             EvaluationMode.qga,
         ]:
-            if example.outputs is None or "ground_truth" not in example.outputs:
-                raise ValueError("expected `ground_truth` in example outputs.")
-            chain_eval["ground_truth"] = example.outputs["ground_truth"]
+            if example.outputs is None or ground_truth_key not in example.outputs:
+                raise ValueError(f"expected `{ground_truth_key}` in example outputs.")
+            chain_eval[ground_truth_key] = example.outputs[ground_truth_key]
         eval_output = self.invoke(chain_eval, include_run_info=True)
 
         evaluation_result = EvaluationResult(

diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py
@@ -85,6 +85,7 @@ def evaluate(
     experiment_name: t.Optional[str] = None,
     metrics: t.Optional[list] = None,
     verbose: bool = False,
+    column_map: t.Optional[dict[str, str]] = None,
 ) -> t.Dict[str, t.Any]:
     """
     Evaluates a language model or a chain factory on a specified dataset using
@@ -144,6 +145,9 @@ def evaluate(
     returning the results. Custom evaluation metrics can be specified, or a default set
     will be used if none are provided.
     """
+    # setup column_map
+    column_map = {} if not column_map else column_map
+
     # init client and validate dataset
     client = Client()
     try:
@@ -164,7 +168,7 @@ def evaluate(
 
         metrics = [answer_relevancy, context_precision, faithfulness, context_recall]
 
-    metrics = [EvaluatorChain(m) for m in metrics]
+    metrics = [EvaluatorChain(m, column_map=column_map) for m in metrics]
     eval_config = RunEvalConfig(
         custom_evaluators=metrics,
     )

diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -38,6 +38,9 @@
 def get_required_columns(
     eval_mod: EvaluationMode, ignore_columns: t.Optional[t.List[str]] = None
 ) -> t.List[str]:
+    keys = []
+    ignore_columns = ignore_columns or []
+
     if eval_mod == EvaluationMode.qac:
         keys = ["question", "answer", "contexts"]
     elif eval_mod == EvaluationMode.qa:
@@ -52,7 +55,6 @@ def get_required_columns(
         keys = ["question", "contexts", "answer", "ground_truth"]
     elif eval_mod == EvaluationMode.qcg:
         keys = ["question", "contexts", "ground_truth"]
-    ignore_columns = ignore_columns or []
 
     return [k for k in keys if k not in ignore_columns]