Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding support for langsmith #1062

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions docs/howtos/integrations/llamaindex.ipynb
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "fd8d6ad7",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"id": "d2451aff",
Expand Down
2 changes: 1 addition & 1 deletion src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def evaluate(
# validation
dataset = handle_deprecated_ground_truths(dataset)
validate_evaluation_modes(dataset, metrics)
validate_column_dtypes(dataset)
validate_column_dtypes(dataset, column_map)

# set the llm and embeddings
if isinstance(llm, LangchainLLM):
Expand Down
101 changes: 76 additions & 25 deletions src/ragas/integrations/langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,26 @@ class EvaluatorChain(Chain, RunEvaluator):
"""

metric: Metric
column_map: dict[str, str]

def __init__(self, metric: Metric, **kwargs: t.Any):
kwargs["metric"] = metric

# chek if column_map is provided
if "column_map" in kwargs:
_column_map = kwargs["column_map"]
else:
_column_map = {}
kwargs["column_map"] = _column_map

super().__init__(**kwargs)

# set up the run config
if "run_config" in kwargs:
run_config = kwargs["run_config"]
else:
run_config = RunConfig()

if isinstance(self.metric, MetricWithLLM):
llm = kwargs.get("llm", ChatOpenAI())
t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
Expand All @@ -54,7 +66,10 @@ def __init__(self, metric: Metric, **kwargs: t.Any):

@property
def input_keys(self) -> list[str]:
return get_required_columns(self.metric.evaluation_mode)
return [
self.column_map.get(column_name, column_name)
for column_name in get_required_columns(self.metric.evaluation_mode)
]

@property
def output_keys(self) -> list[str]:
Expand All @@ -68,14 +83,10 @@ def _call(
"""
Call the evaluation chain.
"""
self._validate(inputs)
q, c, a, g = self._validate(inputs)
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()

c = inputs.get("contexts", [""])
g = inputs.get("ground_truth", "")
q = inputs.get("question", "")
a = inputs.get("answer", "")
score = self.metric.score(
{
"question": q,
Expand All @@ -95,15 +106,11 @@ async def _acall(
"""
Call the evaluation chain.
"""
self._validate(inputs)
q, c, a, g = self._validate(inputs)
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
# TODO: currently AsyncCallbacks are not supported in ragas
_run_manager.get_child()

c = inputs.get("contexts", [""])
g = inputs.get("ground_truth", "")
q = inputs.get("question", "")
a = inputs.get("answer", "")
score = await self.metric.ascore(
{
"question": q,
Expand All @@ -118,11 +125,14 @@ async def _acall(
def _validate(
self,
input: dict[str, t.Any],
question_key: str = "question",
prediction_key: str = "answer",
context_key: str = "contexts",
) -> None:
# validate each example
) -> tuple[str, list[str], str, str]:
# remap the keys
question_key = self.column_map.get("question", "question")
prediction_key = self.column_map.get("answer", "answer")
context_key = self.column_map.get("contexts", "contexts")
ground_truth_key = self.column_map.get("ground_truth", "ground_truth")

# validate if the required columns are present
required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
if "question" in required_columns and question_key not in input:
raise ValueError(
Expand All @@ -139,17 +149,45 @@ def _validate(
f'"{context_key}" is required in each prediction for the '
f"metric[{self.metric.name}] you have chosen."
)
if "ground_truth" in required_columns and "ground_truth" not in input:
if "ground_truth" in required_columns and ground_truth_key not in input:
raise ValueError(
f'"ground_truth" is required in each prediction for the '
f"metric[{self.metric.name}] you have chosen."
)

# validate if the columns are of the correct datatype
for column_name in [question_key, prediction_key, ground_truth_key]:
if column_name in input:
if not isinstance(input[column_name], str):
raise ValueError(
f'Input feature "{column_name}" should be of type string'
)

for column_name in [context_key]:
if column_name in input:
if not (isinstance(input[column_name], list)):
raise ValueError(
f'Input feature "{column_name}" should be of type'
f" list[str], got {type(input[column_name])}"
)

q = input.get(question_key, "")
c = input.get(context_key, [""])
a = input.get(prediction_key, "")
g = input.get(ground_truth_key, "")
return q, c, a, g

@staticmethod
def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
return [k for k in keys_to_check if k not in dict_to_check]

def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
# remap column names
question_key = self.column_map.get("question", "question")
ground_truth_key = self.column_map.get("ground_truth", "ground_truth")
answer_key = self.column_map.get("answer", "answer")
context_key = self.column_map.get("contexts", "contexts")

if example is None:
raise ValueError(
"expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
Expand All @@ -162,21 +200,30 @@ def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> No
raise ValueError(
"expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
)
if "question" not in example.inputs or "ground_truth" not in example.outputs:
if (
question_key not in example.inputs
or ground_truth_key not in example.outputs
):
raise ValueError(
"Expected 'question' and 'ground_truth' in example."
f"Expected '{question_key}' and {ground_truth_key} in example."
f"Got: {[k for k in example.inputs.keys()]}"
)
assert (
run.outputs is not None
), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
output_keys = get_required_columns(
self.metric.evaluation_mode, ["question", "ground_truth"]
eval_mod=self.metric.evaluation_mode,
ignore_columns=["question", "ground_truth"],
)
# remap output keys with column_map
output_keys = [
self.column_map.get(column_name, column_name) for column_name in output_keys
]
# check for missing keys
missing_keys = self._keys_are_present(output_keys, run.outputs)
if missing_keys:
raise ValueError(
"Expected 'answer' and 'contexts' in run.outputs."
f"Expected {answer_key} and {context_key} in run.outputs."
f"Got: {[k for k in run.outputs.keys()]}"
)

Expand All @@ -195,17 +242,21 @@ def evaluate_run(
assert example.inputs is not None
assert example.outputs is not None

# remap column key
ground_truth_key = self.column_map.get("ground_truth", "ground_truth")
question_key = self.column_map.get("question", "question")

chain_eval = run.outputs
chain_eval["question"] = example.inputs["question"]
chain_eval[question_key] = example.inputs[question_key]
if self.metric.evaluation_mode in [
EvaluationMode.gc,
EvaluationMode.ga,
EvaluationMode.qcg,
EvaluationMode.qga,
]:
if example.outputs is None or "ground_truth" not in example.outputs:
raise ValueError("expected `ground_truth` in example outputs.")
chain_eval["ground_truth"] = example.outputs["ground_truth"]
if example.outputs is None or ground_truth_key not in example.outputs:
raise ValueError(f"expected `{ground_truth_key}` in example outputs.")
chain_eval[ground_truth_key] = example.outputs[ground_truth_key]
eval_output = self.invoke(chain_eval, include_run_info=True)

evaluation_result = EvaluationResult(
Expand Down
6 changes: 5 additions & 1 deletion src/ragas/integrations/langsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def evaluate(
experiment_name: t.Optional[str] = None,
metrics: t.Optional[list] = None,
verbose: bool = False,
column_map: t.Optional[dict[str, str]] = None,
) -> t.Dict[str, t.Any]:
"""
Evaluates a language model or a chain factory on a specified dataset using
Expand Down Expand Up @@ -144,6 +145,9 @@ def evaluate(
returning the results. Custom evaluation metrics can be specified, or a default set
will be used if none are provided.
"""
# setup column_map
column_map = {} if not column_map else column_map

# init client and validate dataset
client = Client()
try:
Expand All @@ -164,7 +168,7 @@ def evaluate(

metrics = [answer_relevancy, context_precision, faithfulness, context_recall]

metrics = [EvaluatorChain(m) for m in metrics]
metrics = [EvaluatorChain(m, column_map=column_map) for m in metrics]
eval_config = RunEvalConfig(
custom_evaluators=metrics,
)
Expand Down
4 changes: 3 additions & 1 deletion src/ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
def get_required_columns(
eval_mod: EvaluationMode, ignore_columns: t.Optional[t.List[str]] = None
) -> t.List[str]:
keys = []
ignore_columns = ignore_columns or []

if eval_mod == EvaluationMode.qac:
keys = ["question", "answer", "contexts"]
elif eval_mod == EvaluationMode.qa:
Expand All @@ -52,7 +55,6 @@ def get_required_columns(
keys = ["question", "contexts", "answer", "ground_truth"]
elif eval_mod == EvaluationMode.qcg:
keys = ["question", "contexts", "ground_truth"]
ignore_columns = ignore_columns or []

return [k for k in keys if k not in ignore_columns]

Expand Down