Skip to content

Commit

Permalink
Merge branch 'main' into feat/adding_noise_metric
Browse files Browse the repository at this point in the history
  • Loading branch information
sahusiddharth committed Aug 13, 2024
2 parents 99e4848 + 7946480 commit a5b2c28
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 1 deletion.
21 changes: 21 additions & 0 deletions docs/concepts/metrics/faithfulness.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,24 @@ Let's examine how faithfulness was calculated using the low faithfulness answer:
```


## Faithfullness with HHEM 2.1 Model

[Vectara's HHEM 2.1](https://vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model/) is a classifier model (T5) that is trained to detect halluccinations from LLM generated text. This model can be used in second step of calculating faithfullness, ie when claims are cross-checked with the given context to determine if it can be inferred from the context. The model is free, small and opensource making it very effient to use in production use-cases. To use the model to calculate faithfulness, you can use the following code snippet:

```{code-block} python
from datasets import Dataset
from ragas.metrics import FaithulnesswithHHEM
from ragas import evaluate
faithfulness_with_hhem = FaithulnesswithHHEM()
data_samples = {
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness_with_hhem])
score.to_pandas()
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dynamic = ["version", "readme"]
[project.optional-dependencies]
all = [
"sentence-transformers",
"transformers",
]

[tool.setuptools]
Expand Down
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ llama_index
notebook
sphinx-autobuild
sentence-transformers
transformers
fastembed
graphene
fuzzywuzzy
3 changes: 2 additions & 1 deletion src/ragas/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
context_utilization,
)
from ragas.metrics._context_recall import ContextRecall, context_recall
from ragas.metrics._faithfulness import Faithfulness, faithfulness
from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
from ragas.metrics._noise_sensitivity import noise_sensitivity
from ragas.metrics._summarization import SummarizationScore, summarization_score
from ragas.metrics.critique import AspectCritique
Expand All @@ -22,6 +22,7 @@
"answer_correctness",
"Faithfulness",
"faithfulness",
"FaithulnesswithHHEM",
"AnswerSimilarity",
"answer_similarity",
"ContextPrecision",
Expand Down
54 changes: 54 additions & 0 deletions src/ragas/metrics/_faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,4 +312,58 @@ def save(self, cache_dir: t.Optional[str] = None) -> None:
self.statement_prompt.save(cache_dir)


@dataclass
class FaithulnesswithHHEM(Faithfulness):
name: str = "faithfulness_with_hhem" # type: ignore

def __post_init__(self):
try:
from transformers import AutoModelForSequenceClassification
except ImportError:
raise ImportError(
"Huggingface transformers must be installed to use this feature, try `pip install transformers`"
)
self.nli_classifier = AutoModelForSequenceClassification.from_pretrained(
"vectara/hallucination_evaluation_model", trust_remote_code=True
)
super().__post_init__()

def _create_pairs(
self, row: t.Dict, statements: t.List[str]
) -> t.List[t.Tuple[str, str]]:
"""
create pairs of (question, answer) from the row
"""
premise = "\n".join(row["contexts"])
pairs = [(premise, statement) for statement in statements]
return pairs

async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
"""
returns the NLI score for each (q, c, a) pair
"""
assert self.llm is not None, "LLM is not set"

p_value = self._create_statements_prompt(row)
statements = await self.llm.generate(
p_value,
callbacks=callbacks,
)
statements = await _statements_output_parser.aparse(
statements.generations[0][0].text, p_value, self.llm, self.max_retries
)

if statements is None:
return np.nan

statements = [item["simpler_statements"] for item in statements.dicts()]
statements = [item for sublist in statements for item in sublist]

assert isinstance(statements, t.List), "statements must be a list"

pairs = self._create_pairs(row, statements)
scores = self.nli_classifier.predict(pairs).detach().numpy().round()
return scores.sum() / len(scores)


faithfulness = Faithfulness()

0 comments on commit a5b2c28

Please sign in to comment.