检索增强生成(RAG)评测体系
原创
灵阙教研团队
A 推荐 进阶 |
约 9 分钟阅读
更新于 2026-02-28 AI 导读
检索增强生成(RAG)评测体系 RAGAS指标体系、评测流水线设计与合成测试数据生成实战 引言 RAG系统的质量评估是一个系统工程问题。仅评估最终答案是不够的——检索质量、上下文相关性、答案忠实度和响应速度都需要独立度量。RAGAS(Retrieval Augmented Generation...
检索增强生成(RAG)评测体系
RAGAS指标体系、评测流水线设计与合成测试数据生成实战
引言
RAG系统的质量评估是一个系统工程问题。仅评估最终答案是不够的——检索质量、上下文相关性、答案忠实度和响应速度都需要独立度量。RAGAS(Retrieval Augmented Generation Assessment)框架为这一问题提供了结构化的解决方案。本文将从评测指标体系、评测流水线设计、合成测试数据生成和A/B测试四个维度展开。
RAGAS核心指标
四维评测框架
RAGAS评测维度
检索质量 生成质量
┌─────────┐ ┌─────────┐
│Context │ │Faithful-│
│Precision│ │ness │
│ │ │ │
│检索的内容│ │答案是否 │
│有多相关?│ │忠于检索?│
└────┬────┘ └────┬────┘
│ │
Query ─────────────┼──────────────────────┼──── Answer
│ │
┌────┴────┐ ┌────┴────┐
│Context │ │Answer │
│Recall │ │Relevancy│
│ │ │ │
│是否检索 │ │答案是否 │
│到足够信息│ │回答了问题│
└─────────┘ └─────────┘
指标详解与计算
import numpy as np
from dataclasses import dataclass
@dataclass
class RAGEvalSample:
query: str
contexts: list[str] # Retrieved contexts
answer: str # Generated answer
ground_truth: str = None # Reference answer (optional)
class RAGASMetrics:
"""Implementation of core RAGAS metrics."""
def __init__(self, llm_judge, embed_fn):
self.llm = llm_judge
self.embed = embed_fn
def faithfulness(self, sample: RAGEvalSample) -> float:
"""
Measures if the answer is grounded in retrieved contexts.
Score: 0 (hallucinated) to 1 (fully faithful)
Method:
1. Extract claims from the answer
2. Check each claim against contexts
3. Score = supported_claims / total_claims
"""
# Step 1: Extract atomic claims
claims = self._extract_claims(sample.answer)
if not claims:
return 1.0
# Step 2: Verify each claim
context_str = "\n\n".join(sample.contexts)
supported = 0
for claim in claims:
if self._verify_claim(claim, context_str):
supported += 1
return supported / len(claims)
def answer_relevancy(self, sample: RAGEvalSample) -> float:
"""
Measures if the answer addresses the question.
Score: 0 (irrelevant) to 1 (perfectly relevant)
Method:
1. Generate N questions from the answer
2. Compute similarity between generated Qs and original Q
3. Score = average similarity
"""
# Generate questions that the answer could be responding to
generated_questions = self._generate_questions(sample.answer, n=3)
# Compute embedding similarity
q_emb = self.embed([sample.query])[0]
gen_embs = self.embed(generated_questions)
similarities = [self._cosine_sim(q_emb, ge) for ge in gen_embs]
return float(np.mean(similarities))
def context_precision(self, sample: RAGEvalSample) -> float:
"""
Measures if relevant contexts are ranked higher.
Score: 0 (relevant contexts ranked low) to 1 (ranked high)
Method: Average Precision of relevant contexts in ranking
"""
# Judge each context's relevance
relevant_mask = []
for ctx in sample.contexts:
is_relevant = self._judge_relevance(sample.query, ctx)
relevant_mask.append(is_relevant)
# Calculate Average Precision
if not any(relevant_mask):
return 0.0
precision_sum = 0.0
relevant_count = 0
for i, is_rel in enumerate(relevant_mask):
if is_rel:
relevant_count += 1
precision_at_k = relevant_count / (i + 1)
precision_sum += precision_at_k
return precision_sum / sum(relevant_mask)
def context_recall(self, sample: RAGEvalSample) -> float:
"""
Measures if all necessary information was retrieved.
Requires ground_truth reference answer.
Score: 0 (critical info missing) to 1 (all info present)
Method:
1. Extract claims from ground_truth
2. Check if each claim can be found in contexts
3. Score = found_claims / total_claims
"""
if not sample.ground_truth:
return None
gt_claims = self._extract_claims(sample.ground_truth)
if not gt_claims:
return 1.0
context_str = "\n\n".join(sample.contexts)
found = sum(1 for c in gt_claims if self._verify_claim(c, context_str))
return found / len(gt_claims)
# --- Helper methods ---
def _extract_claims(self, text: str) -> list[str]:
prompt = f"Extract all atomic factual claims from this text. Return one claim per line.\n\nText: {text}"
response = self.llm.generate(prompt)
return [c.strip() for c in response.strip().split("\n") if c.strip()]
def _verify_claim(self, claim: str, context: str) -> bool:
prompt = f"Can this claim be supported by the context?\nClaim: {claim}\nContext: {context}\nAnswer: yes or no"
return "yes" in self.llm.generate(prompt).lower()
def _generate_questions(self, answer: str, n: int = 3) -> list[str]:
prompt = f"Generate {n} questions that this text could be answering:\n{answer}"
response = self.llm.generate(prompt)
return [q.strip() for q in response.strip().split("\n") if q.strip()][:n]
def _judge_relevance(self, query: str, context: str) -> bool:
prompt = f"Is this context relevant to the query?\nQuery: {query}\nContext: {context}\nAnswer: yes or no"
return "yes" in self.llm.generate(prompt).lower()
def _cosine_sim(self, a, b) -> float:
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
指标基线与目标
| 指标 | 差 | 一般 | 好 | 优秀 | 目标 |
|---|---|---|---|---|---|
| Faithfulness | <0.5 | 0.5-0.7 | 0.7-0.85 | >0.85 | >0.85 |
| Answer Relevancy | <0.5 | 0.5-0.7 | 0.7-0.85 | >0.85 | >0.80 |
| Context Precision | <0.3 | 0.3-0.6 | 0.6-0.8 | >0.8 | >0.75 |
| Context Recall | <0.4 | 0.4-0.65 | 0.65-0.85 | >0.85 | >0.80 |
评测流水线设计
自动化评测架构
评测流水线
┌──────────────┐
│ 测试数据集 │ 合成 + 人工标注 + 生产采样
│ (Q, A_ref, │
│ contexts) │
└──────┬───────┘
│
▼
┌──────────────┐
│ RAG Pipeline │ 被评测的系统
│ (被测对象) │
└──────┬───────┘
│ 输出: (contexts_retrieved, answer_generated)
▼
┌──────────────┐
│ 评测引擎 │
│ ├── RAGAS │ 四维指标
│ ├── 延迟 │ TTFT, 总延迟
│ ├── 成本 │ Token消耗
│ └── 自定义 │ 业务特定指标
└──────┬───────┘
│
▼
┌──────────────┐
│ 报告 & CI │ 仪表盘 + 回归检测 + 告警
└──────────────┘
CI/CD集成
import json
from pathlib import Path
class RAGEvalPipeline:
"""Automated RAG evaluation pipeline for CI/CD."""
def __init__(self, rag_system, metrics: RAGASMetrics,
test_data_path: str):
self.rag = rag_system
self.metrics = metrics
self.test_data = self._load_test_data(test_data_path)
def run_evaluation(self) -> dict:
"""Run full evaluation suite."""
results = []
for sample in self.test_data:
# Run RAG pipeline
rag_output = self.rag.query(sample["query"])
eval_sample = RAGEvalSample(
query=sample["query"],
contexts=rag_output["contexts"],
answer=rag_output["answer"],
ground_truth=sample.get("ground_truth"),
)
# Compute metrics
scores = {
"faithfulness": self.metrics.faithfulness(eval_sample),
"answer_relevancy": self.metrics.answer_relevancy(eval_sample),
"context_precision": self.metrics.context_precision(eval_sample),
}
if eval_sample.ground_truth:
scores["context_recall"] = self.metrics.context_recall(eval_sample)
results.append({
"query": sample["query"],
"scores": scores,
"answer": rag_output["answer"][:200],
})
# Aggregate
aggregate = self._aggregate(results)
return {"samples": results, "aggregate": aggregate}
def check_thresholds(self, results: dict,
thresholds: dict = None) -> bool:
"""Check if evaluation meets quality thresholds."""
defaults = {
"faithfulness": 0.85,
"answer_relevancy": 0.80,
"context_precision": 0.75,
"context_recall": 0.80,
}
thresholds = thresholds or defaults
agg = results["aggregate"]
passed = True
for metric, threshold in thresholds.items():
if metric in agg and agg[metric] < threshold:
print(f"FAIL: {metric} = {agg[metric]:.3f} < {threshold}")
passed = False
elif metric in agg:
print(f"PASS: {metric} = {agg[metric]:.3f} >= {threshold}")
return passed
def _aggregate(self, results: list) -> dict:
metrics = {}
for key in ["faithfulness", "answer_relevancy",
"context_precision", "context_recall"]:
values = [r["scores"][key] for r in results
if key in r["scores"] and r["scores"][key] is not None]
if values:
metrics[key] = float(np.mean(values))
return metrics
def _load_test_data(self, path: str) -> list:
with open(path) as f:
return json.load(f)
合成测试数据生成
自动化测试集构建
class SyntheticTestGenerator:
"""Generate synthetic QA pairs for RAG evaluation."""
def __init__(self, llm, documents: list[str]):
self.llm = llm
self.documents = documents
def generate_test_set(self, n_samples: int = 100,
difficulty_mix: dict = None) -> list[dict]:
"""Generate diverse test samples across difficulty levels."""
if difficulty_mix is None:
difficulty_mix = {
"simple": 0.3, # Single-document, factoid
"reasoning": 0.3, # Requires inference
"multi_hop": 0.2, # Needs multiple documents
"negative": 0.2, # No answer in corpus
}
samples = []
for difficulty, ratio in difficulty_mix.items():
count = int(n_samples * ratio)
for _ in range(count):
sample = self._generate_sample(difficulty)
if sample:
samples.append(sample)
return samples
def _generate_sample(self, difficulty: str) -> dict:
if difficulty == "simple":
return self._gen_simple()
elif difficulty == "reasoning":
return self._gen_reasoning()
elif difficulty == "multi_hop":
return self._gen_multi_hop()
elif difficulty == "negative":
return self._gen_negative()
def _gen_simple(self) -> dict:
"""Generate simple factoid question from a single document."""
import random
doc = random.choice(self.documents)
prompt = (
f"Based on this document, generate a factoid question "
f"and its answer.\n\nDocument: {doc[:2000]}\n\n"
f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
)
result = self.llm.generate(prompt)
try:
parsed = json.loads(result)
return {
"query": parsed["question"],
"ground_truth": parsed["answer"],
"difficulty": "simple",
"source_doc": doc[:500],
}
except (json.JSONDecodeError, KeyError):
return None
def _gen_reasoning(self) -> dict:
"""Generate question requiring inference/reasoning."""
import random
doc = random.choice(self.documents)
prompt = (
f"Based on this document, generate a question that requires "
f"reasoning or inference (not just fact lookup).\n\n"
f"Document: {doc[:2000]}\n\n"
f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
)
result = self.llm.generate(prompt)
try:
parsed = json.loads(result)
parsed["difficulty"] = "reasoning"
parsed["query"] = parsed.pop("question")
parsed["ground_truth"] = parsed.pop("answer")
return parsed
except (json.JSONDecodeError, KeyError):
return None
def _gen_multi_hop(self) -> dict:
"""Generate question needing info from multiple documents."""
import random
docs = random.sample(self.documents, min(2, len(self.documents)))
prompt = (
f"Generate a question that can only be answered by combining "
f"information from BOTH documents.\n\n"
f"Document 1: {docs[0][:1000]}\n\n"
f"Document 2: {docs[1][:1000] if len(docs) > 1 else docs[0][:1000]}\n\n"
f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
)
result = self.llm.generate(prompt)
try:
parsed = json.loads(result)
return {
"query": parsed["question"],
"ground_truth": parsed["answer"],
"difficulty": "multi_hop",
}
except (json.JSONDecodeError, KeyError):
return None
def _gen_negative(self) -> dict:
"""Generate question that cannot be answered from the corpus."""
prompt = (
"Generate a realistic but specific question about a topic "
"that would NOT be answerable from a typical knowledge base. "
"Return JSON: {\"question\": \"...\"}"
)
result = self.llm.generate(prompt)
try:
parsed = json.loads(result)
return {
"query": parsed["question"],
"ground_truth": "This question cannot be answered from the available documents.",
"difficulty": "negative",
}
except (json.JSONDecodeError, KeyError):
return None
A/B测试RAG
实验设计
| 变量 | A组(基线) | B组(实验) | 度量 |
|---|---|---|---|
| 分块大小 | 512 tokens | 256 tokens | Precision/Recall |
| 检索数量 | Top-5 | Top-10 | Faithfulness |
| 重排序 | 无 | BGE-reranker | Relevancy |
| 模型 | GPT-4o-mini | GPT-4o | 质量+成本 |
class RAGABTest:
"""A/B testing framework for RAG configurations."""
def __init__(self, config_a: dict, config_b: dict,
test_data: list[dict], metrics: RAGASMetrics):
self.config_a = config_a
self.config_b = config_b
self.test_data = test_data
self.metrics = metrics
def run_experiment(self) -> dict:
results_a = self._evaluate_config(self.config_a, "A")
results_b = self._evaluate_config(self.config_b, "B")
comparison = {}
for metric in ["faithfulness", "answer_relevancy",
"context_precision"]:
a_mean = np.mean([r[metric] for r in results_a if metric in r])
b_mean = np.mean([r[metric] for r in results_b if metric in r])
delta = b_mean - a_mean
# Simple significance test
relative = delta / (a_mean + 1e-8) * 100
comparison[metric] = {
"A": round(a_mean, 3),
"B": round(b_mean, 3),
"delta": round(delta, 3),
"relative_pct": round(relative, 1),
"winner": "B" if delta > 0.02 else ("A" if delta < -0.02 else "tie"),
}
return comparison
def _evaluate_config(self, config: dict, label: str) -> list:
rag = build_rag_pipeline(config)
results = []
for sample in self.test_data:
output = rag.query(sample["query"])
eval_sample = RAGEvalSample(
query=sample["query"],
contexts=output["contexts"],
answer=output["answer"],
ground_truth=sample.get("ground_truth"),
)
scores = {
"faithfulness": self.metrics.faithfulness(eval_sample),
"answer_relevancy": self.metrics.answer_relevancy(eval_sample),
"context_precision": self.metrics.context_precision(eval_sample),
}
results.append(scores)
return results
结论
RAG评测体系的建设是RAG系统从"实验室"走向"生产环境"的关键一步。RAGAS提供了从检索质量到生成质量的四维评测框架,合成测试数据生成解决了评测数据集的冷启动问题,A/B测试为配置优化提供了数据驱动的决策依据。建议将RAG评测纳入CI/CD流水线,设置质量门禁(Faithfulness > 0.85, Relevancy > 0.80),并通过持续的A/B测试推动系统逐步优化。
Maurice | maurice_wen@proton.me