检索增强生成(RAG)评测体系

RAGAS指标体系、评测流水线设计与合成测试数据生成实战

引言

RAG系统的质量评估是一个系统工程问题。仅评估最终答案是不够的——检索质量、上下文相关性、答案忠实度和响应速度都需要独立度量。RAGAS(Retrieval Augmented Generation Assessment)框架为这一问题提供了结构化的解决方案。本文将从评测指标体系、评测流水线设计、合成测试数据生成和A/B测试四个维度展开。

RAGAS核心指标

四维评测框架

RAGAS评测维度

                    检索质量                生成质量
                  ┌─────────┐            ┌─────────┐
                  │Context  │            │Faithful-│
                  │Precision│            │ness     │
                  │         │            │         │
                  │检索的内容│            │答案是否 │
                  │有多相关?│            │忠于检索?│
                  └────┬────┘            └────┬────┘
                       │                      │
    Query ─────────────┼──────────────────────┼──── Answer
                       │                      │
                  ┌────┴────┐            ┌────┴────┐
                  │Context  │            │Answer   │
                  │Recall   │            │Relevancy│
                  │         │            │         │
                  │是否检索 │            │答案是否 │
                  │到足够信息│            │回答了问题│
                  └─────────┘            └─────────┘

指标详解与计算

import numpy as np
from dataclasses import dataclass

@dataclass
class RAGEvalSample:
    query: str
    contexts: list[str]       # Retrieved contexts
    answer: str               # Generated answer
    ground_truth: str = None  # Reference answer (optional)

class RAGASMetrics:
    """Implementation of core RAGAS metrics."""

    def __init__(self, llm_judge, embed_fn):
        self.llm = llm_judge
        self.embed = embed_fn

    def faithfulness(self, sample: RAGEvalSample) -> float:
        """
        Measures if the answer is grounded in retrieved contexts.
        Score: 0 (hallucinated) to 1 (fully faithful)

        Method:
        1. Extract claims from the answer
        2. Check each claim against contexts
        3. Score = supported_claims / total_claims
        """
        # Step 1: Extract atomic claims
        claims = self._extract_claims(sample.answer)
        if not claims:
            return 1.0

        # Step 2: Verify each claim
        context_str = "\n\n".join(sample.contexts)
        supported = 0
        for claim in claims:
            if self._verify_claim(claim, context_str):
                supported += 1

        return supported / len(claims)

    def answer_relevancy(self, sample: RAGEvalSample) -> float:
        """
        Measures if the answer addresses the question.
        Score: 0 (irrelevant) to 1 (perfectly relevant)

        Method:
        1. Generate N questions from the answer
        2. Compute similarity between generated Qs and original Q
        3. Score = average similarity
        """
        # Generate questions that the answer could be responding to
        generated_questions = self._generate_questions(sample.answer, n=3)

        # Compute embedding similarity
        q_emb = self.embed([sample.query])[0]
        gen_embs = self.embed(generated_questions)

        similarities = [self._cosine_sim(q_emb, ge) for ge in gen_embs]
        return float(np.mean(similarities))

    def context_precision(self, sample: RAGEvalSample) -> float:
        """
        Measures if relevant contexts are ranked higher.
        Score: 0 (relevant contexts ranked low) to 1 (ranked high)

        Method: Average Precision of relevant contexts in ranking
        """
        # Judge each context's relevance
        relevant_mask = []
        for ctx in sample.contexts:
            is_relevant = self._judge_relevance(sample.query, ctx)
            relevant_mask.append(is_relevant)

        # Calculate Average Precision
        if not any(relevant_mask):
            return 0.0

        precision_sum = 0.0
        relevant_count = 0
        for i, is_rel in enumerate(relevant_mask):
            if is_rel:
                relevant_count += 1
                precision_at_k = relevant_count / (i + 1)
                precision_sum += precision_at_k

        return precision_sum / sum(relevant_mask)

    def context_recall(self, sample: RAGEvalSample) -> float:
        """
        Measures if all necessary information was retrieved.
        Requires ground_truth reference answer.
        Score: 0 (critical info missing) to 1 (all info present)

        Method:
        1. Extract claims from ground_truth
        2. Check if each claim can be found in contexts
        3. Score = found_claims / total_claims
        """
        if not sample.ground_truth:
            return None

        gt_claims = self._extract_claims(sample.ground_truth)
        if not gt_claims:
            return 1.0

        context_str = "\n\n".join(sample.contexts)
        found = sum(1 for c in gt_claims if self._verify_claim(c, context_str))
        return found / len(gt_claims)

    # --- Helper methods ---
    def _extract_claims(self, text: str) -> list[str]:
        prompt = f"Extract all atomic factual claims from this text. Return one claim per line.\n\nText: {text}"
        response = self.llm.generate(prompt)
        return [c.strip() for c in response.strip().split("\n") if c.strip()]

    def _verify_claim(self, claim: str, context: str) -> bool:
        prompt = f"Can this claim be supported by the context?\nClaim: {claim}\nContext: {context}\nAnswer: yes or no"
        return "yes" in self.llm.generate(prompt).lower()

    def _generate_questions(self, answer: str, n: int = 3) -> list[str]:
        prompt = f"Generate {n} questions that this text could be answering:\n{answer}"
        response = self.llm.generate(prompt)
        return [q.strip() for q in response.strip().split("\n") if q.strip()][:n]

    def _judge_relevance(self, query: str, context: str) -> bool:
        prompt = f"Is this context relevant to the query?\nQuery: {query}\nContext: {context}\nAnswer: yes or no"
        return "yes" in self.llm.generate(prompt).lower()

    def _cosine_sim(self, a, b) -> float:
        a, b = np.array(a), np.array(b)
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))

指标基线与目标

指标 一般 优秀 目标
Faithfulness <0.5 0.5-0.7 0.7-0.85 >0.85 >0.85
Answer Relevancy <0.5 0.5-0.7 0.7-0.85 >0.85 >0.80
Context Precision <0.3 0.3-0.6 0.6-0.8 >0.8 >0.75
Context Recall <0.4 0.4-0.65 0.65-0.85 >0.85 >0.80

评测流水线设计

自动化评测架构

评测流水线

┌──────────────┐
│ 测试数据集    │  合成 + 人工标注 + 生产采样
│ (Q, A_ref,   │
│  contexts)   │
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ RAG Pipeline  │  被评测的系统
│ (被测对象)    │
└──────┬───────┘
       │  输出: (contexts_retrieved, answer_generated)
       ▼
┌──────────────┐
│ 评测引擎      │
│ ├── RAGAS    │  四维指标
│ ├── 延迟     │  TTFT, 总延迟
│ ├── 成本     │  Token消耗
│ └── 自定义   │  业务特定指标
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 报告 & CI    │  仪表盘 + 回归检测 + 告警
└──────────────┘

CI/CD集成

import json
from pathlib import Path

class RAGEvalPipeline:
    """Automated RAG evaluation pipeline for CI/CD."""

    def __init__(self, rag_system, metrics: RAGASMetrics,
                 test_data_path: str):
        self.rag = rag_system
        self.metrics = metrics
        self.test_data = self._load_test_data(test_data_path)

    def run_evaluation(self) -> dict:
        """Run full evaluation suite."""
        results = []

        for sample in self.test_data:
            # Run RAG pipeline
            rag_output = self.rag.query(sample["query"])

            eval_sample = RAGEvalSample(
                query=sample["query"],
                contexts=rag_output["contexts"],
                answer=rag_output["answer"],
                ground_truth=sample.get("ground_truth"),
            )

            # Compute metrics
            scores = {
                "faithfulness": self.metrics.faithfulness(eval_sample),
                "answer_relevancy": self.metrics.answer_relevancy(eval_sample),
                "context_precision": self.metrics.context_precision(eval_sample),
            }
            if eval_sample.ground_truth:
                scores["context_recall"] = self.metrics.context_recall(eval_sample)

            results.append({
                "query": sample["query"],
                "scores": scores,
                "answer": rag_output["answer"][:200],
            })

        # Aggregate
        aggregate = self._aggregate(results)
        return {"samples": results, "aggregate": aggregate}

    def check_thresholds(self, results: dict,
                          thresholds: dict = None) -> bool:
        """Check if evaluation meets quality thresholds."""
        defaults = {
            "faithfulness": 0.85,
            "answer_relevancy": 0.80,
            "context_precision": 0.75,
            "context_recall": 0.80,
        }
        thresholds = thresholds or defaults
        agg = results["aggregate"]

        passed = True
        for metric, threshold in thresholds.items():
            if metric in agg and agg[metric] < threshold:
                print(f"FAIL: {metric} = {agg[metric]:.3f} < {threshold}")
                passed = False
            elif metric in agg:
                print(f"PASS: {metric} = {agg[metric]:.3f} >= {threshold}")

        return passed

    def _aggregate(self, results: list) -> dict:
        metrics = {}
        for key in ["faithfulness", "answer_relevancy",
                     "context_precision", "context_recall"]:
            values = [r["scores"][key] for r in results
                      if key in r["scores"] and r["scores"][key] is not None]
            if values:
                metrics[key] = float(np.mean(values))
        return metrics

    def _load_test_data(self, path: str) -> list:
        with open(path) as f:
            return json.load(f)

合成测试数据生成

自动化测试集构建

class SyntheticTestGenerator:
    """Generate synthetic QA pairs for RAG evaluation."""

    def __init__(self, llm, documents: list[str]):
        self.llm = llm
        self.documents = documents

    def generate_test_set(self, n_samples: int = 100,
                           difficulty_mix: dict = None) -> list[dict]:
        """Generate diverse test samples across difficulty levels."""
        if difficulty_mix is None:
            difficulty_mix = {
                "simple": 0.3,    # Single-document, factoid
                "reasoning": 0.3, # Requires inference
                "multi_hop": 0.2, # Needs multiple documents
                "negative": 0.2,  # No answer in corpus
            }

        samples = []
        for difficulty, ratio in difficulty_mix.items():
            count = int(n_samples * ratio)
            for _ in range(count):
                sample = self._generate_sample(difficulty)
                if sample:
                    samples.append(sample)

        return samples

    def _generate_sample(self, difficulty: str) -> dict:
        if difficulty == "simple":
            return self._gen_simple()
        elif difficulty == "reasoning":
            return self._gen_reasoning()
        elif difficulty == "multi_hop":
            return self._gen_multi_hop()
        elif difficulty == "negative":
            return self._gen_negative()

    def _gen_simple(self) -> dict:
        """Generate simple factoid question from a single document."""
        import random
        doc = random.choice(self.documents)
        prompt = (
            f"Based on this document, generate a factoid question "
            f"and its answer.\n\nDocument: {doc[:2000]}\n\n"
            f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
        )
        result = self.llm.generate(prompt)
        try:
            parsed = json.loads(result)
            return {
                "query": parsed["question"],
                "ground_truth": parsed["answer"],
                "difficulty": "simple",
                "source_doc": doc[:500],
            }
        except (json.JSONDecodeError, KeyError):
            return None

    def _gen_reasoning(self) -> dict:
        """Generate question requiring inference/reasoning."""
        import random
        doc = random.choice(self.documents)
        prompt = (
            f"Based on this document, generate a question that requires "
            f"reasoning or inference (not just fact lookup).\n\n"
            f"Document: {doc[:2000]}\n\n"
            f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
        )
        result = self.llm.generate(prompt)
        try:
            parsed = json.loads(result)
            parsed["difficulty"] = "reasoning"
            parsed["query"] = parsed.pop("question")
            parsed["ground_truth"] = parsed.pop("answer")
            return parsed
        except (json.JSONDecodeError, KeyError):
            return None

    def _gen_multi_hop(self) -> dict:
        """Generate question needing info from multiple documents."""
        import random
        docs = random.sample(self.documents, min(2, len(self.documents)))
        prompt = (
            f"Generate a question that can only be answered by combining "
            f"information from BOTH documents.\n\n"
            f"Document 1: {docs[0][:1000]}\n\n"
            f"Document 2: {docs[1][:1000] if len(docs) > 1 else docs[0][:1000]}\n\n"
            f"Return JSON: {{\"question\": \"...\", \"answer\": \"...\"}}"
        )
        result = self.llm.generate(prompt)
        try:
            parsed = json.loads(result)
            return {
                "query": parsed["question"],
                "ground_truth": parsed["answer"],
                "difficulty": "multi_hop",
            }
        except (json.JSONDecodeError, KeyError):
            return None

    def _gen_negative(self) -> dict:
        """Generate question that cannot be answered from the corpus."""
        prompt = (
            "Generate a realistic but specific question about a topic "
            "that would NOT be answerable from a typical knowledge base. "
            "Return JSON: {\"question\": \"...\"}"
        )
        result = self.llm.generate(prompt)
        try:
            parsed = json.loads(result)
            return {
                "query": parsed["question"],
                "ground_truth": "This question cannot be answered from the available documents.",
                "difficulty": "negative",
            }
        except (json.JSONDecodeError, KeyError):
            return None

A/B测试RAG

实验设计

变量 A组(基线) B组(实验) 度量
分块大小 512 tokens 256 tokens Precision/Recall
检索数量 Top-5 Top-10 Faithfulness
重排序 BGE-reranker Relevancy
模型 GPT-4o-mini GPT-4o 质量+成本
class RAGABTest:
    """A/B testing framework for RAG configurations."""

    def __init__(self, config_a: dict, config_b: dict,
                 test_data: list[dict], metrics: RAGASMetrics):
        self.config_a = config_a
        self.config_b = config_b
        self.test_data = test_data
        self.metrics = metrics

    def run_experiment(self) -> dict:
        results_a = self._evaluate_config(self.config_a, "A")
        results_b = self._evaluate_config(self.config_b, "B")

        comparison = {}
        for metric in ["faithfulness", "answer_relevancy",
                       "context_precision"]:
            a_mean = np.mean([r[metric] for r in results_a if metric in r])
            b_mean = np.mean([r[metric] for r in results_b if metric in r])
            delta = b_mean - a_mean
            # Simple significance test
            relative = delta / (a_mean + 1e-8) * 100

            comparison[metric] = {
                "A": round(a_mean, 3),
                "B": round(b_mean, 3),
                "delta": round(delta, 3),
                "relative_pct": round(relative, 1),
                "winner": "B" if delta > 0.02 else ("A" if delta < -0.02 else "tie"),
            }

        return comparison

    def _evaluate_config(self, config: dict, label: str) -> list:
        rag = build_rag_pipeline(config)
        results = []
        for sample in self.test_data:
            output = rag.query(sample["query"])
            eval_sample = RAGEvalSample(
                query=sample["query"],
                contexts=output["contexts"],
                answer=output["answer"],
                ground_truth=sample.get("ground_truth"),
            )
            scores = {
                "faithfulness": self.metrics.faithfulness(eval_sample),
                "answer_relevancy": self.metrics.answer_relevancy(eval_sample),
                "context_precision": self.metrics.context_precision(eval_sample),
            }
            results.append(scores)
        return results

结论

RAG评测体系的建设是RAG系统从"实验室"走向"生产环境"的关键一步。RAGAS提供了从检索质量到生成质量的四维评测框架,合成测试数据生成解决了评测数据集的冷启动问题,A/B测试为配置优化提供了数据驱动的决策依据。建议将RAG评测纳入CI/CD流水线,设置质量门禁(Faithfulness > 0.85, Relevancy > 0.80),并通过持续的A/B测试推动系统逐步优化。


Maurice | maurice_wen@proton.me