知识图谱与大模型融合实践
原创
灵阙教研团队
S 精选 进阶 |
约 8 分钟阅读
更新于 2026-02-28 AI 导读
知识图谱与大模型融合实践 KG增强LLM、Graph-based RAG、实体链接与知识锚定:减少幻觉的工程路径 引言 大语言模型的"幻觉"问题——即自信地生成与事实不符的内容——是阻碍其在高可靠性场景落地的核心障碍。知识图谱作为结构化的事实存储,天然具备"可验证性"和"可追溯性",是对抗幻觉的有力武器。本文将系统阐述知识图谱与大模型融合的四种核心模式及其工程实现。 融合模式概览 四种融合范式...
知识图谱与大模型融合实践
KG增强LLM、Graph-based RAG、实体链接与知识锚定:减少幻觉的工程路径
引言
大语言模型的"幻觉"问题——即自信地生成与事实不符的内容——是阻碍其在高可靠性场景落地的核心障碍。知识图谱作为结构化的事实存储,天然具备"可验证性"和"可追溯性",是对抗幻觉的有力武器。本文将系统阐述知识图谱与大模型融合的四种核心模式及其工程实现。
融合模式概览
四种融合范式
KG-LLM融合范式
范式1: KG-Enhanced Retrieval(检索增强)
Query → [KG检索] → 结构化上下文 → LLM → Answer
特点: LLM不变,KG提供精确上下文
适用: 事实性问答、数据查询
范式2: KG-Grounded Generation(知识锚定)
Query → LLM生成初稿 → [KG验证] → 修正 → Answer
特点: LLM先生成,KG后验证
适用: 长文本生成、报告撰写
范式3: KG-Augmented Reasoning(推理增强)
Query → [KG子图检索] → 图推理路径 → LLM推理 → Answer
特点: KG提供推理链路,LLM做自然语言推理
适用: 多跳问答、因果分析
范式4: LLM-Powered KG(LLM驱动KG)
Text → LLM抽取 → KG构建/更新
Query → LLM生成Cypher → KG执行 → Answer
特点: LLM负责KG的构建和查询
适用: 知识图谱维护、Text-to-Cypher
范式1:KG增强检索(GraphRAG)
架构设计
from dataclasses import dataclass
@dataclass
class GraphRAGResult:
answer: str
entities: list[dict]
subgraph: list[dict] # Triples used
confidence: float
sources: list[str]
class GraphRAGPipeline:
"""Knowledge Graph enhanced RAG pipeline."""
def __init__(self, kg_client, vector_store, llm):
self.kg = kg_client
self.vectors = vector_store
self.llm = llm
def query(self, question: str) -> GraphRAGResult:
# Step 1: Entity extraction from question
entities = self._extract_entities(question)
# Step 2: KG subgraph retrieval
subgraph = self._retrieve_subgraph(entities, depth=2)
# Step 3: Vector retrieval for additional context
vector_results = self.vectors.similarity_search(question, k=3)
# Step 4: Combine structured + unstructured context
context = self._build_context(subgraph, vector_results)
# Step 5: Generate answer with grounding
answer = self._generate_grounded_answer(question, context)
return GraphRAGResult(
answer=answer,
entities=entities,
subgraph=subgraph,
confidence=self._assess_confidence(answer, subgraph),
sources=[t["source"] for t in subgraph if "source" in t],
)
def _extract_entities(self, text: str) -> list[dict]:
"""Extract entities using LLM for KG lookup."""
prompt = f"Extract named entities from: {text}\nReturn JSON array."
response = self.llm.generate(prompt, temperature=0)
import json
try:
return json.loads(response)
except:
return []
def _retrieve_subgraph(self, entities: list[dict],
depth: int = 2) -> list[dict]:
"""Retrieve relevant subgraph from KG."""
triples = []
for entity in entities:
name = entity.get("text", entity.get("name", ""))
# Query KG for entity and its neighbors
results = self.kg.query(f"""
MATCH (e {{name: $name}})-[r]-(neighbor)
RETURN e.name AS subject, type(r) AS predicate,
neighbor.name AS object, labels(neighbor) AS types
LIMIT 50
""", name=name)
triples.extend(results)
return triples
def _build_context(self, subgraph: list[dict],
vector_results: list) -> str:
"""Combine graph and vector contexts."""
# Structured context from KG
kg_context = "Structured Knowledge:\n"
for triple in subgraph[:20]:
kg_context += f"- {triple['subject']} --[{triple['predicate']}]--> {triple['object']}\n"
# Unstructured context from vectors
text_context = "Related Documents:\n"
for doc in vector_results:
text_context += f"- {doc.page_content[:300]}\n"
return f"{kg_context}\n{text_context}"
def _generate_grounded_answer(self, question: str,
context: str) -> str:
prompt = f"""Answer the question based ONLY on the provided knowledge.
If the knowledge is insufficient, say so.
{context}
Question: {question}
Requirements:
- Cite specific facts from the structured knowledge
- Do not invent information not present in the context
- If uncertain, express uncertainty explicitly
"""
return self.llm.generate(prompt, temperature=0)
def _assess_confidence(self, answer: str,
subgraph: list[dict]) -> float:
"""Estimate answer confidence based on KG coverage."""
if not subgraph:
return 0.3
return min(0.95, 0.5 + len(subgraph) * 0.05)
范式2:知识锚定生成
事实验证流水线
class KGGroundedGenerator:
"""Generate text grounded in knowledge graph facts."""
def __init__(self, kg_client, llm):
self.kg = kg_client
self.llm = llm
def generate_with_verification(self, prompt: str) -> dict:
# Phase 1: Generate initial draft
draft = self.llm.generate(prompt, temperature=0.7)
# Phase 2: Extract claims from draft
claims = self._extract_claims(draft)
# Phase 3: Verify each claim against KG
verified_claims = []
for claim in claims:
verification = self._verify_claim(claim)
verified_claims.append({
"claim": claim,
"status": verification["status"],
"evidence": verification.get("evidence"),
"correction": verification.get("correction"),
})
# Phase 4: Correct if needed
unsupported = [c for c in verified_claims if c["status"] != "supported"]
if unsupported:
corrected = self._correct_draft(draft, unsupported)
else:
corrected = draft
return {
"draft": draft,
"final": corrected,
"claims": verified_claims,
"accuracy": sum(1 for c in verified_claims
if c["status"] == "supported") / max(len(verified_claims), 1),
}
def _extract_claims(self, text: str) -> list[str]:
prompt = f"Extract all factual claims from this text, one per line:\n{text}"
response = self.llm.generate(prompt, temperature=0)
return [c.strip() for c in response.split("\n") if c.strip()]
def _verify_claim(self, claim: str) -> dict:
"""Verify a claim against the knowledge graph."""
# Extract entities from claim
entities = self._quick_ner(claim)
# Search KG for relevant facts
kg_facts = []
for entity in entities:
facts = self.kg.query(f"""
MATCH (e {{name: $name}})-[r]->(t)
RETURN e.name + ' ' + type(r) + ' ' + t.name AS fact
LIMIT 10
""", name=entity)
kg_facts.extend([f["fact"] for f in facts])
if not kg_facts:
return {"status": "unverifiable", "reason": "No matching KG facts"}
# Use LLM to compare claim against KG facts
facts_str = "\n".join(kg_facts)
prompt = f"""Compare this claim against known facts.
Claim: {claim}
Known facts:
{facts_str}
Is the claim: supported, contradicted, or unverifiable?
If contradicted, provide correction."""
response = self.llm.generate(prompt, temperature=0)
if "supported" in response.lower():
return {"status": "supported", "evidence": kg_facts[:3]}
elif "contradicted" in response.lower():
return {"status": "contradicted", "evidence": kg_facts[:3],
"correction": response}
else:
return {"status": "unverifiable"}
def _quick_ner(self, text: str) -> list[str]:
prompt = f"Extract entity names from: {text}\nReturn comma-separated list."
return [e.strip() for e in self.llm.generate(prompt).split(",")]
def _correct_draft(self, draft: str, unsupported: list[dict]) -> str:
corrections = "\n".join([
f"- Claim: {c['claim']}\n Issue: {c['status']}\n Correction: {c.get('correction', 'Remove or rephrase')}"
for c in unsupported
])
prompt = f"""Revise this text to fix the following issues:
{corrections}
Original text:
{draft}
Return the corrected text."""
return self.llm.generate(prompt, temperature=0)
实体链接
从文本到KG节点
class EntityLinker:
"""Link text mentions to knowledge graph entities."""
def __init__(self, kg_client, embed_fn, threshold: float = 0.8):
self.kg = kg_client
self.embed = embed_fn
self.threshold = threshold
def link(self, mention: str, context: str = "",
entity_type: str = None) -> dict:
"""Link a text mention to a KG entity."""
# Strategy 1: Exact match
exact = self._exact_match(mention, entity_type)
if exact:
return {"entity": exact, "method": "exact", "score": 1.0}
# Strategy 2: Alias match
alias = self._alias_match(mention, entity_type)
if alias:
return {"entity": alias, "method": "alias", "score": 0.95}
# Strategy 3: Embedding similarity
candidates = self._get_candidates(mention, entity_type)
if candidates:
best = self._rank_by_embedding(mention, context, candidates)
if best and best["score"] >= self.threshold:
return {"entity": best, "method": "embedding", "score": best["score"]}
# Strategy 4: LLM disambiguation
if candidates:
disambiguated = self._llm_disambiguate(mention, context, candidates)
if disambiguated:
return {"entity": disambiguated, "method": "llm", "score": 0.85}
return {"entity": None, "method": "not_found", "score": 0.0}
def _exact_match(self, mention: str, entity_type: str = None):
type_filter = f":{entity_type}" if entity_type else ""
results = self.kg.query(f"""
MATCH (e{type_filter} {{name: $name}})
RETURN e.name AS name, labels(e) AS types, id(e) AS id
LIMIT 1
""", name=mention)
return results[0] if results else None
def _alias_match(self, mention: str, entity_type: str = None):
results = self.kg.query("""
MATCH (e) WHERE $mention IN e.aliases
RETURN e.name AS name, labels(e) AS types, id(e) AS id
LIMIT 5
""", mention=mention)
if entity_type:
results = [r for r in results if entity_type in r["types"]]
return results[0] if results else None
def _get_candidates(self, mention: str, entity_type: str = None, limit: int = 20):
type_filter = f":{entity_type}" if entity_type else ""
return self.kg.query(f"""
CALL db.index.fulltext.queryNodes('entity_names', $query)
YIELD node, score
WHERE score > 0.5
RETURN node.name AS name, labels(node) AS types, score
LIMIT $limit
""", query=mention, limit=limit)
def _rank_by_embedding(self, mention, context, candidates):
query_text = f"{mention} {context}" if context else mention
query_emb = self.embed([query_text])[0]
cand_embs = self.embed([c["name"] for c in candidates])
import numpy as np
scores = [float(np.dot(query_emb, ce) /
(np.linalg.norm(query_emb) * np.linalg.norm(ce) + 1e-8))
for ce in cand_embs]
best_idx = int(np.argmax(scores))
return {**candidates[best_idx], "score": scores[best_idx]}
def _llm_disambiguate(self, mention, context, candidates):
# Use LLM for complex disambiguation
pass
幻觉减少效果评估
对比实验
| 方法 | 事实准确率 | 幻觉率 | 延迟增加 | 成本增加 |
|---|---|---|---|---|
| 纯LLM | 72% | 28% | 基准 | 基准 |
| LLM + Vector RAG | 85% | 15% | +200ms | +30% |
| LLM + KG检索 | 89% | 11% | +300ms | +40% |
| LLM + GraphRAG | 92% | 8% | +500ms | +60% |
| LLM + KG验证 | 94% | 6% | +800ms | +80% |
结论
知识图谱与大模型的融合是解决LLM幻觉问题最有前景的工程路径。四种融合范式各有适用场景:KG增强检索适用于事实性问答,知识锚定适用于内容生成,推理增强适用于复杂分析,LLM驱动KG则形成了知识的自动维护闭环。在工程实践中,建议从最简单的KG增强检索开始,逐步引入验证和推理能力,同时建立完善的实体链接基础设施。
Maurice | maurice_wen@proton.me