大语言模型驱动的知识图谱补全

问题定义

知识图谱天然是不完整的。即使经过大规模自动抽取,图谱中仍存在大量缺失的实体、关系和属性。知识图谱补全(Knowledge Graph Completion, KGC)的目标是推断出图谱中应该存在但尚未被记录的事实。

大语言模型(LLM)为这一问题带来了范式转换:从传统的基于嵌入的链接预测,转向基于语言理解的推理补全。


传统方法 vs LLM 方法

技术路线对比

传统路线(Embedding-based):
  实体/关系 ──→ 嵌入学习 ──→ 评分函数 ──→ 排序预测
  代表:TransE / RotatE / CompGCN
  优势:快速、可扩展
  劣势:冷启动差、不理解语义

LLM 路线(Language-based):
  三元组 ──→ 自然语言描述 ──→ LLM 推理 ──→ 补全结果
  代表:GPT-4 / Claude / LLaMA + Fine-tuning
  优势:语义理解强、零样本能力
  劣势:成本高、幻觉风险

性能对比

方法类别 Hits@1 (FB15k-237) MRR 推理速度 冷启动
TransE 0.243 0.339 极快
RotatE 0.281 0.368
CompGCN 0.299 0.382
KG-BERT 0.312 0.401
GPT-4 (few-shot) 0.345 0.432 极慢 极好
LLM + KGE 混合 0.371 0.458

LLM 补全的三大任务

任务一:链接预测(Link Prediction)

给定头实体和关系,预测尾实体;或给定尾实体和关系,预测头实体。

(华为, 创始人, ?) ──→ 任正非
(?, 总部位于, 深圳) ──→ 腾讯、华为、大疆...

任务二:实体类型推断(Entity Typing)

给定实体及其上下文,推断缺失的类型标签。

(OpenAI, ?) ──→ [AI公司, 技术公司, 美国企业]

任务三:关系预测(Relation Prediction)

给定两个实体,预测它们之间应该存在的关系。

(张三, ?, 阿里巴巴) ──→ 任职于
(谷歌, ?, DeepMind) ──→ 收购

链接预测实现

方案一:LLM 直接推理

from openai import OpenAI
import json

client = OpenAI()

def link_prediction_direct(
    head: str,
    relation: str,
    known_context: list[dict],
    top_k: int = 5
) -> list[dict]:
    """LLM 直接链接预测"""

    # 构建上下文:已知的相关三元组
    context_lines = []
    for triple in known_context:
        context_lines.append(
            f"- {triple['subject']} --[{triple['predicate']}]--> {triple['object']}"
        )
    context_text = "\n".join(context_lines)

    prompt = f"""
基于以下知识图谱中的已知事实,预测缺失的实体。

已知事实:
{context_text}

待预测:
{head} --[{relation}]--> ?

请给出最可能的 {top_k} 个实体,按置信度从高到低排列。
输出 JSON 格式:
{{
  "predictions": [
    {{"entity": "实体名", "confidence": 0.95, "reasoning": "推理依据"}}
  ]
}}
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "你是知识图谱补全专家。基于已知事实做逻辑推理。"},
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0.0
    )
    return json.loads(response.choices[0].message.content)["predictions"]

方案二:检索增强链接预测(RAG-KGC)

先从图谱中检索相关子图作为上下文,再让 LLM 推理。

class RAGLinkPredictor:
    """检索增强的链接预测器"""

    def __init__(self, graph_session, llm_client):
        self.session = graph_session
        self.client = llm_client

    def retrieve_context(
        self,
        entity: str,
        max_hops: int = 2,
        max_triples: int = 50
    ) -> list[dict]:
        """从图谱中检索实体的 N 跳邻域"""
        result = self.session.run(
            """
            MATCH path = (e:Entity {name: $name})-[*1..$hops]-(neighbor)
            UNWIND relationships(path) AS r
            WITH startNode(r) AS s, type(r) AS rel, endNode(r) AS o
            RETURN DISTINCT s.name AS subject, rel AS predicate, o.name AS object
            LIMIT $limit
            """,
            name=entity,
            hops=max_hops,
            limit=max_triples
        )
        return result.data()

    def predict(
        self,
        head: str,
        relation: str,
        top_k: int = 5
    ) -> list[dict]:
        """检索增强链接预测"""
        # 1. 检索相关上下文
        context = self.retrieve_context(head)

        # 2. 检索关系的典型模式
        relation_patterns = self.session.run(
            """
            MATCH (s)-[r]->(o)
            WHERE type(r) = $rel
            RETURN s.name, s.type, o.name, o.type
            LIMIT 10
            """,
            rel=relation
        ).data()

        # 3. 构建增强 prompt
        context_text = "\n".join(
            f"  {t['subject']} --[{t['predicate']}]--> {t['object']}"
            for t in context
        )
        pattern_text = "\n".join(
            f"  {p['s.name']}({p['s.type']}) --[{relation}]--> {p['o.name']}({p['o.type']})"
            for p in relation_patterns
        )

        prompt = f"""
你需要预测知识图谱中的缺失链接。

目标实体 "{head}" 的已知关系:
{context_text}

关系 "{relation}" 的典型模式:
{pattern_text}

预测:{head} --[{relation}]--> ?

给出 {top_k} 个最可能的候选实体,说明推理依据。
输出JSON: {{"predictions": [{{"entity": "名称", "confidence": 0.0-1.0, "reasoning": "依据"}}]}}
"""

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "你是知识图谱推理专家。只基于给定事实推理。"},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.0
        )
        return json.loads(response.choices[0].message.content)["predictions"]

实体类型推断

def entity_typing(
    entity_name: str,
    context_triples: list[dict],
    type_taxonomy: list[str]
) -> list[dict]:
    """基于图谱上下文推断实体类型"""

    context_text = "\n".join(
        f"- {t['subject']} --[{t['predicate']}]--> {t['object']}"
        for t in context_triples
    )

    type_list = ", ".join(type_taxonomy)

    prompt = f"""
根据以下实体的关系上下文,推断实体 "{entity_name}" 的类型。

可选类型列表:{type_list}

已知关系:
{context_text}

为实体选择最匹配的类型(可多选),并给出推理依据。
输出JSON: {{
  "types": [
    {{"type": "类型名", "confidence": 0.95, "evidence": "关键证据"}}
  ]
}}
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "你是本体分类专家。"},
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0.0
    )
    return json.loads(response.choices[0].message.content)["types"]

关系预测

def relation_prediction(
    entity_a: str,
    entity_b: str,
    context_a: list[dict],
    context_b: list[dict],
    relation_schema: list[str]
) -> list[dict]:
    """预测两个实体之间的关系"""

    ctx_a = "\n".join(f"  {t['subject']}--[{t['predicate']}]-->{t['object']}" for t in context_a[:15])
    ctx_b = "\n".join(f"  {t['subject']}--[{t['predicate']}]-->{t['object']}" for t in context_b[:15])
    rels = ", ".join(relation_schema)

    prompt = f"""
预测实体 "{entity_a}" 和 "{entity_b}" 之间最可能存在的关系。

可选关系类型:{rels}

实体A "{entity_a}" 的已知关系:
{ctx_a}

实体B "{entity_b}" 的已知关系:
{ctx_b}

预测 {entity_a} 与 {entity_b} 之间的关系(可能有多个或没有关系)。
输出JSON: {{
  "relations": [
    {{
      "subject": "{entity_a}",
      "predicate": "关系类型",
      "object": "{entity_b}",
      "confidence": 0.9,
      "reasoning": "推理依据"
    }}
  ]
}}
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "你是知识图谱关系推理专家。只输出有充分证据支撑的关系。"},
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0.0
    )
    return json.loads(response.choices[0].message.content)["relations"]

幻觉抑制策略

LLM 最大的风险是幻觉(Hallucination)-- 生成看似合理但实际不存在的事实。

多重验证框架

LLM 预测结果
    │
    ├── 验证层 1:图谱约束检查
    │     类型约束、基数约束、时序约束
    │
    ├── 验证层 2:多模型投票
    │     3+ 个模型独立预测,取交集
    │
    ├── 验证层 3:外部知识验证
    │     Web 搜索验证、权威知识库交叉核对
    │
    └── 验证层 4:人工抽样审核
          定期抽样验证,更新可信度

实现代码

class HallucinationFilter:
    """幻觉过滤器"""

    def __init__(self, graph_session, ontology: dict):
        self.session = graph_session
        self.ontology = ontology

    def check_type_constraint(self, triple: dict) -> bool:
        """检查类型约束"""
        rel = triple["predicate"]
        constraints = self.ontology.get("relation_constraints", {}).get(rel, {})

        if constraints:
            # 检查主语类型
            subject_type = self._get_entity_type(triple["subject"])
            if subject_type and constraints.get("domain"):
                if subject_type not in constraints["domain"]:
                    return False

            # 检查宾语类型
            object_type = self._get_entity_type(triple["object"])
            if object_type and constraints.get("range"):
                if object_type not in constraints["range"]:
                    return False

        return True

    def check_cardinality(self, triple: dict) -> bool:
        """检查基数约束(如 CEO 只能有一个)"""
        functional_relations = self.ontology.get("functional_relations", [])

        if triple["predicate"] in functional_relations:
            existing = self.session.run(
                """
                MATCH (s:Entity {name: $subject})-[r]->(o)
                WHERE type(r) = $rel
                RETURN count(o) AS cnt
                """,
                subject=triple["subject"],
                rel=triple["predicate"]
            ).single()["cnt"]

            if existing >= 1:
                return False  # 函数关系已有值,不应再添加

        return True

    def multi_model_vote(
        self,
        triple: dict,
        models: list[str],
        threshold: int = 2
    ) -> bool:
        """多模型投票验证"""
        votes = 0
        for model_name in models:
            verification = verify_with_model(triple, model_name)
            if verification.get("is_valid"):
                votes += 1

        return votes >= threshold

    def filter(self, predictions: list[dict]) -> list[dict]:
        """综合过滤"""
        validated = []
        for pred in predictions:
            triple = {
                "subject": pred.get("subject", ""),
                "predicate": pred.get("predicate", ""),
                "object": pred.get("entity", pred.get("object", ""))
            }

            checks = {
                "type_constraint": self.check_type_constraint(triple),
                "cardinality": self.check_cardinality(triple),
            }

            # 只有通过所有约束检查的才保留
            if all(checks.values()):
                pred["validation"] = checks
                validated.append(pred)

        return validated

    def _get_entity_type(self, name: str) -> str | None:
        result = self.session.run(
            "MATCH (e:Entity {name: $name}) RETURN e.type AS type",
            name=name
        ).single()
        return result["type"] if result else None

批量补全管线

端到端补全流程

class KGCompletionPipeline:
    """LLM 驱动的知识图谱补全管线"""

    def __init__(self, graph_session, llm_client, ontology: dict):
        self.session = graph_session
        self.client = llm_client
        self.predictor = RAGLinkPredictor(graph_session, llm_client)
        self.filter = HallucinationFilter(graph_session, ontology)

    def find_completion_candidates(self) -> list[dict]:
        """发现需要补全的候选位置"""
        candidates = []

        # 1. 关系密度低的实体(可能缺少关系)
        sparse_entities = self.session.run(
            """
            MATCH (e:Entity)
            WITH e, size((e)-[]-()) AS degree
            WHERE degree < 3
            RETURN e.name, e.type, degree
            ORDER BY degree ASC
            LIMIT 100
            """
        ).data()

        for entity in sparse_entities:
            candidates.append({
                "type": "sparse_entity",
                "entity": entity["e.name"],
                "entity_type": entity["e.type"],
                "current_degree": entity["degree"]
            })

        # 2. 本体定义了但缺失的关系
        expected_but_missing = self.session.run(
            """
            MATCH (a:Company)
            WHERE NOT EXISTS((a)-[:总部位于]->(:City))
            RETURN a.name
            LIMIT 50
            """
        ).data()

        for entity in expected_but_missing:
            candidates.append({
                "type": "missing_expected_relation",
                "entity": entity["a.name"],
                "missing_relation": "总部位于"
            })

        return candidates

    def run_completion(self, max_candidates: int = 50) -> dict:
        """执行补全"""
        candidates = self.find_completion_candidates()[:max_candidates]
        stats = {"attempted": 0, "predicted": 0, "validated": 0, "ingested": 0}

        for cand in candidates:
            stats["attempted"] += 1

            if cand["type"] == "sparse_entity":
                # 为低密度实体预测新关系
                context = self.predictor.retrieve_context(cand["entity"])
                # 尝试常见关系
                for rel in ["属于", "位于", "创建者"]:
                    predictions = self.predictor.predict(
                        head=cand["entity"],
                        relation=rel,
                        top_k=3
                    )
                    stats["predicted"] += len(predictions)

                    # 过滤幻觉
                    validated = self.filter.filter(predictions)
                    stats["validated"] += len(validated)

                    # 入库高置信预测
                    for pred in validated:
                        if pred.get("confidence", 0) >= 0.8:
                            self._ingest_prediction(cand["entity"], rel, pred)
                            stats["ingested"] += 1

        return stats

    def _ingest_prediction(self, head: str, relation: str, prediction: dict):
        """将预测结果入库"""
        self.session.run(
            """
            MATCH (h:Entity {name: $head})
            MERGE (t:Entity {name: $tail})
            MERGE (h)-[r:PREDICTED_REL {type: $rel}]->(t)
            SET r.confidence = $conf,
                r.source = "llm_completion",
                r.reasoning = $reasoning,
                r.predicted_at = datetime()
            """,
            head=head,
            tail=prediction["entity"],
            rel=relation,
            conf=prediction["confidence"],
            reasoning=prediction.get("reasoning", "")
        )

评估方法

标准评估指标

def evaluate_completion(
    predictions: list[dict],
    ground_truth: list[dict]
) -> dict:
    """评估补全质量"""

    # 构建真值集合
    gt_set = {(t["subject"], t["predicate"], t["object"]) for t in ground_truth}

    # 按 top-K 评估
    metrics = {}
    for k in [1, 3, 5, 10]:
        hits = 0
        mrr_sum = 0

        for pred_group in predictions:
            top_k_preds = pred_group["candidates"][:k]
            for rank, pred in enumerate(top_k_preds, 1):
                triple = (pred_group["head"], pred_group["relation"], pred["entity"])
                if triple in gt_set:
                    hits += 1
                    mrr_sum += 1.0 / rank
                    break

        n = len(predictions)
        metrics[f"Hits@{k}"] = hits / n if n > 0 else 0
        metrics[f"MRR@{k}"] = mrr_sum / n if n > 0 else 0

    return metrics

成本优化策略

策略 描述 成本节省
候选预筛选 先用嵌入模型粗排,再用 LLM 精排 80-90%
批量推理 多个预测请求合并为一个 prompt 30-50%
模型分级 简单预测用小模型,复杂推理用大模型 50-70%
缓存 相似查询结果缓存复用 20-40%
增量补全 只对新增/变更实体做补全 因场景而异

候选预筛选实现

def prefilter_candidates(
    head: str,
    relation: str,
    all_entities: list[dict],
    top_n: int = 20
) -> list[dict]:
    """使用嵌入模型预筛选候选实体,减少 LLM 调用量"""
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("shibing624/text2vec-base-chinese")

    query = f"{head} {relation}"
    query_emb = model.encode(query)

    entity_texts = [f"{e['name']} {e.get('description', '')}" for e in all_entities]
    entity_embs = model.encode(entity_texts)

    similarities = np.dot(entity_embs, query_emb)
    top_indices = np.argsort(similarities)[-top_n:][::-1]

    return [all_entities[i] for i in top_indices]

总结

LLM 驱动的知识图谱补全核心要点:

  1. RAG 范式:先从图谱检索上下文,再让 LLM 推理,比直接推理准确率高 15-25%
  2. 幻觉是最大风险:必须多层验证(类型约束 + 多模型投票 + 外部验证)
  3. 成本控制:候选预筛选 + 模型分级 + 批量推理,可将成本降低 80%+
  4. 增量补全:生产环境用事件驱动的增量补全,不做全量重算
  5. 人机协同:高置信预测自动入库,低置信预测交给人工审核

Maurice | maurice_wen@proton.me