知识图谱自动构建:从文本到三元组

NER+关系抽取、LLM驱动的知识提取、Schema设计与质量控制全流程

引言

知识图谱的价值取决于其覆盖度和准确度,而这两者都依赖于高效的知识获取能力。传统的人工构建方式成本高昂且难以扩展,自动化构建已成为知识图谱工程的核心挑战。本文将从命名实体识别(NER)、关系抽取、LLM驱动的知识提取、Schema设计和质量控制五个方面系统阐述知识图谱自动构建的工程实践。

自动构建流水线

端到端架构

知识图谱自动构建流水线

原始文本
    │
    ▼
┌──────────────┐
│ 预处理       │  分句、分段、语言检测、编码统一
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 命名实体识别  │  识别文本中的实体(人、组织、地点、产品...)
│ (NER)        │
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 关系抽取      │  识别实体间的关系(属于、位于、创建...)
│ (RE)         │
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 三元组生成    │  (主体, 关系, 客体) + 属性
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 实体消解      │  合并指代同一实体的不同表述
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 质量控制      │  一致性校验、置信度过滤、人工审核
└──────┬───────┘
       │
       ▼
┌──────────────┐
│ 图数据库写入  │  Neo4j / Nebula / TigerGraph
└──────────────┘

命名实体识别(NER)

NER方法对比

方法 精度 召回 速度 扩展性 适用场景
规则+词典 高(领域内) 极快 固定领域
CRF/BiLSTM 中高 传统NLP
BERT-NER 通用NER
LLM零样本 中高 极好 新领域冷启动
LLM少样本 极好 领域适配

LLM驱动的NER

import json
from typing import Optional

class LLMEntityExtractor:
    """Extract named entities using LLM with structured output."""

    def __init__(self, llm_client, schema: dict):
        self.llm = llm_client
        self.schema = schema

    def extract_entities(self, text: str) -> list[dict]:
        """Extract entities from text using LLM."""
        entity_types = ", ".join(self.schema.get("entity_types", []))

        prompt = f"""Extract all named entities from the following text.

Entity types to extract: {entity_types}

Text:
{text}

Return a JSON array of entities:
[{{"text": "entity mention", "type": "entity_type", "start": char_offset, "end": char_offset}}]

Rules:
- Extract all instances, including pronouns resolved to their referent
- Use exact text spans from the original text
- Classify each entity into exactly one type
"""
        response = self.llm.generate(
            prompt,
            response_format={"type": "json_object"},
            temperature=0,
        )

        try:
            result = json.loads(response)
            entities = result if isinstance(result, list) else result.get("entities", [])
            return self._validate_entities(entities, text)
        except json.JSONDecodeError:
            return []

    def _validate_entities(self, entities: list[dict], text: str) -> list[dict]:
        """Validate extracted entities against source text."""
        valid = []
        for ent in entities:
            if ent.get("text") and ent["text"] in text:
                if ent.get("type") in self.schema.get("entity_types", []):
                    valid.append(ent)
        return valid


# Usage example
schema = {
    "entity_types": ["Person", "Organization", "Location", "Product",
                     "Technology", "Date", "Money"],
}

# extractor = LLMEntityExtractor(llm_client, schema)
# entities = extractor.extract_entities(
#     "2025年,阿里巴巴发布了通义千问2.5模型,总部位于杭州。"
# )

关系抽取

LLM驱动的关系抽取

class LLMRelationExtractor:
    """Extract relations between entities using LLM."""

    def __init__(self, llm_client, schema: dict):
        self.llm = llm_client
        self.schema = schema

    def extract_relations(self, text: str, entities: list[dict]) -> list[dict]:
        """Extract relations between identified entities."""
        entity_list = "\n".join([
            f"- {e['text']} ({e['type']})" for e in entities
        ])
        relation_types = "\n".join([
            f"- {r['name']}: {r['description']}"
            for r in self.schema.get("relation_types", [])
        ])

        prompt = f"""Given the text and entities, extract all relations.

Text:
{text}

Entities:
{entity_list}

Possible relation types:
{relation_types}

Return JSON array of relations:
[{{"subject": "entity1", "predicate": "relation_type", "object": "entity2", "confidence": 0.0-1.0}}]

Rules:
- Only extract relations explicitly stated or strongly implied
- Subject and object must be from the entity list
- Assign confidence based on how explicit the relation is
"""
        response = self.llm.generate(prompt, temperature=0)

        try:
            relations = json.loads(response)
            if not isinstance(relations, list):
                relations = relations.get("relations", [])
            return self._validate_relations(relations, entities)
        except json.JSONDecodeError:
            return []

    def _validate_relations(self, relations: list[dict],
                             entities: list[dict]) -> list[dict]:
        entity_texts = {e["text"] for e in entities}
        valid_rels = {r["name"] for r in self.schema.get("relation_types", [])}
        valid = []
        for rel in relations:
            if (rel.get("subject") in entity_texts and
                rel.get("object") in entity_texts and
                rel.get("predicate") in valid_rels):
                valid.append(rel)
        return valid

三元组端到端抽取

传统管道先做NER再做RE,而端到端方法同时输出实体和关系:

class EndToEndKGExtractor:
    """One-shot entity and relation extraction."""

    def __init__(self, llm_client, schema: dict):
        self.llm = llm_client
        self.schema = schema

    def extract(self, text: str) -> dict:
        """Extract entities and relations in one pass."""
        schema_desc = json.dumps(self.schema, ensure_ascii=False, indent=2)

        prompt = f"""Extract a knowledge graph from the following text.

Schema:
{schema_desc}

Text:
{text}

Return JSON with two arrays:
{{
  "entities": [{{"id": "e1", "text": "...", "type": "..."}}],
  "relations": [{{"subject_id": "e1", "predicate": "...", "object_id": "e2"}}]
}}
"""
        response = self.llm.generate(prompt, temperature=0)
        try:
            return json.loads(response)
        except json.JSONDecodeError:
            return {"entities": [], "relations": []}

Schema设计

本体设计原则

Schema设计方法论

Step 1: 确定领域范围
  ├── 核心问题: 图谱要回答什么问题?
  ├── 用例驱动: 列出top-10查询场景
  └── 边界定义: 明确不覆盖的范围

Step 2: 实体类型建模
  ├── 顶层类型: Person, Organization, Location, Event, Concept
  ├── 领域类型: Product, Technology, Policy, Standard
  ├── 属性定义: 每个类型的核心属性
  └── 唯一标识: 如何唯一区分同类型实体

Step 3: 关系类型建模
  ├── 通用关系: IS_A, PART_OF, RELATED_TO
  ├── 领域关系: DEVELOPED, REGULATED_BY, COMPETES_WITH
  ├── 时序关系: PRECEDED_BY, SUCCEEDED_BY
  └── 属性关系: 关系本身的属性(时间、强度、来源)

Step 4: 约束与规则
  ├── 基数约束: Person -[BORN_IN]-> Location (1:1)
  ├── 类型约束: FOUNDED_BY 主体必须是 Organization
  ├── 完整性约束: 必填属性
  └── 推理规则: A PART_OF B, B PART_OF C => A PART_OF C

Schema定义示例

from dataclasses import dataclass, field

@dataclass
class EntityType:
    name: str
    description: str
    properties: dict[str, str]
    required_properties: list[str] = field(default_factory=list)

@dataclass
class RelationType:
    name: str
    description: str
    source_type: str
    target_type: str
    properties: dict[str, str] = field(default_factory=dict)
    cardinality: str = "many_to_many"

class KGSchema:
    """Knowledge Graph schema definition and validation."""

    def __init__(self):
        self.entity_types: dict[str, EntityType] = {}
        self.relation_types: dict[str, RelationType] = {}

    def add_entity_type(self, et: EntityType):
        self.entity_types[et.name] = et

    def add_relation_type(self, rt: RelationType):
        self.relation_types[rt.name] = rt

    def validate_triple(self, subject_type: str, predicate: str,
                         object_type: str) -> bool:
        if predicate not in self.relation_types:
            return False
        rt = self.relation_types[predicate]
        return (subject_type == rt.source_type and
                object_type == rt.target_type)

    def to_cypher_constraints(self) -> str:
        """Generate Neo4j constraint statements."""
        statements = []
        for name, et in self.entity_types.items():
            for prop in et.required_properties:
                statements.append(
                    f"CREATE CONSTRAINT IF NOT EXISTS "
                    f"FOR (n:{name}) REQUIRE n.{prop} IS NOT NULL"
                )
        return ";\n".join(statements)

实体消解

消解策略

策略 精度 速度 适用
精确匹配 完美 极快 标准化名称
编辑距离 拼写变体
别名词典 已知别名
嵌入相似度 中高 语义相似
LLM判断 复杂歧义

质量控制

质量指标

维度 指标 计算方法 目标
准确性 三元组正确率 抽样人工验证 >90%
完整性 已知实体覆盖率 对比种子列表 >80%
一致性 Schema合规率 自动校验 100%
时效性 信息新鲜度 来源时间分布 90%在1年内
去重率 重复三元组比例 自动检测 <5%

自动化质量检查

class KGQualityChecker:
    """Automated quality checks for knowledge graph."""

    def __init__(self, graph_db, schema: KGSchema):
        self.db = graph_db
        self.schema = schema

    def run_all_checks(self) -> dict:
        return {
            "schema_compliance": self.check_schema_compliance(),
            "orphan_nodes": self.check_orphan_nodes(),
            "duplicate_check": self.check_duplicates(),
            "completeness": self.check_required_properties(),
        }

    def check_orphan_nodes(self) -> dict:
        result = self.db.query(
            "MATCH (n) WHERE NOT (n)--() RETURN count(n) AS orphans"
        )
        return {"orphan_count": result[0]["orphans"]}

    def check_duplicates(self) -> dict:
        result = self.db.query("""
            MATCH (a), (b)
            WHERE id(a) < id(b) AND a.name = b.name AND labels(a) = labels(b)
            RETURN count(*) AS exact_duplicates
        """)
        return {"exact_duplicates": result[0]["exact_duplicates"]}

    def check_schema_compliance(self) -> dict:
        violations = []
        all_labels = self.db.query("CALL db.labels()")
        for label in all_labels:
            if label not in self.schema.entity_types:
                violations.append(f"Unknown label: {label}")
        return {"violations": len(violations), "details": violations[:10]}

    def check_required_properties(self) -> dict:
        missing = []
        for type_name, et in self.schema.entity_types.items():
            for prop in et.required_properties:
                result = self.db.query(f"""
                    MATCH (n:{type_name})
                    WHERE n.{prop} IS NULL
                    RETURN count(n) AS missing_count
                """)
                if result[0]["missing_count"] > 0:
                    missing.append({"type": type_name, "property": prop,
                                    "count": result[0]["missing_count"]})
        return {"missing_properties": missing}

结论

知识图谱自动构建已从传统的NLP管道升级为LLM驱动的智能抽取。LLM在实体识别和关系抽取上展现出强大的零样本和少样本能力,显著降低了冷启动成本。然而,自动构建的质量控制仍是核心挑战——Schema设计决定了图谱的结构质量,实体消解决定了数据质量,而自动化质量检查则保障了持续运营中的数据治理。建议采用"LLM抽取+规则校验+人工抽检"的三层质控体系,在效率和质量之间找到平衡。


Maurice | maurice_wen@proton.me