Skip to content

Stage 1: Vault Operations

obsidian_export.pipeline.stage1_vault

Stage 1: Vault operations — frontmatter, embed resolution, Obsidian syntax stripping.

EmbedContext dataclass

Immutable context for recursive embed resolution.

Source code in obsidian_export/pipeline/stage1_vault.py
@dataclass(frozen=True)
class EmbedContext:
    """Immutable context for recursive embed resolution."""

    vault_root: Path
    current_file: Path
    visited: frozenset[Path]
    depth: int
    max_embed_depth: int

parse_frontmatter

parse_frontmatter(text: str) -> tuple[dict[str, Any], str]

Extract YAML frontmatter. Returns (metadata_dict, body_text).

Source code in obsidian_export/pipeline/stage1_vault.py
def parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
    """Extract YAML frontmatter. Returns (metadata_dict, body_text)."""
    m = _FRONTMATTER_RE.match(text)
    if not m:
        return {}, text
    raw = m.group(1)
    try:
        fm = yaml.safe_load(raw) or {}
    except yaml.YAMLError:
        _log.warning("YAML frontmatter parse failed, retrying with auto-quoted colons:\n%s", raw)
        fm = yaml.safe_load(_quote_yaml_values(raw)) or {}
    body = text[m.end() :]
    return fm, body

clean_frontmatter

clean_frontmatter(fm: dict[str, Any]) -> dict[str, Any]

Strip Obsidian-only keys; convert tags list to keywords string for Pandoc.

Source code in obsidian_export/pipeline/stage1_vault.py
def clean_frontmatter(fm: dict[str, Any]) -> dict[str, Any]:
    """Strip Obsidian-only keys; convert tags list to keywords string for Pandoc."""
    cleaned = {k: v for k, v in fm.items() if k not in _OBSIDIAN_KEYS}
    if "tags" in fm:
        tags = fm["tags"]
        if isinstance(tags, list):
            cleaned["keywords"] = ", ".join(str(t) for t in tags)
        elif isinstance(tags, str):
            cleaned["keywords"] = tags
    return cleaned

resolve_embeds

resolve_embeds(content: str, vault_root: Path, current_file: Path, max_embed_depth: int) -> str

Recursively resolve ![[embed]] blocks with cycle detection.

  • Text embeds: resolved inline (with depth cap)
  • Section embeds (![[note#Heading]]): extract section
  • Image embeds (extensions in IMAGE_EXTENSIONS): converted to refs
  • Missing embeds: raise EmbedNotFoundError
  • Circular embeds: raise CircularEmbedError
Source code in obsidian_export/pipeline/stage1_vault.py
def resolve_embeds(
    content: str,
    vault_root: Path,
    current_file: Path,
    max_embed_depth: int,
) -> str:
    """Recursively resolve ![[embed]] blocks with cycle detection.

    - Text embeds: resolved inline (with depth cap)
    - Section embeds (![[note#Heading]]): extract section
    - Image embeds (extensions in IMAGE_EXTENSIONS): converted to ![]() refs
    - Missing embeds: raise EmbedNotFoundError
    - Circular embeds: raise CircularEmbedError
    """
    ctx = EmbedContext(
        vault_root=vault_root,
        current_file=current_file,
        visited=frozenset(),
        depth=0,
        max_embed_depth=max_embed_depth,
    )
    return _resolve_embeds_recursive(content, ctx)

strip_leading_title

strip_leading_title(body: str, title: str) -> str

Remove the first h1 heading if it matches the document title.

Source code in obsidian_export/pipeline/stage1_vault.py
def strip_leading_title(body: str, title: str) -> str:
    """Remove the first h1 heading if it matches the document title."""
    m = re.match(r"^\s*#\s+(.+?)(?:\s*\{[^}]*\})?\s*\n", body)
    if m and m.group(1).strip() == title.strip():
        return body[m.end() :]
    return body

strip_obsidian_syntax

strip_obsidian_syntax(text: str) -> str

Remove/simplify Obsidian-specific syntax for export.

  • ![[embed]] → removed (use resolve_embeds first for inline resolution)
  • [[Entity|Display]] → Display
  • [[Entity]] → Entity
  • Relations section → removed with all content below

Callout syntax (> [!type]) is preserved for downstream processing by stage2.

Source code in obsidian_export/pipeline/stage1_vault.py
def strip_obsidian_syntax(text: str) -> str:
    """Remove/simplify Obsidian-specific syntax for export.

    - ![[embed]] → removed (use resolve_embeds first for inline resolution)
    - [[Entity|Display]] → Display
    - [[Entity]] → Entity
    - ## Relations section → removed with all content below

    Callout syntax (> [!type]) is preserved for downstream processing by stage2.
    """
    # Remove bare embeds (not already resolved)
    text = _EMBED_RE.sub("", text)
    # Pipe wikilinks → display text
    text = _WIKILINK_PIPE_RE.sub(lambda m: m.group(2), text)
    # Bare wikilinks → entity name
    text = _WIKILINK_BARE_RE.sub(lambda m: m.group(1), text)
    # Remove Relations section
    text = _RELATIONS_RE.sub("", text)
    return text