Skip to content

Stage 2: Text Preprocessing

obsidian_export.pipeline.stage2_preprocess

Stage 2: Text-level pre-processing before Pandoc parses the document.

escape_dollar_signs

escape_dollar_signs(text: str) -> str

Escape currency dollar signs to \$.

Skips fenced code blocks, display math ($$...$$), and patterns that look like intentional inline math (no digit immediately after $). Belt-and-suspenders alongside --from=gfm-tex_math_dollars.

Source code in obsidian_export/pipeline/stage2_preprocess.py
def escape_dollar_signs(text: str) -> str:
    """Escape currency dollar signs to \\$.

    Skips fenced code blocks, display math ($$...$$), and patterns that
    look like intentional inline math (no digit immediately after $).
    Belt-and-suspenders alongside --from=gfm-tex_math_dollars.
    """
    segments = _split_preserve_code(text)
    result = []
    for is_code, segment in segments:
        if is_code:
            result.append(segment)
            continue
        # Replace $digit or $ digit (currency) with \$
        result.append(_CURRENCY_RE.sub(r"\\$", segment))
    return "".join(result)

convert_callouts

convert_callouts(text: str) -> str

Transform Obsidian callout blocks to Pandoc fenced divs.

[!NOTE] Title

content line

Output: :::{.note title="Title"} content line :::

Source code in obsidian_export/pipeline/stage2_preprocess.py
def convert_callouts(text: str) -> str:
    """Transform Obsidian callout blocks to Pandoc fenced divs.

    Input:  > [!NOTE] Title
            > content line
    Output: :::{.note title="Title"}
            content line
            :::
    """
    return _CALLOUT_HEADER_RE.sub(_callout_replacement, text)

process_urls

process_urls(text: str, strategy: str, threshold: int) -> str

Handle bare URLs in text according to strategy.

Strategies

keep — leave as-is footnote_long — move URLs longer than threshold to footnotes footnote_all — move all URLs to footnotes strip — remove bare URLs entirely

Source code in obsidian_export/pipeline/stage2_preprocess.py
def process_urls(text: str, strategy: str, threshold: int) -> str:
    """Handle bare URLs in text according to strategy.

    Strategies:
      keep          — leave as-is
      footnote_long — move URLs longer than threshold to footnotes
      footnote_all  — move all URLs to footnotes
      strip         — remove bare URLs entirely
    """
    if strategy == "keep":
        return text

    def replace_url(m: re.Match) -> str:
        """Replace a bare URL match according to the active footnote strategy.

        Receives a match whose group(1) is the URL. Returns either the original
        match unchanged or a Pandoc footnote-formatted replacement string.
        """
        url = m.group(1)
        if strategy == "footnote_all" or (strategy == "footnote_long" and len(url) > threshold):
            # Pandoc footnote syntax: [^N] — but inline footnotes are cleaner
            return f"[link]({url})[^url-{abs(hash(url)) % 100000}]\n\n[^url-{abs(hash(url)) % 100000}]: <{url}>"
        return m.group(0)

    if strategy == "strip":
        return _BARE_URL_RE.sub("", text)

    if strategy not in ("footnote_all", "footnote_long"):
        raise ConfigValueError(f"Unknown url_strategy: {strategy!r}")

    # For footnote strategies, wrap in angle brackets first (Pandoc autolinks)
    segments = _split_preserve_code(text)
    result = []
    for is_code, segment in segments:
        if is_code:
            result.append(segment)
            continue
        result.append(_BARE_URL_RE.sub(replace_url, segment))
    return "".join(result)

normalize_line_endings

normalize_line_endings(text: str) -> str

Normalize line endings to LF and strip trailing whitespace per line.

Source code in obsidian_export/pipeline/stage2_preprocess.py
def normalize_line_endings(text: str) -> str:
    """Normalize line endings to LF and strip trailing whitespace per line."""
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    lines = text.split("\n")
    return "\n".join(line.rstrip() for line in lines)

strip_variation_selectors

strip_variation_selectors(text: str) -> str

Remove Unicode variation selectors (U+FE0F) that TeX cannot render.

Source code in obsidian_export/pipeline/stage2_preprocess.py
def strip_variation_selectors(text: str) -> str:
    """Remove Unicode variation selectors (U+FE0F) that TeX cannot render."""
    return _VARIATION_SELECTOR_RE.sub("", text)

preprocess

preprocess(text: str, config: ObsidianConfig) -> str

Apply all Stage 2 transforms in order.

Source code in obsidian_export/pipeline/stage2_preprocess.py
def preprocess(text: str, config: ObsidianConfig) -> str:
    """Apply all Stage 2 transforms in order."""
    text = normalize_line_endings(text)
    text = strip_variation_selectors(text)
    text = escape_dollar_signs(text)
    text = convert_callouts(text)
    text = process_urls(text, config.url_strategy, config.url_length_threshold)
    return text