"""HTML transformation and injection utilities for SPA output.
Regex patterns are compiled once at import time for performance.
"""
import re
from functools import lru_cache, partial
from typing import Any
from litestar.serialization import encode_json
_HEAD_END_PATTERN = re.compile(r"</head\s*>", re.IGNORECASE)
_BODY_END_PATTERN = re.compile(r"</body\s*>", re.IGNORECASE)
_BODY_START_PATTERN = re.compile(r"<body[^>]*>", re.IGNORECASE)
_HTML_END_PATTERN = re.compile(r"</html\s*>", re.IGNORECASE)
_SCRIPT_SRC_PATTERN = re.compile(r'(<script[^>]*\s+src\s*=\s*["\'])([^"\']+)(["\'][^>]*>)', re.IGNORECASE)
_LINK_HREF_PATTERN = re.compile(r'(<link[^>]*\s+href\s*=\s*["\'])([^"\']+)(["\'][^>]*>)', re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_id_selector_pattern(element_id: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an ID selector.
Returns:
Pattern matching an element with the given ID.
"""
return re.compile(
rf'(<[a-zA-Z][a-zA-Z0-9]*\s+[^>]*id\s*=\s*["\']?{re.escape(element_id)}["\']?[^>]*)(>)', re.IGNORECASE
)
@lru_cache(maxsize=128)
def _get_element_selector_pattern(element_name: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an element selector.
Returns:
Pattern matching elements with the given tag name.
"""
return re.compile(rf"(<{re.escape(element_name)}[^>]*)(>)", re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_attr_pattern(attr: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an attribute.
Returns:
Pattern matching the attribute with its value.
"""
return re.compile(rf'{re.escape(attr)}\s*=\s*["\'][^"\']*["\']', re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_id_element_with_content_pattern(element_id: str) -> re.Pattern[str]:
"""Return a compiled regex pattern to match an element by ID and capture its inner HTML.
The pattern matches: <tag ... id="element_id" ...> ... </tag>
and captures the opening tag, the inner content, and the closing tag.
Returns:
Pattern matching an element with the given ID, capturing its inner HTML.
"""
return re.compile(
rf"(<(?P<tag>[a-zA-Z0-9]+)(?P<attrs>[^>]*\bid=[\"']{re.escape(element_id)}[\"'][^>]*)>)(?P<inner>.*?)(</(?P=tag)\s*>)",
flags=re.IGNORECASE | re.DOTALL,
)
def _escape_script(script: str) -> str:
r"""Escape script content to prevent breaking out of script tags.
Replaces ``</script>`` with ``<\/script>`` to prevent premature tag closure.
Args:
script: The script content to escape.
Returns:
The escaped script content safe for embedding in ``<script>`` tags.
"""
return script.replace("</script>", r"<\/script>")
def _escape_attr(value: str) -> str:
"""Escape attribute value for safe HTML embedding.
Escapes special HTML characters: ``&``, ``"``, ``'``, ``<``, ``>``.
Args:
value: The attribute value to escape.
Returns:
The escaped value safe for use in HTML attribute values.
"""
return (
value
.replace("&", "&")
.replace('"', """)
.replace("'", "'")
.replace("<", "<")
.replace(">", ">")
)
def _set_attribute_replacer(
match: re.Match[str], *, attr_pattern: re.Pattern[str], attr_name: str, escaped_val: str
) -> str:
"""Replace or add an attribute on an opening tag match.
Args:
match: Regex match capturing the opening portion and closing delimiter.
attr_pattern: Compiled pattern that matches the attribute assignment.
attr_name: Attribute name to set.
escaped_val: Escaped attribute value.
Returns:
Updated tag string with ``attr_name`` set to ``escaped_val``.
"""
opening = match.group(1)
closing = match.group(2)
if attr_pattern.search(opening):
opening = attr_pattern.sub(f'{attr_name}="{escaped_val}"', opening)
else:
opening = opening.rstrip() + f' {attr_name}="{escaped_val}"'
return opening + closing
def _set_inner_html_replacer(match: re.Match[str], *, content: str) -> str:
"""Replace inner HTML for an ID-targeted element match.
Args:
match: Regex match from ``_get_id_element_with_content_pattern``.
content: Raw HTML to inject as the element's inner HTML.
Returns:
Updated HTML fragment with replaced inner content.
"""
return match.group(1) + content + match.group(5)
[docs]
def inject_head_script(html: str, script: str, *, escape: bool = True, nonce: str | None = None) -> str:
"""Inject a script tag before the closing </head> tag.
Args:
html: The HTML document.
script: The JavaScript code to inject (without <script> tags).
escape: Whether to escape the script content. Default True.
nonce: Optional CSP nonce to add to the injected ``<script>`` tag.
Returns:
The HTML with the injected script. If ``</head>`` is not found,
falls back to injecting before ``</html>``. If neither is found,
appends the script at the end. Returns the original HTML unchanged
if ``script`` is empty.
Example:
html = inject_head_script(html, "window.__DATA__ = {foo: 1};")
"""
if not script:
return html
if escape:
script = _escape_script(script)
nonce_attr = f' nonce="{_escape_attr(nonce)}"' if nonce else ""
script_tag = f"<script{nonce_attr}>{script}</script>\n"
head_end_match = _HEAD_END_PATTERN.search(html)
if head_end_match:
pos = head_end_match.start()
return html[:pos] + script_tag + html[pos:]
html_end_match = _HTML_END_PATTERN.search(html)
if html_end_match:
pos = html_end_match.start()
return html[:pos] + script_tag + html[pos:]
return html + "\n" + script_tag
[docs]
def inject_head_html(html: str, content: str) -> str:
"""Inject raw HTML into the ``<head>`` section.
This is used for Inertia SSR, where the SSR server returns an array of HTML strings
(typically ``<title>``, ``<meta>``, etc.) that must be placed in the final HTML response.
Args:
html: The HTML document.
content: Raw HTML to inject. This is inserted as-is.
Returns:
The HTML with the content injected before ``</head>`` when present.
Falls back to injecting before ``</html>`` or appending at the end.
"""
if not content:
return html
head_end_match = _HEAD_END_PATTERN.search(html)
if head_end_match:
pos = head_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
html_end_match = _HTML_END_PATTERN.search(html)
if html_end_match:
pos = html_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
return html + "\n" + content
[docs]
def inject_body_content(html: str, content: str, *, position: str = "end") -> str:
"""Inject content into the body element.
Args:
html: The HTML document.
content: The content to inject (can include HTML tags).
position: Where to inject - "start" (after <body>) or "end" (before </body>).
Returns:
The HTML with the injected content. Returns the original HTML unchanged
if ``content`` is empty or if no ``<body>`` tag is found.
Example:
html = inject_body_content(html, '<div id="portal"></div>', position="end")
"""
if not content:
return html
if position == "end":
body_end_match = _BODY_END_PATTERN.search(html)
if body_end_match:
pos = body_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
elif position == "start":
body_start_match = _BODY_START_PATTERN.search(html)
if body_start_match:
pos = body_start_match.end()
return html[:pos] + "\n" + content + html[pos:]
return html
[docs]
def set_data_attribute(html: str, selector: str, attr: str, value: str) -> str:
"""Set a data attribute on an element matching the selector.
This function supports simple ID selectors (#id) and element selectors (div).
For complex selectors, consider using a proper HTML parser.
Args:
html: The HTML document.
selector: CSS-like selector (currently supports #id and element names).
attr: The attribute name (e.g., "data-page").
value: The attribute value (will be HTML-escaped automatically).
Returns:
The HTML with the attribute set. If the attribute already exists, it is
replaced. Returns the original HTML unchanged if ``selector`` or ``attr``
is empty, or if no matching element is found.
Note:
Only the first matching element is modified. The value is automatically
escaped to prevent XSS vulnerabilities.
Example:
html = set_data_attribute(html, "#app", "data-page", '{"component":"Home"}')
"""
if not selector or not attr:
return html
escaped_value = _escape_attr(value)
attr_pattern = _get_attr_pattern(attr)
replacer = partial(_set_attribute_replacer, attr_pattern=attr_pattern, attr_name=attr, escaped_val=escaped_value)
if selector.startswith("#"):
element_id = selector[1:]
pattern = _get_id_selector_pattern(element_id)
return pattern.sub(replacer, html, count=1)
element_name = selector.lower()
pattern = _get_element_selector_pattern(element_name)
return pattern.sub(replacer, html, count=1)
[docs]
def set_element_inner_html(html: str, selector: str, content: str) -> str:
"""Replace the inner HTML of an element matching the selector.
Supports only simple ID selectors (``#app``). This is intentionally limited to avoid
the overhead and edge cases of a full HTML parser.
Args:
html: The HTML document.
selector: The selector (only ``#id`` supported).
content: The raw HTML to set as the element's innerHTML.
Returns:
Updated HTML. If no matching element is found, returns the original HTML.
"""
if not selector or not selector.startswith("#"):
return html
element_id = selector[1:]
pattern = _get_id_element_with_content_pattern(element_id)
replacer = partial(_set_inner_html_replacer, content=content)
return pattern.sub(replacer, html, count=1)
[docs]
def inject_page_script(html: str, json_data: str, *, nonce: str | None = None, script_id: str = "app_page") -> str:
r"""Inject page data as a JSON script element before ``</body>``.
This is an Inertia.js v2.3+ optimization that embeds page data in a
``<script type="application/json">`` element instead of a ``data-page`` attribute.
This provides ~37% payload reduction for large pages by avoiding HTML entity escaping.
The script element is inserted before ``</body>`` with:
- ``type="application/json"`` (non-executable, just data)
- ``id="app_page"`` (Inertia's expected ID for useScriptElementForInitialPage)
- Optional ``nonce`` for CSP compliance
Args:
html: The HTML document.
json_data: Pre-serialized JSON string (page props).
nonce: Optional CSP nonce to add to the script element.
script_id: The script element ID (default "app_page" per Inertia protocol).
Returns:
The HTML with the script element injected before ``</body>``.
Falls back to appending at the end if no ``</body>`` tag is found.
Note:
The JSON content is escaped to prevent XSS via ``</script>`` injection.
Sequences like ``</`` are replaced with ``<\\/`` (escaped forward slash)
which is valid JSON and prevents HTML parser issues.
Example:
html = inject_page_script(html, '{"component":"Home","props":{}}')
"""
if not json_data:
return html
# Escape sequences that could break out of script element
# Replace </ with <\/ to prevent premature tag closure (XSS prevention)
escaped_json = json_data.replace("</", r"<\/")
nonce_attr = f' nonce="{_escape_attr(nonce)}"' if nonce else ""
script_tag = f'<script type="application/json" id="{script_id}"{nonce_attr}>{escaped_json}</script>\n'
body_end_match = _BODY_END_PATTERN.search(html)
if body_end_match:
pos = body_end_match.start()
return html[:pos] + script_tag + html[pos:]
return html + "\n" + script_tag
[docs]
def inject_json_script(html: str, var_name: str, data: dict[str, Any], *, nonce: str | None = None) -> str:
"""Inject a script that sets a global JavaScript variable to JSON data.
This is a convenience function for injecting structured data into the page.
The data is serialized with compact JSON (no extra whitespace) and non-ASCII
characters are preserved.
Args:
html: The HTML document.
var_name: The global variable name (e.g., "__LITESTAR_ROUTES__").
data: The data to serialize as JSON.
nonce: Optional CSP nonce to add to the injected ``<script>`` tag.
Returns:
The HTML with the injected script in the ``<head>`` section. Falls back
to injecting before ``</html>`` or at the end if no ``</head>`` is found.
Note:
The script content is NOT escaped to preserve valid JSON. Ensure that
``data`` does not contain user-controlled content that could include
malicious ``</script>`` sequences.
Example:
html = inject_json_script(html, "__ROUTES__", {"home": "/", "about": "/about"})
"""
json_data = encode_json(data).decode("utf-8")
script = f"window.{var_name} = {json_data};"
return inject_head_script(html, script, escape=False, nonce=nonce)
[docs]
def inject_vite_dev_scripts(
html: str,
vite_url: str,
*,
asset_url: str = "/static/",
is_react: bool = False,
csp_nonce: str | None = None,
resource_dir: str | None = None,
) -> str:
"""Inject Vite dev server scripts for HMR support.
This function injects the necessary scripts for Vite's Hot Module Replacement
(HMR) to work when serving HTML from the backend (e.g., in hybrid/Inertia mode).
The scripts are injected into the ``<head>`` section.
For React apps, a preamble script is injected before the Vite client to
enable React Fast Refresh.
Scripts are injected as relative URLs using the ``asset_url`` prefix. This
routes them through Litestar's proxy middleware, which forwards to Vite
with the correct base path handling.
When ``resource_dir`` is provided, entry point script URLs are also transformed
to include the asset URL prefix (e.g., ``/resources/main.tsx`` becomes
``/static/resources/main.tsx``).
Args:
html: The HTML document.
vite_url: The Vite dev server URL (kept for backward compatibility, unused).
asset_url: The asset URL prefix (e.g., "/static/"). Scripts are served
at ``{asset_url}@vite/client`` etc.
is_react: Whether to inject the React Fast Refresh preamble.
csp_nonce: Optional CSP nonce to add to injected ``<script>`` tags.
resource_dir: Optional resource directory name (e.g., "resources", "src").
When provided, script sources starting with ``/{resource_dir}/`` are
prefixed with ``asset_url``.
Returns:
The HTML with Vite dev scripts injected. Scripts are inserted before
``</head>`` when present, otherwise before ``</html>`` or at the end.
Example:
html = inject_vite_dev_scripts(html, "", asset_url="/static/", is_react=True)
"""
# Use relative URLs with asset_url prefix so requests go through Litestar's proxy
# This ensures proper base path handling (Vite expects /static/@vite/client, not /@vite/client)
base = asset_url.rstrip("/")
nonce_attr = f' nonce="{_escape_attr(csp_nonce)}"' if csp_nonce else ""
# Transform entry point script URLs to include the asset URL prefix
# This ensures /resources/main.tsx becomes /static/resources/main.tsx
if resource_dir:
resource_prefix = f"/{resource_dir.strip('/')}/"
def transform_entry_script(match: re.Match[str]) -> str:
prefix = match.group(1)
src = match.group(2)
suffix = match.group(3)
if src.startswith(resource_prefix) and not src.startswith(base):
return prefix + base + src + suffix
return match.group(0)
html = _SCRIPT_SRC_PATTERN.sub(transform_entry_script, html)
scripts: list[str] = []
if is_react:
react_preamble = f"""import RefreshRuntime from '{base}/@react-refresh'
RefreshRuntime.injectIntoGlobalHook(window)
window.$RefreshReg$ = () => {{}}
window.$RefreshSig$ = () => (type) => type
window.__vite_plugin_react_preamble_installed__ = true"""
scripts.append(f'<script type="module"{nonce_attr}>{react_preamble}</script>')
scripts.append(f'<script type="module" src="{base}/@vite/client"{nonce_attr}></script>')
script_content = "\n".join(scripts) + "\n"
head_end_match = _HEAD_END_PATTERN.search(html)
if head_end_match:
pos = head_end_match.start()
return html[:pos] + script_content + html[pos:]
html_end_match = _HTML_END_PATTERN.search(html)
if html_end_match:
pos = html_end_match.start()
return html[:pos] + script_content + html[pos:]
return html + "\n" + script_content