"""HTML transformation and injection utilities for SPA output.
Regex patterns are compiled once at import time for performance.
"""
import re
from functools import lru_cache, partial
from typing import Any
from litestar.serialization import encode_json
_HEAD_END_PATTERN = re.compile(r"</head\s*>", re.IGNORECASE)
_BODY_END_PATTERN = re.compile(r"</body\s*>", re.IGNORECASE)
_BODY_START_PATTERN = re.compile(r"<body[^>]*>", re.IGNORECASE)
_HTML_END_PATTERN = re.compile(r"</html\s*>", re.IGNORECASE)
_SCRIPT_SRC_PATTERN = re.compile(r'(<script[^>]*\s+src\s*=\s*["\'])([^"\']+)(["\'][^>]*>)', re.IGNORECASE)
_LINK_HREF_PATTERN = re.compile(r'(<link[^>]*\s+href\s*=\s*["\'])([^"\']+)(["\'][^>]*>)', re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_id_selector_pattern(element_id: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an ID selector.
Returns:
Pattern matching an element with the given ID.
"""
return re.compile(
rf'(<[a-zA-Z][a-zA-Z0-9]*\s+[^>]*id\s*=\s*["\']?{re.escape(element_id)}["\']?[^>]*)(>)', re.IGNORECASE
)
@lru_cache(maxsize=128)
def _get_element_selector_pattern(element_name: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an element selector.
Returns:
Pattern matching elements with the given tag name.
"""
return re.compile(rf"(<{re.escape(element_name)}[^>]*)(>)", re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_attr_pattern(attr: str) -> re.Pattern[str]:
"""Return a compiled regex pattern for an attribute.
Returns:
Pattern matching the attribute with its value.
"""
return re.compile(rf'{re.escape(attr)}\s*=\s*["\'][^"\']*["\']', re.IGNORECASE)
@lru_cache(maxsize=128)
def _get_id_element_with_content_pattern(element_id: str) -> re.Pattern[str]:
"""Return a compiled regex pattern to match an element by ID and capture its inner HTML.
The pattern matches: <tag ... id="element_id" ...> ... </tag>
and captures the opening tag, the inner content, and the closing tag.
Returns:
Pattern matching an element with the given ID, capturing its inner HTML.
"""
return re.compile(
rf"(<(?P<tag>[a-zA-Z0-9]+)(?P<attrs>[^>]*\bid=[\"']{re.escape(element_id)}[\"'][^>]*)>)(?P<inner>.*?)(</(?P=tag)\s*>)",
flags=re.IGNORECASE | re.DOTALL,
)
def _escape_script(script: str) -> str:
r"""Escape script content to prevent breaking out of script tags.
Replaces ``</script>`` with ``<\/script>`` to prevent premature tag closure.
Args:
script: The script content to escape.
Returns:
The escaped script content safe for embedding in ``<script>`` tags.
"""
return script.replace("</script>", r"<\/script>")
def _escape_attr(value: str) -> str:
"""Escape attribute value for safe HTML embedding.
Escapes special HTML characters: ``&``, ``"``, ``'``, ``<``, ``>``.
Args:
value: The attribute value to escape.
Returns:
The escaped value safe for use in HTML attribute values.
"""
return (
value.replace("&", "&")
.replace('"', """)
.replace("'", "'")
.replace("<", "<")
.replace(">", ">")
)
def _set_attribute_replacer(
match: re.Match[str], *, attr_pattern: re.Pattern[str], attr_name: str, escaped_val: str
) -> str:
"""Replace or add an attribute on an opening tag match.
Args:
match: Regex match capturing the opening portion and closing delimiter.
attr_pattern: Compiled pattern that matches the attribute assignment.
attr_name: Attribute name to set.
escaped_val: Escaped attribute value.
Returns:
Updated tag string with ``attr_name`` set to ``escaped_val``.
"""
opening = match.group(1)
closing = match.group(2)
if attr_pattern.search(opening):
opening = attr_pattern.sub(f'{attr_name}="{escaped_val}"', opening)
else:
opening = opening.rstrip() + f' {attr_name}="{escaped_val}"'
return opening + closing
def _set_inner_html_replacer(match: re.Match[str], *, content: str) -> str:
"""Replace inner HTML for an ID-targeted element match.
Args:
match: Regex match from ``_get_id_element_with_content_pattern``.
content: Raw HTML to inject as the element's inner HTML.
Returns:
Updated HTML fragment with replaced inner content.
"""
return match.group(1) + content + match.group(5)
[docs]
def inject_head_script(html: str, script: str, *, escape: bool = True, nonce: str | None = None) -> str:
"""Inject a script tag before the closing </head> tag.
Args:
html: The HTML document.
script: The JavaScript code to inject (without <script> tags).
escape: Whether to escape the script content. Default True.
nonce: Optional CSP nonce to add to the injected ``<script>`` tag.
Returns:
The HTML with the injected script. If ``</head>`` is not found,
falls back to injecting before ``</html>``. If neither is found,
appends the script at the end. Returns the original HTML unchanged
if ``script`` is empty.
Example:
html = inject_head_script(html, "window.__DATA__ = {foo: 1};")
"""
if not script:
return html
if escape:
script = _escape_script(script)
nonce_attr = f' nonce="{_escape_attr(nonce)}"' if nonce else ""
script_tag = f"<script{nonce_attr}>{script}</script>\n"
head_end_match = _HEAD_END_PATTERN.search(html)
if head_end_match:
pos = head_end_match.start()
return html[:pos] + script_tag + html[pos:]
html_end_match = _HTML_END_PATTERN.search(html)
if html_end_match:
pos = html_end_match.start()
return html[:pos] + script_tag + html[pos:]
return html + "\n" + script_tag
[docs]
def inject_head_html(html: str, content: str) -> str:
"""Inject raw HTML into the ``<head>`` section.
This is used for Inertia SSR, where the SSR server returns an array of HTML strings
(typically ``<title>``, ``<meta>``, etc.) that must be placed in the final HTML response.
Args:
html: The HTML document.
content: Raw HTML to inject. This is inserted as-is.
Returns:
The HTML with the content injected before ``</head>`` when present.
Falls back to injecting before ``</html>`` or appending at the end.
"""
if not content:
return html
head_end_match = _HEAD_END_PATTERN.search(html)
if head_end_match:
pos = head_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
html_end_match = _HTML_END_PATTERN.search(html)
if html_end_match:
pos = html_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
return html + "\n" + content
[docs]
def inject_body_content(html: str, content: str, *, position: str = "end") -> str:
"""Inject content into the body element.
Args:
html: The HTML document.
content: The content to inject (can include HTML tags).
position: Where to inject - "start" (after <body>) or "end" (before </body>).
Returns:
The HTML with the injected content. Returns the original HTML unchanged
if ``content`` is empty or if no ``<body>`` tag is found.
Example:
html = inject_body_content(html, '<div id="portal"></div>', position="end")
"""
if not content:
return html
if position == "end":
body_end_match = _BODY_END_PATTERN.search(html)
if body_end_match:
pos = body_end_match.start()
return html[:pos] + content + "\n" + html[pos:]
elif position == "start":
body_start_match = _BODY_START_PATTERN.search(html)
if body_start_match:
pos = body_start_match.end()
return html[:pos] + "\n" + content + html[pos:]
return html
[docs]
def set_data_attribute(html: str, selector: str, attr: str, value: str) -> str:
"""Set a data attribute on an element matching the selector.
This function supports simple ID selectors (#id) and element selectors (div).
For complex selectors, consider using a proper HTML parser.
Args:
html: The HTML document.
selector: CSS-like selector (currently supports #id and element names).
attr: The attribute name (e.g., "data-page").
value: The attribute value (will be HTML-escaped automatically).
Returns:
The HTML with the attribute set. If the attribute already exists, it is
replaced. Returns the original HTML unchanged if ``selector`` or ``attr``
is empty, or if no matching element is found.
Note:
Only the first matching element is modified. The value is automatically
escaped to prevent XSS vulnerabilities.
Example:
html = set_data_attribute(html, "#app", "data-page", '{"component":"Home"}')
"""
if not selector or not attr:
return html
escaped_value = _escape_attr(value)
attr_pattern = _get_attr_pattern(attr)
replacer = partial(_set_attribute_replacer, attr_pattern=attr_pattern, attr_name=attr, escaped_val=escaped_value)
if selector.startswith("#"):
element_id = selector[1:]
pattern = _get_id_selector_pattern(element_id)
return pattern.sub(replacer, html, count=1)
element_name = selector.lower()
pattern = _get_element_selector_pattern(element_name)
return pattern.sub(replacer, html, count=1)
[docs]
def set_element_inner_html(html: str, selector: str, content: str) -> str:
"""Replace the inner HTML of an element matching the selector.
Supports only simple ID selectors (``#app``). This is intentionally limited to avoid
the overhead and edge cases of a full HTML parser.
Args:
html: The HTML document.
selector: The selector (only ``#id`` supported).
content: The raw HTML to set as the element's innerHTML.
Returns:
Updated HTML. If no matching element is found, returns the original HTML.
"""
if not selector or not selector.startswith("#"):
return html
element_id = selector[1:]
pattern = _get_id_element_with_content_pattern(element_id)
replacer = partial(_set_inner_html_replacer, content=content)
return pattern.sub(replacer, html, count=1)
[docs]
def inject_json_script(html: str, var_name: str, data: dict[str, Any], *, nonce: str | None = None) -> str:
"""Inject a script that sets a global JavaScript variable to JSON data.
This is a convenience function for injecting structured data into the page.
The data is serialized with compact JSON (no extra whitespace) and non-ASCII
characters are preserved.
Args:
html: The HTML document.
var_name: The global variable name (e.g., "__LITESTAR_ROUTES__").
data: The data to serialize as JSON.
nonce: Optional CSP nonce to add to the injected ``<script>`` tag.
Returns:
The HTML with the injected script in the ``<head>`` section. Falls back
to injecting before ``</html>`` or at the end if no ``</head>`` is found.
Note:
The script content is NOT escaped to preserve valid JSON. Ensure that
``data`` does not contain user-controlled content that could include
malicious ``</script>`` sequences.
Example:
html = inject_json_script(html, "__ROUTES__", {"home": "/", "about": "/about"})
"""
json_data = encode_json(data).decode("utf-8")
script = f"window.{var_name} = {json_data};"
return inject_head_script(html, script, escape=False, nonce=nonce)