1 files changed, 69 insertions, 1 deletions
diff --git a/pysite/rst/__init__.py b/pysite/rst/__init__.py
index e0fc973e..0c069615 100644
--- a/pysite/rst/__init__.py
+++ b/pysite/rst/__init__.py
@@ -1,15 +1,83 @@
 # coding=utf-8
+import re
+
 from docutils.core import publish_parts
 from docutils.parsers.rst.roles import register_canonical_role
 
 from pysite.rst.roles import icon_role, page_role, url_for_role
 
+RST_TEMPLATE = """.. contents::
+
+{0}"""
+
+CONTENTS_REGEX = re.compile(r"""<div class=\"contents topic\" id=\"contents\">(.*?)</div>""", re.DOTALL)
+HREF_REGEX = re.compile(r"""<a class=\"reference internal\" href=\"(.*?)\".*?>(.*?)</a>""")
+
 
 def render(rst: str):
-    return publish_parts(
+    rst = RST_TEMPLATE.format(rst)
+    html = publish_parts(
         source=rst, writer_name="html5", settings_overrides={"halt_level": 2, "syntax_highlight": "short"}
     )["html_body"]
 
+    data = {
+        "html": html,
+        "headers": []
+    }
+
+    match = CONTENTS_REGEX.search(html)  # Find the contents HTML
+
+    if match:
+        data["html"] = html.replace(match.group(0), "")  # Remove the contents from the document HTML
+
+        depth = 0
+        headers = []
+        current_header = {}
+
+        group = match.group(1)
+
+        # Sanitize the output so we can more easily parse it
+        group = group.replace("<li>", "<li>\n")
+        group = group.replace("</li>", "\n</li>")
+        group = group.replace("<p>", "<p>\n")
+        group = group.replace("</p>", "\n</p>")
+
+        for line in group.split("\n"):
+            line = line.strip()  # Remove excess whitespace
+
+            if not line:  # Nothing to process
+                continue
+
+            if line.startswith("<li>") and depth <= 2:
+                #  We've found a header, or the start of a header group
+                depth += 1
+            elif line.startswith("</li>") and depth >= 0:
+                # That's the end of a header or header group
+
+                if depth == 1:
+                    # We just dealt with an entire header group, so store it
+                    headers.append(current_header.copy())  # Store a copy, since we're clearing the dict
+                    current_header.clear()
+
+                depth -= 1
+            elif line.startswith("<a") and depth <= 2:
+                # We've found an actual URL
+                match = HREF_REGEX.match(line)  # Parse the line for the ID and header title
+
+                if depth == 1:  # Top-level header, so just store it in the current header
+                    current_header["id"] = match.group(1)
+                    current_header["title"] = match.group(2)
+                else:  # Second-level (or deeper) header, should be stored in a list of sub-headers under the current
+                    sub_headers = current_header.get("sub_headers", [])
+                    sub_headers.append({
+                        "id": match.group(1),
+                        "title": match.group(2)
+                    })
+                    current_header["sub_headers"] = sub_headers
+
+        data["headers"] = headers
+    return data
+
 
 register_canonical_role("icon", icon_role)
 register_canonical_role("page", page_role)