With the help of Claude, I created a Vikunja-to-Markdown report generator.
One tricky aspect was formatting lists properly in Markdown.
See: Markdown as first class citizen.
def _preprocess_html_lists(html_content: str) -> str:
"""Preprocess HTML content to handle task lists and regular lists."""
# Convert Vikunja checklist HTML to regular list HTML
html_content = re.sub(r'<ul data-type="taskList">', '<ul>', html_content)
html_content = re.sub(r'<li[^>]*data-checked="false" data-type="taskItem"[^>]*>', '<li>[ ] ', html_content)
html_content = re.sub(r'<li[^>]*data-checked="true" data-type="taskItem"[^>]*>', '<li>[x] ', html_content)
html_content = re.sub(r'<label><input[^>]*><span></span></label>', '', html_content)
return html_content
def _clean_list_paragraphs(html_content: str) -> str:
"""Clean paragraph and div tags within lists to improve markdown conversion."""
if not re.search(r'<(ul|ol)', html_content):
return html_content
soup = BeautifulSoup(html_content, "html.parser")
# Process all ul/ol lists
for lst in soup.find_all(["ul", "ol"]):
# Step 1: Replace </p><p> with </p><br><p> within list items
for tag in lst.find_all(["p", "div"]):
tag_str = str(tag)
tag_str = re.sub(r'</p><p>', '</p><br><p>', tag_str)
new_soup = BeautifulSoup(tag_str, "html.parser")
tag.replace_with(new_soup)
# Step 2: Remove p and div tags while keeping content
for tag in lst.find_all(["p", "div"]):
tag.unwrap()
return str(soup)
def _convert_with_pandoc(html_content: str) -> str:
"""Convert HTML to GitHub Flavored Markdown using pandoc."""
try:
result = subprocess.run(
['pandoc', '--from=html', '--to=gfm', '--wrap=none'],
input=html_content,
text=True,
capture_output=True,
check=True,
timeout=30
)
return result.stdout
except subprocess.CalledProcessError as e:
logger.error(f"Pandoc conversion failed: {e}")
raise RuntimeError(f"Pandoc conversion failed: {e}")
except subprocess.TimeoutExpired:
logger.error("Pandoc conversion timed out")
raise RuntimeError("Pandoc conversion timed out")
except FileNotFoundError:
logger.error("Pandoc not found. Please install pandoc.")
raise RuntimeError("Pandoc not found. Please install pandoc.")
def _postprocess_markdown(markdown_content: str) -> str:
"""Post-process markdown to clean up links and checkboxes."""
# Convert HTML links to cleaner format
markdown_content = re.sub(r'<a href="([^"]*)"[^>]*>([^<]*)</a>', r'<\1>', markdown_content)
# Fix escaped checkboxes
markdown_content = re.sub(r"\\\[(x| )\\\]", r"[\1]", markdown_content)
return markdown_content