Auto generate weekly report

With the help of Claude, I created a Vikunja-to-Markdown report generator.

One tricky aspect was formatting lists properly in Markdown.
See: Markdown as first class citizen.

def _preprocess_html_lists(html_content: str) -> str:
    """Preprocess HTML content to handle task lists and regular lists."""
    # Convert Vikunja checklist HTML to regular list HTML
    html_content = re.sub(r'<ul data-type="taskList">', '<ul>', html_content)
    html_content = re.sub(r'<li[^>]*data-checked="false" data-type="taskItem"[^>]*>', '<li>[ ] ', html_content)
    html_content = re.sub(r'<li[^>]*data-checked="true" data-type="taskItem"[^>]*>', '<li>[x] ', html_content)
    html_content = re.sub(r'<label><input[^>]*><span></span></label>', '', html_content)
    
    return html_content


def _clean_list_paragraphs(html_content: str) -> str:
    """Clean paragraph and div tags within lists to improve markdown conversion."""
    if not re.search(r'<(ul|ol)', html_content):
        return html_content
        
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Process all ul/ol lists
    for lst in soup.find_all(["ul", "ol"]):
        # Step 1: Replace </p><p> with </p><br><p> within list items
        for tag in lst.find_all(["p", "div"]):
            tag_str = str(tag)
            tag_str = re.sub(r'</p><p>', '</p><br><p>', tag_str)
            new_soup = BeautifulSoup(tag_str, "html.parser")
            tag.replace_with(new_soup)
        
        # Step 2: Remove p and div tags while keeping content
        for tag in lst.find_all(["p", "div"]):
            tag.unwrap()
    
    return str(soup)


def _convert_with_pandoc(html_content: str) -> str:
    """Convert HTML to GitHub Flavored Markdown using pandoc."""
    try:
        result = subprocess.run(
            ['pandoc', '--from=html', '--to=gfm', '--wrap=none'],
            input=html_content,
            text=True,
            capture_output=True,
            check=True,
            timeout=30
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        logger.error(f"Pandoc conversion failed: {e}")
        raise RuntimeError(f"Pandoc conversion failed: {e}")
    except subprocess.TimeoutExpired:
        logger.error("Pandoc conversion timed out")
        raise RuntimeError("Pandoc conversion timed out")
    except FileNotFoundError:
        logger.error("Pandoc not found. Please install pandoc.")
        raise RuntimeError("Pandoc not found. Please install pandoc.")


def _postprocess_markdown(markdown_content: str) -> str:
    """Post-process markdown to clean up links and checkboxes."""
    # Convert HTML links to cleaner format
    markdown_content = re.sub(r'<a href="([^"]*)"[^>]*>([^<]*)</a>', r'<\1>', markdown_content)
    
    # Fix escaped checkboxes
    markdown_content = re.sub(r"\\\[(x| )\\\]", r"[\1]", markdown_content)
    
    return markdown_content