""" PDF Conversion Service Provides Word-to-PDF and Markdown-to-PDF conversion functionality """ from pathlib import Path from typing import Optional, Dict, Tuple import io import tempfile import logging from docx import Document as DocxDocument from app.services.word2markdown import convert_any as convert_word_to_md from app.services.docling_adapter import md_to_pdf_bytes_with_renderer logger = logging.getLogger(__name__) def word_to_pdf_bytes( file_path: str | Path, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None, ) -> bytes: """ Convert Word document (.doc, .docx) to PDF Args: file_path: Path to Word file toc: Enable table of contents header_text: Custom header text footer_text: Custom footer text logo_url: URL to logo image copyright_text: Copyright notice filename_text: Filename to display cover_src: Cover image source product_name: Product name for cover document_name: Document name for cover product_version: Product version document_version: Document version Returns: PDF file as bytes """ logger.info(f"Converting Word to PDF: {file_path}") # Convert Word to Markdown first path = Path(file_path) _, markdown_content = convert_word_to_md(path) # Then convert Markdown to PDF pdf_bytes = md_to_pdf_bytes_with_renderer( md=markdown_content, renderer="weasyprint", toc=toc, header_text=header_text, footer_text=footer_text, logo_url=logo_url, copyright_text=copyright_text, filename_text=filename_text or path.stem, cover_src=cover_src, product_name=product_name, document_name=document_name, product_version=product_version, document_version=document_version, ) logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes") return pdf_bytes def markdown_to_pdf_bytes( markdown_content: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None, css_name: Optional[str] = None, css_text: Optional[str] = None, ) -> bytes: """ Convert Markdown content to PDF Args: markdown_content: Markdown text content toc: Enable table of contents header_text: Custom header text footer_text: Custom footer text logo_url: URL to logo image copyright_text: Copyright notice filename_text: Filename to display cover_src: Cover image source product_name: Product name for cover document_name: Document name for cover product_version: Product version document_version: Document version css_name: Name of CSS file in configs/styles css_text: Custom CSS as string Returns: PDF file as bytes """ logger.info("Converting Markdown to PDF") pdf_bytes = md_to_pdf_bytes_with_renderer( md=markdown_content, renderer="weasyprint", css_name=css_name, css_text=css_text, toc=toc, header_text=header_text, footer_text=footer_text, logo_url=logo_url, copyright_text=copyright_text, filename_text=filename_text, cover_src=cover_src, product_name=product_name, document_name=document_name, product_version=product_version, document_version=document_version, ) logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes") return pdf_bytes def markdown_file_to_pdf_bytes( file_path: str | Path, encoding: str = "utf-8", **kwargs ) -> bytes: """ Convert Markdown file to PDF Args: file_path: Path to Markdown file encoding: File encoding (default: utf-8) **kwargs: Additional arguments passed to markdown_to_pdf_bytes Returns: PDF file as bytes """ path = Path(file_path) markdown_content = path.read_text(encoding=encoding) # Set default filename from file path if not provided if "filename_text" not in kwargs or not kwargs.get("filename_text"): kwargs["filename_text"] = path.stem return markdown_to_pdf_bytes(markdown_content, **kwargs) def read_file_content(file_path: str | Path) -> Tuple[str, bytes]: """ Read file content and detect content type Args: file_path: Path to file Returns: Tuple of (detected_type, content_bytes) """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") content_bytes = path.read_bytes() # Detect by extension ext = path.suffix.lower() if ext in {".md", ".markdown"}: return "markdown", content_bytes elif ext in {".doc", ".docx"}: return "word", content_bytes elif ext in {".txt"}: return "text", content_bytes else: # Try to detect by content content_start = content_bytes[:8] if content_start.startswith(b"PK\x03\x04"): return "word", content_bytes return "text", content_bytes