199 lines
5.7 KiB
Python
199 lines
5.7 KiB
Python
|
|
"""
|
||
|
|
PDF Conversion Service
|
||
|
|
Provides Word-to-PDF and Markdown-to-PDF conversion functionality
|
||
|
|
"""
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional, Dict, Tuple
|
||
|
|
import io
|
||
|
|
import tempfile
|
||
|
|
import logging
|
||
|
|
|
||
|
|
from docx import Document as DocxDocument
|
||
|
|
|
||
|
|
from app.services.word2markdown import convert_any as convert_word_to_md
|
||
|
|
from app.services.docling_adapter import md_to_pdf_bytes_with_renderer
|
||
|
|
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
def word_to_pdf_bytes(
|
||
|
|
file_path: str | Path,
|
||
|
|
toc: bool = False,
|
||
|
|
header_text: Optional[str] = None,
|
||
|
|
footer_text: Optional[str] = None,
|
||
|
|
logo_url: Optional[str] = None,
|
||
|
|
copyright_text: Optional[str] = None,
|
||
|
|
filename_text: Optional[str] = None,
|
||
|
|
cover_src: Optional[str] = None,
|
||
|
|
product_name: Optional[str] = None,
|
||
|
|
document_name: Optional[str] = None,
|
||
|
|
product_version: Optional[str] = None,
|
||
|
|
document_version: Optional[str] = None,
|
||
|
|
) -> bytes:
|
||
|
|
"""
|
||
|
|
Convert Word document (.doc, .docx) to PDF
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to Word file
|
||
|
|
toc: Enable table of contents
|
||
|
|
header_text: Custom header text
|
||
|
|
footer_text: Custom footer text
|
||
|
|
logo_url: URL to logo image
|
||
|
|
copyright_text: Copyright notice
|
||
|
|
filename_text: Filename to display
|
||
|
|
cover_src: Cover image source
|
||
|
|
product_name: Product name for cover
|
||
|
|
document_name: Document name for cover
|
||
|
|
product_version: Product version
|
||
|
|
document_version: Document version
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
PDF file as bytes
|
||
|
|
"""
|
||
|
|
logger.info(f"Converting Word to PDF: {file_path}")
|
||
|
|
|
||
|
|
# Convert Word to Markdown first
|
||
|
|
path = Path(file_path)
|
||
|
|
_, markdown_content = convert_word_to_md(path)
|
||
|
|
|
||
|
|
# Then convert Markdown to PDF
|
||
|
|
pdf_bytes = md_to_pdf_bytes_with_renderer(
|
||
|
|
md=markdown_content,
|
||
|
|
renderer="weasyprint",
|
||
|
|
toc=toc,
|
||
|
|
header_text=header_text,
|
||
|
|
footer_text=footer_text,
|
||
|
|
logo_url=logo_url,
|
||
|
|
copyright_text=copyright_text,
|
||
|
|
filename_text=filename_text or path.stem,
|
||
|
|
cover_src=cover_src,
|
||
|
|
product_name=product_name,
|
||
|
|
document_name=document_name,
|
||
|
|
product_version=product_version,
|
||
|
|
document_version=document_version,
|
||
|
|
)
|
||
|
|
|
||
|
|
logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes")
|
||
|
|
return pdf_bytes
|
||
|
|
|
||
|
|
|
||
|
|
def markdown_to_pdf_bytes(
|
||
|
|
markdown_content: str,
|
||
|
|
toc: bool = False,
|
||
|
|
header_text: Optional[str] = None,
|
||
|
|
footer_text: Optional[str] = None,
|
||
|
|
logo_url: Optional[str] = None,
|
||
|
|
copyright_text: Optional[str] = None,
|
||
|
|
filename_text: Optional[str] = None,
|
||
|
|
cover_src: Optional[str] = None,
|
||
|
|
product_name: Optional[str] = None,
|
||
|
|
document_name: Optional[str] = None,
|
||
|
|
product_version: Optional[str] = None,
|
||
|
|
document_version: Optional[str] = None,
|
||
|
|
css_name: Optional[str] = None,
|
||
|
|
css_text: Optional[str] = None,
|
||
|
|
) -> bytes:
|
||
|
|
"""
|
||
|
|
Convert Markdown content to PDF
|
||
|
|
|
||
|
|
Args:
|
||
|
|
markdown_content: Markdown text content
|
||
|
|
toc: Enable table of contents
|
||
|
|
header_text: Custom header text
|
||
|
|
footer_text: Custom footer text
|
||
|
|
logo_url: URL to logo image
|
||
|
|
copyright_text: Copyright notice
|
||
|
|
filename_text: Filename to display
|
||
|
|
cover_src: Cover image source
|
||
|
|
product_name: Product name for cover
|
||
|
|
document_name: Document name for cover
|
||
|
|
product_version: Product version
|
||
|
|
document_version: Document version
|
||
|
|
css_name: Name of CSS file in configs/styles
|
||
|
|
css_text: Custom CSS as string
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
PDF file as bytes
|
||
|
|
"""
|
||
|
|
logger.info("Converting Markdown to PDF")
|
||
|
|
|
||
|
|
pdf_bytes = md_to_pdf_bytes_with_renderer(
|
||
|
|
md=markdown_content,
|
||
|
|
renderer="weasyprint",
|
||
|
|
css_name=css_name,
|
||
|
|
css_text=css_text,
|
||
|
|
toc=toc,
|
||
|
|
header_text=header_text,
|
||
|
|
footer_text=footer_text,
|
||
|
|
logo_url=logo_url,
|
||
|
|
copyright_text=copyright_text,
|
||
|
|
filename_text=filename_text,
|
||
|
|
cover_src=cover_src,
|
||
|
|
product_name=product_name,
|
||
|
|
document_name=document_name,
|
||
|
|
product_version=product_version,
|
||
|
|
document_version=document_version,
|
||
|
|
)
|
||
|
|
|
||
|
|
logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes")
|
||
|
|
return pdf_bytes
|
||
|
|
|
||
|
|
|
||
|
|
def markdown_file_to_pdf_bytes(
|
||
|
|
file_path: str | Path,
|
||
|
|
encoding: str = "utf-8",
|
||
|
|
**kwargs
|
||
|
|
) -> bytes:
|
||
|
|
"""
|
||
|
|
Convert Markdown file to PDF
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to Markdown file
|
||
|
|
encoding: File encoding (default: utf-8)
|
||
|
|
**kwargs: Additional arguments passed to markdown_to_pdf_bytes
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
PDF file as bytes
|
||
|
|
"""
|
||
|
|
path = Path(file_path)
|
||
|
|
markdown_content = path.read_text(encoding=encoding)
|
||
|
|
|
||
|
|
# Set default filename from file path if not provided
|
||
|
|
if "filename_text" not in kwargs or not kwargs.get("filename_text"):
|
||
|
|
kwargs["filename_text"] = path.stem
|
||
|
|
|
||
|
|
return markdown_to_pdf_bytes(markdown_content, **kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
def read_file_content(file_path: str | Path) -> Tuple[str, bytes]:
|
||
|
|
"""
|
||
|
|
Read file content and detect content type
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to file
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (detected_type, content_bytes)
|
||
|
|
"""
|
||
|
|
path = Path(file_path)
|
||
|
|
if not path.exists():
|
||
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||
|
|
|
||
|
|
content_bytes = path.read_bytes()
|
||
|
|
|
||
|
|
# Detect by extension
|
||
|
|
ext = path.suffix.lower()
|
||
|
|
if ext in {".md", ".markdown"}:
|
||
|
|
return "markdown", content_bytes
|
||
|
|
elif ext in {".doc", ".docx"}:
|
||
|
|
return "word", content_bytes
|
||
|
|
elif ext in {".txt"}:
|
||
|
|
return "text", content_bytes
|
||
|
|
else:
|
||
|
|
# Try to detect by content
|
||
|
|
content_start = content_bytes[:8]
|
||
|
|
if content_start.startswith(b"PK\x03\x04"):
|
||
|
|
return "word", content_bytes
|
||
|
|
return "text", content_bytes
|