add:markdown 转 pdf

This commit is contained in:
2026-01-13 22:56:22 +08:00
parent 0b07e63b76
commit cecc8c65be
3 changed files with 816 additions and 0 deletions

View File

@@ -58,6 +58,12 @@ from app.services.docling_adapter import (
)
from app.services.unified_converter import FormatConverter
from app.services.minio_utils import minio_current, join_prefix, presigned_read
from app.services.pdf_converter import (
word_to_pdf_bytes,
markdown_to_pdf_bytes,
markdown_file_to_pdf_bytes,
read_file_content,
)
"""
@api Server Application
@@ -2561,6 +2567,193 @@ async def api_convert(
except Exception as e:
return _err(str(e))
@app.post("/api/pdf/convert")
async def api_pdf_convert(
file: Optional[UploadFile] = File(None),
file_path: Optional[str] = Form(None),
markdown_content: Optional[str] = Form(None),
toc: bool = Form(False),
header_text: Optional[str] = Form(None),
footer_text: Optional[str] = Form(None),
logo_url: Optional[str] = Form(None),
copyright_text: Optional[str] = Form(None),
filename_text: Optional[str] = Form(None),
cover_src: Optional[str] = Form(None),
product_name: Optional[str] = Form(None),
document_name: Optional[str] = Form(None),
product_version: Optional[str] = Form(None),
document_version: Optional[str] = Form(None),
css_name: Optional[str] = Form(None),
css_text: Optional[str] = Form(None),
download: bool = Form(True),
):
"""
Convert Word or Markdown to PDF
Supports three input methods:
1. Upload file (Word .doc/.docx or Markdown .md)
2. Specify file_path (local file path)
3. Provide markdown_content directly
Returns PDF file as download by default
"""
try:
pdf_bytes: bytes = b""
output_filename: str = "document.pdf"
# Determine input source
if file:
# Handle uploaded file
filename = file.filename or "upload"
suffix = Path(filename).suffix.lower()
# Save uploaded file to temp
tmp_path = Path(tempfile.mktemp(suffix=suffix))
try:
content = await file.read()
tmp_path.write_bytes(content)
if suffix in {".doc", ".docx"}:
# Convert Word to PDF
output_filename = f"{Path(filename).stem}.pdf"
pdf_bytes = await asyncio.to_thread(
word_to_pdf_bytes,
tmp_path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or Path(filename).stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
elif suffix in {".md", ".markdown"}:
# Convert Markdown file to PDF
output_filename = f"{Path(filename).stem}.pdf"
pdf_bytes = await asyncio.to_thread(
markdown_file_to_pdf_bytes,
tmp_path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or Path(filename).stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
finally:
try:
tmp_path.unlink(missing_ok=True)
except Exception:
pass
elif file_path:
# Handle local file path
path = Path(file_path).expanduser()
if not path.exists():
return _err(f"文件不存在: {file_path}")
suffix = path.suffix.lower()
output_filename = f"{path.stem}.pdf"
if suffix in {".doc", ".docx"}:
pdf_bytes = await asyncio.to_thread(
word_to_pdf_bytes,
path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or path.stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
elif suffix in {".md", ".markdown"}:
pdf_bytes = await asyncio.to_thread(
markdown_file_to_pdf_bytes,
path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or path.stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
elif markdown_content:
# Handle direct markdown content
output_filename = f"{filename_text or 'document'}.pdf"
pdf_bytes = await asyncio.to_thread(
markdown_to_pdf_bytes,
markdown_content,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err("必须提供 file、file_path 或 markdown_content 中的一个")
if not pdf_bytes:
return _err("PDF 转换失败,未生成内容")
# Return PDF file
if download:
from fastapi.responses import StreamingResponse
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename=\"{output_filename}\""
}
)
else:
# Return as base64 in JSON
import base64
return _ok({
"pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"),
"filename": output_filename,
"size": len(pdf_bytes)
})
except Exception as e:
logging.exception("PDF conversion error")
return _err(f"PDF 转换失败: {str(e)}")
@app.post("/api/import/convert")
async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)):
try:

View File

@@ -0,0 +1,198 @@
"""
PDF Conversion Service
Provides Word-to-PDF and Markdown-to-PDF conversion functionality
"""
from pathlib import Path
from typing import Optional, Dict, Tuple
import io
import tempfile
import logging
from docx import Document as DocxDocument
from app.services.word2markdown import convert_any as convert_word_to_md
from app.services.docling_adapter import md_to_pdf_bytes_with_renderer
logger = logging.getLogger(__name__)
def word_to_pdf_bytes(
file_path: str | Path,
toc: bool = False,
header_text: Optional[str] = None,
footer_text: Optional[str] = None,
logo_url: Optional[str] = None,
copyright_text: Optional[str] = None,
filename_text: Optional[str] = None,
cover_src: Optional[str] = None,
product_name: Optional[str] = None,
document_name: Optional[str] = None,
product_version: Optional[str] = None,
document_version: Optional[str] = None,
) -> bytes:
"""
Convert Word document (.doc, .docx) to PDF
Args:
file_path: Path to Word file
toc: Enable table of contents
header_text: Custom header text
footer_text: Custom footer text
logo_url: URL to logo image
copyright_text: Copyright notice
filename_text: Filename to display
cover_src: Cover image source
product_name: Product name for cover
document_name: Document name for cover
product_version: Product version
document_version: Document version
Returns:
PDF file as bytes
"""
logger.info(f"Converting Word to PDF: {file_path}")
# Convert Word to Markdown first
path = Path(file_path)
_, markdown_content = convert_word_to_md(path)
# Then convert Markdown to PDF
pdf_bytes = md_to_pdf_bytes_with_renderer(
md=markdown_content,
renderer="weasyprint",
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or path.stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes")
return pdf_bytes
def markdown_to_pdf_bytes(
markdown_content: str,
toc: bool = False,
header_text: Optional[str] = None,
footer_text: Optional[str] = None,
logo_url: Optional[str] = None,
copyright_text: Optional[str] = None,
filename_text: Optional[str] = None,
cover_src: Optional[str] = None,
product_name: Optional[str] = None,
document_name: Optional[str] = None,
product_version: Optional[str] = None,
document_version: Optional[str] = None,
css_name: Optional[str] = None,
css_text: Optional[str] = None,
) -> bytes:
"""
Convert Markdown content to PDF
Args:
markdown_content: Markdown text content
toc: Enable table of contents
header_text: Custom header text
footer_text: Custom footer text
logo_url: URL to logo image
copyright_text: Copyright notice
filename_text: Filename to display
cover_src: Cover image source
product_name: Product name for cover
document_name: Document name for cover
product_version: Product version
document_version: Document version
css_name: Name of CSS file in configs/styles
css_text: Custom CSS as string
Returns:
PDF file as bytes
"""
logger.info("Converting Markdown to PDF")
pdf_bytes = md_to_pdf_bytes_with_renderer(
md=markdown_content,
renderer="weasyprint",
css_name=css_name,
css_text=css_text,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes")
return pdf_bytes
def markdown_file_to_pdf_bytes(
file_path: str | Path,
encoding: str = "utf-8",
**kwargs
) -> bytes:
"""
Convert Markdown file to PDF
Args:
file_path: Path to Markdown file
encoding: File encoding (default: utf-8)
**kwargs: Additional arguments passed to markdown_to_pdf_bytes
Returns:
PDF file as bytes
"""
path = Path(file_path)
markdown_content = path.read_text(encoding=encoding)
# Set default filename from file path if not provided
if "filename_text" not in kwargs or not kwargs.get("filename_text"):
kwargs["filename_text"] = path.stem
return markdown_to_pdf_bytes(markdown_content, **kwargs)
def read_file_content(file_path: str | Path) -> Tuple[str, bytes]:
"""
Read file content and detect content type
Args:
file_path: Path to file
Returns:
Tuple of (detected_type, content_bytes)
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
content_bytes = path.read_bytes()
# Detect by extension
ext = path.suffix.lower()
if ext in {".md", ".markdown"}:
return "markdown", content_bytes
elif ext in {".doc", ".docx"}:
return "word", content_bytes
elif ext in {".txt"}:
return "text", content_bytes
else:
# Try to detect by content
content_start = content_bytes[:8]
if content_start.startswith(b"PK\x03\x04"):
return "word", content_bytes
return "text", content_bytes