diff --git a/docling/PDF_API_USAGE.md b/docling/PDF_API_USAGE.md
new file mode 100644
index 0000000..bc89ab7
--- /dev/null
+++ b/docling/PDF_API_USAGE.md
@@ -0,0 +1,425 @@
+# Word/Markdown 转 PDF API 使用指南
+
+## API 端点
+
+```
+POST /api/pdf/convert
+```
+
+## 支持的输入方式
+
+### 1. 上传文件
+
+支持上传 `.doc`, `.docx`, `.md` 文件:
+
+```javascript
+// 上传 Word 文件
+const formData = new FormData();
+formData.append('file', fileInput.files[0]); // .doc 或 .docx
+
+// 可选参数
+formData.append('toc', 'true'); // 生成目录
+formData.append('header_text', '文档标题|页码'); // 页眉
+formData.append('footer_text', '版权信息'); // 页脚
+formData.append('filename_text', '我的文档'); // 文件名
+
+const response = await fetch('/api/pdf/convert', {
+ method: 'POST',
+ body: formData
+});
+
+const blob = await response.blob();
+const url = URL.createObjectURL(blob);
+```
+
+### 2. 指定本地文件路径
+
+```javascript
+const formData = new FormData();
+formData.append('file_path', '/path/to/document.docx');
+formData.append('toc', 'true');
+
+const response = await fetch('/api/pdf/convert', {
+ method: 'POST',
+ body: formData
+});
+```
+
+### 3. 直接提交 Markdown 内容
+
+```javascript
+const formData = new FormData();
+formData.append('markdown_content', '# 标题\n\n这是内容');
+formData.append('filename_text', '我的文档');
+
+const response = await fetch('/api/pdf/convert', {
+ method: 'POST',
+ body: formData
+});
+```
+
+## 完整参数列表
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| file | File | 否* | 上传的文件 |
+| file_path | string | 否* | 本地文件路径 |
+| markdown_content | string | 否* | Markdown 内容 |
+| toc | boolean | 否 | 是否生成目录,默认 false |
+| header_text | string | 否 | 页眉文本,可用 `\|` 分隔左右 |
+| footer_text | string | 否 | 页脚文本 |
+| logo_url | string | 否 | Logo 图片 URL |
+| copyright_text | string | 否 | 版权声明 |
+| filename_text | string | 否 | 显示的文件名 |
+| cover_src | string | 否 | 封面图片 URL |
+| product_name | string | 否 | 产品名称(封面) |
+| document_name | string | 否 | 文档名称(封面) |
+| product_version | string | 否 | 产品版本 |
+| document_version | string | 否 | 文档版本 |
+| css_name | string | 否 | CSS 样式名称 |
+| css_text | string | 否 | 自定义 CSS |
+| download | boolean | 否 | 是否直接下载,默认 true |
+
+*注:file、file_path、markdown_content 三者必选其一
+
+## 完整示例代码
+
+### React 示例
+
+```jsx
+import { useState, useRef } from 'react';
+
+function PdfConverter() {
+ const [loading, setLoading] = useState(false);
+ const fileInput = useRef(null);
+
+ const convertToPdf = async () => {
+ const file = fileInput.current.files[0];
+ if (!file) return;
+
+ setLoading(true);
+
+ const formData = new FormData();
+ formData.append('file', file);
+ formData.append('toc', 'true');
+ formData.append('header_text', '我的文档|第 {page} 页');
+ formData.append('footer_text', '© 2024 公司名称');
+ formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
+
+ try {
+ const response = await fetch('/api/pdf/convert', {
+ method: 'POST',
+ body: formData
+ });
+
+ if (!response.ok) {
+ const error = await response.json();
+ throw new Error(error.detail || '转换失败');
+ }
+
+ const blob = await response.blob();
+ const url = URL.createObjectURL(blob);
+
+ // 下载 PDF
+ const a = document.createElement('a');
+ a.href = url;
+ a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
+ document.body.appendChild(a);
+ a.click();
+ document.body.removeChild(a);
+ URL.revokeObjectURL(url);
+ } catch (error) {
+ console.error('转换失败:', error);
+ alert('转换失败: ' + error.message);
+ } finally {
+ setLoading(false);
+ }
+ };
+
+ return (
+
+
+
+
+ );
+}
+```
+
+### Vue 3 示例
+
+```vue
+
+
+
+
+
+
+
+
+```
+
+### 原生 JavaScript 示例
+
+```html
+
+
+
+ Word/Markdown 转 PDF
+
+
+ 文档转 PDF
+
+
+
+
+
+
+
+
+
+```
+
+### Markdown 内容转 PDF 示例
+
+```javascript
+async function markdownToPdf() {
+ const markdownContent = `
+# 我的文档
+
+## 第一章
+
+这是第一章的内容。
+
+### 小节
+
+- 列表项 1
+- 列表项 2
+
+| 列1 | 列2 |
+|-----|-----|
+| A | B |
+ `;
+
+ const formData = new FormData();
+ formData.append('markdown_content', markdownContent);
+ formData.append('filename_text', '我的Markdown文档');
+ formData.append('toc', 'true');
+ formData.append('header_text', 'Markdown文档');
+ formData.append('footer_text', '© 2024');
+
+ const response = await fetch('/api/pdf/convert', {
+ method: 'POST',
+ body: formData
+ });
+
+ const blob = await response.blob();
+ // 保存 PDF
+ saveAs(blob, 'document.pdf');
+}
+```
+
+## Python 调用示例
+
+```python
+import requests
+
+def convert_word_to_pdf(file_path, output_path):
+ """将 Word 文件转换为 PDF"""
+ with open(file_path, 'rb') as f:
+ files = {'file': f}
+ data = {
+ 'toc': 'true',
+ 'header_text': '我的文档',
+ 'footer_text': '© 2024',
+ 'filename_text': '文档名称'
+ }
+
+ response = requests.post(
+ 'http://localhost:8000/api/pdf/convert',
+ files=files,
+ data=data
+ )
+
+ if response.status_code == 200:
+ with open(output_path, 'wb') as out:
+ out.write(response.content)
+ print(f"PDF 已保存到: {output_path}")
+ else:
+ print(f"转换失败: {response.text}")
+
+# 使用示例
+convert_word_to_pdf('document.docx', 'output.pdf')
+```
+
+## cURL 示例
+
+```bash
+# 上传 Word 文件转 PDF
+curl -X POST http://localhost:8000/api/pdf/convert \
+ -F "file=@document.docx" \
+ -F "toc=true" \
+ -F "header_text=我的文档" \
+ -F "footer_text=© 2024" \
+ -o output.pdf
+
+# Markdown 内容转 PDF
+curl -X POST http://localhost:8000/api/pdf/convert \
+ -F "markdown_content=# 标题\n\n这是内容" \
+ -F "filename_text=文档" \
+ -o output.pdf
+```
+
+## 错误处理
+
+API 返回的错误格式:
+
+```json
+{
+ "detail": "错误信息"
+}
+```
+
+常见错误:
+
+| 错误信息 | 原因 | 解决方法 |
+|---------|------|----------|
+| 必须提供 file、file_path 或 markdown_content 中的一个 | 未提供输入 | 检查请求参数 |
+| 不支持的文件格式 | 文件格式错误 | 确保是 .doc/.docx/.md |
+| 文件不存在 | 本地文件路径无效 | 检查 file_path 参数 |
+| PDF 转换失败 | 转换过程出错 | 查看服务器日志 |
+
+## 返回格式
+
+### download=true (默认)
+
+直接返回 PDF 文件流:
+
+```
+Content-Type: application/pdf
+Content-Disposition: attachment; filename="document.pdf"
+```
+
+### download=false
+
+返回 JSON,包含 base64 编码的 PDF:
+
+```json
+{
+ "ok": true,
+ "pdf_base64": "JVBERi0xLjQK...",
+ "filename": "document.pdf",
+ "size": 12345
+}
+```
diff --git a/docling/app/server.py b/docling/app/server.py
index 8ab2c08..1860d14 100644
--- a/docling/app/server.py
+++ b/docling/app/server.py
@@ -58,6 +58,12 @@ from app.services.docling_adapter import (
)
from app.services.unified_converter import FormatConverter
from app.services.minio_utils import minio_current, join_prefix, presigned_read
+from app.services.pdf_converter import (
+ word_to_pdf_bytes,
+ markdown_to_pdf_bytes,
+ markdown_file_to_pdf_bytes,
+ read_file_content,
+)
"""
@api Server Application
@@ -2561,6 +2567,193 @@ async def api_convert(
except Exception as e:
return _err(str(e))
+@app.post("/api/pdf/convert")
+async def api_pdf_convert(
+ file: Optional[UploadFile] = File(None),
+ file_path: Optional[str] = Form(None),
+ markdown_content: Optional[str] = Form(None),
+ toc: bool = Form(False),
+ header_text: Optional[str] = Form(None),
+ footer_text: Optional[str] = Form(None),
+ logo_url: Optional[str] = Form(None),
+ copyright_text: Optional[str] = Form(None),
+ filename_text: Optional[str] = Form(None),
+ cover_src: Optional[str] = Form(None),
+ product_name: Optional[str] = Form(None),
+ document_name: Optional[str] = Form(None),
+ product_version: Optional[str] = Form(None),
+ document_version: Optional[str] = Form(None),
+ css_name: Optional[str] = Form(None),
+ css_text: Optional[str] = Form(None),
+ download: bool = Form(True),
+):
+ """
+ Convert Word or Markdown to PDF
+
+ Supports three input methods:
+ 1. Upload file (Word .doc/.docx or Markdown .md)
+ 2. Specify file_path (local file path)
+ 3. Provide markdown_content directly
+
+ Returns PDF file as download by default
+ """
+ try:
+ pdf_bytes: bytes = b""
+ output_filename: str = "document.pdf"
+
+ # Determine input source
+ if file:
+ # Handle uploaded file
+ filename = file.filename or "upload"
+ suffix = Path(filename).suffix.lower()
+
+ # Save uploaded file to temp
+ tmp_path = Path(tempfile.mktemp(suffix=suffix))
+ try:
+ content = await file.read()
+ tmp_path.write_bytes(content)
+
+ if suffix in {".doc", ".docx"}:
+ # Convert Word to PDF
+ output_filename = f"{Path(filename).stem}.pdf"
+ pdf_bytes = await asyncio.to_thread(
+ word_to_pdf_bytes,
+ tmp_path,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text or Path(filename).stem,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ )
+ elif suffix in {".md", ".markdown"}:
+ # Convert Markdown file to PDF
+ output_filename = f"{Path(filename).stem}.pdf"
+ pdf_bytes = await asyncio.to_thread(
+ markdown_file_to_pdf_bytes,
+ tmp_path,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text or Path(filename).stem,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ css_name=css_name,
+ css_text=css_text,
+ )
+ else:
+ return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
+ finally:
+ try:
+ tmp_path.unlink(missing_ok=True)
+ except Exception:
+ pass
+
+ elif file_path:
+ # Handle local file path
+ path = Path(file_path).expanduser()
+ if not path.exists():
+ return _err(f"文件不存在: {file_path}")
+
+ suffix = path.suffix.lower()
+ output_filename = f"{path.stem}.pdf"
+
+ if suffix in {".doc", ".docx"}:
+ pdf_bytes = await asyncio.to_thread(
+ word_to_pdf_bytes,
+ path,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text or path.stem,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ )
+ elif suffix in {".md", ".markdown"}:
+ pdf_bytes = await asyncio.to_thread(
+ markdown_file_to_pdf_bytes,
+ path,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text or path.stem,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ css_name=css_name,
+ css_text=css_text,
+ )
+ else:
+ return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
+
+ elif markdown_content:
+ # Handle direct markdown content
+ output_filename = f"{filename_text or 'document'}.pdf"
+ pdf_bytes = await asyncio.to_thread(
+ markdown_to_pdf_bytes,
+ markdown_content,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ css_name=css_name,
+ css_text=css_text,
+ )
+ else:
+ return _err("必须提供 file、file_path 或 markdown_content 中的一个")
+
+ if not pdf_bytes:
+ return _err("PDF 转换失败,未生成内容")
+
+ # Return PDF file
+ if download:
+ from fastapi.responses import StreamingResponse
+ return StreamingResponse(
+ io.BytesIO(pdf_bytes),
+ media_type="application/pdf",
+ headers={
+ "Content-Disposition": f"attachment; filename=\"{output_filename}\""
+ }
+ )
+ else:
+ # Return as base64 in JSON
+ import base64
+ return _ok({
+ "pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"),
+ "filename": output_filename,
+ "size": len(pdf_bytes)
+ })
+
+ except Exception as e:
+ logging.exception("PDF conversion error")
+ return _err(f"PDF 转换失败: {str(e)}")
+
@app.post("/api/import/convert")
async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)):
try:
diff --git a/docling/app/services/pdf_converter.py b/docling/app/services/pdf_converter.py
new file mode 100644
index 0000000..d6c65df
--- /dev/null
+++ b/docling/app/services/pdf_converter.py
@@ -0,0 +1,198 @@
+"""
+PDF Conversion Service
+Provides Word-to-PDF and Markdown-to-PDF conversion functionality
+"""
+from pathlib import Path
+from typing import Optional, Dict, Tuple
+import io
+import tempfile
+import logging
+
+from docx import Document as DocxDocument
+
+from app.services.word2markdown import convert_any as convert_word_to_md
+from app.services.docling_adapter import md_to_pdf_bytes_with_renderer
+
+
+logger = logging.getLogger(__name__)
+
+
+def word_to_pdf_bytes(
+ file_path: str | Path,
+ toc: bool = False,
+ header_text: Optional[str] = None,
+ footer_text: Optional[str] = None,
+ logo_url: Optional[str] = None,
+ copyright_text: Optional[str] = None,
+ filename_text: Optional[str] = None,
+ cover_src: Optional[str] = None,
+ product_name: Optional[str] = None,
+ document_name: Optional[str] = None,
+ product_version: Optional[str] = None,
+ document_version: Optional[str] = None,
+) -> bytes:
+ """
+ Convert Word document (.doc, .docx) to PDF
+
+ Args:
+ file_path: Path to Word file
+ toc: Enable table of contents
+ header_text: Custom header text
+ footer_text: Custom footer text
+ logo_url: URL to logo image
+ copyright_text: Copyright notice
+ filename_text: Filename to display
+ cover_src: Cover image source
+ product_name: Product name for cover
+ document_name: Document name for cover
+ product_version: Product version
+ document_version: Document version
+
+ Returns:
+ PDF file as bytes
+ """
+ logger.info(f"Converting Word to PDF: {file_path}")
+
+ # Convert Word to Markdown first
+ path = Path(file_path)
+ _, markdown_content = convert_word_to_md(path)
+
+ # Then convert Markdown to PDF
+ pdf_bytes = md_to_pdf_bytes_with_renderer(
+ md=markdown_content,
+ renderer="weasyprint",
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text or path.stem,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ )
+
+ logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes")
+ return pdf_bytes
+
+
+def markdown_to_pdf_bytes(
+ markdown_content: str,
+ toc: bool = False,
+ header_text: Optional[str] = None,
+ footer_text: Optional[str] = None,
+ logo_url: Optional[str] = None,
+ copyright_text: Optional[str] = None,
+ filename_text: Optional[str] = None,
+ cover_src: Optional[str] = None,
+ product_name: Optional[str] = None,
+ document_name: Optional[str] = None,
+ product_version: Optional[str] = None,
+ document_version: Optional[str] = None,
+ css_name: Optional[str] = None,
+ css_text: Optional[str] = None,
+) -> bytes:
+ """
+ Convert Markdown content to PDF
+
+ Args:
+ markdown_content: Markdown text content
+ toc: Enable table of contents
+ header_text: Custom header text
+ footer_text: Custom footer text
+ logo_url: URL to logo image
+ copyright_text: Copyright notice
+ filename_text: Filename to display
+ cover_src: Cover image source
+ product_name: Product name for cover
+ document_name: Document name for cover
+ product_version: Product version
+ document_version: Document version
+ css_name: Name of CSS file in configs/styles
+ css_text: Custom CSS as string
+
+ Returns:
+ PDF file as bytes
+ """
+ logger.info("Converting Markdown to PDF")
+
+ pdf_bytes = md_to_pdf_bytes_with_renderer(
+ md=markdown_content,
+ renderer="weasyprint",
+ css_name=css_name,
+ css_text=css_text,
+ toc=toc,
+ header_text=header_text,
+ footer_text=footer_text,
+ logo_url=logo_url,
+ copyright_text=copyright_text,
+ filename_text=filename_text,
+ cover_src=cover_src,
+ product_name=product_name,
+ document_name=document_name,
+ product_version=product_version,
+ document_version=document_version,
+ )
+
+ logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes")
+ return pdf_bytes
+
+
+def markdown_file_to_pdf_bytes(
+ file_path: str | Path,
+ encoding: str = "utf-8",
+ **kwargs
+) -> bytes:
+ """
+ Convert Markdown file to PDF
+
+ Args:
+ file_path: Path to Markdown file
+ encoding: File encoding (default: utf-8)
+ **kwargs: Additional arguments passed to markdown_to_pdf_bytes
+
+ Returns:
+ PDF file as bytes
+ """
+ path = Path(file_path)
+ markdown_content = path.read_text(encoding=encoding)
+
+ # Set default filename from file path if not provided
+ if "filename_text" not in kwargs or not kwargs.get("filename_text"):
+ kwargs["filename_text"] = path.stem
+
+ return markdown_to_pdf_bytes(markdown_content, **kwargs)
+
+
+def read_file_content(file_path: str | Path) -> Tuple[str, bytes]:
+ """
+ Read file content and detect content type
+
+ Args:
+ file_path: Path to file
+
+ Returns:
+ Tuple of (detected_type, content_bytes)
+ """
+ path = Path(file_path)
+ if not path.exists():
+ raise FileNotFoundError(f"File not found: {file_path}")
+
+ content_bytes = path.read_bytes()
+
+ # Detect by extension
+ ext = path.suffix.lower()
+ if ext in {".md", ".markdown"}:
+ return "markdown", content_bytes
+ elif ext in {".doc", ".docx"}:
+ return "word", content_bytes
+ elif ext in {".txt"}:
+ return "text", content_bytes
+ else:
+ # Try to detect by content
+ content_start = content_bytes[:8]
+ if content_start.startswith(b"PK\x03\x04"):
+ return "word", content_bytes
+ return "text", content_bytes