diff --git a/docling/PDF_API_USAGE.md b/docling/PDF_API_USAGE.md new file mode 100644 index 0000000..bc89ab7 --- /dev/null +++ b/docling/PDF_API_USAGE.md @@ -0,0 +1,425 @@ +# Word/Markdown 转 PDF API 使用指南 + +## API 端点 + +``` +POST /api/pdf/convert +``` + +## 支持的输入方式 + +### 1. 上传文件 + +支持上传 `.doc`, `.docx`, `.md` 文件: + +```javascript +// 上传 Word 文件 +const formData = new FormData(); +formData.append('file', fileInput.files[0]); // .doc 或 .docx + +// 可选参数 +formData.append('toc', 'true'); // 生成目录 +formData.append('header_text', '文档标题|页码'); // 页眉 +formData.append('footer_text', '版权信息'); // 页脚 +formData.append('filename_text', '我的文档'); // 文件名 + +const response = await fetch('/api/pdf/convert', { + method: 'POST', + body: formData +}); + +const blob = await response.blob(); +const url = URL.createObjectURL(blob); +``` + +### 2. 指定本地文件路径 + +```javascript +const formData = new FormData(); +formData.append('file_path', '/path/to/document.docx'); +formData.append('toc', 'true'); + +const response = await fetch('/api/pdf/convert', { + method: 'POST', + body: formData +}); +``` + +### 3. 直接提交 Markdown 内容 + +```javascript +const formData = new FormData(); +formData.append('markdown_content', '# 标题\n\n这是内容'); +formData.append('filename_text', '我的文档'); + +const response = await fetch('/api/pdf/convert', { + method: 'POST', + body: formData +}); +``` + +## 完整参数列表 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| file | File | 否* | 上传的文件 | +| file_path | string | 否* | 本地文件路径 | +| markdown_content | string | 否* | Markdown 内容 | +| toc | boolean | 否 | 是否生成目录,默认 false | +| header_text | string | 否 | 页眉文本,可用 `\|` 分隔左右 | +| footer_text | string | 否 | 页脚文本 | +| logo_url | string | 否 | Logo 图片 URL | +| copyright_text | string | 否 | 版权声明 | +| filename_text | string | 否 | 显示的文件名 | +| cover_src | string | 否 | 封面图片 URL | +| product_name | string | 否 | 产品名称(封面) | +| document_name | string | 否 | 文档名称(封面) | +| product_version | string | 否 | 产品版本 | +| document_version | string | 否 | 文档版本 | +| css_name | string | 否 | CSS 样式名称 | +| css_text | string | 否 | 自定义 CSS | +| download | boolean | 否 | 是否直接下载,默认 true | + +*注:file、file_path、markdown_content 三者必选其一 + +## 完整示例代码 + +### React 示例 + +```jsx +import { useState, useRef } from 'react'; + +function PdfConverter() { + const [loading, setLoading] = useState(false); + const fileInput = useRef(null); + + const convertToPdf = async () => { + const file = fileInput.current.files[0]; + if (!file) return; + + setLoading(true); + + const formData = new FormData(); + formData.append('file', file); + formData.append('toc', 'true'); + formData.append('header_text', '我的文档|第 {page} 页'); + formData.append('footer_text', '© 2024 公司名称'); + formData.append('filename_text', file.name.replace(/\.[^/.]+$/, '')); + + try { + const response = await fetch('/api/pdf/convert', { + method: 'POST', + body: formData + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '转换失败'); + } + + const blob = await response.blob(); + const url = URL.createObjectURL(blob); + + // 下载 PDF + const a = document.createElement('a'); + a.href = url; + a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf'; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + } catch (error) { + console.error('转换失败:', error); + alert('转换失败: ' + error.message); + } finally { + setLoading(false); + } + }; + + return ( +
+ + +
+ ); +} +``` + +### Vue 3 示例 + +```vue + + + +``` + +### 原生 JavaScript 示例 + +```html + + + + Word/Markdown 转 PDF + + +

文档转 PDF

+ + + + +
+ + + + +``` + +### Markdown 内容转 PDF 示例 + +```javascript +async function markdownToPdf() { + const markdownContent = ` +# 我的文档 + +## 第一章 + +这是第一章的内容。 + +### 小节 + +- 列表项 1 +- 列表项 2 + +| 列1 | 列2 | +|-----|-----| +| A | B | + `; + + const formData = new FormData(); + formData.append('markdown_content', markdownContent); + formData.append('filename_text', '我的Markdown文档'); + formData.append('toc', 'true'); + formData.append('header_text', 'Markdown文档'); + formData.append('footer_text', '© 2024'); + + const response = await fetch('/api/pdf/convert', { + method: 'POST', + body: formData + }); + + const blob = await response.blob(); + // 保存 PDF + saveAs(blob, 'document.pdf'); +} +``` + +## Python 调用示例 + +```python +import requests + +def convert_word_to_pdf(file_path, output_path): + """将 Word 文件转换为 PDF""" + with open(file_path, 'rb') as f: + files = {'file': f} + data = { + 'toc': 'true', + 'header_text': '我的文档', + 'footer_text': '© 2024', + 'filename_text': '文档名称' + } + + response = requests.post( + 'http://localhost:8000/api/pdf/convert', + files=files, + data=data + ) + + if response.status_code == 200: + with open(output_path, 'wb') as out: + out.write(response.content) + print(f"PDF 已保存到: {output_path}") + else: + print(f"转换失败: {response.text}") + +# 使用示例 +convert_word_to_pdf('document.docx', 'output.pdf') +``` + +## cURL 示例 + +```bash +# 上传 Word 文件转 PDF +curl -X POST http://localhost:8000/api/pdf/convert \ + -F "file=@document.docx" \ + -F "toc=true" \ + -F "header_text=我的文档" \ + -F "footer_text=© 2024" \ + -o output.pdf + +# Markdown 内容转 PDF +curl -X POST http://localhost:8000/api/pdf/convert \ + -F "markdown_content=# 标题\n\n这是内容" \ + -F "filename_text=文档" \ + -o output.pdf +``` + +## 错误处理 + +API 返回的错误格式: + +```json +{ + "detail": "错误信息" +} +``` + +常见错误: + +| 错误信息 | 原因 | 解决方法 | +|---------|------|----------| +| 必须提供 file、file_path 或 markdown_content 中的一个 | 未提供输入 | 检查请求参数 | +| 不支持的文件格式 | 文件格式错误 | 确保是 .doc/.docx/.md | +| 文件不存在 | 本地文件路径无效 | 检查 file_path 参数 | +| PDF 转换失败 | 转换过程出错 | 查看服务器日志 | + +## 返回格式 + +### download=true (默认) + +直接返回 PDF 文件流: + +``` +Content-Type: application/pdf +Content-Disposition: attachment; filename="document.pdf" +``` + +### download=false + +返回 JSON,包含 base64 编码的 PDF: + +```json +{ + "ok": true, + "pdf_base64": "JVBERi0xLjQK...", + "filename": "document.pdf", + "size": 12345 +} +``` diff --git a/docling/app/server.py b/docling/app/server.py index 8ab2c08..1860d14 100644 --- a/docling/app/server.py +++ b/docling/app/server.py @@ -58,6 +58,12 @@ from app.services.docling_adapter import ( ) from app.services.unified_converter import FormatConverter from app.services.minio_utils import minio_current, join_prefix, presigned_read +from app.services.pdf_converter import ( + word_to_pdf_bytes, + markdown_to_pdf_bytes, + markdown_file_to_pdf_bytes, + read_file_content, +) """ @api Server Application @@ -2561,6 +2567,193 @@ async def api_convert( except Exception as e: return _err(str(e)) +@app.post("/api/pdf/convert") +async def api_pdf_convert( + file: Optional[UploadFile] = File(None), + file_path: Optional[str] = Form(None), + markdown_content: Optional[str] = Form(None), + toc: bool = Form(False), + header_text: Optional[str] = Form(None), + footer_text: Optional[str] = Form(None), + logo_url: Optional[str] = Form(None), + copyright_text: Optional[str] = Form(None), + filename_text: Optional[str] = Form(None), + cover_src: Optional[str] = Form(None), + product_name: Optional[str] = Form(None), + document_name: Optional[str] = Form(None), + product_version: Optional[str] = Form(None), + document_version: Optional[str] = Form(None), + css_name: Optional[str] = Form(None), + css_text: Optional[str] = Form(None), + download: bool = Form(True), +): + """ + Convert Word or Markdown to PDF + + Supports three input methods: + 1. Upload file (Word .doc/.docx or Markdown .md) + 2. Specify file_path (local file path) + 3. Provide markdown_content directly + + Returns PDF file as download by default + """ + try: + pdf_bytes: bytes = b"" + output_filename: str = "document.pdf" + + # Determine input source + if file: + # Handle uploaded file + filename = file.filename or "upload" + suffix = Path(filename).suffix.lower() + + # Save uploaded file to temp + tmp_path = Path(tempfile.mktemp(suffix=suffix)) + try: + content = await file.read() + tmp_path.write_bytes(content) + + if suffix in {".doc", ".docx"}: + # Convert Word to PDF + output_filename = f"{Path(filename).stem}.pdf" + pdf_bytes = await asyncio.to_thread( + word_to_pdf_bytes, + tmp_path, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text or Path(filename).stem, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + elif suffix in {".md", ".markdown"}: + # Convert Markdown file to PDF + output_filename = f"{Path(filename).stem}.pdf" + pdf_bytes = await asyncio.to_thread( + markdown_file_to_pdf_bytes, + tmp_path, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text or Path(filename).stem, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + css_name=css_name, + css_text=css_text, + ) + else: + return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md") + finally: + try: + tmp_path.unlink(missing_ok=True) + except Exception: + pass + + elif file_path: + # Handle local file path + path = Path(file_path).expanduser() + if not path.exists(): + return _err(f"文件不存在: {file_path}") + + suffix = path.suffix.lower() + output_filename = f"{path.stem}.pdf" + + if suffix in {".doc", ".docx"}: + pdf_bytes = await asyncio.to_thread( + word_to_pdf_bytes, + path, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text or path.stem, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + elif suffix in {".md", ".markdown"}: + pdf_bytes = await asyncio.to_thread( + markdown_file_to_pdf_bytes, + path, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text or path.stem, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + css_name=css_name, + css_text=css_text, + ) + else: + return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md") + + elif markdown_content: + # Handle direct markdown content + output_filename = f"{filename_text or 'document'}.pdf" + pdf_bytes = await asyncio.to_thread( + markdown_to_pdf_bytes, + markdown_content, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + css_name=css_name, + css_text=css_text, + ) + else: + return _err("必须提供 file、file_path 或 markdown_content 中的一个") + + if not pdf_bytes: + return _err("PDF 转换失败,未生成内容") + + # Return PDF file + if download: + from fastapi.responses import StreamingResponse + return StreamingResponse( + io.BytesIO(pdf_bytes), + media_type="application/pdf", + headers={ + "Content-Disposition": f"attachment; filename=\"{output_filename}\"" + } + ) + else: + # Return as base64 in JSON + import base64 + return _ok({ + "pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"), + "filename": output_filename, + "size": len(pdf_bytes) + }) + + except Exception as e: + logging.exception("PDF conversion error") + return _err(f"PDF 转换失败: {str(e)}") + @app.post("/api/import/convert") async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)): try: diff --git a/docling/app/services/pdf_converter.py b/docling/app/services/pdf_converter.py new file mode 100644 index 0000000..d6c65df --- /dev/null +++ b/docling/app/services/pdf_converter.py @@ -0,0 +1,198 @@ +""" +PDF Conversion Service +Provides Word-to-PDF and Markdown-to-PDF conversion functionality +""" +from pathlib import Path +from typing import Optional, Dict, Tuple +import io +import tempfile +import logging + +from docx import Document as DocxDocument + +from app.services.word2markdown import convert_any as convert_word_to_md +from app.services.docling_adapter import md_to_pdf_bytes_with_renderer + + +logger = logging.getLogger(__name__) + + +def word_to_pdf_bytes( + file_path: str | Path, + toc: bool = False, + header_text: Optional[str] = None, + footer_text: Optional[str] = None, + logo_url: Optional[str] = None, + copyright_text: Optional[str] = None, + filename_text: Optional[str] = None, + cover_src: Optional[str] = None, + product_name: Optional[str] = None, + document_name: Optional[str] = None, + product_version: Optional[str] = None, + document_version: Optional[str] = None, +) -> bytes: + """ + Convert Word document (.doc, .docx) to PDF + + Args: + file_path: Path to Word file + toc: Enable table of contents + header_text: Custom header text + footer_text: Custom footer text + logo_url: URL to logo image + copyright_text: Copyright notice + filename_text: Filename to display + cover_src: Cover image source + product_name: Product name for cover + document_name: Document name for cover + product_version: Product version + document_version: Document version + + Returns: + PDF file as bytes + """ + logger.info(f"Converting Word to PDF: {file_path}") + + # Convert Word to Markdown first + path = Path(file_path) + _, markdown_content = convert_word_to_md(path) + + # Then convert Markdown to PDF + pdf_bytes = md_to_pdf_bytes_with_renderer( + md=markdown_content, + renderer="weasyprint", + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text or path.stem, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + + logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes") + return pdf_bytes + + +def markdown_to_pdf_bytes( + markdown_content: str, + toc: bool = False, + header_text: Optional[str] = None, + footer_text: Optional[str] = None, + logo_url: Optional[str] = None, + copyright_text: Optional[str] = None, + filename_text: Optional[str] = None, + cover_src: Optional[str] = None, + product_name: Optional[str] = None, + document_name: Optional[str] = None, + product_version: Optional[str] = None, + document_version: Optional[str] = None, + css_name: Optional[str] = None, + css_text: Optional[str] = None, +) -> bytes: + """ + Convert Markdown content to PDF + + Args: + markdown_content: Markdown text content + toc: Enable table of contents + header_text: Custom header text + footer_text: Custom footer text + logo_url: URL to logo image + copyright_text: Copyright notice + filename_text: Filename to display + cover_src: Cover image source + product_name: Product name for cover + document_name: Document name for cover + product_version: Product version + document_version: Document version + css_name: Name of CSS file in configs/styles + css_text: Custom CSS as string + + Returns: + PDF file as bytes + """ + logger.info("Converting Markdown to PDF") + + pdf_bytes = md_to_pdf_bytes_with_renderer( + md=markdown_content, + renderer="weasyprint", + css_name=css_name, + css_text=css_text, + toc=toc, + header_text=header_text, + footer_text=footer_text, + logo_url=logo_url, + copyright_text=copyright_text, + filename_text=filename_text, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + + logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes") + return pdf_bytes + + +def markdown_file_to_pdf_bytes( + file_path: str | Path, + encoding: str = "utf-8", + **kwargs +) -> bytes: + """ + Convert Markdown file to PDF + + Args: + file_path: Path to Markdown file + encoding: File encoding (default: utf-8) + **kwargs: Additional arguments passed to markdown_to_pdf_bytes + + Returns: + PDF file as bytes + """ + path = Path(file_path) + markdown_content = path.read_text(encoding=encoding) + + # Set default filename from file path if not provided + if "filename_text" not in kwargs or not kwargs.get("filename_text"): + kwargs["filename_text"] = path.stem + + return markdown_to_pdf_bytes(markdown_content, **kwargs) + + +def read_file_content(file_path: str | Path) -> Tuple[str, bytes]: + """ + Read file content and detect content type + + Args: + file_path: Path to file + + Returns: + Tuple of (detected_type, content_bytes) + """ + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + content_bytes = path.read_bytes() + + # Detect by extension + ext = path.suffix.lower() + if ext in {".md", ".markdown"}: + return "markdown", content_bytes + elif ext in {".doc", ".docx"}: + return "word", content_bytes + elif ext in {".txt"}: + return "text", content_bytes + else: + # Try to detect by content + content_start = content_bytes[:8] + if content_start.startswith(b"PK\x03\x04"): + return "word", content_bytes + return "text", content_bytes