add:markdown 转 pdf

2026-01-13 22:56:22 +08:00
parent 0b07e63b76
commit cecc8c65be
3 changed files with 816 additions and 0 deletions
--- a/docling/PDF_API_USAGE.md
+++ b/docling/PDF_API_USAGE.md
@@ -0,0 +1,425 @@
 # Word/Markdown 转 PDF API 使用指南
 ## API 端点
 ```
 POST /api/pdf/convert
 ```
 ## 支持的输入方式
 ### 1. 上传文件
 支持上传 `.doc`, `.docx`, `.md` 文件：
 ```javascript
 // 上传 Word 文件
 const formData = new FormData();
 formData.append('file', fileInput.files[0]); // .doc 或 .docx
 // 可选参数
 formData.append('toc', 'true');              // 生成目录
 formData.append('header_text', '文档标题|页码');  // 页眉
 formData.append('footer_text', '版权信息');   // 页脚
 formData.append('filename_text', '我的文档');  // 文件名
 const response = await fetch('/api/pdf/convert', {
  method: 'POST',
  body: formData
 });
 const blob = await response.blob();
 const url = URL.createObjectURL(blob);
 ```
 ### 2. 指定本地文件路径
 ```javascript
 const formData = new FormData();
 formData.append('file_path', '/path/to/document.docx');
 formData.append('toc', 'true');
 const response = await fetch('/api/pdf/convert', {
  method: 'POST',
  body: formData
 });
 ```
 ### 3. 直接提交 Markdown 内容
 ```javascript
 const formData = new FormData();
 formData.append('markdown_content', '# 标题\n\n这是内容');
 formData.append('filename_text', '我的文档');
 const response = await fetch('/api/pdf/convert', {
  method: 'POST',
  body: formData
 });
 ```
 ## 完整参数列表
 | 参数 | 类型 | 必填 | 说明 |
 |------|------|------|------|
 | file | File | 否* | 上传的文件 |
 | file_path | string | 否* | 本地文件路径 |
 | markdown_content | string | 否* | Markdown 内容 |
 | toc | boolean | 否 | 是否生成目录，默认 false |
 | header_text | string | 否 | 页眉文本，可用 `\|` 分隔左右 |
 | footer_text | string | 否 | 页脚文本 |
 | logo_url | string | 否 | Logo 图片 URL |
 | copyright_text | string | 否 | 版权声明 |
 | filename_text | string | 否 | 显示的文件名 |
 | cover_src | string | 否 | 封面图片 URL |
 | product_name | string | 否 | 产品名称（封面） |
 | document_name | string | 否 | 文档名称（封面） |
 | product_version | string | 否 | 产品版本 |
 | document_version | string | 否 | 文档版本 |
 | css_name | string | 否 | CSS 样式名称 |
 | css_text | string | 否 | 自定义 CSS |
 | download | boolean | 否 | 是否直接下载，默认 true |
 *注：file、file_path、markdown_content 三者必选其一
 ## 完整示例代码
 ### React 示例
 ```jsx
 import { useState, useRef } from 'react';
 function PdfConverter() {
  const [loading, setLoading] = useState(false);
  const fileInput = useRef(null);
  const convertToPdf = async () => {
    const file = fileInput.current.files[0];
    if (!file) return;
    setLoading(true);
    const formData = new FormData();
    formData.append('file', file);
    formData.append('toc', 'true');
    formData.append('header_text', '我的文档|第 {page} 页');
    formData.append('footer_text', '© 2024 公司名称');
    formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
    try {
      const response = await fetch('/api/pdf/convert', {
        method: 'POST',
        body: formData
      });
      if (!response.ok) {
        const error = await response.json();
        throw new Error(error.detail || '转换失败');
      }
      const blob = await response.blob();
      const url = URL.createObjectURL(blob);
      // 下载 PDF
      const a = document.createElement('a');
      a.href = url;
      a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
      document.body.appendChild(a);
      a.click();
      document.body.removeChild(a);
      URL.revokeObjectURL(url);
    } catch (error) {
      console.error('转换失败:', error);
      alert('转换失败: ' + error.message);
    } finally {
      setLoading(false);
    }
  };
  return (
    <div>
      <input
        type="file"
        ref={fileInput}
        accept=".doc,.docx,.md"
      />
      <button
        onClick={convertToPdf}
        disabled={loading}
      >
        {loading ? '转换中...' : '转换为 PDF'}
      </button>
    </div>
  );
 }
 ```
 ### Vue 3 示例
 ```vue
 <template>
  <div>
    <input
      ref="fileInput"
      type="file"
      accept=".doc,.docx,.md"
    />
    <button
      @click="convertToPdf"
      :disabled="loading"
    >
      {{ loading ? '转换中...' : '转换为 PDF' }}
    </button>
  </div>
 </template>
 <script setup>
 import { ref } from 'vue';
 const fileInput = ref(null);
 const loading = ref(false);
 const convertToPdf = async () => {
  const file = fileInput.value.files[0];
  if (!file) return;
  loading.value = true;
  const formData = new FormData();
  formData.append('file', file);
  formData.append('toc', 'true');
  formData.append('header_text', '我的文档');
  formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
  try {
    const response = await fetch('/api/pdf/convert', {
      method: 'POST',
      body: formData
    });
    if (!response.ok) {
      const error = await response.json();
      throw new Error(error.detail || '转换失败');
    }
    const blob = await response.blob();
    const url = URL.createObjectURL(blob);
    const a = document.createElement('a');
    a.href = url;
    a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    URL.revokeObjectURL(url);
  } catch (error) {
    console.error('转换失败:', error);
    alert('转换失败: ' + error.message);
  } finally {
    loading.value = false;
  }
 };
 </script>
 ```
 ### 原生 JavaScript 示例
 ```html
 <!DOCTYPE html>
 <html>
 <head>
  <title>Word/Markdown 转 PDF</title>
 </head>
 <body>
  <h1>文档转 PDF</h1>
  <input type="file" id="fileInput" accept=".doc,.docx,.md">
  <button id="convertBtn">转换为 PDF</button>
  <div id="status" style="margin-top: 10px;"></div>
  <script>
    document.getElementById('convertBtn').addEventListener('click', async () => {
      const fileInput = document.getElementById('fileInput');
      const status = document.getElementById('status');
      const file = fileInput.files[0];
      if (!file) {
        status.textContent = '请选择文件';
        return;
      }
      status.textContent = '转换中...';
      const formData = new FormData();
      formData.append('file', file);
      formData.append('toc', 'true');
      formData.append('header_text', '我的文档|{page}');
      formData.append('footer_text', '© 2024');
      formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
      try {
        const response = await fetch('/api/pdf/convert', {
          method: 'POST',
          body: formData
        });
        if (!response.ok) {
          const error = await response.json();
          throw new Error(error.detail || '转换失败');
        }
        const blob = await response.blob();
        const url = URL.createObjectURL(blob);
        const a = document.createElement('a');
        a.href = url;
        a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
        document.body.appendChild(a);
        a.click();
        document.body.removeChild(a);
        URL.revokeObjectURL(url);
        status.textContent = '转换成功！';
      } catch (error) {
        console.error(error);
        status.textContent = '转换失败: ' + error.message;
      }
    });
  </script>
 </body>
 </html>
 ```
 ### Markdown 内容转 PDF 示例
 ```javascript
 async function markdownToPdf() {
  const markdownContent = `
 # 我的文档
 ## 第一章
 这是第一章的内容。
 ### 小节
 - 列表项 1
 - 列表项 2
 | 列1 | 列2 |
 |-----|-----|
 | A   | B   |
  `;
  const formData = new FormData();
  formData.append('markdown_content', markdownContent);
  formData.append('filename_text', '我的Markdown文档');
  formData.append('toc', 'true');
  formData.append('header_text', 'Markdown文档');
  formData.append('footer_text', '© 2024');
  const response = await fetch('/api/pdf/convert', {
    method: 'POST',
    body: formData
  });
  const blob = await response.blob();
  // 保存 PDF
  saveAs(blob, 'document.pdf');
 }
 ```
 ## Python 调用示例
 ```python
 import requests
 def convert_word_to_pdf(file_path, output_path):
    """将 Word 文件转换为 PDF"""
    with open(file_path, 'rb') as f:
        files = {'file': f}
        data = {
            'toc': 'true',
            'header_text': '我的文档',
            'footer_text': '© 2024',
            'filename_text': '文档名称'
        }
        response = requests.post(
            'http://localhost:8000/api/pdf/convert',
            files=files,
            data=data
        )
        if response.status_code == 200:
            with open(output_path, 'wb') as out:
                out.write(response.content)
            print(f"PDF 已保存到: {output_path}")
        else:
            print(f"转换失败: {response.text}")
 # 使用示例
 convert_word_to_pdf('document.docx', 'output.pdf')
 ```
 ## cURL 示例
 ```bash
 # 上传 Word 文件转 PDF
 curl -X POST http://localhost:8000/api/pdf/convert \
  -F "file=@document.docx" \
  -F "toc=true" \
  -F "header_text=我的文档" \
  -F "footer_text=© 2024" \
  -o output.pdf
 # Markdown 内容转 PDF
 curl -X POST http://localhost:8000/api/pdf/convert \
  -F "markdown_content=# 标题\n\n这是内容" \
  -F "filename_text=文档" \
  -o output.pdf
 ```
 ## 错误处理
 API 返回的错误格式：
 ```json
 {
  "detail": "错误信息"
 }
 ```
 常见错误：
 | 错误信息 | 原因 | 解决方法 |
 |---------|------|----------|
 | 必须提供 file、file_path 或 markdown_content 中的一个 | 未提供输入 | 检查请求参数 |
 | 不支持的文件格式 | 文件格式错误 | 确保是 .doc/.docx/.md |
 | 文件不存在 | 本地文件路径无效 | 检查 file_path 参数 |
 | PDF 转换失败 | 转换过程出错 | 查看服务器日志 |
 ## 返回格式
 ### download=true (默认)
 直接返回 PDF 文件流：
 ```
 Content-Type: application/pdf
 Content-Disposition: attachment; filename="document.pdf"
 ```
 ### download=false
 返回 JSON，包含 base64 编码的 PDF：
 ```json
 {
  "ok": true,
  "pdf_base64": "JVBERi0xLjQK...",
  "filename": "document.pdf",
  "size": 12345
 }
 ```
--- a/docling/app/server.py
+++ b/docling/app/server.py
@@ -58,6 +58,12 @@ from app.services.docling_adapter import (
 )
 from app.services.unified_converter import FormatConverter
 from app.services.minio_utils import minio_current, join_prefix, presigned_read
 from app.services.pdf_converter import (
    word_to_pdf_bytes,
    markdown_to_pdf_bytes,
    markdown_file_to_pdf_bytes,
    read_file_content,
 )
 """
@api Server Application
@@ -2561,6 +2567,193 @@ async def api_convert(
    except Exception as e:
        return _err(str(e))
@app.post("/api/pdf/convert")
 async def api_pdf_convert(
    file: Optional[UploadFile] = File(None),
    file_path: Optional[str] = Form(None),
    markdown_content: Optional[str] = Form(None),
    toc: bool = Form(False),
    header_text: Optional[str] = Form(None),
    footer_text: Optional[str] = Form(None),
    logo_url: Optional[str] = Form(None),
    copyright_text: Optional[str] = Form(None),
    filename_text: Optional[str] = Form(None),
    cover_src: Optional[str] = Form(None),
    product_name: Optional[str] = Form(None),
    document_name: Optional[str] = Form(None),
    product_version: Optional[str] = Form(None),
    document_version: Optional[str] = Form(None),
    css_name: Optional[str] = Form(None),
    css_text: Optional[str] = Form(None),
    download: bool = Form(True),
 ):
    """
    Convert Word or Markdown to PDF
    Supports three input methods:
    1. Upload file (Word .doc/.docx or Markdown .md)
    2. Specify file_path (local file path)
    3. Provide markdown_content directly
    Returns PDF file as download by default
    """
    try:
        pdf_bytes: bytes = b""
        output_filename: str = "document.pdf"
        # Determine input source
        if file:
            # Handle uploaded file
            filename = file.filename or "upload"
            suffix = Path(filename).suffix.lower()
            # Save uploaded file to temp
            tmp_path = Path(tempfile.mktemp(suffix=suffix))
            try:
                content = await file.read()
                tmp_path.write_bytes(content)
                if suffix in {".doc", ".docx"}:
                    # Convert Word to PDF
                    output_filename = f"{Path(filename).stem}.pdf"
                    pdf_bytes = await asyncio.to_thread(
                        word_to_pdf_bytes,
                        tmp_path,
                        toc=toc,
                        header_text=header_text,
                        footer_text=footer_text,
                        logo_url=logo_url,
                        copyright_text=copyright_text,
                        filename_text=filename_text or Path(filename).stem,
                        cover_src=cover_src,
                        product_name=product_name,
                        document_name=document_name,
                        product_version=product_version,
                        document_version=document_version,
                    )
                elif suffix in {".md", ".markdown"}:
                    # Convert Markdown file to PDF
                    output_filename = f"{Path(filename).stem}.pdf"
                    pdf_bytes = await asyncio.to_thread(
                        markdown_file_to_pdf_bytes,
                        tmp_path,
                        toc=toc,
                        header_text=header_text,
                        footer_text=footer_text,
                        logo_url=logo_url,
                        copyright_text=copyright_text,
                        filename_text=filename_text or Path(filename).stem,
                        cover_src=cover_src,
                        product_name=product_name,
                        document_name=document_name,
                        product_version=product_version,
                        document_version=document_version,
                        css_name=css_name,
                        css_text=css_text,
                    )
                else:
                    return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
            finally:
                try:
                    tmp_path.unlink(missing_ok=True)
                except Exception:
                    pass
        elif file_path:
            # Handle local file path
            path = Path(file_path).expanduser()
            if not path.exists():
                return _err(f"文件不存在: {file_path}")
            suffix = path.suffix.lower()
            output_filename = f"{path.stem}.pdf"
            if suffix in {".doc", ".docx"}:
                pdf_bytes = await asyncio.to_thread(
                    word_to_pdf_bytes,
                    path,
                    toc=toc,
                    header_text=header_text,
                    footer_text=footer_text,
                    logo_url=logo_url,
                    copyright_text=copyright_text,
                    filename_text=filename_text or path.stem,
                    cover_src=cover_src,
                    product_name=product_name,
                    document_name=document_name,
                    product_version=product_version,
                    document_version=document_version,
                )
            elif suffix in {".md", ".markdown"}:
                pdf_bytes = await asyncio.to_thread(
                    markdown_file_to_pdf_bytes,
                    path,
                    toc=toc,
                    header_text=header_text,
                    footer_text=footer_text,
                    logo_url=logo_url,
                    copyright_text=copyright_text,
                    filename_text=filename_text or path.stem,
                    cover_src=cover_src,
                    product_name=product_name,
                    document_name=document_name,
                    product_version=product_version,
                    document_version=document_version,
                    css_name=css_name,
                    css_text=css_text,
                )
            else:
                return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
        elif markdown_content:
            # Handle direct markdown content
            output_filename = f"{filename_text or 'document'}.pdf"
            pdf_bytes = await asyncio.to_thread(
                markdown_to_pdf_bytes,
                markdown_content,
                toc=toc,
                header_text=header_text,
                footer_text=footer_text,
                logo_url=logo_url,
                copyright_text=copyright_text,
                filename_text=filename_text,
                cover_src=cover_src,
                product_name=product_name,
                document_name=document_name,
                product_version=product_version,
                document_version=document_version,
                css_name=css_name,
                css_text=css_text,
            )
        else:
            return _err("必须提供 file、file_path 或 markdown_content 中的一个")
        if not pdf_bytes:
            return _err("PDF 转换失败，未生成内容")
        # Return PDF file
        if download:
            from fastapi.responses import StreamingResponse
            return StreamingResponse(
                io.BytesIO(pdf_bytes),
                media_type="application/pdf",
                headers={
                    "Content-Disposition": f"attachment; filename=\"{output_filename}\""
                }
            )
        else:
            # Return as base64 in JSON
            import base64
            return _ok({
                "pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"),
                "filename": output_filename,
                "size": len(pdf_bytes)
            })
    except Exception as e:
        logging.exception("PDF conversion error")
        return _err(f"PDF 转换失败: {str(e)}")
@app.post("/api/import/convert")
 async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)):
    try:
--- a/docling/app/services/pdf_converter.py
+++ b/docling/app/services/pdf_converter.py
@@ -0,0 +1,198 @@
 """
 PDF Conversion Service
 Provides Word-to-PDF and Markdown-to-PDF conversion functionality
 """
 from pathlib import Path
 from typing import Optional, Dict, Tuple
 import io
 import tempfile
 import logging
 from docx import Document as DocxDocument
 from app.services.word2markdown import convert_any as convert_word_to_md
 from app.services.docling_adapter import md_to_pdf_bytes_with_renderer
 logger = logging.getLogger(__name__)
 def word_to_pdf_bytes(
    file_path: str | Path,
    toc: bool = False,
    header_text: Optional[str] = None,
    footer_text: Optional[str] = None,
    logo_url: Optional[str] = None,
    copyright_text: Optional[str] = None,
    filename_text: Optional[str] = None,
    cover_src: Optional[str] = None,
    product_name: Optional[str] = None,
    document_name: Optional[str] = None,
    product_version: Optional[str] = None,
    document_version: Optional[str] = None,
 ) -> bytes:
    """
    Convert Word document (.doc, .docx) to PDF
    Args:
        file_path: Path to Word file
        toc: Enable table of contents
        header_text: Custom header text
        footer_text: Custom footer text
        logo_url: URL to logo image
        copyright_text: Copyright notice
        filename_text: Filename to display
        cover_src: Cover image source
        product_name: Product name for cover
        document_name: Document name for cover
        product_version: Product version
        document_version: Document version
    Returns:
        PDF file as bytes
    """
    logger.info(f"Converting Word to PDF: {file_path}")
    # Convert Word to Markdown first
    path = Path(file_path)
    _, markdown_content = convert_word_to_md(path)
    # Then convert Markdown to PDF
    pdf_bytes = md_to_pdf_bytes_with_renderer(
        md=markdown_content,
        renderer="weasyprint",
        toc=toc,
        header_text=header_text,
        footer_text=footer_text,
        logo_url=logo_url,
        copyright_text=copyright_text,
        filename_text=filename_text or path.stem,
        cover_src=cover_src,
        product_name=product_name,
        document_name=document_name,
        product_version=product_version,
        document_version=document_version,
    )
    logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes")
    return pdf_bytes
 def markdown_to_pdf_bytes(
    markdown_content: str,
    toc: bool = False,
    header_text: Optional[str] = None,
    footer_text: Optional[str] = None,
    logo_url: Optional[str] = None,
    copyright_text: Optional[str] = None,
    filename_text: Optional[str] = None,
    cover_src: Optional[str] = None,
    product_name: Optional[str] = None,
    document_name: Optional[str] = None,
    product_version: Optional[str] = None,
    document_version: Optional[str] = None,
    css_name: Optional[str] = None,
    css_text: Optional[str] = None,
 ) -> bytes:
    """
    Convert Markdown content to PDF
    Args:
        markdown_content: Markdown text content
        toc: Enable table of contents
        header_text: Custom header text
        footer_text: Custom footer text
        logo_url: URL to logo image
        copyright_text: Copyright notice
        filename_text: Filename to display
        cover_src: Cover image source
        product_name: Product name for cover
        document_name: Document name for cover
        product_version: Product version
        document_version: Document version
        css_name: Name of CSS file in configs/styles
        css_text: Custom CSS as string
    Returns:
        PDF file as bytes
    """
    logger.info("Converting Markdown to PDF")
    pdf_bytes = md_to_pdf_bytes_with_renderer(
        md=markdown_content,
        renderer="weasyprint",
        css_name=css_name,
        css_text=css_text,
        toc=toc,
        header_text=header_text,
        footer_text=footer_text,
        logo_url=logo_url,
        copyright_text=copyright_text,
        filename_text=filename_text,
        cover_src=cover_src,
        product_name=product_name,
        document_name=document_name,
        product_version=product_version,
        document_version=document_version,
    )
    logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes")
    return pdf_bytes
 def markdown_file_to_pdf_bytes(
    file_path: str | Path,
    encoding: str = "utf-8",
    **kwargs
 ) -> bytes:
    """
    Convert Markdown file to PDF
    Args:
        file_path: Path to Markdown file
        encoding: File encoding (default: utf-8)
        **kwargs: Additional arguments passed to markdown_to_pdf_bytes
    Returns:
        PDF file as bytes
    """
    path = Path(file_path)
    markdown_content = path.read_text(encoding=encoding)
    # Set default filename from file path if not provided
    if "filename_text" not in kwargs or not kwargs.get("filename_text"):
        kwargs["filename_text"] = path.stem
    return markdown_to_pdf_bytes(markdown_content, **kwargs)
 def read_file_content(file_path: str | Path) -> Tuple[str, bytes]:
    """
    Read file content and detect content type
    Args:
        file_path: Path to file
    Returns:
        Tuple of (detected_type, content_bytes)
    """
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    content_bytes = path.read_bytes()
    # Detect by extension
    ext = path.suffix.lower()
    if ext in {".md", ".markdown"}:
        return "markdown", content_bytes
    elif ext in {".doc", ".docx"}:
        return "word", content_bytes
    elif ext in {".txt"}:
        return "text", content_bytes
    else:
        # Try to detect by content
        content_start = content_bytes[:8]
        if content_start.startswith(b"PK\x03\x04"):
            return "word", content_bytes
        return "text", content_bytes