add:修改使用reportlab完成md转pdf

2026-01-15 23:45:46 +08:00
parent cecc8c65be
commit 0cc1a9484e
7 changed files with 436 additions and 299 deletions
--- a/docling/app/server.py
+++ b/docling/app/server.py
@@ -2734,11 +2734,17 @@ async def api_pdf_convert(
        # Return PDF file
        if download:
            from fastapi.responses import StreamingResponse
+            import urllib.parse
+
+            # 处理中文文件名 - 使用 URL 编码确保只包含 ASCII 字符
+            # 先将中文文件名进行百分比编码
+            safe_filename = urllib.parse.quote(output_filename, safe='')
+
            return StreamingResponse(
                io.BytesIO(pdf_bytes),
                media_type="application/pdf",
                headers={
-                    "Content-Disposition": f"attachment; filename=\"{output_filename}\""
+                    "Content-Disposition": f"attachment; filename={safe_filename}"
                }
            )
        else:
--- a/docling/app/services/docling_adapter.py
+++ b/docling/app/services/docling_adapter.py
@@ -38,6 +38,28 @@ except Exception:
    HTML = None
    CSS = None

+try:
+    from xhtml2pdf import pisa as _pisa  # type: ignore
+    _HAS_XHTML2PDF: bool = True
+except Exception:
+    _pisa = None  # type: ignore
+    _HAS_XHTML2PDF: bool = False
+
+# reportlab 用于生成支持中文的 PDF
+try:
+    from reportlab.lib.pagesizes import A4
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.lib.units import mm
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+    from reportlab.lib import colors
+    from reportlab.pdfbase import pdfmetrics
+    from reportlab.pdfbase.ttfonts import TTFont
+    from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
+    _HAS_REPORTLAB: bool = True
+except Exception:
+    A4 = None
+    _HAS_REPORTLAB: bool = False
+
 _mdit: Any = None
 _tasklists_plugin: Any = None
 _deflist_plugin: Any = None
@@ -688,6 +710,214 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
            sheets.append(CSS(filename=str(css_path)))
    return sheets

+def _render_pdf_with_reportlab(md: str) -> bytes:
+    """
+    使用 reportlab 生成支持中文的 PDF（纯 Python，无外部依赖）
+    """
+    print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
+    bio = io.BytesIO()
+
+    # 创建 PDF 文档
+    doc = SimpleDocTemplate(
+        bio,
+        pagesize=A4,
+        rightMargin=20*mm,
+        leftMargin=20*mm,
+        topMargin=20*mm,
+        bottomMargin=20*mm,
+    )
+
+    # 存放 PDF 元素的列表
+    story = []
+    styles = getSampleStyleSheet()
+
+    # 尝试注册中文字体
+    try:
+        # Windows 系统字体
+        font_path = r"C:\Windows\Fonts\msyh.ttc"  # 微软雅黑
+        if Path(font_path).exists():
+            pdfmetrics.registerFont(TTFont('ChineseFont', font_path, subfontIndex=0))
+            chinese_font = 'ChineseFont'
+        else:
+            # 尝试其他常见字体路径
+            alternative_fonts = [
+                r"C:\Windows\Fonts\simhei.ttf",  # 黑体
+                r"C:\Windows\Fonts\simsun.ttc",  # 宋体
+                "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc",  # Linux
+                "/System/Library/Fonts/PingFang.ttc",  # macOS
+            ]
+            chinese_font = 'Helvetica'  # 默认
+            for font in alternative_fonts:
+                if Path(font).exists():
+                    try:
+                        pdfmetrics.registerFont(TTFont('ChineseFont', font))
+                        chinese_font = 'ChineseFont'
+                        break
+                    except:
+                        continue
+    except Exception:
+        chinese_font = 'Helvetica'
+
+    # 创建支持中文的样式
+    title_style = ParagraphStyle(
+        'ChineseTitle',
+        parent=styles['Heading1'],
+        fontName=chinese_font,
+        fontSize=18,
+        textColor=colors.black,
+        spaceAfter=12,
+        spaceBefore=12,
+    )
+
+    heading2_style = ParagraphStyle(
+        'ChineseHeading2',
+        parent=styles['Heading2'],
+        fontName=chinese_font,
+        fontSize=14,
+        textColor=colors.black,
+        spaceAfter=10,
+        spaceBefore=10,
+    )
+
+    normal_style = ParagraphStyle(
+        'ChineseNormal',
+        parent=styles['Normal'],
+        fontName=chinese_font,
+        fontSize=10,
+        textColor=colors.black,
+        spaceAfter=8,
+        wordWrap='CJK',  # 中文换行支持
+    )
+
+    code_style = ParagraphStyle(
+        'ChineseCode',
+        parent=styles['Code'],
+        fontName='Courier',
+        fontSize=9,
+        textColor=colors.black,
+        backColor=colors.lightgrey,
+        leftIndent=10,
+    )
+
+    # 解析 markdown
+    lines = md.split('\n')
+    in_code_block = False
+    code_lines = []
+
+    for line in lines:
+        # 代码块处理
+        if line.strip().startswith('```'):
+            if in_code_block:
+                # 代码块结束
+                code_text = '\n'.join(code_lines)
+                story.append(Paragraph(code_text.replace('<', '&lt;').replace('>', '&gt;'), code_style))
+                story.append(Spacer(1, 6*mm))
+                code_lines = []
+                in_code_block = False
+            else:
+                in_code_block = True
+            continue
+
+        if in_code_block:
+            code_lines.append(line)
+            continue
+
+        # 标题处理
+        if line.startswith('# '):
+            text = line[2:].strip()
+            story.append(Paragraph(text, title_style))
+        elif line.startswith('## '):
+            text = line[3:].strip()
+            story.append(Paragraph(text, heading2_style))
+        elif line.startswith('### '):
+            text = line[4:].strip()
+            story.append(Paragraph(text, heading2_style))
+        # 列表处理
+        elif line.strip().startswith('- ') or line.strip().startswith('* '):
+            text = line.strip()[2:]
+            story.append(Paragraph(f'• {text}', normal_style))
+        elif re.match(r'^\d+\.\s', line.strip()):
+            text = re.sub(r'^\d+\.\s', '', line.strip())
+            story.append(Paragraph(text, normal_style))
+        # 空行
+        elif not line.strip():
+            story.append(Spacer(1, 3*mm))
+        # 普通段落
+        elif line.strip():
+            # 处理粗体和斜体
+            text = line.strip()
+            text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
+            text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
+            text = re.sub(r'`(.+?)`', r'<font face="Courier">\1</font>', text)
+            story.append(Paragraph(text, normal_style))
+
+    # 生成 PDF
+    doc.build(story)
+    return bio.getvalue()
+
+def _render_pdf_with_xhtml2pdf(md: str, html: str, css_name: Optional[str], css_text: Optional[str]) -> bytes:
+    """
+    使用 xhtml2pdf 渲染 PDF（纯 Python，无外部依赖）
+    """
+    # 使用简单的 markdown 转 HTML，避免复杂的 normalize_html
+    simple_html = _render_markdown_html(md)
+
+    # 构建完整的 HTML 文档，确保格式正确
+    full_html = f'''<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <style>
+    @page {{
+        margin: 20mm;
+    }}
+    body {{
+        font-family: "Microsoft YaHei", "SimSun", Arial, sans-serif;
+        font-size: 12pt;
+        line-height: 1.6;
+    }}
+    h1, h2, h3, h4, h5, h6 {{
+        color: #333;
+        margin-top: 1em;
+        margin-bottom: 0.5em;
+    }}
+    h1 {{ font-size: 24pt; font-weight: bold; }}
+    h2 {{ font-size: 20pt; font-weight: bold; }}
+    h3 {{ font-size: 16pt; font-weight: bold; }}
+    p {{ margin-bottom: 1em; }}
+    ul, ol {{ margin-left: 2em; }}
+    table {{
+        border-collapse: collapse;
+        width: 100%;
+        margin: 1em 0;
+    }}
+    th, td {{
+        border: 1px solid #ddd;
+        padding: 8px;
+    }}
+    th {{
+        background-color: #f2f2f2;
+    }}
+    a {{ color: #1d4ed8; text-decoration: underline; }}
+    </style>
+</head>
+<body>
+{simple_html}
+</body>
+</html>'''
+
+    # 使用 BytesIO 接收 PDF 输出
+    bio = io.BytesIO()
+
+    # 调用 pisa.CreatePDF
+    _pisa.CreatePDF(
+        full_html,
+        dest=bio,
+        encoding='utf-8'
+    )
+
+    return bio.getvalue()
+
 def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
    html = normalize_html(md, options={
        "toc": "1" if toc else "",
@@ -702,8 +932,38 @@ def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_nam
        "product_version": product_version,
        "document_version": document_version,
    })
+
+    # ========== PDF 渲染优先级 ==========
+    # 1. reportlab (首选) - 纯 Python，支持中文，跨平台兼容
+    # 2. WeasyPrint - 需要 GTK 系统库，Windows 上安装复杂
+    # =====================================
+
+    print(f"[DEBUG] 开始 PDF 转换, _HAS_REPORTLAB={_HAS_REPORTLAB}, HTML is None={HTML is None}")
+
+    # 首选：reportlab（纯 Python，支持中文，无需外部依赖）
+    if _HAS_REPORTLAB:
+        try:
+            print(f"[DEBUG] 尝试使用 reportlab...")
+            return _render_pdf_with_reportlab(md)
+        except Exception as e:
+            # reportlab 失败，记录错误并继续尝试下一个方案
+            import traceback
+            error_detail = traceback.format_exc()
+            print(f"[DEBUG] reportlab 失败: {str(e)}")
+            print(f"[DEBUG] 错误详情:\n{error_detail}")
+
+    # 备选：WeasyPrint（需要系统库支持）
    if HTML is not None:
-        stylesheets = _stylesheets_for(css_name, css_text)
-        pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None)
-        return pdf_bytes
-    raise RuntimeError("WeasyPrint is not available")
+        try:
+            print(f"[DEBUG] 尝试使用 WeasyPrint...")
+            stylesheets = _stylesheets_for(css_name, css_text)
+            pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None)
+            return pdf_bytes
+        except Exception as e:
+            # WeasyPrint 失败，记录错误
+            import traceback
+            error_detail = traceback.format_exc()
+            print(f"[DEBUG] WeasyPrint 失败: {str(e)}")
+            print(f"[DEBUG] 错误详情:\n{error_detail}")
+
+    raise RuntimeError("PDF 转换失败。reportlab 已安装但转换失败，请检查 markdown 格式")