add:增加标题、列表、代码块、表格、引用等样式的处理

2026-01-19 22:30:29 +08:00
parent 0cc1a9484e
commit 57cd7e7c3e
2 changed files with 312 additions and 34 deletions
--- a/docling/app/services/docling_adapter.py
+++ b/docling/app/services/docling_adapter.py
@@ -49,14 +49,20 @@ except Exception:
 try:
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-    from reportlab.lib.units import mm
-    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+    from reportlab.lib.units import mm, cm
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, HRFlowable
    from reportlab.lib import colors
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont
    from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
+    from reportlab.platypus import KeepInFrame
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.colors import HexColor
    _HAS_REPORTLAB: bool = True
-except Exception:
+except Exception as e:
+    import traceback
+    print(f"[ERROR] reportlab import failed: {e}")
+    traceback.print_exc()
    A4 = None
    _HAS_REPORTLAB: bool = False

@@ -713,6 +719,7 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
 def _render_pdf_with_reportlab(md: str) -> bytes:
    """
    使用 reportlab 生成支持中文的 PDF（纯 Python，无外部依赖）
+    完整支持 markdown 格式：标题、列表、代码块、表格、引用等
    """
    print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
    bio = io.BytesIO()
@@ -767,6 +774,7 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=12,
        spaceBefore=12,
+        leading=22,
    )

    heading2_style = ParagraphStyle(
@@ -777,6 +785,18 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=10,
        spaceBefore=10,
+        leading=18,
+    )
+
+    heading3_style = ParagraphStyle(
+        'ChineseHeading3',
+        parent=styles['Heading3'],
+        fontName=chinese_font,
+        fontSize=12,
+        textColor=colors.black,
+        spaceAfter=8,
+        spaceBefore=8,
+        leading=16,
    )

    normal_style = ParagraphStyle(
@@ -787,69 +807,328 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=8,
        wordWrap='CJK',  # 中文换行支持
+        leading=14,
    )

-    code_style = ParagraphStyle(
-        'ChineseCode',
-        parent=styles['Code'],
+    blockquote_style = ParagraphStyle(
+        'ChineseBlockquote',
+        parent=normal_style,
+        fontName=chinese_font,
+        leftIndent=10*mm,
+        textColor=colors.Color(0.4, 0.4, 0.4),
+        spaceAfter=8,
+        backColor=colors.Color(0.95, 0.95, 0.95),
+    )
+
+    code_block_style = ParagraphStyle(
+        'ChineseCodeBlock',
+        parent=normal_style,
        fontName='Courier',
-        fontSize=9,
+        fontSize=8,
        textColor=colors.black,
-        backColor=colors.lightgrey,
-        leftIndent=10,
+        backColor=colors.Color(0.98, 0.98, 0.98),
+        leftIndent=5*mm,
+        rightIndent=5*mm,
+        spaceAfter=10,
+        spaceBefore=10,
+        leading=12,
    )

    # 解析 markdown
    lines = md.split('\n')
+    i = 0
    in_code_block = False
+    code_lang = ''
    code_lines = []

-    for line in lines:
+    def process_inline_markdown(text: str) -> str:
+        """处理行内 markdown 格式：粗体、斜体、行内代码、链接"""
+        # 使用占位符来保护我们生成的 HTML 标签
+        placeholders = {}
+        placeholder_idx = 0
+
+        def save_placeholder(content):
+            nonlocal placeholder_idx
+            key = f"__PLACEHOLDER_{placeholder_idx}__"
+            placeholder_idx += 1
+            placeholders[key] = content
+            return key
+
+        # 先进行 HTML 转义（处理用户输入中的特殊字符）
+        text = text.replace('<', '&lt;').replace('>', '&gt;')
+
+        # 处理行内代码（避免和其他标记冲突）
+        def replace_code(match):
+            code_text = match.group(1)
+            # 代码内容不需要转义，直接使用
+            html = f'<font face="Courier" color="#d63384">{code_text}</font>'
+            return save_placeholder(html)
+        text = re.sub(r'`([^`]+)`', replace_code, text)
+
+        # 处理粗体
+        def replace_bold(match):
+            content = match.group(1)
+            html = f'<b>{content}</b>'
+            return save_placeholder(html)
+        text = re.sub(r'\*\*([^*]+)\*\*', replace_bold, text)
+
+        # 处理斜体
+        def replace_italic(match):
+            content = match.group(1)
+            html = f'<i>{content}</i>'
+            return save_placeholder(html)
+        text = re.sub(r'\*([^*]+)\*', replace_italic, text)
+
+        # 处理链接 [text](url) - 使用 reportlab 的 link 标签创建可点击的超链接
+        def replace_link(match):
+            link_text = match.group(1)
+            url = match.group(2)
+            # 使用蓝色下划线样式，link href 属性使链接可点击
+            html = f'<a href="{url}" color="blue"><u>{link_text}</u></a>'
+            return save_placeholder(html)
+        text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', replace_link, text)
+
+        # 还原占位符为实际的 HTML 标签
+        for key, value in placeholders.items():
+            text = text.replace(key, value)
+
+        return text
+
+    def parse_table(table_lines: list) -> None:
+        """解析 markdown 表格并添加到 story"""
+        if not table_lines:
+            return
+
+        # 解析分隔行以确定列对齐方式
+        separator_line = table_lines[1] if len(table_lines) > 1 else ""
+        alignments = []
+        if separator_line:
+            parts = separator_line.split('|')[1:-1]  # 去掉首尾空元素
+            for part in parts:
+                part = part.strip()
+                if part.startswith(':') and part.endswith(':'):
+                    alignments.append('CENTER')
+                elif part.endswith(':'):
+                    alignments.append('RIGHT')
+                else:
+                    alignments.append('LEFT')
+
+        # 解析表头
+        header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
+        # 处理表头中的行内样式
+        processed_headers = []
+        for cell in header_cells:
+            processed = process_inline_markdown(cell)
+            processed_headers.append(Paragraph(processed, normal_style))
+
+        # 解析数据行（跳过分隔行）
+        row_data = [processed_headers]
+        for line in table_lines[2:]:
+            if '|' in line:
+                cells = [cell.strip() for cell in line.split('|')[1:-1]]
+                # 处理每个单元格中的行内样式
+                processed_cells = []
+                for cell in cells:
+                    processed = process_inline_markdown(cell)
+                    processed_cells.append(Paragraph(processed, normal_style))
+                row_data.append(processed_cells)
+
+        # 计算列宽（自动调整）
+        col_widths = []
+        num_cols = len(header_cells)
+        if num_cols > 0:
+            # 计算每列的最大宽度
+            max_content_width = (A4[0] - 40*mm) / num_cols  # 减去左右边距
+            col_widths = [max_content_width] * num_cols
+
+        # 创建表格样式
+        table_style = TableStyle([
+            # 表头样式
+            ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.4, 0.6, 0.9)),  # 蓝色背景
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
+            ('FONTNAME', (0, 0), (-1, 0), chinese_font),
+            ('FONTSIZE', (0, 0), (-1, 0), 10),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
+            ('TOPPADDING', (0, 0), (-1, 0), 8),
+            ('LEFTPADDING', (0, 0), (-1, -1), 6),
+            ('RIGHTPADDING', (0, 0), (-1, -1), 6),
+            # 表头边框
+            ('LINEABOVE', (0, 0), (-1, 0), 1, colors.black),
+            ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),
+            ('LINEBEFORE', (0, 0), (0, -1), 0.5, colors.grey),
+            ('LINEAFTER', (-1, 0), (-1, -1), 0.5, colors.grey),
+            # 数据行样式
+            ('BACKGROUND', (0, 1), (-1, -1), colors.white),
+            ('FONTNAME', (0, 1), (-1, -1), chinese_font),
+            ('FONTSIZE', (0, 1), (-1, -1), 9),
+            ('TOPPADDING', (0, 1), (-1, -1), 6),
+            ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
+            # 斑马纹效果（交替行背景色）
+            ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.Color(0.95, 0.95, 0.98)]),
+            # 网格线
+            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
+            # 设置对齐方式
+            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+        ])
+
+        # 应用列对齐
+        for col_idx, align in enumerate(alignments):
+            if align:
+                table_style.add('ALIGN', (col_idx, 0), (col_idx, -1), align)
+
+        # 创建表格
+        t = Table(row_data, colWidths=col_widths)
+        t.setStyle(table_style)
+        story.append(t)
+        story.append(Spacer(1, 8*mm))
+
+    while i < len(lines):
+        line = lines[i]
+
        # 代码块处理
        if line.strip().startswith('```'):
            if in_code_block:
                # 代码块结束
                code_text = '\n'.join(code_lines)
-                story.append(Paragraph(code_text.replace('<', '&lt;').replace('>', '&gt;'), code_style))
-                story.append(Spacer(1, 6*mm))
+                # 使用 pre 标签保留格式
+                escaped_code = code_text.replace('<', '&lt;').replace('>', '&gt;')
+                story.append(Paragraph(f'<font face="Courier" size="8">{escaped_code}</font>', code_block_style))
+                story.append(Spacer(1, 3*mm))
                code_lines = []
                in_code_block = False
            else:
                in_code_block = True
+                code_lang = line.strip()[3:]  # 获取语言标识
+            i += 1
            continue

        if in_code_block:
            code_lines.append(line)
+            i += 1
+            continue
+
+        # 表格处理
+        if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
+            # 检查是否是分隔行
+            next_line = lines[i + 1].strip()
+            if re.match(r'^\|?\s*:?-+:?\s*(\|:?-+:?\s*)*\|?$', next_line):
+                table_lines = [line, next_line]  # 包含表头和分隔行
+                i += 2
+                # 收集所有表格数据行
+                while i < len(lines) and '|' in lines[i] and not lines[i].strip().startswith('```'):
+                    table_lines.append(lines[i])
+                    i += 1
+                parse_table(table_lines)
                continue

        # 标题处理
-        if line.startswith('# '):
-            text = line[2:].strip()
-            story.append(Paragraph(text, title_style))
-        elif line.startswith('## '):
-            text = line[3:].strip()
-            story.append(Paragraph(text, heading2_style))
+        if line.startswith('#### '):
+            text = process_inline_markdown(line[5:].strip())
+            h4_style = ParagraphStyle(
+                'ChineseHeading4',
+                parent=heading3_style,
+                fontSize=11,
+            )
+            story.append(Paragraph(text, h4_style))
        elif line.startswith('### '):
-            text = line[4:].strip()
+            text = process_inline_markdown(line[4:].strip())
+            story.append(Paragraph(text, heading3_style))
+        elif line.startswith('## '):
+            text = process_inline_markdown(line[3:].strip())
            story.append(Paragraph(text, heading2_style))
-        # 列表处理
+        elif line.startswith('# '):
+            text = process_inline_markdown(line[2:].strip())
+            story.append(Paragraph(text, title_style))
+
+        # 引用块处理
+        elif line.strip().startswith('>'):
+            quote_text = line.strip()[1:].strip()
+            processed = process_inline_markdown(quote_text)
+            story.append(Paragraph(processed, blockquote_style))
+
+        # 无序列表处理（包括任务列表）
        elif line.strip().startswith('- ') or line.strip().startswith('* '):
-            text = line.strip()[2:]
+            content = line.strip()[2:].strip()
+
+            # 检查是否是任务列表 [ ] 或 [x]
+            task_checked = None
+            if content.startswith('[ ]'):
+                # 未完成的任务
+                task_text = content[2:].strip()
+                task_checked = False
+            elif content.startswith('[x]') or content.startswith('[X]'):
+                # 已完成的任务
+                task_text = content[2:].strip()
+                task_checked = True
+            else:
+                # 普通列表项
+                task_text = content
+                task_checked = None
+
+            text = process_inline_markdown(task_text)
+
+            if task_checked is True:
+                # 使用复选框符号表示已完成
+                story.append(Paragraph(f'☑ {text}', normal_style))
+            elif task_checked is False:
+                # 使用复选框符号表示未完成
+                story.append(Paragraph(f'☐ {text}', normal_style))
+            else:
+                # 普通列表项
                story.append(Paragraph(f'• {text}', normal_style))
-        elif re.match(r'^\d+\.\s', line.strip()):
-            text = re.sub(r'^\d+\.\s', '', line.strip())
-            story.append(Paragraph(text, normal_style))
+
+        # 有序列表处理
+        elif re.match(r'^\s*\d+\.\s', line.strip()):
+            match = re.match(r'^\s*(\d+)\.\s(.*)$', line.strip())
+            if match:
+                num = match.group(1)
+                text = process_inline_markdown(match.group(2))
+                story.append(Paragraph(f'{num}. {text}', normal_style))
+
+        # 分隔线
+        elif line.strip() in ['---', '***', '___']:
+            # 使用 HRFlowable 绘制水平分割线
+            story.append(Spacer(1, 3*mm))
+            story.append(HRFlowable(
+                width="100%",
+                thickness=0.5,
+                lineCap='round',
+                color=colors.grey,
+                spaceBefore=1*mm,
+                spaceAfter=3*mm,
+            ))
+
        # 空行
        elif not line.strip():
-            story.append(Spacer(1, 3*mm))
-        # 普通段落
+            story.append(Spacer(1, 2*mm))
+
+        # 普通段落（可能跨多行）
        elif line.strip():
-            # 处理粗体和斜体
-            text = line.strip()
-            text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
-            text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
-            text = re.sub(r'`(.+?)`', r'<font face="Courier">\1</font>', text)
-            story.append(Paragraph(text, normal_style))
+            # 收集连续的非空行作为段落
+            paragraph_lines = [line.strip()]
+            i += 1
+            while i < len(lines):
+                next_line = lines[i].strip()
+                # 遇到空行、标题、列表等特殊行时停止
+                if (not next_line or
+                    next_line.startswith('#') or
+                    next_line.startswith('>') or
+                    next_line.startswith('-') or
+                    next_line.startswith('*') or
+                    next_line.startswith('```') or
+                    re.match(r'^\d+\.\s', next_line) or
+                    (next_line.startswith('---') or next_line.startswith('***')) or
+                    ('|' in next_line and i + 1 < len(lines) and '|' in lines[i + 1])):
+                    break
+                paragraph_lines.append(next_line)
+                i += 1
+
+            paragraph_text = ' '.join(paragraph_lines)
+            processed = process_inline_markdown(paragraph_text)
+            story.append(Paragraph(processed, normal_style))
+            i -= 1  # 回退一行，因为外层会 i += 1
+
+        i += 1

    # 生成 PDF
    doc.build(story)
--- a/docling/docling
+++ b/docling/docling