add:增加标题、列表、代码块、表格、引用等样式的处理

2026-01-19 22:30:29 +08:00
parent 0cc1a9484e
commit 57cd7e7c3e
2 changed files with 312 additions and 34 deletions
--- a/docling/app/services/docling_adapter.py
+++ b/docling/app/services/docling_adapter.py
@@ -49,14 +49,20 @@ except Exception:
 try:
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-    from reportlab.lib.units import mm
+    from reportlab.lib.units import mm, cm
-    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, HRFlowable
    from reportlab.lib import colors
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont
    from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
    from reportlab.platypus import KeepInFrame
    from reportlab.pdfgen import canvas
    from reportlab.lib.colors import HexColor
    _HAS_REPORTLAB: bool = True
-except Exception:
+except Exception as e:
    import traceback
    print(f"[ERROR] reportlab import failed: {e}")
    traceback.print_exc()
    A4 = None
    _HAS_REPORTLAB: bool = False
@@ -713,6 +719,7 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
 def _render_pdf_with_reportlab(md: str) -> bytes:
    """
    使用 reportlab 生成支持中文的 PDF（纯 Python，无外部依赖）
    完整支持 markdown 格式：标题、列表、代码块、表格、引用等
    """
    print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
    bio = io.BytesIO()
@@ -767,6 +774,7 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=12,
        spaceBefore=12,
        leading=22,
    )
    heading2_style = ParagraphStyle(
@@ -777,6 +785,18 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=10,
        spaceBefore=10,
        leading=18,
    )
    heading3_style = ParagraphStyle(
        'ChineseHeading3',
        parent=styles['Heading3'],
        fontName=chinese_font,
        fontSize=12,
        textColor=colors.black,
        spaceAfter=8,
        spaceBefore=8,
        leading=16,
    )
    normal_style = ParagraphStyle(
@@ -787,69 +807,328 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
        textColor=colors.black,
        spaceAfter=8,
        wordWrap='CJK',  # 中文换行支持
        leading=14,
    )
-    code_style = ParagraphStyle(
+    blockquote_style = ParagraphStyle(
-        'ChineseCode',
+        'ChineseBlockquote',
-        parent=styles['Code'],
+        parent=normal_style,
        fontName=chinese_font,
        leftIndent=10*mm,
        textColor=colors.Color(0.4, 0.4, 0.4),
        spaceAfter=8,
        backColor=colors.Color(0.95, 0.95, 0.95),
    )
    code_block_style = ParagraphStyle(
        'ChineseCodeBlock',
        parent=normal_style,
        fontName='Courier',
-        fontSize=9,
+        fontSize=8,
        textColor=colors.black,
-        backColor=colors.lightgrey,
+        backColor=colors.Color(0.98, 0.98, 0.98),
-        leftIndent=10,
+        leftIndent=5*mm,
        rightIndent=5*mm,
        spaceAfter=10,
        spaceBefore=10,
        leading=12,
    )
    # 解析 markdown
    lines = md.split('\n')
    i = 0
    in_code_block = False
    code_lang = ''
    code_lines = []
-    for line in lines:
+    def process_inline_markdown(text: str) -> str:
        """处理行内 markdown 格式：粗体、斜体、行内代码、链接"""
        # 使用占位符来保护我们生成的 HTML 标签
        placeholders = {}
        placeholder_idx = 0
        def save_placeholder(content):
            nonlocal placeholder_idx
            key = f"__PLACEHOLDER_{placeholder_idx}__"
            placeholder_idx += 1
            placeholders[key] = content
            return key
        # 先进行 HTML 转义（处理用户输入中的特殊字符）
        text = text.replace('<', '&lt;').replace('>', '&gt;')
        # 处理行内代码（避免和其他标记冲突）
        def replace_code(match):
            code_text = match.group(1)
            # 代码内容不需要转义，直接使用
            html = f'<font face="Courier" color="#d63384">{code_text}</font>'
            return save_placeholder(html)
        text = re.sub(r'`([^`]+)`', replace_code, text)
        # 处理粗体
        def replace_bold(match):
            content = match.group(1)
            html = f'<b>{content}</b>'
            return save_placeholder(html)
        text = re.sub(r'\*\*([^*]+)\*\*', replace_bold, text)
        # 处理斜体
        def replace_italic(match):
            content = match.group(1)
            html = f'<i>{content}</i>'
            return save_placeholder(html)
        text = re.sub(r'\*([^*]+)\*', replace_italic, text)
        # 处理链接 [text](url) - 使用 reportlab 的 link 标签创建可点击的超链接
        def replace_link(match):
            link_text = match.group(1)
            url = match.group(2)
            # 使用蓝色下划线样式，link href 属性使链接可点击
            html = f'<a href="{url}" color="blue"><u>{link_text}</u></a>'
            return save_placeholder(html)
        text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', replace_link, text)
        # 还原占位符为实际的 HTML 标签
        for key, value in placeholders.items():
            text = text.replace(key, value)
        return text
    def parse_table(table_lines: list) -> None:
        """解析 markdown 表格并添加到 story"""
        if not table_lines:
            return
        # 解析分隔行以确定列对齐方式
        separator_line = table_lines[1] if len(table_lines) > 1 else ""
        alignments = []
        if separator_line:
            parts = separator_line.split('|')[1:-1]  # 去掉首尾空元素
            for part in parts:
                part = part.strip()
                if part.startswith(':') and part.endswith(':'):
                    alignments.append('CENTER')
                elif part.endswith(':'):
                    alignments.append('RIGHT')
                else:
                    alignments.append('LEFT')
        # 解析表头
        header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
        # 处理表头中的行内样式
        processed_headers = []
        for cell in header_cells:
            processed = process_inline_markdown(cell)
            processed_headers.append(Paragraph(processed, normal_style))
        # 解析数据行（跳过分隔行）
        row_data = [processed_headers]
        for line in table_lines[2:]:
            if '|' in line:
                cells = [cell.strip() for cell in line.split('|')[1:-1]]
                # 处理每个单元格中的行内样式
                processed_cells = []
                for cell in cells:
                    processed = process_inline_markdown(cell)
                    processed_cells.append(Paragraph(processed, normal_style))
                row_data.append(processed_cells)
        # 计算列宽（自动调整）
        col_widths = []
        num_cols = len(header_cells)
        if num_cols > 0:
            # 计算每列的最大宽度
            max_content_width = (A4[0] - 40*mm) / num_cols  # 减去左右边距
            col_widths = [max_content_width] * num_cols
        # 创建表格样式
        table_style = TableStyle([
            # 表头样式
            ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.4, 0.6, 0.9)),  # 蓝色背景
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
            ('FONTNAME', (0, 0), (-1, 0), chinese_font),
            ('FONTSIZE', (0, 0), (-1, 0), 10),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
            ('TOPPADDING', (0, 0), (-1, 0), 8),
            ('LEFTPADDING', (0, 0), (-1, -1), 6),
            ('RIGHTPADDING', (0, 0), (-1, -1), 6),
            # 表头边框
            ('LINEABOVE', (0, 0), (-1, 0), 1, colors.black),
            ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),
            ('LINEBEFORE', (0, 0), (0, -1), 0.5, colors.grey),
            ('LINEAFTER', (-1, 0), (-1, -1), 0.5, colors.grey),
            # 数据行样式
            ('BACKGROUND', (0, 1), (-1, -1), colors.white),
            ('FONTNAME', (0, 1), (-1, -1), chinese_font),
            ('FONTSIZE', (0, 1), (-1, -1), 9),
            ('TOPPADDING', (0, 1), (-1, -1), 6),
            ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
            # 斑马纹效果（交替行背景色）
            ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.Color(0.95, 0.95, 0.98)]),
            # 网格线
            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
            # 设置对齐方式
            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ])
        # 应用列对齐
        for col_idx, align in enumerate(alignments):
            if align:
                table_style.add('ALIGN', (col_idx, 0), (col_idx, -1), align)
        # 创建表格
        t = Table(row_data, colWidths=col_widths)
        t.setStyle(table_style)
        story.append(t)
        story.append(Spacer(1, 8*mm))
    while i < len(lines):
        line = lines[i]
        # 代码块处理
        if line.strip().startswith('```'):
            if in_code_block:
                # 代码块结束
                code_text = '\n'.join(code_lines)
-                story.append(Paragraph(code_text.replace('<', '&lt;').replace('>', '&gt;'), code_style))
+                # 使用 pre 标签保留格式
-                story.append(Spacer(1, 6*mm))
+                escaped_code = code_text.replace('<', '&lt;').replace('>', '&gt;')
                story.append(Paragraph(f'<font face="Courier" size="8">{escaped_code}</font>', code_block_style))
                story.append(Spacer(1, 3*mm))
                code_lines = []
                in_code_block = False
            else:
                in_code_block = True
                code_lang = line.strip()[3:]  # 获取语言标识
            i += 1
            continue
        if in_code_block:
            code_lines.append(line)
            i += 1
            continue
        # 表格处理
        if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
            # 检查是否是分隔行
            next_line = lines[i + 1].strip()
            if re.match(r'^\|?\s*:?-+:?\s*(\|:?-+:?\s*)*\|?$', next_line):
                table_lines = [line, next_line]  # 包含表头和分隔行
                i += 2
                # 收集所有表格数据行
                while i < len(lines) and '|' in lines[i] and not lines[i].strip().startswith('```'):
                    table_lines.append(lines[i])
                    i += 1
                parse_table(table_lines)
                continue
        # 标题处理
-        if line.startswith('# '):
+        if line.startswith('#### '):
-            text = line[2:].strip()
+            text = process_inline_markdown(line[5:].strip())
-            story.append(Paragraph(text, title_style))
+            h4_style = ParagraphStyle(
-        elif line.startswith('## '):
+                'ChineseHeading4',
-            text = line[3:].strip()
+                parent=heading3_style,
-            story.append(Paragraph(text, heading2_style))
+                fontSize=11,
            )
            story.append(Paragraph(text, h4_style))
        elif line.startswith('### '):
-            text = line[4:].strip()
+            text = process_inline_markdown(line[4:].strip())
            story.append(Paragraph(text, heading3_style))
        elif line.startswith('## '):
            text = process_inline_markdown(line[3:].strip())
            story.append(Paragraph(text, heading2_style))
-        # 列表处理
+        elif line.startswith('# '):
            text = process_inline_markdown(line[2:].strip())
            story.append(Paragraph(text, title_style))
        # 引用块处理
        elif line.strip().startswith('>'):
            quote_text = line.strip()[1:].strip()
            processed = process_inline_markdown(quote_text)
            story.append(Paragraph(processed, blockquote_style))
        # 无序列表处理（包括任务列表）
        elif line.strip().startswith('- ') or line.strip().startswith('* '):
-            text = line.strip()[2:]
+            content = line.strip()[2:].strip()
            # 检查是否是任务列表 [ ] 或 [x]
            task_checked = None
            if content.startswith('[ ]'):
                # 未完成的任务
                task_text = content[2:].strip()
                task_checked = False
            elif content.startswith('[x]') or content.startswith('[X]'):
                # 已完成的任务
                task_text = content[2:].strip()
                task_checked = True
            else:
                # 普通列表项
                task_text = content
                task_checked = None
            text = process_inline_markdown(task_text)
            if task_checked is True:
                # 使用复选框符号表示已完成
                story.append(Paragraph(f'☑ {text}', normal_style))
            elif task_checked is False:
                # 使用复选框符号表示未完成
                story.append(Paragraph(f'☐ {text}', normal_style))
            else:
                # 普通列表项
                story.append(Paragraph(f'• {text}', normal_style))
-        elif re.match(r'^\d+\.\s', line.strip()):
+
-            text = re.sub(r'^\d+\.\s', '', line.strip())
+        # 有序列表处理
-            story.append(Paragraph(text, normal_style))
+        elif re.match(r'^\s*\d+\.\s', line.strip()):
            match = re.match(r'^\s*(\d+)\.\s(.*)$', line.strip())
            if match:
                num = match.group(1)
                text = process_inline_markdown(match.group(2))
                story.append(Paragraph(f'{num}. {text}', normal_style))
        # 分隔线
        elif line.strip() in ['---', '***', '___']:
            # 使用 HRFlowable 绘制水平分割线
            story.append(Spacer(1, 3*mm))
            story.append(HRFlowable(
                width="100%",
                thickness=0.5,
                lineCap='round',
                color=colors.grey,
                spaceBefore=1*mm,
                spaceAfter=3*mm,
            ))
        # 空行
        elif not line.strip():
-            story.append(Spacer(1, 3*mm))
+            story.append(Spacer(1, 2*mm))
-        # 普通段落
+
        # 普通段落（可能跨多行）
        elif line.strip():
-            # 处理粗体和斜体
+            # 收集连续的非空行作为段落
-            text = line.strip()
+            paragraph_lines = [line.strip()]
-            text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
+            i += 1
-            text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
+            while i < len(lines):
-            text = re.sub(r'`(.+?)`', r'<font face="Courier">\1</font>', text)
+                next_line = lines[i].strip()
-            story.append(Paragraph(text, normal_style))
+                # 遇到空行、标题、列表等特殊行时停止
                if (not next_line or
                    next_line.startswith('#') or
                    next_line.startswith('>') or
                    next_line.startswith('-') or
                    next_line.startswith('*') or
                    next_line.startswith('```') or
                    re.match(r'^\d+\.\s', next_line) or
                    (next_line.startswith('---') or next_line.startswith('***')) or
                    ('|' in next_line and i + 1 < len(lines) and '|' in lines[i + 1])):
                    break
                paragraph_lines.append(next_line)
                i += 1
            paragraph_text = ' '.join(paragraph_lines)
            processed = process_inline_markdown(paragraph_text)
            story.append(Paragraph(processed, normal_style))
            i -= 1  # 回退一行，因为外层会 i += 1
        i += 1
    # 生成 PDF
    doc.build(story)
--- a/docling/docling
+++ b/docling/docling