add:增加标题、列表、代码块、表格、引用等样式的处理

This commit is contained in:
2026-01-19 22:30:29 +08:00
parent 0cc1a9484e
commit 57cd7e7c3e
2 changed files with 312 additions and 34 deletions

View File

@@ -49,14 +49,20 @@ except Exception:
try:
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.units import mm, cm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, HRFlowable
from reportlab.lib import colors
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
from reportlab.platypus import KeepInFrame
from reportlab.pdfgen import canvas
from reportlab.lib.colors import HexColor
_HAS_REPORTLAB: bool = True
except Exception:
except Exception as e:
import traceback
print(f"[ERROR] reportlab import failed: {e}")
traceback.print_exc()
A4 = None
_HAS_REPORTLAB: bool = False
@@ -713,6 +719,7 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
def _render_pdf_with_reportlab(md: str) -> bytes:
"""
使用 reportlab 生成支持中文的 PDF纯 Python无外部依赖
完整支持 markdown 格式:标题、列表、代码块、表格、引用等
"""
print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
bio = io.BytesIO()
@@ -767,6 +774,7 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=12,
spaceBefore=12,
leading=22,
)
heading2_style = ParagraphStyle(
@@ -777,6 +785,18 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=10,
spaceBefore=10,
leading=18,
)
heading3_style = ParagraphStyle(
'ChineseHeading3',
parent=styles['Heading3'],
fontName=chinese_font,
fontSize=12,
textColor=colors.black,
spaceAfter=8,
spaceBefore=8,
leading=16,
)
normal_style = ParagraphStyle(
@@ -787,69 +807,328 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=8,
wordWrap='CJK', # 中文换行支持
leading=14,
)
code_style = ParagraphStyle(
'ChineseCode',
parent=styles['Code'],
blockquote_style = ParagraphStyle(
'ChineseBlockquote',
parent=normal_style,
fontName=chinese_font,
leftIndent=10*mm,
textColor=colors.Color(0.4, 0.4, 0.4),
spaceAfter=8,
backColor=colors.Color(0.95, 0.95, 0.95),
)
code_block_style = ParagraphStyle(
'ChineseCodeBlock',
parent=normal_style,
fontName='Courier',
fontSize=9,
fontSize=8,
textColor=colors.black,
backColor=colors.lightgrey,
leftIndent=10,
backColor=colors.Color(0.98, 0.98, 0.98),
leftIndent=5*mm,
rightIndent=5*mm,
spaceAfter=10,
spaceBefore=10,
leading=12,
)
# 解析 markdown
lines = md.split('\n')
i = 0
in_code_block = False
code_lang = ''
code_lines = []
for line in lines:
def process_inline_markdown(text: str) -> str:
"""处理行内 markdown 格式:粗体、斜体、行内代码、链接"""
# 使用占位符来保护我们生成的 HTML 标签
placeholders = {}
placeholder_idx = 0
def save_placeholder(content):
nonlocal placeholder_idx
key = f"__PLACEHOLDER_{placeholder_idx}__"
placeholder_idx += 1
placeholders[key] = content
return key
# 先进行 HTML 转义(处理用户输入中的特殊字符)
text = text.replace('<', '&lt;').replace('>', '&gt;')
# 处理行内代码(避免和其他标记冲突)
def replace_code(match):
code_text = match.group(1)
# 代码内容不需要转义,直接使用
html = f'<font face="Courier" color="#d63384">{code_text}</font>'
return save_placeholder(html)
text = re.sub(r'`([^`]+)`', replace_code, text)
# 处理粗体
def replace_bold(match):
content = match.group(1)
html = f'<b>{content}</b>'
return save_placeholder(html)
text = re.sub(r'\*\*([^*]+)\*\*', replace_bold, text)
# 处理斜体
def replace_italic(match):
content = match.group(1)
html = f'<i>{content}</i>'
return save_placeholder(html)
text = re.sub(r'\*([^*]+)\*', replace_italic, text)
# 处理链接 [text](url) - 使用 reportlab 的 link 标签创建可点击的超链接
def replace_link(match):
link_text = match.group(1)
url = match.group(2)
# 使用蓝色下划线样式link href 属性使链接可点击
html = f'<a href="{url}" color="blue"><u>{link_text}</u></a>'
return save_placeholder(html)
text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', replace_link, text)
# 还原占位符为实际的 HTML 标签
for key, value in placeholders.items():
text = text.replace(key, value)
return text
def parse_table(table_lines: list) -> None:
"""解析 markdown 表格并添加到 story"""
if not table_lines:
return
# 解析分隔行以确定列对齐方式
separator_line = table_lines[1] if len(table_lines) > 1 else ""
alignments = []
if separator_line:
parts = separator_line.split('|')[1:-1] # 去掉首尾空元素
for part in parts:
part = part.strip()
if part.startswith(':') and part.endswith(':'):
alignments.append('CENTER')
elif part.endswith(':'):
alignments.append('RIGHT')
else:
alignments.append('LEFT')
# 解析表头
header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
# 处理表头中的行内样式
processed_headers = []
for cell in header_cells:
processed = process_inline_markdown(cell)
processed_headers.append(Paragraph(processed, normal_style))
# 解析数据行(跳过分隔行)
row_data = [processed_headers]
for line in table_lines[2:]:
if '|' in line:
cells = [cell.strip() for cell in line.split('|')[1:-1]]
# 处理每个单元格中的行内样式
processed_cells = []
for cell in cells:
processed = process_inline_markdown(cell)
processed_cells.append(Paragraph(processed, normal_style))
row_data.append(processed_cells)
# 计算列宽(自动调整)
col_widths = []
num_cols = len(header_cells)
if num_cols > 0:
# 计算每列的最大宽度
max_content_width = (A4[0] - 40*mm) / num_cols # 减去左右边距
col_widths = [max_content_width] * num_cols
# 创建表格样式
table_style = TableStyle([
# 表头样式
('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.4, 0.6, 0.9)), # 蓝色背景
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('FONTNAME', (0, 0), (-1, 0), chinese_font),
('FONTSIZE', (0, 0), (-1, 0), 10),
('BOTTOMPADDING', (0, 0), (-1, 0), 8),
('TOPPADDING', (0, 0), (-1, 0), 8),
('LEFTPADDING', (0, 0), (-1, -1), 6),
('RIGHTPADDING', (0, 0), (-1, -1), 6),
# 表头边框
('LINEABOVE', (0, 0), (-1, 0), 1, colors.black),
('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),
('LINEBEFORE', (0, 0), (0, -1), 0.5, colors.grey),
('LINEAFTER', (-1, 0), (-1, -1), 0.5, colors.grey),
# 数据行样式
('BACKGROUND', (0, 1), (-1, -1), colors.white),
('FONTNAME', (0, 1), (-1, -1), chinese_font),
('FONTSIZE', (0, 1), (-1, -1), 9),
('TOPPADDING', (0, 1), (-1, -1), 6),
('BOTTOMPADDING', (0, 1), (-1, -1), 6),
# 斑马纹效果(交替行背景色)
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.Color(0.95, 0.95, 0.98)]),
# 网格线
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
# 设置对齐方式
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
])
# 应用列对齐
for col_idx, align in enumerate(alignments):
if align:
table_style.add('ALIGN', (col_idx, 0), (col_idx, -1), align)
# 创建表格
t = Table(row_data, colWidths=col_widths)
t.setStyle(table_style)
story.append(t)
story.append(Spacer(1, 8*mm))
while i < len(lines):
line = lines[i]
# 代码块处理
if line.strip().startswith('```'):
if in_code_block:
# 代码块结束
code_text = '\n'.join(code_lines)
story.append(Paragraph(code_text.replace('<', '&lt;').replace('>', '&gt;'), code_style))
story.append(Spacer(1, 6*mm))
# 使用 pre 标签保留格式
escaped_code = code_text.replace('<', '&lt;').replace('>', '&gt;')
story.append(Paragraph(f'<font face="Courier" size="8">{escaped_code}</font>', code_block_style))
story.append(Spacer(1, 3*mm))
code_lines = []
in_code_block = False
else:
in_code_block = True
code_lang = line.strip()[3:] # 获取语言标识
i += 1
continue
if in_code_block:
code_lines.append(line)
i += 1
continue
# 表格处理
if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
# 检查是否是分隔行
next_line = lines[i + 1].strip()
if re.match(r'^\|?\s*:?-+:?\s*(\|:?-+:?\s*)*\|?$', next_line):
table_lines = [line, next_line] # 包含表头和分隔行
i += 2
# 收集所有表格数据行
while i < len(lines) and '|' in lines[i] and not lines[i].strip().startswith('```'):
table_lines.append(lines[i])
i += 1
parse_table(table_lines)
continue
# 标题处理
if line.startswith('# '):
text = line[2:].strip()
story.append(Paragraph(text, title_style))
elif line.startswith('## '):
text = line[3:].strip()
story.append(Paragraph(text, heading2_style))
if line.startswith('#### '):
text = process_inline_markdown(line[5:].strip())
h4_style = ParagraphStyle(
'ChineseHeading4',
parent=heading3_style,
fontSize=11,
)
story.append(Paragraph(text, h4_style))
elif line.startswith('### '):
text = line[4:].strip()
text = process_inline_markdown(line[4:].strip())
story.append(Paragraph(text, heading3_style))
elif line.startswith('## '):
text = process_inline_markdown(line[3:].strip())
story.append(Paragraph(text, heading2_style))
# 列表处理
elif line.startswith('# '):
text = process_inline_markdown(line[2:].strip())
story.append(Paragraph(text, title_style))
# 引用块处理
elif line.strip().startswith('>'):
quote_text = line.strip()[1:].strip()
processed = process_inline_markdown(quote_text)
story.append(Paragraph(processed, blockquote_style))
# 无序列表处理(包括任务列表)
elif line.strip().startswith('- ') or line.strip().startswith('* '):
text = line.strip()[2:]
content = line.strip()[2:].strip()
# 检查是否是任务列表 [ ] 或 [x]
task_checked = None
if content.startswith('[ ]'):
# 未完成的任务
task_text = content[2:].strip()
task_checked = False
elif content.startswith('[x]') or content.startswith('[X]'):
# 已完成的任务
task_text = content[2:].strip()
task_checked = True
else:
# 普通列表项
task_text = content
task_checked = None
text = process_inline_markdown(task_text)
if task_checked is True:
# 使用复选框符号表示已完成
story.append(Paragraph(f'{text}', normal_style))
elif task_checked is False:
# 使用复选框符号表示未完成
story.append(Paragraph(f'{text}', normal_style))
else:
# 普通列表项
story.append(Paragraph(f'{text}', normal_style))
elif re.match(r'^\d+\.\s', line.strip()):
text = re.sub(r'^\d+\.\s', '', line.strip())
story.append(Paragraph(text, normal_style))
# 有序列表处理
elif re.match(r'^\s*\d+\.\s', line.strip()):
match = re.match(r'^\s*(\d+)\.\s(.*)$', line.strip())
if match:
num = match.group(1)
text = process_inline_markdown(match.group(2))
story.append(Paragraph(f'{num}. {text}', normal_style))
# 分隔线
elif line.strip() in ['---', '***', '___']:
# 使用 HRFlowable 绘制水平分割线
story.append(Spacer(1, 3*mm))
story.append(HRFlowable(
width="100%",
thickness=0.5,
lineCap='round',
color=colors.grey,
spaceBefore=1*mm,
spaceAfter=3*mm,
))
# 空行
elif not line.strip():
story.append(Spacer(1, 3*mm))
# 普通段落
story.append(Spacer(1, 2*mm))
# 普通段落(可能跨多行)
elif line.strip():
# 处理粗体和斜体
text = line.strip()
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
text = re.sub(r'`(.+?)`', r'<font face="Courier">\1</font>', text)
story.append(Paragraph(text, normal_style))
# 收集连续的非空行作为段落
paragraph_lines = [line.strip()]
i += 1
while i < len(lines):
next_line = lines[i].strip()
# 遇到空行、标题、列表等特殊行时停止
if (not next_line or
next_line.startswith('#') or
next_line.startswith('>') or
next_line.startswith('-') or
next_line.startswith('*') or
next_line.startswith('```') or
re.match(r'^\d+\.\s', next_line) or
(next_line.startswith('---') or next_line.startswith('***')) or
('|' in next_line and i + 1 < len(lines) and '|' in lines[i + 1])):
break
paragraph_lines.append(next_line)
i += 1
paragraph_text = ' '.join(paragraph_lines)
processed = process_inline_markdown(paragraph_text)
story.append(Paragraph(processed, normal_style))
i -= 1 # 回退一行,因为外层会 i += 1
i += 1
# 生成 PDF
doc.build(story)

Submodule docling/docling deleted from ad97e52851