diff --git a/docling/app/services/docling_adapter.py b/docling/app/services/docling_adapter.py
index 08cf8e2..8e22089 100644
--- a/docling/app/services/docling_adapter.py
+++ b/docling/app/services/docling_adapter.py
@@ -49,14 +49,20 @@ except Exception:
try:
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
- from reportlab.lib.units import mm
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+ from reportlab.lib.units import mm, cm
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, HRFlowable
from reportlab.lib import colors
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
+ from reportlab.platypus import KeepInFrame
+ from reportlab.pdfgen import canvas
+ from reportlab.lib.colors import HexColor
_HAS_REPORTLAB: bool = True
-except Exception:
+except Exception as e:
+ import traceback
+ print(f"[ERROR] reportlab import failed: {e}")
+ traceback.print_exc()
A4 = None
_HAS_REPORTLAB: bool = False
@@ -713,6 +719,7 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
def _render_pdf_with_reportlab(md: str) -> bytes:
"""
使用 reportlab 生成支持中文的 PDF(纯 Python,无外部依赖)
+ 完整支持 markdown 格式:标题、列表、代码块、表格、引用等
"""
print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
bio = io.BytesIO()
@@ -767,6 +774,7 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=12,
spaceBefore=12,
+ leading=22,
)
heading2_style = ParagraphStyle(
@@ -777,6 +785,18 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=10,
spaceBefore=10,
+ leading=18,
+ )
+
+ heading3_style = ParagraphStyle(
+ 'ChineseHeading3',
+ parent=styles['Heading3'],
+ fontName=chinese_font,
+ fontSize=12,
+ textColor=colors.black,
+ spaceAfter=8,
+ spaceBefore=8,
+ leading=16,
)
normal_style = ParagraphStyle(
@@ -787,69 +807,328 @@ def _render_pdf_with_reportlab(md: str) -> bytes:
textColor=colors.black,
spaceAfter=8,
wordWrap='CJK', # 中文换行支持
+ leading=14,
)
- code_style = ParagraphStyle(
- 'ChineseCode',
- parent=styles['Code'],
+ blockquote_style = ParagraphStyle(
+ 'ChineseBlockquote',
+ parent=normal_style,
+ fontName=chinese_font,
+ leftIndent=10*mm,
+ textColor=colors.Color(0.4, 0.4, 0.4),
+ spaceAfter=8,
+ backColor=colors.Color(0.95, 0.95, 0.95),
+ )
+
+ code_block_style = ParagraphStyle(
+ 'ChineseCodeBlock',
+ parent=normal_style,
fontName='Courier',
- fontSize=9,
+ fontSize=8,
textColor=colors.black,
- backColor=colors.lightgrey,
- leftIndent=10,
+ backColor=colors.Color(0.98, 0.98, 0.98),
+ leftIndent=5*mm,
+ rightIndent=5*mm,
+ spaceAfter=10,
+ spaceBefore=10,
+ leading=12,
)
# 解析 markdown
lines = md.split('\n')
+ i = 0
in_code_block = False
+ code_lang = ''
code_lines = []
- for line in lines:
+ def process_inline_markdown(text: str) -> str:
+ """处理行内 markdown 格式:粗体、斜体、行内代码、链接"""
+ # 使用占位符来保护我们生成的 HTML 标签
+ placeholders = {}
+ placeholder_idx = 0
+
+ def save_placeholder(content):
+ nonlocal placeholder_idx
+ key = f"__PLACEHOLDER_{placeholder_idx}__"
+ placeholder_idx += 1
+ placeholders[key] = content
+ return key
+
+ # 先进行 HTML 转义(处理用户输入中的特殊字符)
+ text = text.replace('<', '<').replace('>', '>')
+
+ # 处理行内代码(避免和其他标记冲突)
+ def replace_code(match):
+ code_text = match.group(1)
+ # 代码内容不需要转义,直接使用
+ html = f'{code_text}'
+ return save_placeholder(html)
+ text = re.sub(r'`([^`]+)`', replace_code, text)
+
+ # 处理粗体
+ def replace_bold(match):
+ content = match.group(1)
+ html = f'{content}'
+ return save_placeholder(html)
+ text = re.sub(r'\*\*([^*]+)\*\*', replace_bold, text)
+
+ # 处理斜体
+ def replace_italic(match):
+ content = match.group(1)
+ html = f'{content}'
+ return save_placeholder(html)
+ text = re.sub(r'\*([^*]+)\*', replace_italic, text)
+
+ # 处理链接 [text](url) - 使用 reportlab 的 link 标签创建可点击的超链接
+ def replace_link(match):
+ link_text = match.group(1)
+ url = match.group(2)
+ # 使用蓝色下划线样式,link href 属性使链接可点击
+ html = f'{link_text}'
+ return save_placeholder(html)
+ text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', replace_link, text)
+
+ # 还原占位符为实际的 HTML 标签
+ for key, value in placeholders.items():
+ text = text.replace(key, value)
+
+ return text
+
+ def parse_table(table_lines: list) -> None:
+ """解析 markdown 表格并添加到 story"""
+ if not table_lines:
+ return
+
+ # 解析分隔行以确定列对齐方式
+ separator_line = table_lines[1] if len(table_lines) > 1 else ""
+ alignments = []
+ if separator_line:
+ parts = separator_line.split('|')[1:-1] # 去掉首尾空元素
+ for part in parts:
+ part = part.strip()
+ if part.startswith(':') and part.endswith(':'):
+ alignments.append('CENTER')
+ elif part.endswith(':'):
+ alignments.append('RIGHT')
+ else:
+ alignments.append('LEFT')
+
+ # 解析表头
+ header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
+ # 处理表头中的行内样式
+ processed_headers = []
+ for cell in header_cells:
+ processed = process_inline_markdown(cell)
+ processed_headers.append(Paragraph(processed, normal_style))
+
+ # 解析数据行(跳过分隔行)
+ row_data = [processed_headers]
+ for line in table_lines[2:]:
+ if '|' in line:
+ cells = [cell.strip() for cell in line.split('|')[1:-1]]
+ # 处理每个单元格中的行内样式
+ processed_cells = []
+ for cell in cells:
+ processed = process_inline_markdown(cell)
+ processed_cells.append(Paragraph(processed, normal_style))
+ row_data.append(processed_cells)
+
+ # 计算列宽(自动调整)
+ col_widths = []
+ num_cols = len(header_cells)
+ if num_cols > 0:
+ # 计算每列的最大宽度
+ max_content_width = (A4[0] - 40*mm) / num_cols # 减去左右边距
+ col_widths = [max_content_width] * num_cols
+
+ # 创建表格样式
+ table_style = TableStyle([
+ # 表头样式
+ ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.4, 0.6, 0.9)), # 蓝色背景
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
+ ('FONTNAME', (0, 0), (-1, 0), chinese_font),
+ ('FONTSIZE', (0, 0), (-1, 0), 10),
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
+ ('TOPPADDING', (0, 0), (-1, 0), 8),
+ ('LEFTPADDING', (0, 0), (-1, -1), 6),
+ ('RIGHTPADDING', (0, 0), (-1, -1), 6),
+ # 表头边框
+ ('LINEABOVE', (0, 0), (-1, 0), 1, colors.black),
+ ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),
+ ('LINEBEFORE', (0, 0), (0, -1), 0.5, colors.grey),
+ ('LINEAFTER', (-1, 0), (-1, -1), 0.5, colors.grey),
+ # 数据行样式
+ ('BACKGROUND', (0, 1), (-1, -1), colors.white),
+ ('FONTNAME', (0, 1), (-1, -1), chinese_font),
+ ('FONTSIZE', (0, 1), (-1, -1), 9),
+ ('TOPPADDING', (0, 1), (-1, -1), 6),
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
+ # 斑马纹效果(交替行背景色)
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.Color(0.95, 0.95, 0.98)]),
+ # 网格线
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
+ # 设置对齐方式
+ ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+ ])
+
+ # 应用列对齐
+ for col_idx, align in enumerate(alignments):
+ if align:
+ table_style.add('ALIGN', (col_idx, 0), (col_idx, -1), align)
+
+ # 创建表格
+ t = Table(row_data, colWidths=col_widths)
+ t.setStyle(table_style)
+ story.append(t)
+ story.append(Spacer(1, 8*mm))
+
+ while i < len(lines):
+ line = lines[i]
+
# 代码块处理
if line.strip().startswith('```'):
if in_code_block:
# 代码块结束
code_text = '\n'.join(code_lines)
- story.append(Paragraph(code_text.replace('<', '<').replace('>', '>'), code_style))
- story.append(Spacer(1, 6*mm))
+ # 使用 pre 标签保留格式
+ escaped_code = code_text.replace('<', '<').replace('>', '>')
+ story.append(Paragraph(f'{escaped_code}', code_block_style))
+ story.append(Spacer(1, 3*mm))
code_lines = []
in_code_block = False
else:
in_code_block = True
+ code_lang = line.strip()[3:] # 获取语言标识
+ i += 1
continue
if in_code_block:
code_lines.append(line)
+ i += 1
continue
+ # 表格处理
+ if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
+ # 检查是否是分隔行
+ next_line = lines[i + 1].strip()
+ if re.match(r'^\|?\s*:?-+:?\s*(\|:?-+:?\s*)*\|?$', next_line):
+ table_lines = [line, next_line] # 包含表头和分隔行
+ i += 2
+ # 收集所有表格数据行
+ while i < len(lines) and '|' in lines[i] and not lines[i].strip().startswith('```'):
+ table_lines.append(lines[i])
+ i += 1
+ parse_table(table_lines)
+ continue
+
# 标题处理
- if line.startswith('# '):
- text = line[2:].strip()
- story.append(Paragraph(text, title_style))
- elif line.startswith('## '):
- text = line[3:].strip()
- story.append(Paragraph(text, heading2_style))
+ if line.startswith('#### '):
+ text = process_inline_markdown(line[5:].strip())
+ h4_style = ParagraphStyle(
+ 'ChineseHeading4',
+ parent=heading3_style,
+ fontSize=11,
+ )
+ story.append(Paragraph(text, h4_style))
elif line.startswith('### '):
- text = line[4:].strip()
+ text = process_inline_markdown(line[4:].strip())
+ story.append(Paragraph(text, heading3_style))
+ elif line.startswith('## '):
+ text = process_inline_markdown(line[3:].strip())
story.append(Paragraph(text, heading2_style))
- # 列表处理
+ elif line.startswith('# '):
+ text = process_inline_markdown(line[2:].strip())
+ story.append(Paragraph(text, title_style))
+
+ # 引用块处理
+ elif line.strip().startswith('>'):
+ quote_text = line.strip()[1:].strip()
+ processed = process_inline_markdown(quote_text)
+ story.append(Paragraph(processed, blockquote_style))
+
+ # 无序列表处理(包括任务列表)
elif line.strip().startswith('- ') or line.strip().startswith('* '):
- text = line.strip()[2:]
- story.append(Paragraph(f'• {text}', normal_style))
- elif re.match(r'^\d+\.\s', line.strip()):
- text = re.sub(r'^\d+\.\s', '', line.strip())
- story.append(Paragraph(text, normal_style))
+ content = line.strip()[2:].strip()
+
+ # 检查是否是任务列表 [ ] 或 [x]
+ task_checked = None
+ if content.startswith('[ ]'):
+ # 未完成的任务
+ task_text = content[2:].strip()
+ task_checked = False
+ elif content.startswith('[x]') or content.startswith('[X]'):
+ # 已完成的任务
+ task_text = content[2:].strip()
+ task_checked = True
+ else:
+ # 普通列表项
+ task_text = content
+ task_checked = None
+
+ text = process_inline_markdown(task_text)
+
+ if task_checked is True:
+ # 使用复选框符号表示已完成
+ story.append(Paragraph(f'☑ {text}', normal_style))
+ elif task_checked is False:
+ # 使用复选框符号表示未完成
+ story.append(Paragraph(f'☐ {text}', normal_style))
+ else:
+ # 普通列表项
+ story.append(Paragraph(f'• {text}', normal_style))
+
+ # 有序列表处理
+ elif re.match(r'^\s*\d+\.\s', line.strip()):
+ match = re.match(r'^\s*(\d+)\.\s(.*)$', line.strip())
+ if match:
+ num = match.group(1)
+ text = process_inline_markdown(match.group(2))
+ story.append(Paragraph(f'{num}. {text}', normal_style))
+
+ # 分隔线
+ elif line.strip() in ['---', '***', '___']:
+ # 使用 HRFlowable 绘制水平分割线
+ story.append(Spacer(1, 3*mm))
+ story.append(HRFlowable(
+ width="100%",
+ thickness=0.5,
+ lineCap='round',
+ color=colors.grey,
+ spaceBefore=1*mm,
+ spaceAfter=3*mm,
+ ))
+
# 空行
elif not line.strip():
- story.append(Spacer(1, 3*mm))
- # 普通段落
+ story.append(Spacer(1, 2*mm))
+
+ # 普通段落(可能跨多行)
elif line.strip():
- # 处理粗体和斜体
- text = line.strip()
- text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
- text = re.sub(r'\*(.+?)\*', r'\1', text)
- text = re.sub(r'`(.+?)`', r'\1', text)
- story.append(Paragraph(text, normal_style))
+ # 收集连续的非空行作为段落
+ paragraph_lines = [line.strip()]
+ i += 1
+ while i < len(lines):
+ next_line = lines[i].strip()
+ # 遇到空行、标题、列表等特殊行时停止
+ if (not next_line or
+ next_line.startswith('#') or
+ next_line.startswith('>') or
+ next_line.startswith('-') or
+ next_line.startswith('*') or
+ next_line.startswith('```') or
+ re.match(r'^\d+\.\s', next_line) or
+ (next_line.startswith('---') or next_line.startswith('***')) or
+ ('|' in next_line and i + 1 < len(lines) and '|' in lines[i + 1])):
+ break
+ paragraph_lines.append(next_line)
+ i += 1
+
+ paragraph_text = ' '.join(paragraph_lines)
+ processed = process_inline_markdown(paragraph_text)
+ story.append(Paragraph(processed, normal_style))
+ i -= 1 # 回退一行,因为外层会 i += 1
+
+ i += 1
# 生成 PDF
doc.build(story)
diff --git a/docling/docling b/docling/docling
deleted file mode 160000
index ad97e52..0000000
--- a/docling/docling
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ad97e5285126388847ba9a219ac73f006c759f09