diff --git a/docling/app/services/docling_adapter.py b/docling/app/services/docling_adapter.py index 08cf8e2..8e22089 100644 --- a/docling/app/services/docling_adapter.py +++ b/docling/app/services/docling_adapter.py @@ -49,14 +49,20 @@ except Exception: try: from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle - from reportlab.lib.units import mm - from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak + from reportlab.lib.units import mm, cm + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, HRFlowable from reportlab.lib import colors from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT + from reportlab.platypus import KeepInFrame + from reportlab.pdfgen import canvas + from reportlab.lib.colors import HexColor _HAS_REPORTLAB: bool = True -except Exception: +except Exception as e: + import traceback + print(f"[ERROR] reportlab import failed: {e}") + traceback.print_exc() A4 = None _HAS_REPORTLAB: bool = False @@ -713,6 +719,7 @@ def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]): def _render_pdf_with_reportlab(md: str) -> bytes: """ 使用 reportlab 生成支持中文的 PDF(纯 Python,无外部依赖) + 完整支持 markdown 格式:标题、列表、代码块、表格、引用等 """ print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}") bio = io.BytesIO() @@ -767,6 +774,7 @@ def _render_pdf_with_reportlab(md: str) -> bytes: textColor=colors.black, spaceAfter=12, spaceBefore=12, + leading=22, ) heading2_style = ParagraphStyle( @@ -777,6 +785,18 @@ def _render_pdf_with_reportlab(md: str) -> bytes: textColor=colors.black, spaceAfter=10, spaceBefore=10, + leading=18, + ) + + heading3_style = ParagraphStyle( + 'ChineseHeading3', + parent=styles['Heading3'], + fontName=chinese_font, + fontSize=12, + textColor=colors.black, + spaceAfter=8, + spaceBefore=8, + leading=16, ) normal_style = ParagraphStyle( @@ -787,69 +807,328 @@ def _render_pdf_with_reportlab(md: str) -> bytes: textColor=colors.black, spaceAfter=8, wordWrap='CJK', # 中文换行支持 + leading=14, ) - code_style = ParagraphStyle( - 'ChineseCode', - parent=styles['Code'], + blockquote_style = ParagraphStyle( + 'ChineseBlockquote', + parent=normal_style, + fontName=chinese_font, + leftIndent=10*mm, + textColor=colors.Color(0.4, 0.4, 0.4), + spaceAfter=8, + backColor=colors.Color(0.95, 0.95, 0.95), + ) + + code_block_style = ParagraphStyle( + 'ChineseCodeBlock', + parent=normal_style, fontName='Courier', - fontSize=9, + fontSize=8, textColor=colors.black, - backColor=colors.lightgrey, - leftIndent=10, + backColor=colors.Color(0.98, 0.98, 0.98), + leftIndent=5*mm, + rightIndent=5*mm, + spaceAfter=10, + spaceBefore=10, + leading=12, ) # 解析 markdown lines = md.split('\n') + i = 0 in_code_block = False + code_lang = '' code_lines = [] - for line in lines: + def process_inline_markdown(text: str) -> str: + """处理行内 markdown 格式:粗体、斜体、行内代码、链接""" + # 使用占位符来保护我们生成的 HTML 标签 + placeholders = {} + placeholder_idx = 0 + + def save_placeholder(content): + nonlocal placeholder_idx + key = f"__PLACEHOLDER_{placeholder_idx}__" + placeholder_idx += 1 + placeholders[key] = content + return key + + # 先进行 HTML 转义(处理用户输入中的特殊字符) + text = text.replace('<', '<').replace('>', '>') + + # 处理行内代码(避免和其他标记冲突) + def replace_code(match): + code_text = match.group(1) + # 代码内容不需要转义,直接使用 + html = f'{code_text}' + return save_placeholder(html) + text = re.sub(r'`([^`]+)`', replace_code, text) + + # 处理粗体 + def replace_bold(match): + content = match.group(1) + html = f'{content}' + return save_placeholder(html) + text = re.sub(r'\*\*([^*]+)\*\*', replace_bold, text) + + # 处理斜体 + def replace_italic(match): + content = match.group(1) + html = f'{content}' + return save_placeholder(html) + text = re.sub(r'\*([^*]+)\*', replace_italic, text) + + # 处理链接 [text](url) - 使用 reportlab 的 link 标签创建可点击的超链接 + def replace_link(match): + link_text = match.group(1) + url = match.group(2) + # 使用蓝色下划线样式,link href 属性使链接可点击 + html = f'{link_text}' + return save_placeholder(html) + text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', replace_link, text) + + # 还原占位符为实际的 HTML 标签 + for key, value in placeholders.items(): + text = text.replace(key, value) + + return text + + def parse_table(table_lines: list) -> None: + """解析 markdown 表格并添加到 story""" + if not table_lines: + return + + # 解析分隔行以确定列对齐方式 + separator_line = table_lines[1] if len(table_lines) > 1 else "" + alignments = [] + if separator_line: + parts = separator_line.split('|')[1:-1] # 去掉首尾空元素 + for part in parts: + part = part.strip() + if part.startswith(':') and part.endswith(':'): + alignments.append('CENTER') + elif part.endswith(':'): + alignments.append('RIGHT') + else: + alignments.append('LEFT') + + # 解析表头 + header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]] + # 处理表头中的行内样式 + processed_headers = [] + for cell in header_cells: + processed = process_inline_markdown(cell) + processed_headers.append(Paragraph(processed, normal_style)) + + # 解析数据行(跳过分隔行) + row_data = [processed_headers] + for line in table_lines[2:]: + if '|' in line: + cells = [cell.strip() for cell in line.split('|')[1:-1]] + # 处理每个单元格中的行内样式 + processed_cells = [] + for cell in cells: + processed = process_inline_markdown(cell) + processed_cells.append(Paragraph(processed, normal_style)) + row_data.append(processed_cells) + + # 计算列宽(自动调整) + col_widths = [] + num_cols = len(header_cells) + if num_cols > 0: + # 计算每列的最大宽度 + max_content_width = (A4[0] - 40*mm) / num_cols # 减去左右边距 + col_widths = [max_content_width] * num_cols + + # 创建表格样式 + table_style = TableStyle([ + # 表头样式 + ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.4, 0.6, 0.9)), # 蓝色背景 + ('TEXTCOLOR', (0, 0), (-1, 0), colors.white), + ('FONTNAME', (0, 0), (-1, 0), chinese_font), + ('FONTSIZE', (0, 0), (-1, 0), 10), + ('BOTTOMPADDING', (0, 0), (-1, 0), 8), + ('TOPPADDING', (0, 0), (-1, 0), 8), + ('LEFTPADDING', (0, 0), (-1, -1), 6), + ('RIGHTPADDING', (0, 0), (-1, -1), 6), + # 表头边框 + ('LINEABOVE', (0, 0), (-1, 0), 1, colors.black), + ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black), + ('LINEBEFORE', (0, 0), (0, -1), 0.5, colors.grey), + ('LINEAFTER', (-1, 0), (-1, -1), 0.5, colors.grey), + # 数据行样式 + ('BACKGROUND', (0, 1), (-1, -1), colors.white), + ('FONTNAME', (0, 1), (-1, -1), chinese_font), + ('FONTSIZE', (0, 1), (-1, -1), 9), + ('TOPPADDING', (0, 1), (-1, -1), 6), + ('BOTTOMPADDING', (0, 1), (-1, -1), 6), + # 斑马纹效果(交替行背景色) + ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.Color(0.95, 0.95, 0.98)]), + # 网格线 + ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), + # 设置对齐方式 + ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), + ]) + + # 应用列对齐 + for col_idx, align in enumerate(alignments): + if align: + table_style.add('ALIGN', (col_idx, 0), (col_idx, -1), align) + + # 创建表格 + t = Table(row_data, colWidths=col_widths) + t.setStyle(table_style) + story.append(t) + story.append(Spacer(1, 8*mm)) + + while i < len(lines): + line = lines[i] + # 代码块处理 if line.strip().startswith('```'): if in_code_block: # 代码块结束 code_text = '\n'.join(code_lines) - story.append(Paragraph(code_text.replace('<', '<').replace('>', '>'), code_style)) - story.append(Spacer(1, 6*mm)) + # 使用 pre 标签保留格式 + escaped_code = code_text.replace('<', '<').replace('>', '>') + story.append(Paragraph(f'{escaped_code}', code_block_style)) + story.append(Spacer(1, 3*mm)) code_lines = [] in_code_block = False else: in_code_block = True + code_lang = line.strip()[3:] # 获取语言标识 + i += 1 continue if in_code_block: code_lines.append(line) + i += 1 continue + # 表格处理 + if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]: + # 检查是否是分隔行 + next_line = lines[i + 1].strip() + if re.match(r'^\|?\s*:?-+:?\s*(\|:?-+:?\s*)*\|?$', next_line): + table_lines = [line, next_line] # 包含表头和分隔行 + i += 2 + # 收集所有表格数据行 + while i < len(lines) and '|' in lines[i] and not lines[i].strip().startswith('```'): + table_lines.append(lines[i]) + i += 1 + parse_table(table_lines) + continue + # 标题处理 - if line.startswith('# '): - text = line[2:].strip() - story.append(Paragraph(text, title_style)) - elif line.startswith('## '): - text = line[3:].strip() - story.append(Paragraph(text, heading2_style)) + if line.startswith('#### '): + text = process_inline_markdown(line[5:].strip()) + h4_style = ParagraphStyle( + 'ChineseHeading4', + parent=heading3_style, + fontSize=11, + ) + story.append(Paragraph(text, h4_style)) elif line.startswith('### '): - text = line[4:].strip() + text = process_inline_markdown(line[4:].strip()) + story.append(Paragraph(text, heading3_style)) + elif line.startswith('## '): + text = process_inline_markdown(line[3:].strip()) story.append(Paragraph(text, heading2_style)) - # 列表处理 + elif line.startswith('# '): + text = process_inline_markdown(line[2:].strip()) + story.append(Paragraph(text, title_style)) + + # 引用块处理 + elif line.strip().startswith('>'): + quote_text = line.strip()[1:].strip() + processed = process_inline_markdown(quote_text) + story.append(Paragraph(processed, blockquote_style)) + + # 无序列表处理(包括任务列表) elif line.strip().startswith('- ') or line.strip().startswith('* '): - text = line.strip()[2:] - story.append(Paragraph(f'• {text}', normal_style)) - elif re.match(r'^\d+\.\s', line.strip()): - text = re.sub(r'^\d+\.\s', '', line.strip()) - story.append(Paragraph(text, normal_style)) + content = line.strip()[2:].strip() + + # 检查是否是任务列表 [ ] 或 [x] + task_checked = None + if content.startswith('[ ]'): + # 未完成的任务 + task_text = content[2:].strip() + task_checked = False + elif content.startswith('[x]') or content.startswith('[X]'): + # 已完成的任务 + task_text = content[2:].strip() + task_checked = True + else: + # 普通列表项 + task_text = content + task_checked = None + + text = process_inline_markdown(task_text) + + if task_checked is True: + # 使用复选框符号表示已完成 + story.append(Paragraph(f'☑ {text}', normal_style)) + elif task_checked is False: + # 使用复选框符号表示未完成 + story.append(Paragraph(f'☐ {text}', normal_style)) + else: + # 普通列表项 + story.append(Paragraph(f'• {text}', normal_style)) + + # 有序列表处理 + elif re.match(r'^\s*\d+\.\s', line.strip()): + match = re.match(r'^\s*(\d+)\.\s(.*)$', line.strip()) + if match: + num = match.group(1) + text = process_inline_markdown(match.group(2)) + story.append(Paragraph(f'{num}. {text}', normal_style)) + + # 分隔线 + elif line.strip() in ['---', '***', '___']: + # 使用 HRFlowable 绘制水平分割线 + story.append(Spacer(1, 3*mm)) + story.append(HRFlowable( + width="100%", + thickness=0.5, + lineCap='round', + color=colors.grey, + spaceBefore=1*mm, + spaceAfter=3*mm, + )) + # 空行 elif not line.strip(): - story.append(Spacer(1, 3*mm)) - # 普通段落 + story.append(Spacer(1, 2*mm)) + + # 普通段落(可能跨多行) elif line.strip(): - # 处理粗体和斜体 - text = line.strip() - text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) - text = re.sub(r'\*(.+?)\*', r'\1', text) - text = re.sub(r'`(.+?)`', r'\1', text) - story.append(Paragraph(text, normal_style)) + # 收集连续的非空行作为段落 + paragraph_lines = [line.strip()] + i += 1 + while i < len(lines): + next_line = lines[i].strip() + # 遇到空行、标题、列表等特殊行时停止 + if (not next_line or + next_line.startswith('#') or + next_line.startswith('>') or + next_line.startswith('-') or + next_line.startswith('*') or + next_line.startswith('```') or + re.match(r'^\d+\.\s', next_line) or + (next_line.startswith('---') or next_line.startswith('***')) or + ('|' in next_line and i + 1 < len(lines) and '|' in lines[i + 1])): + break + paragraph_lines.append(next_line) + i += 1 + + paragraph_text = ' '.join(paragraph_lines) + processed = process_inline_markdown(paragraph_text) + story.append(Paragraph(processed, normal_style)) + i -= 1 # 回退一行,因为外层会 i += 1 + + i += 1 # 生成 PDF doc.build(story) diff --git a/docling/docling b/docling/docling deleted file mode 160000 index ad97e52..0000000 --- a/docling/docling +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ad97e5285126388847ba9a219ac73f006c759f09