from pathlib import Path from typing import Optional, Tuple, Dict, List, Any from urllib.parse import urlparse, unquote import os import re import io from bs4 import BeautifulSoup from bs4.element import PageElement import marko import sys try: _DOC_BASE = Path(__file__).resolve().parents[2] / "docling" p = str(_DOC_BASE) if p not in sys.path: sys.path.insert(0, p) except Exception: pass try: from docling.document_converter import DocumentConverter except Exception: class DocumentConverter: # type: ignore def __init__(self, *args, **kwargs): pass def convert(self, source): raise RuntimeError("docling not available") from docx import Document from docx.shared import Mm, Pt from docx.enum.section import WD_SECTION from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml import OxmlElement from docx.oxml.ns import qn from urllib.request import urlopen import json try: from weasyprint import HTML, CSS # type: ignore except Exception: HTML = None CSS = None try: from xhtml2pdf import pisa as _pisa # type: ignore _HAS_XHTML2PDF: bool = True except Exception: _pisa = None # type: ignore _HAS_XHTML2PDF: bool = False # reportlab 用于生成支持中文的 PDF try: from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import mm from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak from reportlab.lib import colors from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT _HAS_REPORTLAB: bool = True except Exception: A4 = None _HAS_REPORTLAB: bool = False _mdit: Any = None _tasklists_plugin: Any = None _deflist_plugin: Any = None _footnote_plugin: Any = None _attrs_plugin: Any = None _HAS_MD_IT: bool = False try: import markdown_it as _mdit # type: ignore from mdit_py_plugins.tasklists import tasklists_plugin as _tasklists_plugin # type: ignore from mdit_py_plugins.deflist import deflist_plugin as _deflist_plugin # type: ignore from mdit_py_plugins.footnote import footnote_plugin as _footnote_plugin # type: ignore from mdit_py_plugins.attrs import attrs_plugin as _attrs_plugin # type: ignore _HAS_MD_IT = True except Exception: pass converter = DocumentConverter() LINKMAP_PATH = Path(__file__).resolve().parent.parent / "configs" / "linkmap" / "linkmap.json" _LINKMAP: Dict[str, str] = {} def load_linkmap() -> Dict[str, str]: global _LINKMAP try: if LINKMAP_PATH.exists(): _LINKMAP = json.loads(LINKMAP_PATH.read_text("utf-8")) or {} except Exception: _LINKMAP = {} return _LINKMAP def save_linkmap(mapping: Dict[str, str]) -> None: LINKMAP_PATH.parent.mkdir(parents=True, exist_ok=True) LINKMAP_PATH.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), "utf-8") load_linkmap() def resolve_link(href: Optional[str], data_doc: Optional[str]) -> Optional[str]: if href: return href if not _LINKMAP: load_linkmap() if data_doc and data_doc in _LINKMAP: return _LINKMAP[data_doc] return None def export_payload(doc, fmt: str) -> Tuple[str, str]: f = fmt.lower() if f == "markdown": return doc.export_to_markdown(), "text/markdown" if f == "html": return doc.export_to_html(), "text/html" if f == "json": return doc.export_to_json(), "application/json" if f == "doctags": return doc.export_to_doctags(), "application/json" raise ValueError("unsupported export") def infer_basename(source_url: Optional[str], upload_name: Optional[str]) -> str: if source_url: path = urlparse(source_url).path name = os.path.basename(path) or "document" name = unquote(name) return os.path.splitext(name)[0] or "document" if upload_name: name = os.path.splitext(os.path.basename(upload_name))[0] or "document" return name return "document" def sanitize_filename(name: Optional[str]) -> str: if not name: return "document" name = name.strip()[:128] name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name) or "document" return name def convert_source(source: str, export: str) -> Tuple[str, str]: result = converter.convert(source) return export_payload(result.document, export) def md_to_docx_bytes(md: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes: try: import logging as _log _log.info(f"md_to_docx_bytes start toc={toc} header={bool(header_text)} footer={bool(footer_text)} logo={bool(logo_url)} cover={bool(cover_src)}") except Exception: pass def _add_field(paragraph, instr: str): r1 = paragraph.add_run() b = OxmlElement('w:fldChar') b.set(qn('w:fldCharType'), 'begin') r1._r.append(b) r2 = paragraph.add_run() t = OxmlElement('w:instrText') t.set(qn('xml:space'), 'preserve') t.text = instr r2._r.append(t) r3 = paragraph.add_run() e = OxmlElement('w:fldChar') e.set(qn('w:fldCharType'), 'end') r3._r.append(e) def _available_width(section) -> int: return section.page_width - section.left_margin - section.right_margin def _fetch_bytes(u: str) -> Optional[bytes]: try: if u.lower().startswith('http://') or u.lower().startswith('https://'): with urlopen(u, timeout=10) as r: return r.read() p = Path(u) if p.exists() and p.is_file(): return p.read_bytes() except Exception: return None return None html = normalize_html(md, options={ "toc": "1" if toc else "", "header_text": header_text, "footer_text": footer_text, "logo_url": logo_url, "copyright_text": copyright_text, "filename_text": filename_text, "cover_src": cover_src, "product_name": product_name, "document_name": document_name, "product_version": product_version, "document_version": document_version, }) try: import logging as _log _log.info(f"md_to_docx_bytes normalize_html length={len(html)}") except Exception: pass soup = BeautifulSoup(html, "html.parser") doc = Document() sec0 = doc.sections[0] sec0.page_width = Mm(210) sec0.page_height = Mm(297) sec0.left_margin = Mm(15) sec0.right_margin = Mm(15) sec0.top_margin = Mm(20) sec0.bottom_margin = Mm(20) has_cover = bool(cover_src or (soup.find('section', class_='cover') is not None)) if has_cover: sec0.left_margin = Mm(0) sec0.right_margin = Mm(0) sec0.top_margin = Mm(0) sec0.bottom_margin = Mm(0) if cover_src: b = _fetch_bytes(cover_src) if b: bio = io.BytesIO(b) doc.add_picture(bio, width=_available_width(sec0)) if product_name: p = doc.add_paragraph() r = p.add_run(product_name) r.font.size = Pt(18) r.bold = True t = document_name or None if not t: h1 = soup.body.find('h1') if soup.body else soup.find('h1') t = h1.get_text(strip=True) if h1 else '文档' p2 = doc.add_paragraph() r2 = p2.add_run(t or '文档') r2.font.size = Pt(24) r2.bold = True if filename_text: p3 = doc.add_paragraph() r3 = p3.add_run(filename_text) r3.font.size = Pt(13) meta_parts = [] if product_version: meta_parts.append("产品版本:" + product_version) if document_version: meta_parts.append("文档版本:" + document_version) if meta_parts: pm = doc.add_paragraph(" ".join(meta_parts)) pm.font = None doc.add_section(WD_SECTION.NEW_PAGE) sec = doc.sections[-1] sec.page_width = Mm(210) sec.page_height = Mm(297) sec.left_margin = Mm(15) sec.right_margin = Mm(15) sec.top_margin = Mm(20) sec.bottom_margin = Mm(20) else: sec = sec0 if header_text or logo_url or filename_text: hp = sec.header.add_paragraph() left = header_text or '' right = '' if '||' in left: parts = left.split('||', 1) left, right = parts[0], parts[1] elif '|' in left: parts = left.split('|', 1) left, right = parts[0], parts[1] if left.strip(): hp.add_run(left.strip()) if right.strip(): rp = sec.header.add_paragraph() rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT rp.add_run(right.strip()) elif filename_text: rp = sec.header.add_paragraph() rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT rp.add_run(filename_text) if footer_text or copyright_text: fp = sec.footer.add_paragraph() if footer_text: fp.add_run(footer_text) if copyright_text: cp = sec.footer.add_paragraph() cp.add_run(copyright_text) pn = sec.footer.add_paragraph() pn.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT _add_field(pn, 'PAGE') if toc: doc.add_paragraph('目录') _add_field(doc.add_paragraph(), 'TOC \\o "1-3" \\h \\z \\u') doc.add_page_break() def add_inline(p, node): if isinstance(node, str): p.add_run(node) return if node.name in ['strong', 'b']: r = p.add_run(node.get_text()) r.bold = True return if node.name in ['em', 'i']: r = p.add_run(node.get_text()) r.italic = True return if node.name == 'code': r = p.add_run(node.get_text()) r.font.name = 'Courier New' return if node.name == 'a': text = node.get_text() href = node.get('href') extra = node.get('data-doc') resolved = resolve_link(href, extra) if resolved: p.add_run(text + ' [' + resolved + ']') else: p.add_run(text) return if node.name == 'img': src = node.get('src') or '' b = _fetch_bytes(src) if b: bio = io.BytesIO(b) try: doc.add_picture(bio, width=_available_width(sec)) except Exception: pass return for c in getattr(node, 'children', []): add_inline(p, c) def process_block(el): name = getattr(el, 'name', None) if name is None: return cls = el.get('class') or [] if name == 'div' and 'doc-meta' in cls: return if name == 'section' and 'cover' in cls: return if name == 'nav' and 'toc' in cls: return if name == 'div': for child in el.children: process_block(child) return if name == 'h1': doc.add_heading(el.get_text(), level=1) return if name == 'h2' or (name == 'strong' and 'subtitle' in cls): doc.add_heading(el.get_text(), level=2) return if name == 'h3': doc.add_heading(el.get_text(), level=3) return if name == 'p': p = doc.add_paragraph() for c in el.children: add_inline(p, c) return if name in ['ul', 'ol']: for li in el.find_all('li', recursive=False): p = doc.add_paragraph(style='List Bullet') for c in li.children: add_inline(p, c) return if name == 'pre': code = el.get_text() or '' p = doc.add_paragraph() run = p.add_run(code) run.font.name = 'Courier New' return if name == 'blockquote': p = doc.add_paragraph(el.get_text()) p.paragraph_format.left_indent = Mm(10) return if name == 'table': rows = [] thead = el.find('thead') tbody = el.find('tbody') if thead: hdrs = [th.get_text(strip=True) for th in thead.find_all('th')] else: hdrs = [cell.get_text(strip=True) for cell in el.find_all('tr')[0].find_all(['th','td'])] if el.find_all('tr') else [] trs = tbody.find_all('tr') if tbody else el.find_all('tr')[1:] for tr in trs: tds = [td.get_text(strip=True) for td in tr.find_all('td')] rows.append(tds) tbl = doc.add_table(rows=1 + len(rows), cols=len(hdrs) or 1) hdr = tbl.rows[0].cells for k, h in enumerate(hdrs or ['']): hdr[k].text = h for r_idx, row in enumerate(rows): cells = tbl.rows[1 + r_idx].cells for c_idx in range(len(hdrs) or 1): cells[c_idx].text = (row[c_idx] if c_idx < len(row) else '') return if name == 'img': src = el.get('src') or '' b = _fetch_bytes(src) if b: bio = io.BytesIO(b) try: doc.add_picture(bio, width=_available_width(sec)) except Exception: pass return body = soup.body or soup for el in body.children: process_block(el) bio = io.BytesIO() try: import logging as _log _log.info("md_to_docx_bytes saving doc") except Exception: pass doc.save(bio) try: import logging as _log _log.info(f"md_to_docx_bytes done size={bio.tell()}") except Exception: pass return bio.getvalue() def md_to_pdf_bytes(md: str) -> bytes: return md_to_pdf_bytes_with_renderer(md, renderer="weasyprint") def _md_with_tables_to_html(md_text: str) -> str: lines = md_text.splitlines() out = [] i = 0 while i < len(lines): line = lines[i] def is_sep(s: str) -> bool: s = s.strip() if "|" not in s: return False s = s.strip("|") return all(set(seg.strip()) <= set("-: ") and len(seg.strip()) >= 1 for seg in s.split("|")) if "|" in line and i + 1 < len(lines) and is_sep(lines[i + 1]): headers = [c.strip() for c in line.strip().strip("|").split("|")] j = i + 2 rows = [] while j < len(lines) and "|" in lines[j]: rows.append([c.strip() for c in lines[j].strip().strip("|").split("|")]) j += 1 tbl = ["", ""] for h in headers: tbl.append(f"") tbl.append("") for row in rows: tbl.append("") for idx in range(len(headers)): cell = row[idx] if idx < len(row) else "" tbl.append(f"") tbl.append("") tbl.append("
{h}
{cell}
") out.append("".join(tbl)) i = j continue out.append(line) i += 1 return marko.convert("\n".join(out)) def _render_markdown_html(md_text: str) -> str: if _HAS_MD_IT and _mdit is not None: try: md = _mdit.MarkdownIt("commonmark").enable(["table", "strikethrough"]) if _tasklists_plugin: md.use(_tasklists_plugin) if _deflist_plugin: md.use(_deflist_plugin) if _footnote_plugin: md.use(_footnote_plugin) if _attrs_plugin: md.use(_attrs_plugin) return md.render(md_text) except Exception: pass return _md_with_tables_to_html(md_text) def normalize_html(md_or_html: str, options: Optional[Dict[str, Optional[str]]] = None) -> str: html = _render_markdown_html(md_or_html) soup = BeautifulSoup(html, "html.parser") for s in soup.find_all("strong", class_="subtitle"): s.name = "h2" s.attrs = {"data-origin": "subtitle"} for a in soup.find_all("a"): href_val = a.get("href") extra_val = a.get("data-doc") href = href_val if isinstance(href_val, str) else None extra = extra_val if isinstance(extra_val, str) else None resolved = resolve_link(href, extra) if resolved: a["href"] = resolved elif not href and extra: a.replace_with(a.get_text() + " [" + extra + "]") opts = options or {} header_text = opts.get("header_text") or None footer_text = opts.get("footer_text") or None logo_url = opts.get("logo_url") or None copyright_text = opts.get("copyright_text") or None cover_src = opts.get("cover_src") or None product_name_opt = opts.get("product_name") or None document_name_opt = opts.get("document_name") or None product_version_opt = opts.get("product_version") or None document_version_opt = opts.get("document_version") or None toc_flag = bool(opts.get("toc")) meta = soup.new_tag("div", attrs={"class": "doc-meta"}) if header_text: ht = soup.new_tag("div", attrs={"class": "doc-header-text"}) text = header_text left = text right = "" if "||" in text: parts = text.split("||", 1) left, right = parts[0], parts[1] elif "|" in text: parts = text.split("|", 1) left, right = parts[0], parts[1] if logo_url: img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url}) ht.append(img) hl = soup.new_tag("span", attrs={"class": "doc-header-left"}) hl.string = left ht.append(hl) if right.strip(): hr = soup.new_tag("span", attrs={"class": "doc-header-right"}) hr.string = right ht.append(hr) meta.append(ht) else: first_h1 = None if soup.body: first_h1 = soup.body.find("h1") else: first_h1 = soup.find("h1") left = (first_h1.get_text(strip=True) if first_h1 else "文档") right = opts.get("filename_text") or "" ht = soup.new_tag("div", attrs={"class": "doc-header-text"}) if logo_url: img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url}) ht.append(img) hl = soup.new_tag("span", attrs={"class": "doc-header-left"}) hl.string = left ht.append(hl) if right: hr = soup.new_tag("span", attrs={"class": "doc-header-right"}) hr.string = right ht.append(hr) meta.append(ht) if footer_text: ft = soup.new_tag("div", attrs={"class": "doc-footer-text"}) ft.string = footer_text meta.append(ft) page_header_val = (header_text or (document_name_opt or None)) if not page_header_val: first_h1_for_header = None if soup.body: first_h1_for_header = soup.body.find("h1") else: first_h1_for_header = soup.find("h1") page_header_val = (first_h1_for_header.get_text(strip=True) if first_h1_for_header else "文档") page_footer_val = (footer_text or "FunMD") ph = soup.new_tag("div", attrs={"class": "doc-page-header"}) if logo_url: logo_inline = soup.new_tag("img", attrs={"src": logo_url, "class": "doc-page-header-logo"}) ph.append(logo_inline) ht_inline = soup.new_tag("span", attrs={"class": "doc-page-header-text"}) ht_inline.string = page_header_val ph.append(ht_inline) meta.append(ph) pf = soup.new_tag("div", attrs={"class": "doc-page-footer"}) pf.string = page_footer_val meta.append(pf) if copyright_text: cp = soup.new_tag("div", attrs={"class": "doc-copyright"}) cp.string = copyright_text meta.append(cp) # brand logo is rendered inline within header; no separate top-left element if soup.body: soup.body.insert(0, meta) else: soup.insert(0, meta) if not soup.head: head = soup.new_tag("head") soup.insert(0, head) else: head = soup.head style_run = soup.new_tag("style") style_run.string = "@page{margin:20mm}@page{\n @top-center{content: element(page-header)}\n @bottom-center{content: element(page-footer)}\n}\n.doc-page-header{position: running(page-header); font-size:10pt; color:#666; display:block; text-align:center; width:100%}\n.doc-page-header::after{content:''; display:block; width:80%; border-bottom:1px solid #d9d9d9; margin:4px auto 0}\n.doc-page-header-logo{height:20px; vertical-align:middle; margin-right:4px}\n.doc-page-header-text{vertical-align:middle}\n.doc-page-footer{position: running(page-footer); font-size:10pt; color:#666}\n.doc-page-footer::before{content:''; display:block; width:80%; border-top:1px solid #d9d9d9; margin:0 auto 4px}" head.append(style_run) # Fallback inline styles for cover to ensure visibility even if external CSS isn't loaded if (cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt): if not soup.head: head = soup.new_tag("head") soup.insert(0, head) else: head = soup.head style = soup.new_tag("style") style.string = "@page:first{margin:0} html,body{margin:0;padding:0}.cover{position:relative;width:210mm;height:297mm;overflow:hidden;page-break-after:always}.cover .cover-bg{position:absolute;left:0;top:0;right:0;bottom:0;width:100%;height:100%;object-fit:cover;display:block}.cover .cover-brand{position:absolute;top:20mm;left:20mm;font-size:18pt;font-weight:700;color:#1d4ed8}.cover .cover-footer{position:absolute;left:0;right:0;bottom:0;background:#1d4ed8;color:#fff;padding:12mm 20mm}.cover .cover-title{font-size:24pt;font-weight:700;margin:0}.cover .cover-subtitle{font-size:13pt;margin-top:4pt}.cover .cover-meta{margin-top:8pt;font-size:11pt;display:flex;gap:20mm}" head.append(style) if cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt: cov = soup.new_tag("section", attrs={"class": "cover"}) if cover_src: bg = soup.new_tag("img", attrs={"class": "cover-bg", "src": cover_src}) cov.append(bg) if product_name_opt: brand_el = soup.new_tag("div", attrs={"class": "cover-brand"}) brand_el.string = product_name_opt cov.append(brand_el) footer = soup.new_tag("div", attrs={"class": "cover-footer"}) title_text = document_name_opt or None if not title_text: first_h1 = soup.body.find("h1") if soup.body else soup.find("h1") if first_h1: title_text = first_h1.get_text(strip=True) title_el = soup.new_tag("div", attrs={"class": "cover-title"}) title_el.string = title_text or "文档" footer.append(title_el) subtitle_val = opts.get("filename_text") or "" if subtitle_val: subtitle_el = soup.new_tag("div", attrs={"class": "cover-subtitle"}) subtitle_el.string = subtitle_val footer.append(subtitle_el) meta_el = soup.new_tag("div", attrs={"class": "cover-meta"}) if product_version_opt: pv = soup.new_tag("span") pv.string = f"产品版本:{product_version_opt}" meta_el.append(pv) if document_version_opt: dv = soup.new_tag("span") dv.string = f"文档版本:{document_version_opt}" meta_el.append(dv) footer.append(meta_el) cov.append(footer) if soup.body: soup.body.insert(1, cov) else: soup.insert(1, cov) if toc_flag: headings = [ el for el in (soup.find_all(["h1", "h2", "h3"]) or []) if el.get("data-origin") != "subtitle" ] if headings: ul = soup.new_tag("ul") idx = 1 for el in headings: text = el.get_text(strip=True) if not text: continue hid = el.get("id") if not hid: hid = f"sec-{idx}" el["id"] = hid idx += 1 li = soup.new_tag("li", attrs={"class": f"toc-{el.name}"}) a = soup.new_tag("a", attrs={"href": f"#{hid}", "class": "toc-text"}) a.string = text dots = soup.new_tag("span", attrs={"class": "toc-dots"}) page = soup.new_tag("span", attrs={"class": "toc-page", "data-target": f"#{hid}"}) li.append(a) li.append(dots) li.append(page) ul.append(li) nav = soup.new_tag("nav", attrs={"class": "toc"}) h = soup.new_tag("h1") h.string = "目录" nav.append(h) nav.append(ul) if soup.body: soup.body.insert(2, nav) else: soup.insert(2, nav) if soup.body: for h in soup.body.find_all(["h1", "h2", "h3"]): sib: Optional[PageElement] = h.find_next_sibling() blocks: List[Any] = [] first_table: Optional[Any] = None while sib is not None: # Skip pure whitespace nodes if getattr(sib, "name", None) is None: try: if str(sib).strip() == "": sib = sib.next_sibling continue except Exception: break # Stop if next heading encountered name = getattr(sib, "name", None) if name in ["h1", "h2", "h3"]: break # Collect explanatory blocks until first table if name == "table": first_table = sib break if name in ["p", "blockquote", "ul", "ol"]: blocks.append(sib) sib = sib.next_sibling continue # Unknown block: stop grouping to avoid wrapping unrelated content break if first_table is not None: wrap = soup.new_tag("div", attrs={"class": "table-block"}) h.insert_before(wrap) wrap.append(h.extract()) for el in blocks: wrap.append(el.extract()) wrap.append(first_table.extract()) return str(soup) def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]): sheets: List[Any] = [] if CSS is None: return sheets if css_text: sheets.append(CSS(string=css_text)) if css_name: css_path = Path(__file__).resolve().parent.parent / "configs" / "styles" / f"{css_name}.css" if css_path.exists(): sheets.append(CSS(filename=str(css_path))) return sheets def _render_pdf_with_reportlab(md: str) -> bytes: """ 使用 reportlab 生成支持中文的 PDF(纯 Python,无外部依赖) """ print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}") bio = io.BytesIO() # 创建 PDF 文档 doc = SimpleDocTemplate( bio, pagesize=A4, rightMargin=20*mm, leftMargin=20*mm, topMargin=20*mm, bottomMargin=20*mm, ) # 存放 PDF 元素的列表 story = [] styles = getSampleStyleSheet() # 尝试注册中文字体 try: # Windows 系统字体 font_path = r"C:\Windows\Fonts\msyh.ttc" # 微软雅黑 if Path(font_path).exists(): pdfmetrics.registerFont(TTFont('ChineseFont', font_path, subfontIndex=0)) chinese_font = 'ChineseFont' else: # 尝试其他常见字体路径 alternative_fonts = [ r"C:\Windows\Fonts\simhei.ttf", # 黑体 r"C:\Windows\Fonts\simsun.ttc", # 宋体 "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc", # Linux "/System/Library/Fonts/PingFang.ttc", # macOS ] chinese_font = 'Helvetica' # 默认 for font in alternative_fonts: if Path(font).exists(): try: pdfmetrics.registerFont(TTFont('ChineseFont', font)) chinese_font = 'ChineseFont' break except: continue except Exception: chinese_font = 'Helvetica' # 创建支持中文的样式 title_style = ParagraphStyle( 'ChineseTitle', parent=styles['Heading1'], fontName=chinese_font, fontSize=18, textColor=colors.black, spaceAfter=12, spaceBefore=12, ) heading2_style = ParagraphStyle( 'ChineseHeading2', parent=styles['Heading2'], fontName=chinese_font, fontSize=14, textColor=colors.black, spaceAfter=10, spaceBefore=10, ) normal_style = ParagraphStyle( 'ChineseNormal', parent=styles['Normal'], fontName=chinese_font, fontSize=10, textColor=colors.black, spaceAfter=8, wordWrap='CJK', # 中文换行支持 ) code_style = ParagraphStyle( 'ChineseCode', parent=styles['Code'], fontName='Courier', fontSize=9, textColor=colors.black, backColor=colors.lightgrey, leftIndent=10, ) # 解析 markdown lines = md.split('\n') in_code_block = False code_lines = [] for line in lines: # 代码块处理 if line.strip().startswith('```'): if in_code_block: # 代码块结束 code_text = '\n'.join(code_lines) story.append(Paragraph(code_text.replace('<', '<').replace('>', '>'), code_style)) story.append(Spacer(1, 6*mm)) code_lines = [] in_code_block = False else: in_code_block = True continue if in_code_block: code_lines.append(line) continue # 标题处理 if line.startswith('# '): text = line[2:].strip() story.append(Paragraph(text, title_style)) elif line.startswith('## '): text = line[3:].strip() story.append(Paragraph(text, heading2_style)) elif line.startswith('### '): text = line[4:].strip() story.append(Paragraph(text, heading2_style)) # 列表处理 elif line.strip().startswith('- ') or line.strip().startswith('* '): text = line.strip()[2:] story.append(Paragraph(f'• {text}', normal_style)) elif re.match(r'^\d+\.\s', line.strip()): text = re.sub(r'^\d+\.\s', '', line.strip()) story.append(Paragraph(text, normal_style)) # 空行 elif not line.strip(): story.append(Spacer(1, 3*mm)) # 普通段落 elif line.strip(): # 处理粗体和斜体 text = line.strip() text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'`(.+?)`', r'\1', text) story.append(Paragraph(text, normal_style)) # 生成 PDF doc.build(story) return bio.getvalue() def _render_pdf_with_xhtml2pdf(md: str, html: str, css_name: Optional[str], css_text: Optional[str]) -> bytes: """ 使用 xhtml2pdf 渲染 PDF(纯 Python,无外部依赖) """ # 使用简单的 markdown 转 HTML,避免复杂的 normalize_html simple_html = _render_markdown_html(md) # 构建完整的 HTML 文档,确保格式正确 full_html = f''' {simple_html} ''' # 使用 BytesIO 接收 PDF 输出 bio = io.BytesIO() # 调用 pisa.CreatePDF _pisa.CreatePDF( full_html, dest=bio, encoding='utf-8' ) return bio.getvalue() def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes: html = normalize_html(md, options={ "toc": "1" if toc else "", "header_text": header_text, "footer_text": footer_text, "logo_url": logo_url, "copyright_text": copyright_text, "filename_text": filename_text, "cover_src": cover_src, "product_name": product_name, "document_name": document_name, "product_version": product_version, "document_version": document_version, }) # ========== PDF 渲染优先级 ========== # 1. reportlab (首选) - 纯 Python,支持中文,跨平台兼容 # 2. WeasyPrint - 需要 GTK 系统库,Windows 上安装复杂 # ===================================== print(f"[DEBUG] 开始 PDF 转换, _HAS_REPORTLAB={_HAS_REPORTLAB}, HTML is None={HTML is None}") # 首选:reportlab(纯 Python,支持中文,无需外部依赖) if _HAS_REPORTLAB: try: print(f"[DEBUG] 尝试使用 reportlab...") return _render_pdf_with_reportlab(md) except Exception as e: # reportlab 失败,记录错误并继续尝试下一个方案 import traceback error_detail = traceback.format_exc() print(f"[DEBUG] reportlab 失败: {str(e)}") print(f"[DEBUG] 错误详情:\n{error_detail}") # 备选:WeasyPrint(需要系统库支持) if HTML is not None: try: print(f"[DEBUG] 尝试使用 WeasyPrint...") stylesheets = _stylesheets_for(css_name, css_text) pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None) return pdf_bytes except Exception as e: # WeasyPrint 失败,记录错误 import traceback error_detail = traceback.format_exc() print(f"[DEBUG] WeasyPrint 失败: {str(e)}") print(f"[DEBUG] 错误详情:\n{error_detail}") raise RuntimeError("PDF 转换失败。reportlab 已安装但转换失败,请检查 markdown 格式")