969 lines
36 KiB
Python
969 lines
36 KiB
Python
from pathlib import Path
|
||
from typing import Optional, Tuple, Dict, List, Any
|
||
from urllib.parse import urlparse, unquote
|
||
import os
|
||
import re
|
||
import io
|
||
from bs4 import BeautifulSoup
|
||
from bs4.element import PageElement
|
||
import marko
|
||
import sys
|
||
try:
|
||
_DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
|
||
p = str(_DOC_BASE)
|
||
if p not in sys.path:
|
||
sys.path.insert(0, p)
|
||
except Exception:
|
||
pass
|
||
try:
|
||
from docling.document_converter import DocumentConverter
|
||
except Exception:
|
||
class DocumentConverter: # type: ignore
|
||
def __init__(self, *args, **kwargs):
|
||
pass
|
||
def convert(self, source):
|
||
raise RuntimeError("docling not available")
|
||
from docx import Document
|
||
from docx.shared import Mm, Pt
|
||
from docx.enum.section import WD_SECTION
|
||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||
from docx.oxml import OxmlElement
|
||
from docx.oxml.ns import qn
|
||
from urllib.request import urlopen
|
||
import json
|
||
|
||
try:
|
||
from weasyprint import HTML, CSS # type: ignore
|
||
except Exception:
|
||
HTML = None
|
||
CSS = None
|
||
|
||
try:
|
||
from xhtml2pdf import pisa as _pisa # type: ignore
|
||
_HAS_XHTML2PDF: bool = True
|
||
except Exception:
|
||
_pisa = None # type: ignore
|
||
_HAS_XHTML2PDF: bool = False
|
||
|
||
# reportlab 用于生成支持中文的 PDF
|
||
try:
|
||
from reportlab.lib.pagesizes import A4
|
||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||
from reportlab.lib.units import mm
|
||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
||
from reportlab.lib import colors
|
||
from reportlab.pdfbase import pdfmetrics
|
||
from reportlab.pdfbase.ttfonts import TTFont
|
||
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
|
||
_HAS_REPORTLAB: bool = True
|
||
except Exception:
|
||
A4 = None
|
||
_HAS_REPORTLAB: bool = False
|
||
|
||
_mdit: Any = None
|
||
_tasklists_plugin: Any = None
|
||
_deflist_plugin: Any = None
|
||
_footnote_plugin: Any = None
|
||
_attrs_plugin: Any = None
|
||
_HAS_MD_IT: bool = False
|
||
try:
|
||
import markdown_it as _mdit # type: ignore
|
||
from mdit_py_plugins.tasklists import tasklists_plugin as _tasklists_plugin # type: ignore
|
||
from mdit_py_plugins.deflist import deflist_plugin as _deflist_plugin # type: ignore
|
||
from mdit_py_plugins.footnote import footnote_plugin as _footnote_plugin # type: ignore
|
||
from mdit_py_plugins.attrs import attrs_plugin as _attrs_plugin # type: ignore
|
||
_HAS_MD_IT = True
|
||
except Exception:
|
||
pass
|
||
|
||
converter = DocumentConverter()
|
||
LINKMAP_PATH = Path(__file__).resolve().parent.parent / "configs" / "linkmap" / "linkmap.json"
|
||
_LINKMAP: Dict[str, str] = {}
|
||
|
||
def load_linkmap() -> Dict[str, str]:
|
||
global _LINKMAP
|
||
try:
|
||
if LINKMAP_PATH.exists():
|
||
_LINKMAP = json.loads(LINKMAP_PATH.read_text("utf-8")) or {}
|
||
except Exception:
|
||
_LINKMAP = {}
|
||
return _LINKMAP
|
||
|
||
def save_linkmap(mapping: Dict[str, str]) -> None:
|
||
LINKMAP_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
LINKMAP_PATH.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), "utf-8")
|
||
load_linkmap()
|
||
|
||
def resolve_link(href: Optional[str], data_doc: Optional[str]) -> Optional[str]:
|
||
if href:
|
||
return href
|
||
if not _LINKMAP:
|
||
load_linkmap()
|
||
if data_doc and data_doc in _LINKMAP:
|
||
return _LINKMAP[data_doc]
|
||
return None
|
||
|
||
def export_payload(doc, fmt: str) -> Tuple[str, str]:
|
||
f = fmt.lower()
|
||
if f == "markdown":
|
||
return doc.export_to_markdown(), "text/markdown"
|
||
if f == "html":
|
||
return doc.export_to_html(), "text/html"
|
||
if f == "json":
|
||
return doc.export_to_json(), "application/json"
|
||
if f == "doctags":
|
||
return doc.export_to_doctags(), "application/json"
|
||
raise ValueError("unsupported export")
|
||
|
||
def infer_basename(source_url: Optional[str], upload_name: Optional[str]) -> str:
|
||
if source_url:
|
||
path = urlparse(source_url).path
|
||
name = os.path.basename(path) or "document"
|
||
name = unquote(name)
|
||
return os.path.splitext(name)[0] or "document"
|
||
if upload_name:
|
||
name = os.path.splitext(os.path.basename(upload_name))[0] or "document"
|
||
return name
|
||
return "document"
|
||
|
||
def sanitize_filename(name: Optional[str]) -> str:
|
||
if not name:
|
||
return "document"
|
||
name = name.strip()[:128]
|
||
name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name) or "document"
|
||
return name
|
||
|
||
def convert_source(source: str, export: str) -> Tuple[str, str]:
|
||
result = converter.convert(source)
|
||
return export_payload(result.document, export)
|
||
|
||
|
||
|
||
def md_to_docx_bytes(md: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
|
||
try:
|
||
import logging as _log
|
||
_log.info(f"md_to_docx_bytes start toc={toc} header={bool(header_text)} footer={bool(footer_text)} logo={bool(logo_url)} cover={bool(cover_src)}")
|
||
except Exception:
|
||
pass
|
||
def _add_field(paragraph, instr: str):
|
||
r1 = paragraph.add_run()
|
||
b = OxmlElement('w:fldChar')
|
||
b.set(qn('w:fldCharType'), 'begin')
|
||
r1._r.append(b)
|
||
r2 = paragraph.add_run()
|
||
t = OxmlElement('w:instrText')
|
||
t.set(qn('xml:space'), 'preserve')
|
||
t.text = instr
|
||
r2._r.append(t)
|
||
r3 = paragraph.add_run()
|
||
e = OxmlElement('w:fldChar')
|
||
e.set(qn('w:fldCharType'), 'end')
|
||
r3._r.append(e)
|
||
def _available_width(section) -> int:
|
||
return section.page_width - section.left_margin - section.right_margin
|
||
def _fetch_bytes(u: str) -> Optional[bytes]:
|
||
try:
|
||
if u.lower().startswith('http://') or u.lower().startswith('https://'):
|
||
with urlopen(u, timeout=10) as r:
|
||
return r.read()
|
||
p = Path(u)
|
||
if p.exists() and p.is_file():
|
||
return p.read_bytes()
|
||
except Exception:
|
||
return None
|
||
return None
|
||
html = normalize_html(md, options={
|
||
"toc": "1" if toc else "",
|
||
"header_text": header_text,
|
||
"footer_text": footer_text,
|
||
"logo_url": logo_url,
|
||
"copyright_text": copyright_text,
|
||
"filename_text": filename_text,
|
||
"cover_src": cover_src,
|
||
"product_name": product_name,
|
||
"document_name": document_name,
|
||
"product_version": product_version,
|
||
"document_version": document_version,
|
||
})
|
||
try:
|
||
import logging as _log
|
||
_log.info(f"md_to_docx_bytes normalize_html length={len(html)}")
|
||
except Exception:
|
||
pass
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
doc = Document()
|
||
sec0 = doc.sections[0]
|
||
sec0.page_width = Mm(210)
|
||
sec0.page_height = Mm(297)
|
||
sec0.left_margin = Mm(15)
|
||
sec0.right_margin = Mm(15)
|
||
sec0.top_margin = Mm(20)
|
||
sec0.bottom_margin = Mm(20)
|
||
has_cover = bool(cover_src or (soup.find('section', class_='cover') is not None))
|
||
if has_cover:
|
||
sec0.left_margin = Mm(0)
|
||
sec0.right_margin = Mm(0)
|
||
sec0.top_margin = Mm(0)
|
||
sec0.bottom_margin = Mm(0)
|
||
if cover_src:
|
||
b = _fetch_bytes(cover_src)
|
||
if b:
|
||
bio = io.BytesIO(b)
|
||
doc.add_picture(bio, width=_available_width(sec0))
|
||
if product_name:
|
||
p = doc.add_paragraph()
|
||
r = p.add_run(product_name)
|
||
r.font.size = Pt(18)
|
||
r.bold = True
|
||
t = document_name or None
|
||
if not t:
|
||
h1 = soup.body.find('h1') if soup.body else soup.find('h1')
|
||
t = h1.get_text(strip=True) if h1 else '文档'
|
||
p2 = doc.add_paragraph()
|
||
r2 = p2.add_run(t or '文档')
|
||
r2.font.size = Pt(24)
|
||
r2.bold = True
|
||
if filename_text:
|
||
p3 = doc.add_paragraph()
|
||
r3 = p3.add_run(filename_text)
|
||
r3.font.size = Pt(13)
|
||
meta_parts = []
|
||
if product_version:
|
||
meta_parts.append("产品版本:" + product_version)
|
||
if document_version:
|
||
meta_parts.append("文档版本:" + document_version)
|
||
if meta_parts:
|
||
pm = doc.add_paragraph(" ".join(meta_parts))
|
||
pm.font = None
|
||
doc.add_section(WD_SECTION.NEW_PAGE)
|
||
sec = doc.sections[-1]
|
||
sec.page_width = Mm(210)
|
||
sec.page_height = Mm(297)
|
||
sec.left_margin = Mm(15)
|
||
sec.right_margin = Mm(15)
|
||
sec.top_margin = Mm(20)
|
||
sec.bottom_margin = Mm(20)
|
||
else:
|
||
sec = sec0
|
||
if header_text or logo_url or filename_text:
|
||
hp = sec.header.add_paragraph()
|
||
left = header_text or ''
|
||
right = ''
|
||
if '||' in left:
|
||
parts = left.split('||', 1)
|
||
left, right = parts[0], parts[1]
|
||
elif '|' in left:
|
||
parts = left.split('|', 1)
|
||
left, right = parts[0], parts[1]
|
||
if left.strip():
|
||
hp.add_run(left.strip())
|
||
if right.strip():
|
||
rp = sec.header.add_paragraph()
|
||
rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||
rp.add_run(right.strip())
|
||
elif filename_text:
|
||
rp = sec.header.add_paragraph()
|
||
rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||
rp.add_run(filename_text)
|
||
if footer_text or copyright_text:
|
||
fp = sec.footer.add_paragraph()
|
||
if footer_text:
|
||
fp.add_run(footer_text)
|
||
if copyright_text:
|
||
cp = sec.footer.add_paragraph()
|
||
cp.add_run(copyright_text)
|
||
pn = sec.footer.add_paragraph()
|
||
pn.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||
_add_field(pn, 'PAGE')
|
||
if toc:
|
||
doc.add_paragraph('目录')
|
||
_add_field(doc.add_paragraph(), 'TOC \\o "1-3" \\h \\z \\u')
|
||
doc.add_page_break()
|
||
def add_inline(p, node):
|
||
if isinstance(node, str):
|
||
p.add_run(node)
|
||
return
|
||
if node.name in ['strong', 'b']:
|
||
r = p.add_run(node.get_text())
|
||
r.bold = True
|
||
return
|
||
if node.name in ['em', 'i']:
|
||
r = p.add_run(node.get_text())
|
||
r.italic = True
|
||
return
|
||
if node.name == 'code':
|
||
r = p.add_run(node.get_text())
|
||
r.font.name = 'Courier New'
|
||
return
|
||
if node.name == 'a':
|
||
text = node.get_text()
|
||
href = node.get('href')
|
||
extra = node.get('data-doc')
|
||
resolved = resolve_link(href, extra)
|
||
if resolved:
|
||
p.add_run(text + ' [' + resolved + ']')
|
||
else:
|
||
p.add_run(text)
|
||
return
|
||
if node.name == 'img':
|
||
src = node.get('src') or ''
|
||
b = _fetch_bytes(src)
|
||
if b:
|
||
bio = io.BytesIO(b)
|
||
try:
|
||
doc.add_picture(bio, width=_available_width(sec))
|
||
except Exception:
|
||
pass
|
||
return
|
||
for c in getattr(node, 'children', []):
|
||
add_inline(p, c)
|
||
def process_block(el):
|
||
name = getattr(el, 'name', None)
|
||
if name is None:
|
||
return
|
||
cls = el.get('class') or []
|
||
if name == 'div' and 'doc-meta' in cls:
|
||
return
|
||
if name == 'section' and 'cover' in cls:
|
||
return
|
||
if name == 'nav' and 'toc' in cls:
|
||
return
|
||
if name == 'div':
|
||
for child in el.children:
|
||
process_block(child)
|
||
return
|
||
if name == 'h1':
|
||
doc.add_heading(el.get_text(), level=1)
|
||
return
|
||
if name == 'h2' or (name == 'strong' and 'subtitle' in cls):
|
||
doc.add_heading(el.get_text(), level=2)
|
||
return
|
||
if name == 'h3':
|
||
doc.add_heading(el.get_text(), level=3)
|
||
return
|
||
if name == 'p':
|
||
p = doc.add_paragraph()
|
||
for c in el.children:
|
||
add_inline(p, c)
|
||
return
|
||
if name in ['ul', 'ol']:
|
||
for li in el.find_all('li', recursive=False):
|
||
p = doc.add_paragraph(style='List Bullet')
|
||
for c in li.children:
|
||
add_inline(p, c)
|
||
return
|
||
if name == 'pre':
|
||
code = el.get_text() or ''
|
||
p = doc.add_paragraph()
|
||
run = p.add_run(code)
|
||
run.font.name = 'Courier New'
|
||
return
|
||
if name == 'blockquote':
|
||
p = doc.add_paragraph(el.get_text())
|
||
p.paragraph_format.left_indent = Mm(10)
|
||
return
|
||
if name == 'table':
|
||
rows = []
|
||
thead = el.find('thead')
|
||
tbody = el.find('tbody')
|
||
if thead:
|
||
hdrs = [th.get_text(strip=True) for th in thead.find_all('th')]
|
||
else:
|
||
hdrs = [cell.get_text(strip=True) for cell in el.find_all('tr')[0].find_all(['th','td'])] if el.find_all('tr') else []
|
||
trs = tbody.find_all('tr') if tbody else el.find_all('tr')[1:]
|
||
for tr in trs:
|
||
tds = [td.get_text(strip=True) for td in tr.find_all('td')]
|
||
rows.append(tds)
|
||
tbl = doc.add_table(rows=1 + len(rows), cols=len(hdrs) or 1)
|
||
hdr = tbl.rows[0].cells
|
||
for k, h in enumerate(hdrs or ['']):
|
||
hdr[k].text = h
|
||
for r_idx, row in enumerate(rows):
|
||
cells = tbl.rows[1 + r_idx].cells
|
||
for c_idx in range(len(hdrs) or 1):
|
||
cells[c_idx].text = (row[c_idx] if c_idx < len(row) else '')
|
||
return
|
||
if name == 'img':
|
||
src = el.get('src') or ''
|
||
b = _fetch_bytes(src)
|
||
if b:
|
||
bio = io.BytesIO(b)
|
||
try:
|
||
doc.add_picture(bio, width=_available_width(sec))
|
||
except Exception:
|
||
pass
|
||
return
|
||
body = soup.body or soup
|
||
for el in body.children:
|
||
process_block(el)
|
||
bio = io.BytesIO()
|
||
try:
|
||
import logging as _log
|
||
_log.info("md_to_docx_bytes saving doc")
|
||
except Exception:
|
||
pass
|
||
doc.save(bio)
|
||
try:
|
||
import logging as _log
|
||
_log.info(f"md_to_docx_bytes done size={bio.tell()}")
|
||
except Exception:
|
||
pass
|
||
return bio.getvalue()
|
||
|
||
def md_to_pdf_bytes(md: str) -> bytes:
|
||
return md_to_pdf_bytes_with_renderer(md, renderer="weasyprint")
|
||
|
||
def _md_with_tables_to_html(md_text: str) -> str:
|
||
lines = md_text.splitlines()
|
||
out = []
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
def is_sep(s: str) -> bool:
|
||
s = s.strip()
|
||
if "|" not in s:
|
||
return False
|
||
s = s.strip("|")
|
||
return all(set(seg.strip()) <= set("-: ") and len(seg.strip()) >= 1 for seg in s.split("|"))
|
||
if "|" in line and i + 1 < len(lines) and is_sep(lines[i + 1]):
|
||
headers = [c.strip() for c in line.strip().strip("|").split("|")]
|
||
j = i + 2
|
||
rows = []
|
||
while j < len(lines) and "|" in lines[j]:
|
||
rows.append([c.strip() for c in lines[j].strip().strip("|").split("|")])
|
||
j += 1
|
||
tbl = ["<table>", "<thead><tr>"]
|
||
for h in headers:
|
||
tbl.append(f"<th>{h}</th>")
|
||
tbl.append("</tr></thead><tbody>")
|
||
for row in rows:
|
||
tbl.append("<tr>")
|
||
for idx in range(len(headers)):
|
||
cell = row[idx] if idx < len(row) else ""
|
||
tbl.append(f"<td>{cell}</td>")
|
||
tbl.append("</tr>")
|
||
tbl.append("</tbody></table>")
|
||
out.append("".join(tbl))
|
||
i = j
|
||
continue
|
||
out.append(line)
|
||
i += 1
|
||
return marko.convert("\n".join(out))
|
||
|
||
def _render_markdown_html(md_text: str) -> str:
|
||
if _HAS_MD_IT and _mdit is not None:
|
||
try:
|
||
md = _mdit.MarkdownIt("commonmark").enable(["table", "strikethrough"])
|
||
if _tasklists_plugin:
|
||
md.use(_tasklists_plugin)
|
||
if _deflist_plugin:
|
||
md.use(_deflist_plugin)
|
||
if _footnote_plugin:
|
||
md.use(_footnote_plugin)
|
||
if _attrs_plugin:
|
||
md.use(_attrs_plugin)
|
||
return md.render(md_text)
|
||
except Exception:
|
||
pass
|
||
return _md_with_tables_to_html(md_text)
|
||
|
||
def normalize_html(md_or_html: str, options: Optional[Dict[str, Optional[str]]] = None) -> str:
|
||
html = _render_markdown_html(md_or_html)
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
for s in soup.find_all("strong", class_="subtitle"):
|
||
s.name = "h2"
|
||
s.attrs = {"data-origin": "subtitle"}
|
||
for a in soup.find_all("a"):
|
||
href_val = a.get("href")
|
||
extra_val = a.get("data-doc")
|
||
href = href_val if isinstance(href_val, str) else None
|
||
extra = extra_val if isinstance(extra_val, str) else None
|
||
resolved = resolve_link(href, extra)
|
||
if resolved:
|
||
a["href"] = resolved
|
||
elif not href and extra:
|
||
a.replace_with(a.get_text() + " [" + extra + "]")
|
||
opts = options or {}
|
||
header_text = opts.get("header_text") or None
|
||
footer_text = opts.get("footer_text") or None
|
||
logo_url = opts.get("logo_url") or None
|
||
copyright_text = opts.get("copyright_text") or None
|
||
cover_src = opts.get("cover_src") or None
|
||
product_name_opt = opts.get("product_name") or None
|
||
document_name_opt = opts.get("document_name") or None
|
||
product_version_opt = opts.get("product_version") or None
|
||
document_version_opt = opts.get("document_version") or None
|
||
toc_flag = bool(opts.get("toc"))
|
||
meta = soup.new_tag("div", attrs={"class": "doc-meta"})
|
||
if header_text:
|
||
ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
|
||
text = header_text
|
||
left = text
|
||
right = ""
|
||
if "||" in text:
|
||
parts = text.split("||", 1)
|
||
left, right = parts[0], parts[1]
|
||
elif "|" in text:
|
||
parts = text.split("|", 1)
|
||
left, right = parts[0], parts[1]
|
||
if logo_url:
|
||
img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
|
||
ht.append(img)
|
||
hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
|
||
hl.string = left
|
||
ht.append(hl)
|
||
if right.strip():
|
||
hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
|
||
hr.string = right
|
||
ht.append(hr)
|
||
meta.append(ht)
|
||
else:
|
||
first_h1 = None
|
||
if soup.body:
|
||
first_h1 = soup.body.find("h1")
|
||
else:
|
||
first_h1 = soup.find("h1")
|
||
left = (first_h1.get_text(strip=True) if first_h1 else "文档")
|
||
right = opts.get("filename_text") or ""
|
||
ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
|
||
if logo_url:
|
||
img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
|
||
ht.append(img)
|
||
hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
|
||
hl.string = left
|
||
ht.append(hl)
|
||
if right:
|
||
hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
|
||
hr.string = right
|
||
ht.append(hr)
|
||
meta.append(ht)
|
||
if footer_text:
|
||
ft = soup.new_tag("div", attrs={"class": "doc-footer-text"})
|
||
ft.string = footer_text
|
||
meta.append(ft)
|
||
page_header_val = (header_text or (document_name_opt or None))
|
||
if not page_header_val:
|
||
first_h1_for_header = None
|
||
if soup.body:
|
||
first_h1_for_header = soup.body.find("h1")
|
||
else:
|
||
first_h1_for_header = soup.find("h1")
|
||
page_header_val = (first_h1_for_header.get_text(strip=True) if first_h1_for_header else "文档")
|
||
page_footer_val = (footer_text or "FunMD")
|
||
ph = soup.new_tag("div", attrs={"class": "doc-page-header"})
|
||
if logo_url:
|
||
logo_inline = soup.new_tag("img", attrs={"src": logo_url, "class": "doc-page-header-logo"})
|
||
ph.append(logo_inline)
|
||
ht_inline = soup.new_tag("span", attrs={"class": "doc-page-header-text"})
|
||
ht_inline.string = page_header_val
|
||
ph.append(ht_inline)
|
||
meta.append(ph)
|
||
pf = soup.new_tag("div", attrs={"class": "doc-page-footer"})
|
||
pf.string = page_footer_val
|
||
meta.append(pf)
|
||
if copyright_text:
|
||
cp = soup.new_tag("div", attrs={"class": "doc-copyright"})
|
||
cp.string = copyright_text
|
||
meta.append(cp)
|
||
# brand logo is rendered inline within header; no separate top-left element
|
||
if soup.body:
|
||
soup.body.insert(0, meta)
|
||
else:
|
||
soup.insert(0, meta)
|
||
if not soup.head:
|
||
head = soup.new_tag("head")
|
||
soup.insert(0, head)
|
||
else:
|
||
head = soup.head
|
||
style_run = soup.new_tag("style")
|
||
style_run.string = "@page{margin:20mm}@page{\n @top-center{content: element(page-header)}\n @bottom-center{content: element(page-footer)}\n}\n.doc-page-header{position: running(page-header); font-size:10pt; color:#666; display:block; text-align:center; width:100%}\n.doc-page-header::after{content:''; display:block; width:80%; border-bottom:1px solid #d9d9d9; margin:4px auto 0}\n.doc-page-header-logo{height:20px; vertical-align:middle; margin-right:4px}\n.doc-page-header-text{vertical-align:middle}\n.doc-page-footer{position: running(page-footer); font-size:10pt; color:#666}\n.doc-page-footer::before{content:''; display:block; width:80%; border-top:1px solid #d9d9d9; margin:0 auto 4px}"
|
||
head.append(style_run)
|
||
# Fallback inline styles for cover to ensure visibility even if external CSS isn't loaded
|
||
if (cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt):
|
||
if not soup.head:
|
||
head = soup.new_tag("head")
|
||
soup.insert(0, head)
|
||
else:
|
||
head = soup.head
|
||
style = soup.new_tag("style")
|
||
style.string = "@page:first{margin:0} html,body{margin:0;padding:0}.cover{position:relative;width:210mm;height:297mm;overflow:hidden;page-break-after:always}.cover .cover-bg{position:absolute;left:0;top:0;right:0;bottom:0;width:100%;height:100%;object-fit:cover;display:block}.cover .cover-brand{position:absolute;top:20mm;left:20mm;font-size:18pt;font-weight:700;color:#1d4ed8}.cover .cover-footer{position:absolute;left:0;right:0;bottom:0;background:#1d4ed8;color:#fff;padding:12mm 20mm}.cover .cover-title{font-size:24pt;font-weight:700;margin:0}.cover .cover-subtitle{font-size:13pt;margin-top:4pt}.cover .cover-meta{margin-top:8pt;font-size:11pt;display:flex;gap:20mm}"
|
||
head.append(style)
|
||
if cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt:
|
||
cov = soup.new_tag("section", attrs={"class": "cover"})
|
||
if cover_src:
|
||
bg = soup.new_tag("img", attrs={"class": "cover-bg", "src": cover_src})
|
||
cov.append(bg)
|
||
if product_name_opt:
|
||
brand_el = soup.new_tag("div", attrs={"class": "cover-brand"})
|
||
brand_el.string = product_name_opt
|
||
cov.append(brand_el)
|
||
footer = soup.new_tag("div", attrs={"class": "cover-footer"})
|
||
title_text = document_name_opt or None
|
||
if not title_text:
|
||
first_h1 = soup.body.find("h1") if soup.body else soup.find("h1")
|
||
if first_h1:
|
||
title_text = first_h1.get_text(strip=True)
|
||
title_el = soup.new_tag("div", attrs={"class": "cover-title"})
|
||
title_el.string = title_text or "文档"
|
||
footer.append(title_el)
|
||
subtitle_val = opts.get("filename_text") or ""
|
||
if subtitle_val:
|
||
subtitle_el = soup.new_tag("div", attrs={"class": "cover-subtitle"})
|
||
subtitle_el.string = subtitle_val
|
||
footer.append(subtitle_el)
|
||
meta_el = soup.new_tag("div", attrs={"class": "cover-meta"})
|
||
if product_version_opt:
|
||
pv = soup.new_tag("span")
|
||
pv.string = f"产品版本:{product_version_opt}"
|
||
meta_el.append(pv)
|
||
if document_version_opt:
|
||
dv = soup.new_tag("span")
|
||
dv.string = f"文档版本:{document_version_opt}"
|
||
meta_el.append(dv)
|
||
footer.append(meta_el)
|
||
cov.append(footer)
|
||
if soup.body:
|
||
soup.body.insert(1, cov)
|
||
else:
|
||
soup.insert(1, cov)
|
||
if toc_flag:
|
||
headings = [
|
||
el for el in (soup.find_all(["h1", "h2", "h3"]) or [])
|
||
if el.get("data-origin") != "subtitle"
|
||
]
|
||
if headings:
|
||
ul = soup.new_tag("ul")
|
||
idx = 1
|
||
for el in headings:
|
||
text = el.get_text(strip=True)
|
||
if not text:
|
||
continue
|
||
hid = el.get("id")
|
||
if not hid:
|
||
hid = f"sec-{idx}"
|
||
el["id"] = hid
|
||
idx += 1
|
||
li = soup.new_tag("li", attrs={"class": f"toc-{el.name}"})
|
||
a = soup.new_tag("a", attrs={"href": f"#{hid}", "class": "toc-text"})
|
||
a.string = text
|
||
dots = soup.new_tag("span", attrs={"class": "toc-dots"})
|
||
page = soup.new_tag("span", attrs={"class": "toc-page", "data-target": f"#{hid}"})
|
||
li.append(a)
|
||
li.append(dots)
|
||
li.append(page)
|
||
ul.append(li)
|
||
nav = soup.new_tag("nav", attrs={"class": "toc"})
|
||
h = soup.new_tag("h1")
|
||
h.string = "目录"
|
||
nav.append(h)
|
||
nav.append(ul)
|
||
if soup.body:
|
||
soup.body.insert(2, nav)
|
||
else:
|
||
soup.insert(2, nav)
|
||
if soup.body:
|
||
for h in soup.body.find_all(["h1", "h2", "h3"]):
|
||
sib: Optional[PageElement] = h.find_next_sibling()
|
||
blocks: List[Any] = []
|
||
first_table: Optional[Any] = None
|
||
while sib is not None:
|
||
# Skip pure whitespace nodes
|
||
if getattr(sib, "name", None) is None:
|
||
try:
|
||
if str(sib).strip() == "":
|
||
sib = sib.next_sibling
|
||
continue
|
||
except Exception:
|
||
break
|
||
# Stop if next heading encountered
|
||
name = getattr(sib, "name", None)
|
||
if name in ["h1", "h2", "h3"]:
|
||
break
|
||
# Collect explanatory blocks until first table
|
||
if name == "table":
|
||
first_table = sib
|
||
break
|
||
if name in ["p", "blockquote", "ul", "ol"]:
|
||
blocks.append(sib)
|
||
sib = sib.next_sibling
|
||
continue
|
||
# Unknown block: stop grouping to avoid wrapping unrelated content
|
||
break
|
||
if first_table is not None:
|
||
wrap = soup.new_tag("div", attrs={"class": "table-block"})
|
||
h.insert_before(wrap)
|
||
wrap.append(h.extract())
|
||
for el in blocks:
|
||
wrap.append(el.extract())
|
||
wrap.append(first_table.extract())
|
||
return str(soup)
|
||
|
||
def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
|
||
sheets: List[Any] = []
|
||
if CSS is None:
|
||
return sheets
|
||
if css_text:
|
||
sheets.append(CSS(string=css_text))
|
||
if css_name:
|
||
css_path = Path(__file__).resolve().parent.parent / "configs" / "styles" / f"{css_name}.css"
|
||
if css_path.exists():
|
||
sheets.append(CSS(filename=str(css_path)))
|
||
return sheets
|
||
|
||
def _render_pdf_with_reportlab(md: str) -> bytes:
|
||
"""
|
||
使用 reportlab 生成支持中文的 PDF(纯 Python,无外部依赖)
|
||
"""
|
||
print(f"[DEBUG] _render_pdf_with_reportlab 被调用, md 长度: {len(md)}")
|
||
bio = io.BytesIO()
|
||
|
||
# 创建 PDF 文档
|
||
doc = SimpleDocTemplate(
|
||
bio,
|
||
pagesize=A4,
|
||
rightMargin=20*mm,
|
||
leftMargin=20*mm,
|
||
topMargin=20*mm,
|
||
bottomMargin=20*mm,
|
||
)
|
||
|
||
# 存放 PDF 元素的列表
|
||
story = []
|
||
styles = getSampleStyleSheet()
|
||
|
||
# 尝试注册中文字体
|
||
try:
|
||
# Windows 系统字体
|
||
font_path = r"C:\Windows\Fonts\msyh.ttc" # 微软雅黑
|
||
if Path(font_path).exists():
|
||
pdfmetrics.registerFont(TTFont('ChineseFont', font_path, subfontIndex=0))
|
||
chinese_font = 'ChineseFont'
|
||
else:
|
||
# 尝试其他常见字体路径
|
||
alternative_fonts = [
|
||
r"C:\Windows\Fonts\simhei.ttf", # 黑体
|
||
r"C:\Windows\Fonts\simsun.ttc", # 宋体
|
||
"/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc", # Linux
|
||
"/System/Library/Fonts/PingFang.ttc", # macOS
|
||
]
|
||
chinese_font = 'Helvetica' # 默认
|
||
for font in alternative_fonts:
|
||
if Path(font).exists():
|
||
try:
|
||
pdfmetrics.registerFont(TTFont('ChineseFont', font))
|
||
chinese_font = 'ChineseFont'
|
||
break
|
||
except:
|
||
continue
|
||
except Exception:
|
||
chinese_font = 'Helvetica'
|
||
|
||
# 创建支持中文的样式
|
||
title_style = ParagraphStyle(
|
||
'ChineseTitle',
|
||
parent=styles['Heading1'],
|
||
fontName=chinese_font,
|
||
fontSize=18,
|
||
textColor=colors.black,
|
||
spaceAfter=12,
|
||
spaceBefore=12,
|
||
)
|
||
|
||
heading2_style = ParagraphStyle(
|
||
'ChineseHeading2',
|
||
parent=styles['Heading2'],
|
||
fontName=chinese_font,
|
||
fontSize=14,
|
||
textColor=colors.black,
|
||
spaceAfter=10,
|
||
spaceBefore=10,
|
||
)
|
||
|
||
normal_style = ParagraphStyle(
|
||
'ChineseNormal',
|
||
parent=styles['Normal'],
|
||
fontName=chinese_font,
|
||
fontSize=10,
|
||
textColor=colors.black,
|
||
spaceAfter=8,
|
||
wordWrap='CJK', # 中文换行支持
|
||
)
|
||
|
||
code_style = ParagraphStyle(
|
||
'ChineseCode',
|
||
parent=styles['Code'],
|
||
fontName='Courier',
|
||
fontSize=9,
|
||
textColor=colors.black,
|
||
backColor=colors.lightgrey,
|
||
leftIndent=10,
|
||
)
|
||
|
||
# 解析 markdown
|
||
lines = md.split('\n')
|
||
in_code_block = False
|
||
code_lines = []
|
||
|
||
for line in lines:
|
||
# 代码块处理
|
||
if line.strip().startswith('```'):
|
||
if in_code_block:
|
||
# 代码块结束
|
||
code_text = '\n'.join(code_lines)
|
||
story.append(Paragraph(code_text.replace('<', '<').replace('>', '>'), code_style))
|
||
story.append(Spacer(1, 6*mm))
|
||
code_lines = []
|
||
in_code_block = False
|
||
else:
|
||
in_code_block = True
|
||
continue
|
||
|
||
if in_code_block:
|
||
code_lines.append(line)
|
||
continue
|
||
|
||
# 标题处理
|
||
if line.startswith('# '):
|
||
text = line[2:].strip()
|
||
story.append(Paragraph(text, title_style))
|
||
elif line.startswith('## '):
|
||
text = line[3:].strip()
|
||
story.append(Paragraph(text, heading2_style))
|
||
elif line.startswith('### '):
|
||
text = line[4:].strip()
|
||
story.append(Paragraph(text, heading2_style))
|
||
# 列表处理
|
||
elif line.strip().startswith('- ') or line.strip().startswith('* '):
|
||
text = line.strip()[2:]
|
||
story.append(Paragraph(f'• {text}', normal_style))
|
||
elif re.match(r'^\d+\.\s', line.strip()):
|
||
text = re.sub(r'^\d+\.\s', '', line.strip())
|
||
story.append(Paragraph(text, normal_style))
|
||
# 空行
|
||
elif not line.strip():
|
||
story.append(Spacer(1, 3*mm))
|
||
# 普通段落
|
||
elif line.strip():
|
||
# 处理粗体和斜体
|
||
text = line.strip()
|
||
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
|
||
text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
|
||
text = re.sub(r'`(.+?)`', r'<font face="Courier">\1</font>', text)
|
||
story.append(Paragraph(text, normal_style))
|
||
|
||
# 生成 PDF
|
||
doc.build(story)
|
||
return bio.getvalue()
|
||
|
||
def _render_pdf_with_xhtml2pdf(md: str, html: str, css_name: Optional[str], css_text: Optional[str]) -> bytes:
|
||
"""
|
||
使用 xhtml2pdf 渲染 PDF(纯 Python,无外部依赖)
|
||
"""
|
||
# 使用简单的 markdown 转 HTML,避免复杂的 normalize_html
|
||
simple_html = _render_markdown_html(md)
|
||
|
||
# 构建完整的 HTML 文档,确保格式正确
|
||
full_html = f'''<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<style>
|
||
@page {{
|
||
margin: 20mm;
|
||
}}
|
||
body {{
|
||
font-family: "Microsoft YaHei", "SimSun", Arial, sans-serif;
|
||
font-size: 12pt;
|
||
line-height: 1.6;
|
||
}}
|
||
h1, h2, h3, h4, h5, h6 {{
|
||
color: #333;
|
||
margin-top: 1em;
|
||
margin-bottom: 0.5em;
|
||
}}
|
||
h1 {{ font-size: 24pt; font-weight: bold; }}
|
||
h2 {{ font-size: 20pt; font-weight: bold; }}
|
||
h3 {{ font-size: 16pt; font-weight: bold; }}
|
||
p {{ margin-bottom: 1em; }}
|
||
ul, ol {{ margin-left: 2em; }}
|
||
table {{
|
||
border-collapse: collapse;
|
||
width: 100%;
|
||
margin: 1em 0;
|
||
}}
|
||
th, td {{
|
||
border: 1px solid #ddd;
|
||
padding: 8px;
|
||
}}
|
||
th {{
|
||
background-color: #f2f2f2;
|
||
}}
|
||
a {{ color: #1d4ed8; text-decoration: underline; }}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
{simple_html}
|
||
</body>
|
||
</html>'''
|
||
|
||
# 使用 BytesIO 接收 PDF 输出
|
||
bio = io.BytesIO()
|
||
|
||
# 调用 pisa.CreatePDF
|
||
_pisa.CreatePDF(
|
||
full_html,
|
||
dest=bio,
|
||
encoding='utf-8'
|
||
)
|
||
|
||
return bio.getvalue()
|
||
|
||
def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
|
||
html = normalize_html(md, options={
|
||
"toc": "1" if toc else "",
|
||
"header_text": header_text,
|
||
"footer_text": footer_text,
|
||
"logo_url": logo_url,
|
||
"copyright_text": copyright_text,
|
||
"filename_text": filename_text,
|
||
"cover_src": cover_src,
|
||
"product_name": product_name,
|
||
"document_name": document_name,
|
||
"product_version": product_version,
|
||
"document_version": document_version,
|
||
})
|
||
|
||
# ========== PDF 渲染优先级 ==========
|
||
# 1. reportlab (首选) - 纯 Python,支持中文,跨平台兼容
|
||
# 2. WeasyPrint - 需要 GTK 系统库,Windows 上安装复杂
|
||
# =====================================
|
||
|
||
print(f"[DEBUG] 开始 PDF 转换, _HAS_REPORTLAB={_HAS_REPORTLAB}, HTML is None={HTML is None}")
|
||
|
||
# 首选:reportlab(纯 Python,支持中文,无需外部依赖)
|
||
if _HAS_REPORTLAB:
|
||
try:
|
||
print(f"[DEBUG] 尝试使用 reportlab...")
|
||
return _render_pdf_with_reportlab(md)
|
||
except Exception as e:
|
||
# reportlab 失败,记录错误并继续尝试下一个方案
|
||
import traceback
|
||
error_detail = traceback.format_exc()
|
||
print(f"[DEBUG] reportlab 失败: {str(e)}")
|
||
print(f"[DEBUG] 错误详情:\n{error_detail}")
|
||
|
||
# 备选:WeasyPrint(需要系统库支持)
|
||
if HTML is not None:
|
||
try:
|
||
print(f"[DEBUG] 尝试使用 WeasyPrint...")
|
||
stylesheets = _stylesheets_for(css_name, css_text)
|
||
pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None)
|
||
return pdf_bytes
|
||
except Exception as e:
|
||
# WeasyPrint 失败,记录错误
|
||
import traceback
|
||
error_detail = traceback.format_exc()
|
||
print(f"[DEBUG] WeasyPrint 失败: {str(e)}")
|
||
print(f"[DEBUG] 错误详情:\n{error_detail}")
|
||
|
||
raise RuntimeError("PDF 转换失败。reportlab 已安装但转换失败,请检查 markdown 格式") |