Import project files

This commit is contained in:
2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,709 @@
from pathlib import Path
from typing import Optional, Tuple, Dict, List, Any
from urllib.parse import urlparse, unquote
import os
import re
import io
from bs4 import BeautifulSoup
from bs4.element import PageElement
import marko
import sys
try:
_DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
p = str(_DOC_BASE)
if p not in sys.path:
sys.path.insert(0, p)
except Exception:
pass
try:
from docling.document_converter import DocumentConverter
except Exception:
class DocumentConverter: # type: ignore
def __init__(self, *args, **kwargs):
pass
def convert(self, source):
raise RuntimeError("docling not available")
from docx import Document
from docx.shared import Mm, Pt
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from urllib.request import urlopen
import json
try:
from weasyprint import HTML, CSS # type: ignore
except Exception:
HTML = None
CSS = None
_mdit: Any = None
_tasklists_plugin: Any = None
_deflist_plugin: Any = None
_footnote_plugin: Any = None
_attrs_plugin: Any = None
_HAS_MD_IT: bool = False
try:
import markdown_it as _mdit # type: ignore
from mdit_py_plugins.tasklists import tasklists_plugin as _tasklists_plugin # type: ignore
from mdit_py_plugins.deflist import deflist_plugin as _deflist_plugin # type: ignore
from mdit_py_plugins.footnote import footnote_plugin as _footnote_plugin # type: ignore
from mdit_py_plugins.attrs import attrs_plugin as _attrs_plugin # type: ignore
_HAS_MD_IT = True
except Exception:
pass
converter = DocumentConverter()
LINKMAP_PATH = Path(__file__).resolve().parent.parent / "configs" / "linkmap" / "linkmap.json"
_LINKMAP: Dict[str, str] = {}
def load_linkmap() -> Dict[str, str]:
global _LINKMAP
try:
if LINKMAP_PATH.exists():
_LINKMAP = json.loads(LINKMAP_PATH.read_text("utf-8")) or {}
except Exception:
_LINKMAP = {}
return _LINKMAP
def save_linkmap(mapping: Dict[str, str]) -> None:
LINKMAP_PATH.parent.mkdir(parents=True, exist_ok=True)
LINKMAP_PATH.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), "utf-8")
load_linkmap()
def resolve_link(href: Optional[str], data_doc: Optional[str]) -> Optional[str]:
if href:
return href
if not _LINKMAP:
load_linkmap()
if data_doc and data_doc in _LINKMAP:
return _LINKMAP[data_doc]
return None
def export_payload(doc, fmt: str) -> Tuple[str, str]:
f = fmt.lower()
if f == "markdown":
return doc.export_to_markdown(), "text/markdown"
if f == "html":
return doc.export_to_html(), "text/html"
if f == "json":
return doc.export_to_json(), "application/json"
if f == "doctags":
return doc.export_to_doctags(), "application/json"
raise ValueError("unsupported export")
def infer_basename(source_url: Optional[str], upload_name: Optional[str]) -> str:
if source_url:
path = urlparse(source_url).path
name = os.path.basename(path) or "document"
name = unquote(name)
return os.path.splitext(name)[0] or "document"
if upload_name:
name = os.path.splitext(os.path.basename(upload_name))[0] or "document"
return name
return "document"
def sanitize_filename(name: Optional[str]) -> str:
if not name:
return "document"
name = name.strip()[:128]
name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name) or "document"
return name
def convert_source(source: str, export: str) -> Tuple[str, str]:
result = converter.convert(source)
return export_payload(result.document, export)
def md_to_docx_bytes(md: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
try:
import logging as _log
_log.info(f"md_to_docx_bytes start toc={toc} header={bool(header_text)} footer={bool(footer_text)} logo={bool(logo_url)} cover={bool(cover_src)}")
except Exception:
pass
def _add_field(paragraph, instr: str):
r1 = paragraph.add_run()
b = OxmlElement('w:fldChar')
b.set(qn('w:fldCharType'), 'begin')
r1._r.append(b)
r2 = paragraph.add_run()
t = OxmlElement('w:instrText')
t.set(qn('xml:space'), 'preserve')
t.text = instr
r2._r.append(t)
r3 = paragraph.add_run()
e = OxmlElement('w:fldChar')
e.set(qn('w:fldCharType'), 'end')
r3._r.append(e)
def _available_width(section) -> int:
return section.page_width - section.left_margin - section.right_margin
def _fetch_bytes(u: str) -> Optional[bytes]:
try:
if u.lower().startswith('http://') or u.lower().startswith('https://'):
with urlopen(u, timeout=10) as r:
return r.read()
p = Path(u)
if p.exists() and p.is_file():
return p.read_bytes()
except Exception:
return None
return None
html = normalize_html(md, options={
"toc": "1" if toc else "",
"header_text": header_text,
"footer_text": footer_text,
"logo_url": logo_url,
"copyright_text": copyright_text,
"filename_text": filename_text,
"cover_src": cover_src,
"product_name": product_name,
"document_name": document_name,
"product_version": product_version,
"document_version": document_version,
})
try:
import logging as _log
_log.info(f"md_to_docx_bytes normalize_html length={len(html)}")
except Exception:
pass
soup = BeautifulSoup(html, "html.parser")
doc = Document()
sec0 = doc.sections[0]
sec0.page_width = Mm(210)
sec0.page_height = Mm(297)
sec0.left_margin = Mm(15)
sec0.right_margin = Mm(15)
sec0.top_margin = Mm(20)
sec0.bottom_margin = Mm(20)
has_cover = bool(cover_src or (soup.find('section', class_='cover') is not None))
if has_cover:
sec0.left_margin = Mm(0)
sec0.right_margin = Mm(0)
sec0.top_margin = Mm(0)
sec0.bottom_margin = Mm(0)
if cover_src:
b = _fetch_bytes(cover_src)
if b:
bio = io.BytesIO(b)
doc.add_picture(bio, width=_available_width(sec0))
if product_name:
p = doc.add_paragraph()
r = p.add_run(product_name)
r.font.size = Pt(18)
r.bold = True
t = document_name or None
if not t:
h1 = soup.body.find('h1') if soup.body else soup.find('h1')
t = h1.get_text(strip=True) if h1 else '文档'
p2 = doc.add_paragraph()
r2 = p2.add_run(t or '文档')
r2.font.size = Pt(24)
r2.bold = True
if filename_text:
p3 = doc.add_paragraph()
r3 = p3.add_run(filename_text)
r3.font.size = Pt(13)
meta_parts = []
if product_version:
meta_parts.append("产品版本:" + product_version)
if document_version:
meta_parts.append("文档版本:" + document_version)
if meta_parts:
pm = doc.add_paragraph(" ".join(meta_parts))
pm.font = None
doc.add_section(WD_SECTION.NEW_PAGE)
sec = doc.sections[-1]
sec.page_width = Mm(210)
sec.page_height = Mm(297)
sec.left_margin = Mm(15)
sec.right_margin = Mm(15)
sec.top_margin = Mm(20)
sec.bottom_margin = Mm(20)
else:
sec = sec0
if header_text or logo_url or filename_text:
hp = sec.header.add_paragraph()
left = header_text or ''
right = ''
if '||' in left:
parts = left.split('||', 1)
left, right = parts[0], parts[1]
elif '|' in left:
parts = left.split('|', 1)
left, right = parts[0], parts[1]
if left.strip():
hp.add_run(left.strip())
if right.strip():
rp = sec.header.add_paragraph()
rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
rp.add_run(right.strip())
elif filename_text:
rp = sec.header.add_paragraph()
rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
rp.add_run(filename_text)
if footer_text or copyright_text:
fp = sec.footer.add_paragraph()
if footer_text:
fp.add_run(footer_text)
if copyright_text:
cp = sec.footer.add_paragraph()
cp.add_run(copyright_text)
pn = sec.footer.add_paragraph()
pn.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
_add_field(pn, 'PAGE')
if toc:
doc.add_paragraph('目录')
_add_field(doc.add_paragraph(), 'TOC \\o "1-3" \\h \\z \\u')
doc.add_page_break()
def add_inline(p, node):
if isinstance(node, str):
p.add_run(node)
return
if node.name in ['strong', 'b']:
r = p.add_run(node.get_text())
r.bold = True
return
if node.name in ['em', 'i']:
r = p.add_run(node.get_text())
r.italic = True
return
if node.name == 'code':
r = p.add_run(node.get_text())
r.font.name = 'Courier New'
return
if node.name == 'a':
text = node.get_text()
href = node.get('href')
extra = node.get('data-doc')
resolved = resolve_link(href, extra)
if resolved:
p.add_run(text + ' [' + resolved + ']')
else:
p.add_run(text)
return
if node.name == 'img':
src = node.get('src') or ''
b = _fetch_bytes(src)
if b:
bio = io.BytesIO(b)
try:
doc.add_picture(bio, width=_available_width(sec))
except Exception:
pass
return
for c in getattr(node, 'children', []):
add_inline(p, c)
def process_block(el):
name = getattr(el, 'name', None)
if name is None:
return
cls = el.get('class') or []
if name == 'div' and 'doc-meta' in cls:
return
if name == 'section' and 'cover' in cls:
return
if name == 'nav' and 'toc' in cls:
return
if name == 'div':
for child in el.children:
process_block(child)
return
if name == 'h1':
doc.add_heading(el.get_text(), level=1)
return
if name == 'h2' or (name == 'strong' and 'subtitle' in cls):
doc.add_heading(el.get_text(), level=2)
return
if name == 'h3':
doc.add_heading(el.get_text(), level=3)
return
if name == 'p':
p = doc.add_paragraph()
for c in el.children:
add_inline(p, c)
return
if name in ['ul', 'ol']:
for li in el.find_all('li', recursive=False):
p = doc.add_paragraph(style='List Bullet')
for c in li.children:
add_inline(p, c)
return
if name == 'pre':
code = el.get_text() or ''
p = doc.add_paragraph()
run = p.add_run(code)
run.font.name = 'Courier New'
return
if name == 'blockquote':
p = doc.add_paragraph(el.get_text())
p.paragraph_format.left_indent = Mm(10)
return
if name == 'table':
rows = []
thead = el.find('thead')
tbody = el.find('tbody')
if thead:
hdrs = [th.get_text(strip=True) for th in thead.find_all('th')]
else:
hdrs = [cell.get_text(strip=True) for cell in el.find_all('tr')[0].find_all(['th','td'])] if el.find_all('tr') else []
trs = tbody.find_all('tr') if tbody else el.find_all('tr')[1:]
for tr in trs:
tds = [td.get_text(strip=True) for td in tr.find_all('td')]
rows.append(tds)
tbl = doc.add_table(rows=1 + len(rows), cols=len(hdrs) or 1)
hdr = tbl.rows[0].cells
for k, h in enumerate(hdrs or ['']):
hdr[k].text = h
for r_idx, row in enumerate(rows):
cells = tbl.rows[1 + r_idx].cells
for c_idx in range(len(hdrs) or 1):
cells[c_idx].text = (row[c_idx] if c_idx < len(row) else '')
return
if name == 'img':
src = el.get('src') or ''
b = _fetch_bytes(src)
if b:
bio = io.BytesIO(b)
try:
doc.add_picture(bio, width=_available_width(sec))
except Exception:
pass
return
body = soup.body or soup
for el in body.children:
process_block(el)
bio = io.BytesIO()
try:
import logging as _log
_log.info("md_to_docx_bytes saving doc")
except Exception:
pass
doc.save(bio)
try:
import logging as _log
_log.info(f"md_to_docx_bytes done size={bio.tell()}")
except Exception:
pass
return bio.getvalue()
def md_to_pdf_bytes(md: str) -> bytes:
return md_to_pdf_bytes_with_renderer(md, renderer="weasyprint")
def _md_with_tables_to_html(md_text: str) -> str:
lines = md_text.splitlines()
out = []
i = 0
while i < len(lines):
line = lines[i]
def is_sep(s: str) -> bool:
s = s.strip()
if "|" not in s:
return False
s = s.strip("|")
return all(set(seg.strip()) <= set("-: ") and len(seg.strip()) >= 1 for seg in s.split("|"))
if "|" in line and i + 1 < len(lines) and is_sep(lines[i + 1]):
headers = [c.strip() for c in line.strip().strip("|").split("|")]
j = i + 2
rows = []
while j < len(lines) and "|" in lines[j]:
rows.append([c.strip() for c in lines[j].strip().strip("|").split("|")])
j += 1
tbl = ["<table>", "<thead><tr>"]
for h in headers:
tbl.append(f"<th>{h}</th>")
tbl.append("</tr></thead><tbody>")
for row in rows:
tbl.append("<tr>")
for idx in range(len(headers)):
cell = row[idx] if idx < len(row) else ""
tbl.append(f"<td>{cell}</td>")
tbl.append("</tr>")
tbl.append("</tbody></table>")
out.append("".join(tbl))
i = j
continue
out.append(line)
i += 1
return marko.convert("\n".join(out))
def _render_markdown_html(md_text: str) -> str:
if _HAS_MD_IT and _mdit is not None:
try:
md = _mdit.MarkdownIt("commonmark").enable(["table", "strikethrough"])
if _tasklists_plugin:
md.use(_tasklists_plugin)
if _deflist_plugin:
md.use(_deflist_plugin)
if _footnote_plugin:
md.use(_footnote_plugin)
if _attrs_plugin:
md.use(_attrs_plugin)
return md.render(md_text)
except Exception:
pass
return _md_with_tables_to_html(md_text)
def normalize_html(md_or_html: str, options: Optional[Dict[str, Optional[str]]] = None) -> str:
html = _render_markdown_html(md_or_html)
soup = BeautifulSoup(html, "html.parser")
for s in soup.find_all("strong", class_="subtitle"):
s.name = "h2"
s.attrs = {"data-origin": "subtitle"}
for a in soup.find_all("a"):
href_val = a.get("href")
extra_val = a.get("data-doc")
href = href_val if isinstance(href_val, str) else None
extra = extra_val if isinstance(extra_val, str) else None
resolved = resolve_link(href, extra)
if resolved:
a["href"] = resolved
elif not href and extra:
a.replace_with(a.get_text() + " [" + extra + "]")
opts = options or {}
header_text = opts.get("header_text") or None
footer_text = opts.get("footer_text") or None
logo_url = opts.get("logo_url") or None
copyright_text = opts.get("copyright_text") or None
cover_src = opts.get("cover_src") or None
product_name_opt = opts.get("product_name") or None
document_name_opt = opts.get("document_name") or None
product_version_opt = opts.get("product_version") or None
document_version_opt = opts.get("document_version") or None
toc_flag = bool(opts.get("toc"))
meta = soup.new_tag("div", attrs={"class": "doc-meta"})
if header_text:
ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
text = header_text
left = text
right = ""
if "||" in text:
parts = text.split("||", 1)
left, right = parts[0], parts[1]
elif "|" in text:
parts = text.split("|", 1)
left, right = parts[0], parts[1]
if logo_url:
img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
ht.append(img)
hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
hl.string = left
ht.append(hl)
if right.strip():
hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
hr.string = right
ht.append(hr)
meta.append(ht)
else:
first_h1 = None
if soup.body:
first_h1 = soup.body.find("h1")
else:
first_h1 = soup.find("h1")
left = (first_h1.get_text(strip=True) if first_h1 else "文档")
right = opts.get("filename_text") or ""
ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
if logo_url:
img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
ht.append(img)
hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
hl.string = left
ht.append(hl)
if right:
hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
hr.string = right
ht.append(hr)
meta.append(ht)
if footer_text:
ft = soup.new_tag("div", attrs={"class": "doc-footer-text"})
ft.string = footer_text
meta.append(ft)
page_header_val = (header_text or (document_name_opt or None))
if not page_header_val:
first_h1_for_header = None
if soup.body:
first_h1_for_header = soup.body.find("h1")
else:
first_h1_for_header = soup.find("h1")
page_header_val = (first_h1_for_header.get_text(strip=True) if first_h1_for_header else "文档")
page_footer_val = (footer_text or "FunMD")
ph = soup.new_tag("div", attrs={"class": "doc-page-header"})
if logo_url:
logo_inline = soup.new_tag("img", attrs={"src": logo_url, "class": "doc-page-header-logo"})
ph.append(logo_inline)
ht_inline = soup.new_tag("span", attrs={"class": "doc-page-header-text"})
ht_inline.string = page_header_val
ph.append(ht_inline)
meta.append(ph)
pf = soup.new_tag("div", attrs={"class": "doc-page-footer"})
pf.string = page_footer_val
meta.append(pf)
if copyright_text:
cp = soup.new_tag("div", attrs={"class": "doc-copyright"})
cp.string = copyright_text
meta.append(cp)
# brand logo is rendered inline within header; no separate top-left element
if soup.body:
soup.body.insert(0, meta)
else:
soup.insert(0, meta)
if not soup.head:
head = soup.new_tag("head")
soup.insert(0, head)
else:
head = soup.head
style_run = soup.new_tag("style")
style_run.string = "@page{margin:20mm}@page{\n @top-center{content: element(page-header)}\n @bottom-center{content: element(page-footer)}\n}\n.doc-page-header{position: running(page-header); font-size:10pt; color:#666; display:block; text-align:center; width:100%}\n.doc-page-header::after{content:''; display:block; width:80%; border-bottom:1px solid #d9d9d9; margin:4px auto 0}\n.doc-page-header-logo{height:20px; vertical-align:middle; margin-right:4px}\n.doc-page-header-text{vertical-align:middle}\n.doc-page-footer{position: running(page-footer); font-size:10pt; color:#666}\n.doc-page-footer::before{content:''; display:block; width:80%; border-top:1px solid #d9d9d9; margin:0 auto 4px}"
head.append(style_run)
# Fallback inline styles for cover to ensure visibility even if external CSS isn't loaded
if (cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt):
if not soup.head:
head = soup.new_tag("head")
soup.insert(0, head)
else:
head = soup.head
style = soup.new_tag("style")
style.string = "@page:first{margin:0} html,body{margin:0;padding:0}.cover{position:relative;width:210mm;height:297mm;overflow:hidden;page-break-after:always}.cover .cover-bg{position:absolute;left:0;top:0;right:0;bottom:0;width:100%;height:100%;object-fit:cover;display:block}.cover .cover-brand{position:absolute;top:20mm;left:20mm;font-size:18pt;font-weight:700;color:#1d4ed8}.cover .cover-footer{position:absolute;left:0;right:0;bottom:0;background:#1d4ed8;color:#fff;padding:12mm 20mm}.cover .cover-title{font-size:24pt;font-weight:700;margin:0}.cover .cover-subtitle{font-size:13pt;margin-top:4pt}.cover .cover-meta{margin-top:8pt;font-size:11pt;display:flex;gap:20mm}"
head.append(style)
if cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt:
cov = soup.new_tag("section", attrs={"class": "cover"})
if cover_src:
bg = soup.new_tag("img", attrs={"class": "cover-bg", "src": cover_src})
cov.append(bg)
if product_name_opt:
brand_el = soup.new_tag("div", attrs={"class": "cover-brand"})
brand_el.string = product_name_opt
cov.append(brand_el)
footer = soup.new_tag("div", attrs={"class": "cover-footer"})
title_text = document_name_opt or None
if not title_text:
first_h1 = soup.body.find("h1") if soup.body else soup.find("h1")
if first_h1:
title_text = first_h1.get_text(strip=True)
title_el = soup.new_tag("div", attrs={"class": "cover-title"})
title_el.string = title_text or "文档"
footer.append(title_el)
subtitle_val = opts.get("filename_text") or ""
if subtitle_val:
subtitle_el = soup.new_tag("div", attrs={"class": "cover-subtitle"})
subtitle_el.string = subtitle_val
footer.append(subtitle_el)
meta_el = soup.new_tag("div", attrs={"class": "cover-meta"})
if product_version_opt:
pv = soup.new_tag("span")
pv.string = f"产品版本:{product_version_opt}"
meta_el.append(pv)
if document_version_opt:
dv = soup.new_tag("span")
dv.string = f"文档版本:{document_version_opt}"
meta_el.append(dv)
footer.append(meta_el)
cov.append(footer)
if soup.body:
soup.body.insert(1, cov)
else:
soup.insert(1, cov)
if toc_flag:
headings = [
el for el in (soup.find_all(["h1", "h2", "h3"]) or [])
if el.get("data-origin") != "subtitle"
]
if headings:
ul = soup.new_tag("ul")
idx = 1
for el in headings:
text = el.get_text(strip=True)
if not text:
continue
hid = el.get("id")
if not hid:
hid = f"sec-{idx}"
el["id"] = hid
idx += 1
li = soup.new_tag("li", attrs={"class": f"toc-{el.name}"})
a = soup.new_tag("a", attrs={"href": f"#{hid}", "class": "toc-text"})
a.string = text
dots = soup.new_tag("span", attrs={"class": "toc-dots"})
page = soup.new_tag("span", attrs={"class": "toc-page", "data-target": f"#{hid}"})
li.append(a)
li.append(dots)
li.append(page)
ul.append(li)
nav = soup.new_tag("nav", attrs={"class": "toc"})
h = soup.new_tag("h1")
h.string = "目录"
nav.append(h)
nav.append(ul)
if soup.body:
soup.body.insert(2, nav)
else:
soup.insert(2, nav)
if soup.body:
for h in soup.body.find_all(["h1", "h2", "h3"]):
sib: Optional[PageElement] = h.find_next_sibling()
blocks: List[Any] = []
first_table: Optional[Any] = None
while sib is not None:
# Skip pure whitespace nodes
if getattr(sib, "name", None) is None:
try:
if str(sib).strip() == "":
sib = sib.next_sibling
continue
except Exception:
break
# Stop if next heading encountered
name = getattr(sib, "name", None)
if name in ["h1", "h2", "h3"]:
break
# Collect explanatory blocks until first table
if name == "table":
first_table = sib
break
if name in ["p", "blockquote", "ul", "ol"]:
blocks.append(sib)
sib = sib.next_sibling
continue
# Unknown block: stop grouping to avoid wrapping unrelated content
break
if first_table is not None:
wrap = soup.new_tag("div", attrs={"class": "table-block"})
h.insert_before(wrap)
wrap.append(h.extract())
for el in blocks:
wrap.append(el.extract())
wrap.append(first_table.extract())
return str(soup)
def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
sheets: List[Any] = []
if CSS is None:
return sheets
if css_text:
sheets.append(CSS(string=css_text))
if css_name:
css_path = Path(__file__).resolve().parent.parent / "configs" / "styles" / f"{css_name}.css"
if css_path.exists():
sheets.append(CSS(filename=str(css_path)))
return sheets
def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
html = normalize_html(md, options={
"toc": "1" if toc else "",
"header_text": header_text,
"footer_text": footer_text,
"logo_url": logo_url,
"copyright_text": copyright_text,
"filename_text": filename_text,
"cover_src": cover_src,
"product_name": product_name,
"document_name": document_name,
"product_version": product_version,
"document_version": document_version,
})
if HTML is not None:
stylesheets = _stylesheets_for(css_name, css_text)
pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None)
return pdf_bytes
raise RuntimeError("WeasyPrint is not available")

View File

@@ -0,0 +1,190 @@
from typing import Optional, Tuple, Dict
import os
import logging
from urllib.request import urlopen
try:
from minio import Minio # type: ignore
import urllib3 # type: ignore
except Exception:
Minio = None
urllib3 = None # type: ignore
def minio_head_bucket(client: object, bucket: str) -> bool:
try:
if hasattr(client, "bucket_exists"):
try:
return bool(client.bucket_exists(bucket)) # type: ignore
except Exception:
pass
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
client._url_open(method="HEAD", region=region, bucket_name=bucket) # type: ignore
return True
except Exception:
try:
names = [getattr(b, "name", None) for b in client.list_buckets()] # type: ignore
return bucket in set(n for n in names if n)
except Exception:
return False
def minio_create_bucket(client: object, bucket: str) -> bool:
try:
if hasattr(client, "bucket_exists"):
try:
if client.bucket_exists(bucket): # type: ignore
return True
except Exception:
pass
if hasattr(client, "make_bucket"):
try:
client.make_bucket(bucket) # type: ignore
return True
except Exception:
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
try:
client.make_bucket(bucket, location=region) # type: ignore
return True
except Exception:
pass
try:
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
client._url_open(method="PUT", region=region, bucket_name=bucket) # type: ignore
return True
except Exception as ce:
if "BucketAlreadyOwnedByYou" in str(ce) or "BucketAlreadyExists" in str(ce):
return True
raise
except Exception as e:
raise e
def minio_client(endpoint: str, access: str, secret: str, secure: bool):
if urllib3 is not None:
try:
http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=3.0, read=20.0))
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure, http_client=http) # type: ignore
except Exception:
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore
def minio_time_hint(endpoint: str, secure: bool) -> Optional[str]:
try:
scheme = "https" if secure else "http"
r = urlopen(f"{scheme}://{endpoint}", timeout=3)
srv_date = r.headers.get("Date")
if not srv_date:
return None
from email.utils import parsedate_to_datetime
from datetime import datetime, timezone
dt = parsedate_to_datetime(srv_date)
now = datetime.now(timezone.utc)
diff = abs((now - dt).total_seconds())
return f"服务器时间与本机相差约 {int(diff)}"
except Exception:
return None
def join_prefix(prefix: str, rel: str) -> str:
pre = (prefix or "").strip("/")
r = rel.lstrip("/")
if pre and r.startswith(pre + "/"):
return r
return f"{pre}/{r}" if pre else r
def presigned_read(client: object, bucket: str, obj: str, expires_seconds: int) -> Optional[str]:
try:
from datetime import timedelta
exp = expires_seconds
try:
exp = int(exp)
except Exception:
pass
td = timedelta(seconds=exp)
try:
return client.get_presigned_url("GET", bucket, obj, expires=td) # type: ignore
except Exception:
return client.presigned_get_object(bucket, obj, expires=td) # type: ignore
except Exception:
return None
def minio_current(runtime_cfg: Dict[str, Dict[str, Optional[str]]]) -> Tuple[Optional[object], Optional[str], Optional[str], str]:
rc = runtime_cfg.get("minio", {})
endpoint_raw = rc.get("endpoint") or os.environ.get("MINIO_ENDPOINT")
access_raw = rc.get("access") or os.environ.get("MINIO_ACCESS_KEY")
secret_raw = rc.get("secret") or os.environ.get("MINIO_SECRET_KEY")
bucket_raw = rc.get("bucket") or os.environ.get("MINIO_BUCKET")
secure_flag = rc.get("secure") or os.environ.get("MINIO_SECURE", "false")
secure = str(secure_flag or "false").lower() in {"1","true","yes","on"}
public_raw = rc.get("public") or os.environ.get("MINIO_PUBLIC_ENDPOINT")
endpoint = (str(endpoint_raw).strip() if endpoint_raw else None)
try:
if isinstance(endpoint, str) and ":9001" in endpoint:
h = endpoint.split("/")[0]
if ":" in h:
parts = h.split(":")
endpoint = f"{parts[0]}:9000"
else:
endpoint = h
except Exception:
endpoint = endpoint
access = (str(access_raw).strip() if access_raw else None)
secret = (str(secret_raw).strip() if secret_raw else None)
bucket = (str(bucket_raw).strip() if bucket_raw else None)
public_base = (str(public_raw).strip() if public_raw else None)
try:
if isinstance(public_base, str) and (":9001" in public_base or "/browser" in public_base or "/minio" in public_base):
host = public_base.strip().split("/")[0]
scheme = "https" if secure else "http"
if ":" in host:
host = host.split("/")[0]
base_host = host.split(":")[0]
public_base = f"{scheme}://{base_host}:9000"
else:
public_base = f"{scheme}://{host}:9000"
except Exception:
public_base = public_base
if not public_base and endpoint:
public_base = f"https://{endpoint}" if secure else f"http://{endpoint}"
missing = []
if Minio is None:
missing.append("client")
if not endpoint:
missing.append("endpoint")
if not access:
missing.append("access")
if not secret:
missing.append("secret")
if not bucket:
missing.append("bucket")
if not public_base:
missing.append("public")
if missing:
try:
logging.error(f"minio config invalid: missing={missing}")
except Exception:
pass
return None, None, None, ""
client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=secure)
try:
try:
client.list_buckets() # type: ignore
except Exception as e:
if secure and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)):
client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=False)
except Exception:
pass
try:
exists = minio_head_bucket(client, bucket)
if not exists:
minio_create_bucket(client, bucket)
except Exception:
pass
prefix = rc.get("prefix") or os.environ.get("MINIO_PREFIX", "")
return client, bucket, public_base, prefix

View File

@@ -0,0 +1,492 @@
from pathlib import Path
from typing import Optional, Tuple
import re
import tempfile
import sys
from urllib.parse import urlsplit
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
import io
_DOC_AVAILABLE = True
try:
_DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
p = str(_DOC_BASE)
if p not in sys.path:
sys.path.insert(0, p)
except Exception:
pass
try:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode
except Exception:
_DOC_AVAILABLE = False
class DocumentConverter: # type: ignore
def __init__(self, *args, **kwargs):
pass
def convert(self, source):
raise RuntimeError("docling unavailable")
class InputFormat: # type: ignore
PDF = "pdf"
class PdfFormatOption: # type: ignore
def __init__(self, *args, **kwargs):
pass
class StandardPdfPipeline: # type: ignore
pass
class PdfPipelineOptions: # type: ignore
def __init__(self):
pass
class ImageRefMode: # type: ignore
EMBEDDED = None
"""
@api Unified Converter Service
@description Provides core document conversion logic unifying Docling and word2markdown engines
"""
_W2M_AVAILABLE = False
try:
from app.services.word2markdown import convert_any as _w2m_convert_any # type: ignore
_W2M_AVAILABLE = True
except Exception:
_W2M_AVAILABLE = False
try:
from bs4 import BeautifulSoup # type: ignore
except Exception:
BeautifulSoup = None # type: ignore
try:
from app.services.docling_adapter import normalize_html as _normalize_html # type: ignore
from app.services.docling_adapter import resolve_link as _resolve_link # type: ignore
from app.services.docling_adapter import _render_markdown_html as _render_md_html # type: ignore
except Exception:
_normalize_html = None # type: ignore
_resolve_link = None # type: ignore
_render_md_html = None # type: ignore
def _is_http(s: str) -> bool:
t = (s or "").lower()
return t.startswith("http://") or t.startswith("https://")
def _read_bytes(source: str) -> Tuple[bytes, str]:
ct = ""
try:
if _is_http(source):
from urllib.request import urlopen
with urlopen(source, timeout=10) as r:
ct = r.headers.get("Content-Type") or ""
return r.read() or b"", ct
p = Path(source)
if p.exists() and p.is_file():
return p.read_bytes(), ct
except Exception:
return b"", ct
return b"", ct
def _decode_to_utf8(raw: bytes, ct: str = "") -> str:
if not raw:
return ""
if raw.startswith(b"\xef\xbb\xbf"):
try:
return raw[3:].decode("utf-8")
except Exception:
pass
if raw.startswith(b"\xff\xfe"):
try:
return raw[2:].decode("utf-16le")
except Exception:
pass
if raw.startswith(b"\xfe\xff"):
try:
return raw[2:].decode("utf-16be")
except Exception:
pass
try:
m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE)
if m:
enc = m.group(1).strip().lower()
try:
return raw.decode(enc)
except Exception:
pass
except Exception:
pass
candidates = [
"utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252",
]
for enc in candidates:
try:
return raw.decode(enc)
except Exception:
continue
return raw.decode("utf-8", errors="replace")
def _normalize_newlines(s: str) -> str:
return (s or "").replace("\r\n", "\n").replace("\r", "\n")
def _html_to_markdown(html: str) -> str:
if not html:
return ""
if BeautifulSoup is None:
return html
soup = BeautifulSoup(html, "html.parser")
out: list[str] = []
def txt(node) -> str:
return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "")
def inline(node) -> str:
if isinstance(node, str):
return node
name = getattr(node, "name", None)
if name in {None}: # type: ignore
return str(node)
if name in {"strong", "b"}:
return "**" + txt(node) + "**"
if name in {"em", "i"}:
return "*" + txt(node) + "*"
if name == "code":
return "`" + txt(node) + "`"
if name == "a":
href_val = node.get("href")
extra_val = node.get("data-doc")
href = href_val if isinstance(href_val, str) else None
extra = extra_val if isinstance(extra_val, str) else None
resolved = _resolve_link(href, extra) if _resolve_link else (href or extra)
url = resolved or ""
text = txt(node)
if url:
return f"[{text}]({url})"
return text
if name == "img":
alt = node.get("alt") or "image"
src = node.get("src") or ""
return f"![{alt}]({src})"
res = []
for c in getattr(node, "children", []):
res.append(inline(c))
return "".join(res)
def block(node):
name = getattr(node, "name", None)
if name is None:
s = str(node).strip()
if s:
out.append(s)
return
if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
lvl = int(name[1])
out.append("#" * lvl + " " + txt(node))
out.append("")
return
if name == "p":
segs = [inline(c) for c in node.children]
out.append("".join(segs))
out.append("")
return
if name == "br":
out.append("")
return
if name in {"ul", "ol"}:
is_ol = name == "ol"
idx = 1
for li in node.find_all("li", recursive=False):
text = "".join(inline(c) for c in li.children)
if is_ol:
out.append(f"{idx}. {text}")
idx += 1
else:
out.append(f"- {text}")
out.append("")
return
if name == "pre":
code_node = node.find("code")
code_text = code_node.get_text() if code_node else node.get_text()
lang = ""
cls = (code_node.get("class") if code_node else node.get("class")) or []
for c in cls:
s = str(c)
if s.startswith("language-"):
lang = s.split("-", 1)[-1]
break
out.append(f"```{lang}\n{code_text}\n```\n")
return
if name == "blockquote":
lines = [l for l in txt(node).splitlines() if l.strip()]
for l in lines:
out.append("> " + l)
out.append("")
return
if name == "table":
rows = node.find_all("tr")
if not rows:
return
headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])]
if headers:
out.append("|" + "|".join(headers) + "|")
sep = "|" + "|".join(["---" for _ in headers]) + "|"
out.append(sep)
for tr in rows[1:]:
cells = [td.get_text(strip=True) for td in tr.find_all("td")]
if cells:
out.append("|" + "|".join(cells) + "|")
out.append("")
return
if name == "div":
for c in node.children:
block(c)
return
segs = [inline(c) for c in node.children]
if segs:
out.append("".join(segs))
out.append("")
root = soup.body or soup
for ch in getattr(root, "children", []):
block(ch)
return _normalize_newlines("\n".join(out)).strip()
def _lower_html_table_tags(html: str) -> str:
"""
@function _lower_html_table_tags
@description Normalizes HTML table tags to lowercase
@param html Input HTML string
@return Normalized HTML string
"""
if not html:
return html
tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"]
out = html
for t in tags:
out = re.sub(r"</?" + t + r"\b", lambda m: m.group(0).lower(), out)
out = re.sub(r">\s*\n+\s*", ">\n", out)
return out
def _replace_admonitions(md: str) -> str:
"""
@function _replace_admonitions
@description Replaces ::: style admonitions with !!! style
@param md Input markdown string
@return Processed markdown string
"""
if not md:
return md
lines = md.split("\n")
out = []
in_block = False
for raw in lines:
t = raw.strip()
if t.startswith(":::"):
if not in_block:
name = t[3:].strip()
if not name:
out.append("!!!")
else:
out.append("!!! " + name)
in_block = True
else:
out.append("!!!")
in_block = False
continue
out.append(raw)
return "\n".join(out)
def _enhance_codeblocks(md: str) -> str:
if not md:
return md
lines = md.split("\n")
res = []
in_fence = False
fence_lang = ""
i = 0
while i < len(lines):
line = lines[i]
t = line.strip()
if t.startswith("```"):
in_fence = not in_fence
try:
fence_lang = (t[3:] or "").strip() if in_fence else ""
except Exception:
fence_lang = ""
res.append(line)
i += 1
continue
if in_fence:
res.append(line)
i += 1
continue
if t.startswith("{") or t.startswith("["):
buf = [line]
j = i + 1
closed = False
depth = t.count("{") - t.count("}")
while j < len(lines):
buf.append(lines[j])
s = lines[j].strip()
depth += s.count("{") - s.count("}")
if depth <= 0 and s.endswith("}"):
closed = True
break
j += 1
if closed and len(buf) >= 3:
lang = "json"
res.append("```" + lang)
res.extend(buf)
res.append("```")
i = j + 1
continue
code_sig = (
("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t)
)
if code_sig:
buf = [line]
j = i + 1
while j < len(lines):
s = lines[j].strip()
if not s:
break
if s.startswith("# ") or s.startswith("## ") or s.startswith("### "):
break
buf.append(lines[j])
j += 1
if len(buf) >= 3:
res.append("```")
res.extend(buf)
res.append("```")
i = j + 1
continue
res.append(line)
i += 1
return "\n".join(res)
class FormatConverter:
"""
@class FormatConverter
@description Unified converter class wrapping Docling and word2markdown
"""
def __init__(self) -> None:
self._docling = DocumentConverter()
def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]:
"""
@function convert
@description Convert a document source to specified format
@param source Path or URL to source document
@param export Output format (markdown, html, json, doctags)
@param engine Optional engine override (word2markdown/docling)
@param mdx_safe_mode_enabled Toggle safe mode for MDX
@return Tuple of (encoding, content)
"""
# Prefer custom word2markdown engine for DOC/DOCX when available
auto_engine = None
try:
from pathlib import Path as _P
suf = _P(source).suffix.lower()
if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE:
auto_engine = "word2markdown"
except Exception:
auto_engine = None
use_engine = (engine or auto_engine or "").lower()
try:
from urllib.parse import urlsplit
path = source
if _is_http(source):
path = urlsplit(source).path or ""
ext = Path(path).suffix.lower()
except Exception:
ext = Path(source).suffix.lower()
if ext in {".txt"}:
raw, ct = _read_bytes(source)
text = _normalize_newlines(_decode_to_utf8(raw, ct))
if export.lower() == "html":
if _render_md_html is not None:
html = _render_md_html(text)
else:
try:
import marko
html = marko.convert(text)
except Exception:
html = f"<pre>{text}</pre>"
return "utf-8", _lower_html_table_tags(html), None
md = _enhance_codeblocks(text)
return "utf-8", md, None
if ext in {".md"}:
raw, ct = _read_bytes(source)
text = _normalize_newlines(_decode_to_utf8(raw, ct))
if export.lower() == "html":
if _render_md_html is not None:
html = _render_md_html(text)
else:
try:
import marko
html = marko.convert(text)
except Exception:
html = text
return "utf-8", _lower_html_table_tags(html), None
return "utf-8", text, None
if ext in {".html", ".htm"}:
try:
conv = DocumentConverter(allowed_formats=[InputFormat.HTML])
result = conv.convert(source)
if export.lower() == "html":
html = result.document.export_to_html()
html = _lower_html_table_tags(html)
return "utf-8", html, None
md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
md = _replace_admonitions(md)
md = _enhance_codeblocks(md)
return "utf-8", md, None
except Exception:
raw, ct = _read_bytes(source)
html_in = _normalize_newlines(_decode_to_utf8(raw, ct))
if export.lower() == "html":
html = _normalize_html(html_in) if _normalize_html is not None else html_in
return "utf-8", _lower_html_table_tags(html), None
md = _html_to_markdown(html_in)
md = _replace_admonitions(md)
md = _enhance_codeblocks(md)
return "utf-8", md, None
if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE:
enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled)
md = _replace_admonitions(md)
md = _enhance_codeblocks(md)
return enc or "utf-8", md, None
# Configure PDF pipeline to generate picture images into a per-call artifacts directory
artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_")
pdf_opts = PdfPipelineOptions()
pdf_opts.generate_picture_images = True
pdf_opts.generate_page_images = True
pdf_opts.images_scale = 2.0
pdf_opts.do_code_enrichment = True
pdf_opts.do_formula_enrichment = True
self._docling = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=pdf_opts,
)
}
)
result = self._docling.convert(source)
if export.lower() == "markdown":
md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
md = _replace_admonitions(md)
md = _enhance_codeblocks(md)
return "utf-8", md, artifacts_dir
if export.lower() == "html":
html = result.document.export_to_html()
html = _lower_html_table_tags(html)
return "utf-8", html, artifacts_dir
if export.lower() == "json":
js = result.document.export_to_json()
return "utf-8", js, artifacts_dir
if export.lower() == "doctags":
dt = result.document.export_to_doctags()
return "utf-8", dt, artifacts_dir
raise RuntimeError("unsupported export")

View File

@@ -0,0 +1,429 @@
from pathlib import Path
from typing import Tuple, List
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
import re
import base64
import hashlib
import tempfile
import subprocess
from lxml import etree
def _iter_blocks(doc: Document):
parent = doc
parent_elm = parent.element.body
for child in parent_elm.iterchildren():
tag = child.tag.split('}')[-1]
if tag == 'p':
yield Paragraph(child, parent)
elif tag == 'tbl':
yield Table(child, parent)
def _cell_text(cell) -> str:
parts = []
for p in cell.paragraphs:
t = p.text or ""
parts.append(t)
return "\n".join([s for s in parts if s is not None])
def _guess_lang(text: str) -> str:
t = (text or "").strip()
head = t[:512]
if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
return "java"
if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
return "python"
if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
return "javascript"
if re.search(r"^#include|\bint\s+main\s*\(\)", head):
return "c"
if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
return "cpp"
if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
return "sql"
if head.startswith("{") or head.startswith("["):
return "json"
if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
return "html"
if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
return "xml"
return ""
def _table_to_md(tbl: Table) -> str:
rows = tbl.rows
cols = tbl.columns
if len(rows) == 1 and len(cols) == 1:
txt = _cell_text(rows[0].cells[0]).strip()
lang = _guess_lang(txt)
return f"```{lang}\n{txt}\n```\n"
def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
el = paragraph._element
parts: List[str] = []
try:
for ch in el.iterchildren():
tag = ch.tag.split('}')[-1]
if tag == 'r':
for rc in ch.iterchildren():
rtag = rc.tag.split('}')[-1]
if rtag == 't':
s = rc.text or ''
if s:
parts.append(s)
elif rtag == 'br':
parts.append('\n')
elif rtag == 'drawing':
try:
for node in rc.iter():
local = node.tag.split('}')[-1]
rid = None
if local == 'blip':
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
elif local == 'imagedata':
rid = node.get(f"{{{NS['r']}}}id")
if not rid:
continue
try:
part = None
rp = getattr(doc.part, 'related_parts', None)
if isinstance(rp, dict) and rid in rp:
part = rp.get(rid)
if part is None:
rels = getattr(doc.part, 'rels', None)
if rels is not None and hasattr(rels, 'get'):
rel = rels.get(rid)
part = getattr(rel, 'target_part', None)
if part is None:
rel = getattr(doc.part, '_rels', {}).get(rid)
part = getattr(rel, 'target_part', None)
ct = getattr(part, 'content_type', '') if part is not None else ''
data = part.blob if part is not None and hasattr(part, 'blob') else None
if data:
b64 = base64.b64encode(data).decode('ascii')
parts.append(f"![Image](data:{ct};base64,{b64})")
except Exception:
pass
except Exception:
pass
except Exception:
pass
return ''.join(parts)
out = []
# python-docx table parent is the Document
doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
for r_i, r in enumerate(rows):
vals = []
for c in r.cells:
segs: List[str] = []
for p in c.paragraphs:
s = _cell_inline_md(doc, p)
if s:
segs.append(s)
cell_text = '<br>'.join([x for x in segs if x is not None])
vals.append((cell_text or '').replace('|', '\\|').strip())
line = "| " + " | ".join(vals) + " |"
out.append(line)
if r_i == 0:
sep = "| " + " | ".join(["---" for _ in vals]) + " |"
out.append(sep)
return "\n".join(out) + "\n"
def _paragraph_to_md(p: Paragraph) -> str:
return (p.text or "").strip() + "\n\n"
def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
ext = path.suffix.lower()
use_path = path
if ext == ".doc":
use_path = _convert_doc_to_docx_cross_platform(path)
if use_path.suffix.lower() not in {".docx"}:
raise RuntimeError("unsupported input for word2markdown")
doc = Document(str(use_path))
out: List[str] = []
in_code = False
code_lines: List[str] = []
lang_hint: str = ''
for blk in _iter_blocks(doc):
if isinstance(blk, Table):
out.append(_table_to_md(blk))
elif isinstance(blk, Paragraph):
tboxes = _paragraph_textboxes(blk)
for tb in tboxes:
if tb.strip():
out.append(_md_code_block(tb.strip()))
sdts = _paragraph_sdts(blk)
for s in sdts:
if s.strip():
out.append(_md_code_block(s.strip()))
btx = _paragraph_bordered_text(blk)
for s in btx:
if s.strip():
out.append(_md_code_block(s.strip()))
ftx = _paragraph_framed(blk)
for s in ftx:
if s.strip():
out.append(_md_code_block(s.strip()))
raw = (blk.text or "")
sraw = raw.strip()
if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
if not in_code:
in_code = True
lang_hint = _guess_lang(sraw)
code_lines = []
code_lines.append(raw)
continue
if in_code and code_lines:
text = "\n".join(code_lines)
use_lang = lang_hint or _guess_lang(text)
out.append(f"```{use_lang}\n{text}\n```\n")
in_code = False
code_lines = []
lang_hint = ''
def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
el = p._element
parts: List[str] = []
try:
for ch in el.iterchildren():
tag = ch.tag.split('}')[-1]
if tag == 'r':
for rc in ch.iterchildren():
rtag = rc.tag.split('}')[-1]
if rtag == 't':
s = rc.text or ''
if s:
parts.append(s)
elif rtag == 'br':
parts.append('\n')
elif rtag == 'drawing':
for node in rc.iter():
local = node.tag.split('}')[-1]
rid = None
if local == 'blip':
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
elif local == 'imagedata':
rid = node.get(f"{{{NS['r']}}}id")
if not rid:
continue
try:
part = None
rp = getattr(doc.part, 'related_parts', None)
if isinstance(rp, dict) and rid in rp:
part = rp.get(rid)
if part is None:
rels = getattr(doc.part, 'rels', None)
if rels is not None and hasattr(rels, 'get'):
rel = rels.get(rid)
part = getattr(rel, 'target_part', None)
if part is None:
rel = getattr(doc.part, '_rels', {}).get(rid)
part = getattr(rel, 'target_part', None)
ct = getattr(part, 'content_type', '') if part is not None else ''
data = part.blob if part is not None and hasattr(part, 'blob') else None
if data:
b64 = base64.b64encode(data).decode('ascii')
parts.append(f"![Image](data:{ct};base64,{b64})")
except Exception:
pass
except Exception:
pass
s = ''.join(parts).strip()
return (s + '\n\n') if s else ''
txt = _paragraph_with_images(doc, blk)
if txt.strip():
out.append(txt)
if in_code and code_lines:
text = "\n".join(code_lines)
use_lang = lang_hint or _guess_lang(text)
out.append(f"```{use_lang}\n{text}\n```\n")
try:
boxes = _doclevel_textboxes(doc)
existing_texts = set()
try:
for seg in out:
if isinstance(seg, str):
ss = seg.strip()
if ss.startswith("```"):
m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
if m:
existing_texts.add(m.group(1).strip())
continue
existing_texts.add(ss)
except Exception:
pass
for tb in boxes:
s = (tb or '').strip()
if not s:
continue
if s in existing_texts:
continue
out.append(_md_code_block(s))
existing_texts.add(s)
except Exception:
pass
md = "".join(out)
return "utf-8", md
NS = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"v": "urn:schemas-microsoft-com:vml",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
}
def _paragraph_textboxes(p: Paragraph) -> List[str]:
try:
el = p._element
texts: List[str] = []
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
return texts
except Exception:
return []
def _paragraph_sdts(p: Paragraph) -> List[str]:
try:
el = p._element
texts: List[str] = []
for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
paras = sdt.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
return texts
except Exception:
return []
def _paragraph_bordered_text(p: Paragraph) -> List[str]:
try:
el = p._element
has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
t = (p.text or '').strip()
if has_border and t:
return [t]
except Exception:
pass
return []
def _paragraph_framed(p: Paragraph) -> List[str]:
try:
el = p._element
has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
t = (p.text or '').strip()
if has_frame and t:
return [t]
except Exception:
pass
return []
def _md_code_block(text: str) -> str:
lang = _guess_lang(text)
return f"```{lang}\n{text}\n```\n"
def _looks_like_code_paragraph(t: str) -> bool:
s = (t or '').strip()
if not s:
return False
if s.startswith('{') or s.startswith('[') or s.endswith('}'):
return True
if s.startswith(' ') or s.startswith('\t'):
return True
if ';' in s or '{' in s or '}' in s:
return True
keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
return any(k in s for k in keywords)
def _doclevel_textboxes(doc: Document) -> List[str]:
texts: List[str] = []
try:
el = doc.element.body
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([(t.text or '') for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([(t.text or '') for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
except Exception:
pass
return texts
def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
tmp.close()
subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
return Path(tmp.name)
except Exception:
pass
try:
outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
candidate = outdir / (path.stem + ".docx")
if candidate.exists():
return candidate
except Exception:
pass
try:
out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
if out.exists():
return out
except Exception:
pass
raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")