Import project files

2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions
--- a/docling/app/services/init.py
+++ b/docling/app/services/init.py
@@ -0,0 +1 @@
+
--- a/docling/app/services/docling_adapter.py
+++ b/docling/app/services/docling_adapter.py
@@ -0,0 +1,709 @@
+from pathlib import Path
+from typing import Optional, Tuple, Dict, List, Any
+from urllib.parse import urlparse, unquote
+import os
+import re
+import io
+from bs4 import BeautifulSoup
+from bs4.element import PageElement
+import marko
+import sys
+try:
+    _DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
+    p = str(_DOC_BASE)
+    if p not in sys.path:
+        sys.path.insert(0, p)
+except Exception:
+    pass
+try:
+    from docling.document_converter import DocumentConverter
+except Exception:
+    class DocumentConverter:  # type: ignore
+        def __init__(self, *args, **kwargs):
+            pass
+        def convert(self, source):
+            raise RuntimeError("docling not available")
+from docx import Document
+from docx.shared import Mm, Pt
+from docx.enum.section import WD_SECTION
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+from urllib.request import urlopen
+import json
+
+try:
+    from weasyprint import HTML, CSS  # type: ignore
+except Exception:
+    HTML = None
+    CSS = None
+
+_mdit: Any = None
+_tasklists_plugin: Any = None
+_deflist_plugin: Any = None
+_footnote_plugin: Any = None
+_attrs_plugin: Any = None
+_HAS_MD_IT: bool = False
+try:
+    import markdown_it as _mdit  # type: ignore
+    from mdit_py_plugins.tasklists import tasklists_plugin as _tasklists_plugin  # type: ignore
+    from mdit_py_plugins.deflist import deflist_plugin as _deflist_plugin  # type: ignore
+    from mdit_py_plugins.footnote import footnote_plugin as _footnote_plugin  # type: ignore
+    from mdit_py_plugins.attrs import attrs_plugin as _attrs_plugin  # type: ignore
+    _HAS_MD_IT = True
+except Exception:
+    pass
+
+converter = DocumentConverter()
+LINKMAP_PATH = Path(__file__).resolve().parent.parent / "configs" / "linkmap" / "linkmap.json"
+_LINKMAP: Dict[str, str] = {}
+
+def load_linkmap() -> Dict[str, str]:
+    global _LINKMAP
+    try:
+        if LINKMAP_PATH.exists():
+            _LINKMAP = json.loads(LINKMAP_PATH.read_text("utf-8")) or {}
+    except Exception:
+        _LINKMAP = {}
+    return _LINKMAP
+
+def save_linkmap(mapping: Dict[str, str]) -> None:
+    LINKMAP_PATH.parent.mkdir(parents=True, exist_ok=True)
+    LINKMAP_PATH.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), "utf-8")
+    load_linkmap()
+
+def resolve_link(href: Optional[str], data_doc: Optional[str]) -> Optional[str]:
+    if href:
+        return href
+    if not _LINKMAP:
+        load_linkmap()
+    if data_doc and data_doc in _LINKMAP:
+        return _LINKMAP[data_doc]
+    return None
+
+def export_payload(doc, fmt: str) -> Tuple[str, str]:
+    f = fmt.lower()
+    if f == "markdown":
+        return doc.export_to_markdown(), "text/markdown"
+    if f == "html":
+        return doc.export_to_html(), "text/html"
+    if f == "json":
+        return doc.export_to_json(), "application/json"
+    if f == "doctags":
+        return doc.export_to_doctags(), "application/json"
+    raise ValueError("unsupported export")
+
+def infer_basename(source_url: Optional[str], upload_name: Optional[str]) -> str:
+    if source_url:
+        path = urlparse(source_url).path
+        name = os.path.basename(path) or "document"
+        name = unquote(name)
+        return os.path.splitext(name)[0] or "document"
+    if upload_name:
+        name = os.path.splitext(os.path.basename(upload_name))[0] or "document"
+        return name
+    return "document"
+
+def sanitize_filename(name: Optional[str]) -> str:
+    if not name:
+        return "document"
+    name = name.strip()[:128]
+    name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name) or "document"
+    return name
+
+def convert_source(source: str, export: str) -> Tuple[str, str]:
+    result = converter.convert(source)
+    return export_payload(result.document, export)
+
+ 
+
+def md_to_docx_bytes(md: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
+    try:
+        import logging as _log
+        _log.info(f"md_to_docx_bytes start toc={toc} header={bool(header_text)} footer={bool(footer_text)} logo={bool(logo_url)} cover={bool(cover_src)}")
+    except Exception:
+        pass
+    def _add_field(paragraph, instr: str):
+        r1 = paragraph.add_run()
+        b = OxmlElement('w:fldChar')
+        b.set(qn('w:fldCharType'), 'begin')
+        r1._r.append(b)
+        r2 = paragraph.add_run()
+        t = OxmlElement('w:instrText')
+        t.set(qn('xml:space'), 'preserve')
+        t.text = instr
+        r2._r.append(t)
+        r3 = paragraph.add_run()
+        e = OxmlElement('w:fldChar')
+        e.set(qn('w:fldCharType'), 'end')
+        r3._r.append(e)
+    def _available_width(section) -> int:
+        return section.page_width - section.left_margin - section.right_margin
+    def _fetch_bytes(u: str) -> Optional[bytes]:
+        try:
+            if u.lower().startswith('http://') or u.lower().startswith('https://'):
+                with urlopen(u, timeout=10) as r:
+                    return r.read()
+            p = Path(u)
+            if p.exists() and p.is_file():
+                return p.read_bytes()
+        except Exception:
+            return None
+        return None
+    html = normalize_html(md, options={
+        "toc": "1" if toc else "",
+        "header_text": header_text,
+        "footer_text": footer_text,
+        "logo_url": logo_url,
+        "copyright_text": copyright_text,
+        "filename_text": filename_text,
+        "cover_src": cover_src,
+        "product_name": product_name,
+        "document_name": document_name,
+        "product_version": product_version,
+        "document_version": document_version,
+    })
+    try:
+        import logging as _log
+        _log.info(f"md_to_docx_bytes normalize_html length={len(html)}")
+    except Exception:
+        pass
+    soup = BeautifulSoup(html, "html.parser")
+    doc = Document()
+    sec0 = doc.sections[0]
+    sec0.page_width = Mm(210)
+    sec0.page_height = Mm(297)
+    sec0.left_margin = Mm(15)
+    sec0.right_margin = Mm(15)
+    sec0.top_margin = Mm(20)
+    sec0.bottom_margin = Mm(20)
+    has_cover = bool(cover_src or (soup.find('section', class_='cover') is not None))
+    if has_cover:
+        sec0.left_margin = Mm(0)
+        sec0.right_margin = Mm(0)
+        sec0.top_margin = Mm(0)
+        sec0.bottom_margin = Mm(0)
+        if cover_src:
+            b = _fetch_bytes(cover_src)
+            if b:
+                bio = io.BytesIO(b)
+                doc.add_picture(bio, width=_available_width(sec0))
+        if product_name:
+            p = doc.add_paragraph()
+            r = p.add_run(product_name)
+            r.font.size = Pt(18)
+            r.bold = True
+        t = document_name or None
+        if not t:
+            h1 = soup.body.find('h1') if soup.body else soup.find('h1')
+            t = h1.get_text(strip=True) if h1 else '文档'
+        p2 = doc.add_paragraph()
+        r2 = p2.add_run(t or '文档')
+        r2.font.size = Pt(24)
+        r2.bold = True
+        if filename_text:
+            p3 = doc.add_paragraph()
+            r3 = p3.add_run(filename_text)
+            r3.font.size = Pt(13)
+        meta_parts = []
+        if product_version:
+            meta_parts.append("产品版本：" + product_version)
+        if document_version:
+            meta_parts.append("文档版本：" + document_version)
+        if meta_parts:
+            pm = doc.add_paragraph("  ".join(meta_parts))
+            pm.font = None
+        doc.add_section(WD_SECTION.NEW_PAGE)
+        sec = doc.sections[-1]
+        sec.page_width = Mm(210)
+        sec.page_height = Mm(297)
+        sec.left_margin = Mm(15)
+        sec.right_margin = Mm(15)
+        sec.top_margin = Mm(20)
+        sec.bottom_margin = Mm(20)
+    else:
+        sec = sec0
+    if header_text or logo_url or filename_text:
+        hp = sec.header.add_paragraph()
+        left = header_text or ''
+        right = ''
+        if '||' in left:
+            parts = left.split('||', 1)
+            left, right = parts[0], parts[1]
+        elif '|' in left:
+            parts = left.split('|', 1)
+            left, right = parts[0], parts[1]
+        if left.strip():
+            hp.add_run(left.strip())
+        if right.strip():
+            rp = sec.header.add_paragraph()
+            rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+            rp.add_run(right.strip())
+        elif filename_text:
+            rp = sec.header.add_paragraph()
+            rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+            rp.add_run(filename_text)
+    if footer_text or copyright_text:
+        fp = sec.footer.add_paragraph()
+        if footer_text:
+            fp.add_run(footer_text)
+        if copyright_text:
+            cp = sec.footer.add_paragraph()
+            cp.add_run(copyright_text)
+        pn = sec.footer.add_paragraph()
+        pn.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+        _add_field(pn, 'PAGE')
+    if toc:
+        doc.add_paragraph('目录')
+        _add_field(doc.add_paragraph(), 'TOC \\o "1-3" \\h \\z \\u')
+        doc.add_page_break()
+    def add_inline(p, node):
+        if isinstance(node, str):
+            p.add_run(node)
+            return
+        if node.name in ['strong', 'b']:
+            r = p.add_run(node.get_text())
+            r.bold = True
+            return
+        if node.name in ['em', 'i']:
+            r = p.add_run(node.get_text())
+            r.italic = True
+            return
+        if node.name == 'code':
+            r = p.add_run(node.get_text())
+            r.font.name = 'Courier New'
+            return
+        if node.name == 'a':
+            text = node.get_text()
+            href = node.get('href')
+            extra = node.get('data-doc')
+            resolved = resolve_link(href, extra)
+            if resolved:
+                p.add_run(text + ' [' + resolved + ']')
+            else:
+                p.add_run(text)
+            return
+        if node.name == 'img':
+            src = node.get('src') or ''
+            b = _fetch_bytes(src)
+            if b:
+                bio = io.BytesIO(b)
+                try:
+                    doc.add_picture(bio, width=_available_width(sec))
+                except Exception:
+                    pass
+            return
+        for c in getattr(node, 'children', []):
+            add_inline(p, c)
+    def process_block(el):
+        name = getattr(el, 'name', None)
+        if name is None:
+            return
+        cls = el.get('class') or []
+        if name == 'div' and 'doc-meta' in cls:
+            return
+        if name == 'section' and 'cover' in cls:
+            return
+        if name == 'nav' and 'toc' in cls:
+            return
+        if name == 'div':
+            for child in el.children:
+                process_block(child)
+            return
+        if name == 'h1':
+            doc.add_heading(el.get_text(), level=1)
+            return
+        if name == 'h2' or (name == 'strong' and 'subtitle' in cls):
+            doc.add_heading(el.get_text(), level=2)
+            return
+        if name == 'h3':
+            doc.add_heading(el.get_text(), level=3)
+            return
+        if name == 'p':
+            p = doc.add_paragraph()
+            for c in el.children:
+                add_inline(p, c)
+            return
+        if name in ['ul', 'ol']:
+            for li in el.find_all('li', recursive=False):
+                p = doc.add_paragraph(style='List Bullet')
+                for c in li.children:
+                    add_inline(p, c)
+            return
+        if name == 'pre':
+            code = el.get_text() or ''
+            p = doc.add_paragraph()
+            run = p.add_run(code)
+            run.font.name = 'Courier New'
+            return
+        if name == 'blockquote':
+            p = doc.add_paragraph(el.get_text())
+            p.paragraph_format.left_indent = Mm(10)
+            return
+        if name == 'table':
+            rows = []
+            thead = el.find('thead')
+            tbody = el.find('tbody')
+            if thead:
+                hdrs = [th.get_text(strip=True) for th in thead.find_all('th')]
+            else:
+                hdrs = [cell.get_text(strip=True) for cell in el.find_all('tr')[0].find_all(['th','td'])] if el.find_all('tr') else []
+            trs = tbody.find_all('tr') if tbody else el.find_all('tr')[1:]
+            for tr in trs:
+                tds = [td.get_text(strip=True) for td in tr.find_all('td')]
+                rows.append(tds)
+            tbl = doc.add_table(rows=1 + len(rows), cols=len(hdrs) or 1)
+            hdr = tbl.rows[0].cells
+            for k, h in enumerate(hdrs or ['']):
+                hdr[k].text = h
+            for r_idx, row in enumerate(rows):
+                cells = tbl.rows[1 + r_idx].cells
+                for c_idx in range(len(hdrs) or 1):
+                    cells[c_idx].text = (row[c_idx] if c_idx < len(row) else '')
+            return
+        if name == 'img':
+            src = el.get('src') or ''
+            b = _fetch_bytes(src)
+            if b:
+                bio = io.BytesIO(b)
+                try:
+                    doc.add_picture(bio, width=_available_width(sec))
+                except Exception:
+                    pass
+            return
+    body = soup.body or soup
+    for el in body.children:
+        process_block(el)
+    bio = io.BytesIO()
+    try:
+        import logging as _log
+        _log.info("md_to_docx_bytes saving doc")
+    except Exception:
+        pass
+    doc.save(bio)
+    try:
+        import logging as _log
+        _log.info(f"md_to_docx_bytes done size={bio.tell()}")
+    except Exception:
+        pass
+    return bio.getvalue()
+
+def md_to_pdf_bytes(md: str) -> bytes:
+    return md_to_pdf_bytes_with_renderer(md, renderer="weasyprint")
+
+def _md_with_tables_to_html(md_text: str) -> str:
+    lines = md_text.splitlines()
+    out = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        def is_sep(s: str) -> bool:
+            s = s.strip()
+            if "|" not in s:
+                return False
+            s = s.strip("|")
+            return all(set(seg.strip()) <= set("-: ") and len(seg.strip()) >= 1 for seg in s.split("|"))
+        if "|" in line and i + 1 < len(lines) and is_sep(lines[i + 1]):
+            headers = [c.strip() for c in line.strip().strip("|").split("|")]
+            j = i + 2
+            rows = []
+            while j < len(lines) and "|" in lines[j]:
+                rows.append([c.strip() for c in lines[j].strip().strip("|").split("|")])
+                j += 1
+            tbl = ["<table>", "<thead><tr>"]
+            for h in headers:
+                tbl.append(f"<th>{h}</th>")
+            tbl.append("</tr></thead><tbody>")
+            for row in rows:
+                tbl.append("<tr>")
+                for idx in range(len(headers)):
+                    cell = row[idx] if idx < len(row) else ""
+                    tbl.append(f"<td>{cell}</td>")
+                tbl.append("</tr>")
+            tbl.append("</tbody></table>")
+            out.append("".join(tbl))
+            i = j
+            continue
+        out.append(line)
+        i += 1
+    return marko.convert("\n".join(out))
+
+def _render_markdown_html(md_text: str) -> str:
+    if _HAS_MD_IT and _mdit is not None:
+        try:
+            md = _mdit.MarkdownIt("commonmark").enable(["table", "strikethrough"])
+            if _tasklists_plugin:
+                md.use(_tasklists_plugin)
+            if _deflist_plugin:
+                md.use(_deflist_plugin)
+            if _footnote_plugin:
+                md.use(_footnote_plugin)
+            if _attrs_plugin:
+                md.use(_attrs_plugin)
+            return md.render(md_text)
+        except Exception:
+            pass
+    return _md_with_tables_to_html(md_text)
+
+def normalize_html(md_or_html: str, options: Optional[Dict[str, Optional[str]]] = None) -> str:
+    html = _render_markdown_html(md_or_html)
+    soup = BeautifulSoup(html, "html.parser")
+    for s in soup.find_all("strong", class_="subtitle"):
+        s.name = "h2"
+        s.attrs = {"data-origin": "subtitle"}
+    for a in soup.find_all("a"):
+        href_val = a.get("href")
+        extra_val = a.get("data-doc")
+        href = href_val if isinstance(href_val, str) else None
+        extra = extra_val if isinstance(extra_val, str) else None
+        resolved = resolve_link(href, extra)
+        if resolved:
+            a["href"] = resolved
+        elif not href and extra:
+            a.replace_with(a.get_text() + " [" + extra + "]")
+    opts = options or {}
+    header_text = opts.get("header_text") or None
+    footer_text = opts.get("footer_text") or None
+    logo_url = opts.get("logo_url") or None
+    copyright_text = opts.get("copyright_text") or None
+    cover_src = opts.get("cover_src") or None
+    product_name_opt = opts.get("product_name") or None
+    document_name_opt = opts.get("document_name") or None
+    product_version_opt = opts.get("product_version") or None
+    document_version_opt = opts.get("document_version") or None
+    toc_flag = bool(opts.get("toc"))
+    meta = soup.new_tag("div", attrs={"class": "doc-meta"})
+    if header_text:
+        ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
+        text = header_text
+        left = text
+        right = ""
+        if "||" in text:
+            parts = text.split("||", 1)
+            left, right = parts[0], parts[1]
+        elif "|" in text:
+            parts = text.split("|", 1)
+            left, right = parts[0], parts[1]
+        if logo_url:
+            img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
+            ht.append(img)
+        hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
+        hl.string = left
+        ht.append(hl)
+        if right.strip():
+            hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
+            hr.string = right
+            ht.append(hr)
+        meta.append(ht)
+    else:
+        first_h1 = None
+        if soup.body:
+            first_h1 = soup.body.find("h1")
+        else:
+            first_h1 = soup.find("h1")
+        left = (first_h1.get_text(strip=True) if first_h1 else "文档")
+        right = opts.get("filename_text") or ""
+        ht = soup.new_tag("div", attrs={"class": "doc-header-text"})
+        if logo_url:
+            img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url})
+            ht.append(img)
+        hl = soup.new_tag("span", attrs={"class": "doc-header-left"})
+        hl.string = left
+        ht.append(hl)
+        if right:
+            hr = soup.new_tag("span", attrs={"class": "doc-header-right"})
+            hr.string = right
+            ht.append(hr)
+        meta.append(ht)
+    if footer_text:
+        ft = soup.new_tag("div", attrs={"class": "doc-footer-text"})
+        ft.string = footer_text
+        meta.append(ft)
+    page_header_val = (header_text or (document_name_opt or None))
+    if not page_header_val:
+        first_h1_for_header = None
+        if soup.body:
+            first_h1_for_header = soup.body.find("h1")
+        else:
+            first_h1_for_header = soup.find("h1")
+        page_header_val = (first_h1_for_header.get_text(strip=True) if first_h1_for_header else "文档")
+    page_footer_val = (footer_text or "FunMD")
+    ph = soup.new_tag("div", attrs={"class": "doc-page-header"})
+    if logo_url:
+        logo_inline = soup.new_tag("img", attrs={"src": logo_url, "class": "doc-page-header-logo"})
+        ph.append(logo_inline)
+    ht_inline = soup.new_tag("span", attrs={"class": "doc-page-header-text"})
+    ht_inline.string = page_header_val
+    ph.append(ht_inline)
+    meta.append(ph)
+    pf = soup.new_tag("div", attrs={"class": "doc-page-footer"})
+    pf.string = page_footer_val
+    meta.append(pf)
+    if copyright_text:
+        cp = soup.new_tag("div", attrs={"class": "doc-copyright"})
+        cp.string = copyright_text
+        meta.append(cp)
+    # brand logo is rendered inline within header; no separate top-left element
+    if soup.body:
+        soup.body.insert(0, meta)
+    else:
+        soup.insert(0, meta)
+    if not soup.head:
+        head = soup.new_tag("head")
+        soup.insert(0, head)
+    else:
+        head = soup.head
+    style_run = soup.new_tag("style")
+    style_run.string = "@page{margin:20mm}@page{\n  @top-center{content: element(page-header)}\n  @bottom-center{content: element(page-footer)}\n}\n.doc-page-header{position: running(page-header); font-size:10pt; color:#666; display:block; text-align:center; width:100%}\n.doc-page-header::after{content:''; display:block; width:80%; border-bottom:1px solid #d9d9d9; margin:4px auto 0}\n.doc-page-header-logo{height:20px; vertical-align:middle; margin-right:4px}\n.doc-page-header-text{vertical-align:middle}\n.doc-page-footer{position: running(page-footer); font-size:10pt; color:#666}\n.doc-page-footer::before{content:''; display:block; width:80%; border-top:1px solid #d9d9d9; margin:0 auto 4px}"
+    head.append(style_run)
+    # Fallback inline styles for cover to ensure visibility even if external CSS isn't loaded
+    if (cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt):
+        if not soup.head:
+            head = soup.new_tag("head")
+            soup.insert(0, head)
+        else:
+            head = soup.head
+        style = soup.new_tag("style")
+        style.string = "@page:first{margin:0} html,body{margin:0;padding:0}.cover{position:relative;width:210mm;height:297mm;overflow:hidden;page-break-after:always}.cover .cover-bg{position:absolute;left:0;top:0;right:0;bottom:0;width:100%;height:100%;object-fit:cover;display:block}.cover .cover-brand{position:absolute;top:20mm;left:20mm;font-size:18pt;font-weight:700;color:#1d4ed8}.cover .cover-footer{position:absolute;left:0;right:0;bottom:0;background:#1d4ed8;color:#fff;padding:12mm 20mm}.cover .cover-title{font-size:24pt;font-weight:700;margin:0}.cover .cover-subtitle{font-size:13pt;margin-top:4pt}.cover .cover-meta{margin-top:8pt;font-size:11pt;display:flex;gap:20mm}"
+        head.append(style)
+    if cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt:
+        cov = soup.new_tag("section", attrs={"class": "cover"})
+        if cover_src:
+            bg = soup.new_tag("img", attrs={"class": "cover-bg", "src": cover_src})
+            cov.append(bg)
+        if product_name_opt:
+            brand_el = soup.new_tag("div", attrs={"class": "cover-brand"})
+            brand_el.string = product_name_opt
+            cov.append(brand_el)
+        footer = soup.new_tag("div", attrs={"class": "cover-footer"})
+        title_text = document_name_opt or None
+        if not title_text:
+            first_h1 = soup.body.find("h1") if soup.body else soup.find("h1")
+            if first_h1:
+                title_text = first_h1.get_text(strip=True)
+        title_el = soup.new_tag("div", attrs={"class": "cover-title"})
+        title_el.string = title_text or "文档"
+        footer.append(title_el)
+        subtitle_val = opts.get("filename_text") or ""
+        if subtitle_val:
+            subtitle_el = soup.new_tag("div", attrs={"class": "cover-subtitle"})
+            subtitle_el.string = subtitle_val
+            footer.append(subtitle_el)
+        meta_el = soup.new_tag("div", attrs={"class": "cover-meta"})
+        if product_version_opt:
+            pv = soup.new_tag("span")
+            pv.string = f"产品版本：{product_version_opt}"
+            meta_el.append(pv)
+        if document_version_opt:
+            dv = soup.new_tag("span")
+            dv.string = f"文档版本：{document_version_opt}"
+            meta_el.append(dv)
+        footer.append(meta_el)
+        cov.append(footer)
+        if soup.body:
+            soup.body.insert(1, cov)
+        else:
+            soup.insert(1, cov)
+    if toc_flag:
+        headings = [
+            el for el in (soup.find_all(["h1", "h2", "h3"]) or [])
+            if el.get("data-origin") != "subtitle"
+        ]
+        if headings:
+            ul = soup.new_tag("ul")
+            idx = 1
+            for el in headings:
+                text = el.get_text(strip=True)
+                if not text:
+                    continue
+                hid = el.get("id")
+                if not hid:
+                    hid = f"sec-{idx}"
+                    el["id"] = hid
+                    idx += 1
+                li = soup.new_tag("li", attrs={"class": f"toc-{el.name}"})
+                a = soup.new_tag("a", attrs={"href": f"#{hid}", "class": "toc-text"})
+                a.string = text
+                dots = soup.new_tag("span", attrs={"class": "toc-dots"})
+                page = soup.new_tag("span", attrs={"class": "toc-page", "data-target": f"#{hid}"})
+                li.append(a)
+                li.append(dots)
+                li.append(page)
+                ul.append(li)
+            nav = soup.new_tag("nav", attrs={"class": "toc"})
+            h = soup.new_tag("h1")
+            h.string = "目录"
+            nav.append(h)
+            nav.append(ul)
+            if soup.body:
+                soup.body.insert(2, nav)
+            else:
+                soup.insert(2, nav)
+    if soup.body:
+        for h in soup.body.find_all(["h1", "h2", "h3"]):
+            sib: Optional[PageElement] = h.find_next_sibling()
+            blocks: List[Any] = []
+            first_table: Optional[Any] = None
+            while sib is not None:
+                # Skip pure whitespace nodes
+                if getattr(sib, "name", None) is None:
+                    try:
+                        if str(sib).strip() == "":
+                            sib = sib.next_sibling
+                            continue
+                    except Exception:
+                        break
+                # Stop if next heading encountered
+                name = getattr(sib, "name", None)
+                if name in ["h1", "h2", "h3"]:
+                    break
+                # Collect explanatory blocks until first table
+                if name == "table":
+                    first_table = sib
+                    break
+                if name in ["p", "blockquote", "ul", "ol"]:
+                    blocks.append(sib)
+                    sib = sib.next_sibling
+                    continue
+                # Unknown block: stop grouping to avoid wrapping unrelated content
+                break
+            if first_table is not None:
+                wrap = soup.new_tag("div", attrs={"class": "table-block"})
+                h.insert_before(wrap)
+                wrap.append(h.extract())
+                for el in blocks:
+                    wrap.append(el.extract())
+                wrap.append(first_table.extract())
+    return str(soup)
+
+def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]):
+    sheets: List[Any] = []
+    if CSS is None:
+        return sheets
+    if css_text:
+        sheets.append(CSS(string=css_text))
+    if css_name:
+        css_path = Path(__file__).resolve().parent.parent / "configs" / "styles" / f"{css_name}.css"
+        if css_path.exists():
+            sheets.append(CSS(filename=str(css_path)))
+    return sheets
+
+def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes:
+    html = normalize_html(md, options={
+        "toc": "1" if toc else "",
+        "header_text": header_text,
+        "footer_text": footer_text,
+        "logo_url": logo_url,
+        "copyright_text": copyright_text,
+        "filename_text": filename_text,
+        "cover_src": cover_src,
+        "product_name": product_name,
+        "document_name": document_name,
+        "product_version": product_version,
+        "document_version": document_version,
+    })
+    if HTML is not None:
+        stylesheets = _stylesheets_for(css_name, css_text)
+        pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None)
+        return pdf_bytes
+    raise RuntimeError("WeasyPrint is not available")
--- a/docling/app/services/minio_utils.py
+++ b/docling/app/services/minio_utils.py
@@ -0,0 +1,190 @@
+from typing import Optional, Tuple, Dict
+import os
+import logging
+from urllib.request import urlopen
+
+try:
+    from minio import Minio  # type: ignore
+    import urllib3  # type: ignore
+except Exception:
+    Minio = None
+    urllib3 = None  # type: ignore
+
+def minio_head_bucket(client: object, bucket: str) -> bool:
+    try:
+        if hasattr(client, "bucket_exists"):
+            try:
+                return bool(client.bucket_exists(bucket))  # type: ignore
+            except Exception:
+                pass
+        try:
+            region = client._get_region(bucket)  # type: ignore
+        except Exception:
+            region = "us-east-1"
+        client._url_open(method="HEAD", region=region, bucket_name=bucket)  # type: ignore
+        return True
+    except Exception:
+        try:
+            names = [getattr(b, "name", None) for b in client.list_buckets()]  # type: ignore
+            return bucket in set(n for n in names if n)
+        except Exception:
+            return False
+
+def minio_create_bucket(client: object, bucket: str) -> bool:
+    try:
+        if hasattr(client, "bucket_exists"):
+            try:
+                if client.bucket_exists(bucket):  # type: ignore
+                    return True
+            except Exception:
+                pass
+        if hasattr(client, "make_bucket"):
+            try:
+                client.make_bucket(bucket)  # type: ignore
+                return True
+            except Exception:
+                try:
+                    region = client._get_region(bucket)  # type: ignore
+                except Exception:
+                    region = "us-east-1"
+                try:
+                    client.make_bucket(bucket, location=region)  # type: ignore
+                    return True
+                except Exception:
+                    pass
+        try:
+            try:
+                region = client._get_region(bucket)  # type: ignore
+            except Exception:
+                region = "us-east-1"
+            client._url_open(method="PUT", region=region, bucket_name=bucket)  # type: ignore
+            return True
+        except Exception as ce:
+            if "BucketAlreadyOwnedByYou" in str(ce) or "BucketAlreadyExists" in str(ce):
+                return True
+            raise
+    except Exception as e:
+        raise e
+
+def minio_client(endpoint: str, access: str, secret: str, secure: bool):
+    if urllib3 is not None:
+        try:
+            http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=3.0, read=20.0))
+            return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure, http_client=http)  # type: ignore
+        except Exception:
+            return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure)  # type: ignore
+    return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure)  # type: ignore
+
+def minio_time_hint(endpoint: str, secure: bool) -> Optional[str]:
+    try:
+        scheme = "https" if secure else "http"
+        r = urlopen(f"{scheme}://{endpoint}", timeout=3)
+        srv_date = r.headers.get("Date")
+        if not srv_date:
+            return None
+        from email.utils import parsedate_to_datetime
+        from datetime import datetime, timezone
+        dt = parsedate_to_datetime(srv_date)
+        now = datetime.now(timezone.utc)
+        diff = abs((now - dt).total_seconds())
+        return f"服务器时间与本机相差约 {int(diff)} 秒"
+    except Exception:
+        return None
+
+def join_prefix(prefix: str, rel: str) -> str:
+    pre = (prefix or "").strip("/")
+    r = rel.lstrip("/")
+    if pre and r.startswith(pre + "/"):
+        return r
+    return f"{pre}/{r}" if pre else r
+
+def presigned_read(client: object, bucket: str, obj: str, expires_seconds: int) -> Optional[str]:
+    try:
+        from datetime import timedelta
+        exp = expires_seconds
+        try:
+            exp = int(exp)
+        except Exception:
+            pass
+        td = timedelta(seconds=exp)
+        try:
+            return client.get_presigned_url("GET", bucket, obj, expires=td)  # type: ignore
+        except Exception:
+            return client.presigned_get_object(bucket, obj, expires=td)  # type: ignore
+    except Exception:
+        return None
+
+def minio_current(runtime_cfg: Dict[str, Dict[str, Optional[str]]]) -> Tuple[Optional[object], Optional[str], Optional[str], str]:
+    rc = runtime_cfg.get("minio", {})
+    endpoint_raw = rc.get("endpoint") or os.environ.get("MINIO_ENDPOINT")
+    access_raw = rc.get("access") or os.environ.get("MINIO_ACCESS_KEY")
+    secret_raw = rc.get("secret") or os.environ.get("MINIO_SECRET_KEY")
+    bucket_raw = rc.get("bucket") or os.environ.get("MINIO_BUCKET")
+    secure_flag = rc.get("secure") or os.environ.get("MINIO_SECURE", "false")
+    secure = str(secure_flag or "false").lower() in {"1","true","yes","on"}
+    public_raw = rc.get("public") or os.environ.get("MINIO_PUBLIC_ENDPOINT")
+    endpoint = (str(endpoint_raw).strip() if endpoint_raw else None)
+    try:
+        if isinstance(endpoint, str) and ":9001" in endpoint:
+            h = endpoint.split("/")[0]
+            if ":" in h:
+                parts = h.split(":")
+                endpoint = f"{parts[0]}:9000"
+            else:
+                endpoint = h
+    except Exception:
+        endpoint = endpoint
+    access = (str(access_raw).strip() if access_raw else None)
+    secret = (str(secret_raw).strip() if secret_raw else None)
+    bucket = (str(bucket_raw).strip() if bucket_raw else None)
+    public_base = (str(public_raw).strip() if public_raw else None)
+    try:
+        if isinstance(public_base, str) and (":9001" in public_base or "/browser" in public_base or "/minio" in public_base):
+            host = public_base.strip().split("/")[0]
+            scheme = "https" if secure else "http"
+            if ":" in host:
+                host = host.split("/")[0]
+                base_host = host.split(":")[0]
+                public_base = f"{scheme}://{base_host}:9000"
+            else:
+                public_base = f"{scheme}://{host}:9000"
+    except Exception:
+        public_base = public_base
+    if not public_base and endpoint:
+        public_base = f"https://{endpoint}" if secure else f"http://{endpoint}"
+    missing = []
+    if Minio is None:
+        missing.append("client")
+    if not endpoint:
+        missing.append("endpoint")
+    if not access:
+        missing.append("access")
+    if not secret:
+        missing.append("secret")
+    if not bucket:
+        missing.append("bucket")
+    if not public_base:
+        missing.append("public")
+    if missing:
+        try:
+            logging.error(f"minio config invalid: missing={missing}")
+        except Exception:
+            pass
+        return None, None, None, ""
+    client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=secure)
+    try:
+        try:
+            client.list_buckets()  # type: ignore
+        except Exception as e:
+            if secure and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)):
+                client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=False)
+    except Exception:
+        pass
+    try:
+        exists = minio_head_bucket(client, bucket)
+        if not exists:
+            minio_create_bucket(client, bucket)
+    except Exception:
+        pass
+    prefix = rc.get("prefix") or os.environ.get("MINIO_PREFIX", "")
+    return client, bucket, public_base, prefix
--- a/docling/app/services/unified_converter.py
+++ b/docling/app/services/unified_converter.py
@@ -0,0 +1,492 @@
+from pathlib import Path
+from typing import Optional, Tuple
+import re
+
+import tempfile
+import sys
+from urllib.parse import urlsplit
+from urllib.request import urlopen
+from urllib.error import HTTPError, URLError
+import io
+_DOC_AVAILABLE = True
+try:
+    _DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
+    p = str(_DOC_BASE)
+    if p not in sys.path:
+        sys.path.insert(0, p)
+except Exception:
+    pass
+try:
+    from docling.document_converter import DocumentConverter
+    from docling.datamodel.base_models import InputFormat
+    from docling.document_converter import PdfFormatOption
+    from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+    from docling.datamodel.pipeline_options import PdfPipelineOptions
+    from docling_core.types.doc import ImageRefMode
+except Exception:
+    _DOC_AVAILABLE = False
+    class DocumentConverter:  # type: ignore
+        def __init__(self, *args, **kwargs):
+            pass
+        def convert(self, source):
+            raise RuntimeError("docling unavailable")
+    class InputFormat:  # type: ignore
+        PDF = "pdf"
+    class PdfFormatOption:  # type: ignore
+        def __init__(self, *args, **kwargs):
+            pass
+    class StandardPdfPipeline:  # type: ignore
+        pass
+    class PdfPipelineOptions:  # type: ignore
+        def __init__(self):
+            pass
+    class ImageRefMode:  # type: ignore
+        EMBEDDED = None
+
+"""
+@api Unified Converter Service
+@description Provides core document conversion logic unifying Docling and word2markdown engines
+"""
+
+_W2M_AVAILABLE = False
+try:
+    from app.services.word2markdown import convert_any as _w2m_convert_any  # type: ignore
+    _W2M_AVAILABLE = True
+except Exception:
+    _W2M_AVAILABLE = False
+
+try:
+    from bs4 import BeautifulSoup  # type: ignore
+except Exception:
+    BeautifulSoup = None  # type: ignore
+try:
+    from app.services.docling_adapter import normalize_html as _normalize_html  # type: ignore
+    from app.services.docling_adapter import resolve_link as _resolve_link  # type: ignore
+    from app.services.docling_adapter import _render_markdown_html as _render_md_html  # type: ignore
+except Exception:
+    _normalize_html = None  # type: ignore
+    _resolve_link = None  # type: ignore
+    _render_md_html = None  # type: ignore
+
+def _is_http(s: str) -> bool:
+    t = (s or "").lower()
+    return t.startswith("http://") or t.startswith("https://")
+
+def _read_bytes(source: str) -> Tuple[bytes, str]:
+    ct = ""
+    try:
+        if _is_http(source):
+            from urllib.request import urlopen
+            with urlopen(source, timeout=10) as r:
+                ct = r.headers.get("Content-Type") or ""
+                return r.read() or b"", ct
+        p = Path(source)
+        if p.exists() and p.is_file():
+            return p.read_bytes(), ct
+    except Exception:
+        return b"", ct
+    return b"", ct
+
+def _decode_to_utf8(raw: bytes, ct: str = "") -> str:
+    if not raw:
+        return ""
+    if raw.startswith(b"\xef\xbb\xbf"):
+        try:
+            return raw[3:].decode("utf-8")
+        except Exception:
+            pass
+    if raw.startswith(b"\xff\xfe"):
+        try:
+            return raw[2:].decode("utf-16le")
+        except Exception:
+            pass
+    if raw.startswith(b"\xfe\xff"):
+        try:
+            return raw[2:].decode("utf-16be")
+        except Exception:
+            pass
+    try:
+        m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE)
+        if m:
+            enc = m.group(1).strip().lower()
+            try:
+                return raw.decode(enc)
+            except Exception:
+                pass
+    except Exception:
+        pass
+    candidates = [
+        "utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252",
+    ]
+    for enc in candidates:
+        try:
+            return raw.decode(enc)
+        except Exception:
+            continue
+    return raw.decode("utf-8", errors="replace")
+
+def _normalize_newlines(s: str) -> str:
+    return (s or "").replace("\r\n", "\n").replace("\r", "\n")
+
+def _html_to_markdown(html: str) -> str:
+    if not html:
+        return ""
+    if BeautifulSoup is None:
+        return html
+    soup = BeautifulSoup(html, "html.parser")
+    out: list[str] = []
+    def txt(node) -> str:
+        return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "")
+    def inline(node) -> str:
+        if isinstance(node, str):
+            return node
+        name = getattr(node, "name", None)
+        if name in {None}:  # type: ignore
+            return str(node)
+        if name in {"strong", "b"}:
+            return "**" + txt(node) + "**"
+        if name in {"em", "i"}:
+            return "*" + txt(node) + "*"
+        if name == "code":
+            return "`" + txt(node) + "`"
+        if name == "a":
+            href_val = node.get("href")
+            extra_val = node.get("data-doc")
+            href = href_val if isinstance(href_val, str) else None
+            extra = extra_val if isinstance(extra_val, str) else None
+            resolved = _resolve_link(href, extra) if _resolve_link else (href or extra)
+            url = resolved or ""
+            text = txt(node)
+            if url:
+                return f"[{text}]({url})"
+            return text
+        if name == "img":
+            alt = node.get("alt") or "image"
+            src = node.get("src") or ""
+            return f"![{alt}]({src})"
+        res = []
+        for c in getattr(node, "children", []):
+            res.append(inline(c))
+        return "".join(res)
+    def block(node):
+        name = getattr(node, "name", None)
+        if name is None:
+            s = str(node).strip()
+            if s:
+                out.append(s)
+            return
+        if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+            lvl = int(name[1])
+            out.append("#" * lvl + " " + txt(node))
+            out.append("")
+            return
+        if name == "p":
+            segs = [inline(c) for c in node.children]
+            out.append("".join(segs))
+            out.append("")
+            return
+        if name == "br":
+            out.append("")
+            return
+        if name in {"ul", "ol"}:
+            is_ol = name == "ol"
+            idx = 1
+            for li in node.find_all("li", recursive=False):
+                text = "".join(inline(c) for c in li.children)
+                if is_ol:
+                    out.append(f"{idx}. {text}")
+                    idx += 1
+                else:
+                    out.append(f"- {text}")
+            out.append("")
+            return
+        if name == "pre":
+            code_node = node.find("code")
+            code_text = code_node.get_text() if code_node else node.get_text()
+            lang = ""
+            cls = (code_node.get("class") if code_node else node.get("class")) or []
+            for c in cls:
+                s = str(c)
+                if s.startswith("language-"):
+                    lang = s.split("-", 1)[-1]
+                    break
+            out.append(f"```{lang}\n{code_text}\n```\n")
+            return
+        if name == "blockquote":
+            lines = [l for l in txt(node).splitlines() if l.strip()]
+            for l in lines:
+                out.append("> " + l)
+            out.append("")
+            return
+        if name == "table":
+            rows = node.find_all("tr")
+            if not rows:
+                return
+            headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])]
+            if headers:
+                out.append("|" + "|".join(headers) + "|")
+                sep = "|" + "|".join(["---" for _ in headers]) + "|"
+                out.append(sep)
+            for tr in rows[1:]:
+                cells = [td.get_text(strip=True) for td in tr.find_all("td")]
+                if cells:
+                    out.append("|" + "|".join(cells) + "|")
+            out.append("")
+            return
+        if name == "div":
+            for c in node.children:
+                block(c)
+            return
+        segs = [inline(c) for c in node.children]
+        if segs:
+            out.append("".join(segs))
+            out.append("")
+    root = soup.body or soup
+    for ch in getattr(root, "children", []):
+        block(ch)
+    return _normalize_newlines("\n".join(out)).strip()
+
+
+def _lower_html_table_tags(html: str) -> str:
+    """
+    @function _lower_html_table_tags
+    @description Normalizes HTML table tags to lowercase
+    @param html Input HTML string
+    @return Normalized HTML string
+    """
+    if not html:
+        return html
+    tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"]
+    out = html
+    for t in tags:
+        out = re.sub(r"</?" + t + r"\b", lambda m: m.group(0).lower(), out)
+    out = re.sub(r">\s*\n+\s*", ">\n", out)
+    return out
+
+
+def _replace_admonitions(md: str) -> str:
+    """
+    @function _replace_admonitions
+    @description Replaces ::: style admonitions with !!! style
+    @param md Input markdown string
+    @return Processed markdown string
+    """
+    if not md:
+        return md
+    lines = md.split("\n")
+    out = []
+    in_block = False
+    for raw in lines:
+        t = raw.strip()
+        if t.startswith(":::"):
+            if not in_block:
+                name = t[3:].strip()
+                if not name:
+                    out.append("!!!")
+                else:
+                    out.append("!!! " + name)
+                in_block = True
+            else:
+                out.append("!!!")
+                in_block = False
+            continue
+        out.append(raw)
+    return "\n".join(out)
+
+
+def _enhance_codeblocks(md: str) -> str:
+    if not md:
+        return md
+    lines = md.split("\n")
+    res = []
+    in_fence = False
+    fence_lang = ""
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        t = line.strip()
+        if t.startswith("```"):
+            in_fence = not in_fence
+            try:
+                fence_lang = (t[3:] or "").strip() if in_fence else ""
+            except Exception:
+                fence_lang = ""
+            res.append(line)
+            i += 1
+            continue
+        if in_fence:
+            res.append(line)
+            i += 1
+            continue
+        if t.startswith("{") or t.startswith("["):
+            buf = [line]
+            j = i + 1
+            closed = False
+            depth = t.count("{") - t.count("}")
+            while j < len(lines):
+                buf.append(lines[j])
+                s = lines[j].strip()
+                depth += s.count("{") - s.count("}")
+                if depth <= 0 and s.endswith("}"):
+                    closed = True
+                    break
+                j += 1
+            if closed and len(buf) >= 3:
+                lang = "json"
+                res.append("```" + lang)
+                res.extend(buf)
+                res.append("```")
+                i = j + 1
+                continue
+        code_sig = (
+            ("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t)
+        )
+        if code_sig:
+            buf = [line]
+            j = i + 1
+            while j < len(lines):
+                s = lines[j].strip()
+                if not s:
+                    break
+                if s.startswith("# ") or s.startswith("## ") or s.startswith("### "):
+                    break
+                buf.append(lines[j])
+                j += 1
+            if len(buf) >= 3:
+                res.append("```")
+                res.extend(buf)
+                res.append("```")
+                i = j + 1
+                continue
+        res.append(line)
+        i += 1
+    return "\n".join(res)
+
+
+class FormatConverter:
+    """
+    @class FormatConverter
+    @description Unified converter class wrapping Docling and word2markdown
+    """
+    def __init__(self) -> None:
+        self._docling = DocumentConverter()
+
+    def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]:
+        """
+        @function convert
+        @description Convert a document source to specified format
+        @param source Path or URL to source document
+        @param export Output format (markdown, html, json, doctags)
+        @param engine Optional engine override (word2markdown/docling)
+        @param mdx_safe_mode_enabled Toggle safe mode for MDX
+        @return Tuple of (encoding, content)
+        """
+        
+
+        # Prefer custom word2markdown engine for DOC/DOCX when available
+        auto_engine = None
+        try:
+            from pathlib import Path as _P
+            suf = _P(source).suffix.lower()
+            if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE:
+                auto_engine = "word2markdown"
+        except Exception:
+            auto_engine = None
+        use_engine = (engine or auto_engine or "").lower()
+        try:
+            from urllib.parse import urlsplit
+            path = source
+            if _is_http(source):
+                path = urlsplit(source).path or ""
+            ext = Path(path).suffix.lower()
+        except Exception:
+            ext = Path(source).suffix.lower()
+        if ext in {".txt"}:
+            raw, ct = _read_bytes(source)
+            text = _normalize_newlines(_decode_to_utf8(raw, ct))
+            if export.lower() == "html":
+                if _render_md_html is not None:
+                    html = _render_md_html(text)
+                else:
+                    try:
+                        import marko
+                        html = marko.convert(text)
+                    except Exception:
+                        html = f"<pre>{text}</pre>"
+                return "utf-8", _lower_html_table_tags(html), None
+            md = _enhance_codeblocks(text)
+            return "utf-8", md, None
+        if ext in {".md"}:
+            raw, ct = _read_bytes(source)
+            text = _normalize_newlines(_decode_to_utf8(raw, ct))
+            if export.lower() == "html":
+                if _render_md_html is not None:
+                    html = _render_md_html(text)
+                else:
+                    try:
+                        import marko
+                        html = marko.convert(text)
+                    except Exception:
+                        html = text
+                return "utf-8", _lower_html_table_tags(html), None
+            return "utf-8", text, None
+        if ext in {".html", ".htm"}:
+            try:
+                conv = DocumentConverter(allowed_formats=[InputFormat.HTML])
+                result = conv.convert(source)
+                if export.lower() == "html":
+                    html = result.document.export_to_html()
+                    html = _lower_html_table_tags(html)
+                    return "utf-8", html, None
+                md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+                md = _replace_admonitions(md)
+                md = _enhance_codeblocks(md)
+                return "utf-8", md, None
+            except Exception:
+                raw, ct = _read_bytes(source)
+                html_in = _normalize_newlines(_decode_to_utf8(raw, ct))
+                if export.lower() == "html":
+                    html = _normalize_html(html_in) if _normalize_html is not None else html_in
+                    return "utf-8", _lower_html_table_tags(html), None
+                md = _html_to_markdown(html_in)
+                md = _replace_admonitions(md)
+                md = _enhance_codeblocks(md)
+                return "utf-8", md, None
+        if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE:
+            enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled)
+            md = _replace_admonitions(md)
+            md = _enhance_codeblocks(md)
+            return enc or "utf-8", md, None
+        # Configure PDF pipeline to generate picture images into a per-call artifacts directory
+        artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_")
+        pdf_opts = PdfPipelineOptions()
+        pdf_opts.generate_picture_images = True
+        pdf_opts.generate_page_images = True
+        pdf_opts.images_scale = 2.0
+        pdf_opts.do_code_enrichment = True
+        pdf_opts.do_formula_enrichment = True
+        self._docling = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_cls=StandardPdfPipeline,
+                    pipeline_options=pdf_opts,
+                )
+            }
+        )
+        result = self._docling.convert(source)
+        if export.lower() == "markdown":
+            md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+            md = _replace_admonitions(md)
+            md = _enhance_codeblocks(md)
+            return "utf-8", md, artifacts_dir
+        if export.lower() == "html":
+            html = result.document.export_to_html()
+            html = _lower_html_table_tags(html)
+            return "utf-8", html, artifacts_dir
+        if export.lower() == "json":
+            js = result.document.export_to_json()
+            return "utf-8", js, artifacts_dir
+        if export.lower() == "doctags":
+            dt = result.document.export_to_doctags()
+            return "utf-8", dt, artifacts_dir
+        raise RuntimeError("unsupported export")
--- a/docling/app/services/word2markdown.py
+++ b/docling/app/services/word2markdown.py
@@ -0,0 +1,429 @@
+from pathlib import Path
+from typing import Tuple, List
+
+from docx import Document
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+import re
+import base64
+import hashlib
+import tempfile
+import subprocess
+from lxml import etree
+
+
+def _iter_blocks(doc: Document):
+    parent = doc
+    parent_elm = parent.element.body
+    for child in parent_elm.iterchildren():
+        tag = child.tag.split('}')[-1]
+        if tag == 'p':
+            yield Paragraph(child, parent)
+        elif tag == 'tbl':
+            yield Table(child, parent)
+
+
+def _cell_text(cell) -> str:
+    parts = []
+    for p in cell.paragraphs:
+        t = p.text or ""
+        parts.append(t)
+    return "\n".join([s for s in parts if s is not None])
+
+
+def _guess_lang(text: str) -> str:
+    t = (text or "").strip()
+    head = t[:512]
+    if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
+        return "java"
+    if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
+        return "python"
+    if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
+        return "javascript"
+    if re.search(r"^#include|\bint\s+main\s*\(\)", head):
+        return "c"
+    if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
+        return "cpp"
+    if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
+        return "sql"
+    if head.startswith("{") or head.startswith("["):
+        return "json"
+    if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
+        return "html"
+    if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
+        return "xml"
+    return ""
+
+
+def _table_to_md(tbl: Table) -> str:
+    rows = tbl.rows
+    cols = tbl.columns
+    if len(rows) == 1 and len(cols) == 1:
+        txt = _cell_text(rows[0].cells[0]).strip()
+        lang = _guess_lang(txt)
+        return f"```{lang}\n{txt}\n```\n"
+
+    def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
+        el = paragraph._element
+        parts: List[str] = []
+        try:
+            for ch in el.iterchildren():
+                tag = ch.tag.split('}')[-1]
+                if tag == 'r':
+                    for rc in ch.iterchildren():
+                        rtag = rc.tag.split('}')[-1]
+                        if rtag == 't':
+                            s = rc.text or ''
+                            if s:
+                                parts.append(s)
+                        elif rtag == 'br':
+                            parts.append('\n')
+                        elif rtag == 'drawing':
+                            try:
+                                for node in rc.iter():
+                                    local = node.tag.split('}')[-1]
+                                    rid = None
+                                    if local == 'blip':
+                                        rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
+                                    elif local == 'imagedata':
+                                        rid = node.get(f"{{{NS['r']}}}id")
+                                    if not rid:
+                                        continue
+                                    try:
+                                        part = None
+                                        rp = getattr(doc.part, 'related_parts', None)
+                                        if isinstance(rp, dict) and rid in rp:
+                                            part = rp.get(rid)
+                                        if part is None:
+                                            rels = getattr(doc.part, 'rels', None)
+                                            if rels is not None and hasattr(rels, 'get'):
+                                                rel = rels.get(rid)
+                                                part = getattr(rel, 'target_part', None)
+                                        if part is None:
+                                            rel = getattr(doc.part, '_rels', {}).get(rid)
+                                            part = getattr(rel, 'target_part', None)
+                                        ct = getattr(part, 'content_type', '') if part is not None else ''
+                                        data = part.blob if part is not None and hasattr(part, 'blob') else None
+                                        if data:
+                                            b64 = base64.b64encode(data).decode('ascii')
+                                            parts.append(f"![Image](data:{ct};base64,{b64})")
+                                    except Exception:
+                                        pass
+                            except Exception:
+                                pass
+        except Exception:
+            pass
+        return ''.join(parts)
+
+    out = []
+    # python-docx table parent is the Document
+    doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
+    for r_i, r in enumerate(rows):
+        vals = []
+        for c in r.cells:
+            segs: List[str] = []
+            for p in c.paragraphs:
+                s = _cell_inline_md(doc, p)
+                if s:
+                    segs.append(s)
+            cell_text = '<br>'.join([x for x in segs if x is not None])
+            vals.append((cell_text or '').replace('|', '\\|').strip())
+        line = "| " + " | ".join(vals) + " |"
+        out.append(line)
+        if r_i == 0:
+            sep = "| " + " | ".join(["---" for _ in vals]) + " |"
+            out.append(sep)
+    return "\n".join(out) + "\n"
+
+
+def _paragraph_to_md(p: Paragraph) -> str:
+    return (p.text or "").strip() + "\n\n"
+
+
+def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
+    ext = path.suffix.lower()
+    use_path = path
+    if ext == ".doc":
+        use_path = _convert_doc_to_docx_cross_platform(path)
+    if use_path.suffix.lower() not in {".docx"}:
+        raise RuntimeError("unsupported input for word2markdown")
+    doc = Document(str(use_path))
+    out: List[str] = []
+    in_code = False
+    code_lines: List[str] = []
+    lang_hint: str = ''
+    for blk in _iter_blocks(doc):
+        if isinstance(blk, Table):
+            out.append(_table_to_md(blk))
+        elif isinstance(blk, Paragraph):
+            tboxes = _paragraph_textboxes(blk)
+            for tb in tboxes:
+                if tb.strip():
+                    out.append(_md_code_block(tb.strip()))
+            sdts = _paragraph_sdts(blk)
+            for s in sdts:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            btx = _paragraph_bordered_text(blk)
+            for s in btx:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            ftx = _paragraph_framed(blk)
+            for s in ftx:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            raw = (blk.text or "")
+            sraw = raw.strip()
+            if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
+                if not in_code:
+                    in_code = True
+                    lang_hint = _guess_lang(sraw)
+                    code_lines = []
+                code_lines.append(raw)
+                continue
+            if in_code and code_lines:
+                text = "\n".join(code_lines)
+                use_lang = lang_hint or _guess_lang(text)
+                out.append(f"```{use_lang}\n{text}\n```\n")
+                in_code = False
+                code_lines = []
+                lang_hint = ''
+            def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
+                el = p._element
+                parts: List[str] = []
+                try:
+                    for ch in el.iterchildren():
+                        tag = ch.tag.split('}')[-1]
+                        if tag == 'r':
+                            for rc in ch.iterchildren():
+                                rtag = rc.tag.split('}')[-1]
+                                if rtag == 't':
+                                    s = rc.text or ''
+                                    if s:
+                                        parts.append(s)
+                                elif rtag == 'br':
+                                    parts.append('\n')
+                                elif rtag == 'drawing':
+                                    for node in rc.iter():
+                                        local = node.tag.split('}')[-1]
+                                        rid = None
+                                        if local == 'blip':
+                                            rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
+                                        elif local == 'imagedata':
+                                            rid = node.get(f"{{{NS['r']}}}id")
+                                        if not rid:
+                                            continue
+                                        try:
+                                            part = None
+                                            rp = getattr(doc.part, 'related_parts', None)
+                                            if isinstance(rp, dict) and rid in rp:
+                                                part = rp.get(rid)
+                                            if part is None:
+                                                rels = getattr(doc.part, 'rels', None)
+                                                if rels is not None and hasattr(rels, 'get'):
+                                                    rel = rels.get(rid)
+                                                    part = getattr(rel, 'target_part', None)
+                                            if part is None:
+                                                rel = getattr(doc.part, '_rels', {}).get(rid)
+                                                part = getattr(rel, 'target_part', None)
+                                            ct = getattr(part, 'content_type', '') if part is not None else ''
+                                            data = part.blob if part is not None and hasattr(part, 'blob') else None
+                                            if data:
+                                                b64 = base64.b64encode(data).decode('ascii')
+                                                parts.append(f"![Image](data:{ct};base64,{b64})")
+                                        except Exception:
+                                            pass
+                except Exception:
+                    pass
+                s = ''.join(parts).strip()
+                return (s + '\n\n') if s else ''
+            txt = _paragraph_with_images(doc, blk)
+            if txt.strip():
+                out.append(txt)
+    if in_code and code_lines:
+        text = "\n".join(code_lines)
+        use_lang = lang_hint or _guess_lang(text)
+        out.append(f"```{use_lang}\n{text}\n```\n")
+    try:
+        boxes = _doclevel_textboxes(doc)
+        existing_texts = set()
+        try:
+            for seg in out:
+                if isinstance(seg, str):
+                    ss = seg.strip()
+                    if ss.startswith("```"):
+                        m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
+                        if m:
+                            existing_texts.add(m.group(1).strip())
+                            continue
+                    existing_texts.add(ss)
+        except Exception:
+            pass
+        for tb in boxes:
+            s = (tb or '').strip()
+            if not s:
+                continue
+            if s in existing_texts:
+                continue
+            out.append(_md_code_block(s))
+            existing_texts.add(s)
+    except Exception:
+        pass
+    md = "".join(out)
+    return "utf-8", md
+
+NS = {
+    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+    "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+    "v": "urn:schemas-microsoft-com:vml",
+    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
+}
+
+
+def _paragraph_textboxes(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        texts: List[str] = []
+        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        return texts
+    except Exception:
+        return []
+
+
+def _paragraph_sdts(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        texts: List[str] = []
+        for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
+            paras = sdt.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        return texts
+    except Exception:
+        return []
+
+
+def _paragraph_bordered_text(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
+        t = (p.text or '').strip()
+        if has_border and t:
+            return [t]
+    except Exception:
+        pass
+    return []
+
+
+def _paragraph_framed(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
+        t = (p.text or '').strip()
+        if has_frame and t:
+            return [t]
+    except Exception:
+        pass
+    return []
+
+
+def _md_code_block(text: str) -> str:
+    lang = _guess_lang(text)
+    return f"```{lang}\n{text}\n```\n"
+
+
+def _looks_like_code_paragraph(t: str) -> bool:
+    s = (t or '').strip()
+    if not s:
+        return False
+    if s.startswith('{') or s.startswith('[') or s.endswith('}'):
+        return True
+    if s.startswith('    ') or s.startswith('\t'):
+        return True
+    if ';' in s or '{' in s or '}' in s:
+        return True
+    keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
+    return any(k in s for k in keywords)
+
+
+def _doclevel_textboxes(doc: Document) -> List[str]:
+    texts: List[str] = []
+    try:
+        el = doc.element.body
+        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([(t.text or '') for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([(t.text or '') for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+    except Exception:
+        pass
+    return texts
+
+
+def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
+            tmp.close()
+            subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
+            return Path(tmp.name)
+    except Exception:
+        pass
+    try:
+        outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
+        subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
+        candidate = outdir / (path.stem + ".docx")
+        if candidate.exists():
+            return candidate
+    except Exception:
+        pass
+    try:
+        out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
+        subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
+        if out.exists():
+            return out
+    except Exception:
+        pass
+    raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")