FunMD_Convert/docling/app/services/unified_converter.py

from pathlib import Path
from typing import Optional, Tuple
import re

import tempfile
import sys
from urllib.parse import urlsplit
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
import io
_DOC_AVAILABLE = True
try:
    _DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
    p = str(_DOC_BASE)
    if p not in sys.path:
        sys.path.insert(0, p)
except Exception:
    pass
try:
    from docling.document_converter import DocumentConverter
    from docling.datamodel.base_models import InputFormat
    from docling.document_converter import PdfFormatOption
    from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling_core.types.doc import ImageRefMode
except Exception:
    _DOC_AVAILABLE = False
    class DocumentConverter:  # type: ignore
        def __init__(self, *args, **kwargs):
            pass
        def convert(self, source):
            raise RuntimeError("docling unavailable")
    class InputFormat:  # type: ignore
        PDF = "pdf"
    class PdfFormatOption:  # type: ignore
        def __init__(self, *args, **kwargs):
            pass
    class StandardPdfPipeline:  # type: ignore
        pass
    class PdfPipelineOptions:  # type: ignore
        def __init__(self):
            pass
    class ImageRefMode:  # type: ignore
        EMBEDDED = None

"""
@api Unified Converter Service
@description Provides core document conversion logic unifying Docling and word2markdown engines
"""

_W2M_AVAILABLE = False
try:
    from app.services.word2markdown import convert_any as _w2m_convert_any  # type: ignore
    _W2M_AVAILABLE = True
except Exception:
    _W2M_AVAILABLE = False

try:
    from bs4 import BeautifulSoup  # type: ignore
except Exception:
    BeautifulSoup = None  # type: ignore
try:
    from app.services.docling_adapter import normalize_html as _normalize_html  # type: ignore
    from app.services.docling_adapter import resolve_link as _resolve_link  # type: ignore
    from app.services.docling_adapter import _render_markdown_html as _render_md_html  # type: ignore
except Exception:
    _normalize_html = None  # type: ignore
    _resolve_link = None  # type: ignore
    _render_md_html = None  # type: ignore

def _is_http(s: str) -> bool:
    t = (s or "").lower()
    return t.startswith("http://") or t.startswith("https://")

def _read_bytes(source: str) -> Tuple[bytes, str]:
    ct = ""
    try:
        if _is_http(source):
            from urllib.request import urlopen
            with urlopen(source, timeout=10) as r:
                ct = r.headers.get("Content-Type") or ""
                return r.read() or b"", ct
        p = Path(source)
        if p.exists() and p.is_file():
            return p.read_bytes(), ct
    except Exception:
        return b"", ct
    return b"", ct

def _decode_to_utf8(raw: bytes, ct: str = "") -> str:
    if not raw:
        return ""
    if raw.startswith(b"\xef\xbb\xbf"):
        try:
            return raw[3:].decode("utf-8")
        except Exception:
            pass
    if raw.startswith(b"\xff\xfe"):
        try:
            return raw[2:].decode("utf-16le")
        except Exception:
            pass
    if raw.startswith(b"\xfe\xff"):
        try:
            return raw[2:].decode("utf-16be")
        except Exception:
            pass
    try:
        m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE)
        if m:
            enc = m.group(1).strip().lower()
            try:
                return raw.decode(enc)
            except Exception:
                pass
    except Exception:
        pass
    candidates = [
        "utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252",
    ]
    for enc in candidates:
        try:
            return raw.decode(enc)
        except Exception:
            continue
    return raw.decode("utf-8", errors="replace")

def _normalize_newlines(s: str) -> str:
    return (s or "").replace("\r\n", "\n").replace("\r", "\n")

def _html_to_markdown(html: str) -> str:
    if not html:
        return ""
    if BeautifulSoup is None:
        return html
    soup = BeautifulSoup(html, "html.parser")
    out: list[str] = []
    def txt(node) -> str:
        return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "")
    def inline(node) -> str:
        if isinstance(node, str):
            return node
        name = getattr(node, "name", None)
        if name in {None}:  # type: ignore
            return str(node)
        if name in {"strong", "b"}:
            return "**" + txt(node) + "**"
        if name in {"em", "i"}:
            return "*" + txt(node) + "*"
        if name == "code":
            return "`" + txt(node) + "`"
        if name == "a":
            href_val = node.get("href")
            extra_val = node.get("data-doc")
            href = href_val if isinstance(href_val, str) else None
            extra = extra_val if isinstance(extra_val, str) else None
            resolved = _resolve_link(href, extra) if _resolve_link else (href or extra)
            url = resolved or ""
            text = txt(node)
            if url:
                return f"[{text}]({url})"
            return text
        if name == "img":
            alt = node.get("alt") or "image"
            src = node.get("src") or ""
            return f"![{alt}]({src})"
        res = []
        for c in getattr(node, "children", []):
            res.append(inline(c))
        return "".join(res)
    def block(node):
        name = getattr(node, "name", None)
        if name is None:
            s = str(node).strip()
            if s:
                out.append(s)
            return
        if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
            lvl = int(name[1])
            out.append("#" * lvl + " " + txt(node))
            out.append("")
            return
        if name == "p":
            segs = [inline(c) for c in node.children]
            out.append("".join(segs))
            out.append("")
            return
        if name == "br":
            out.append("")
            return
        if name in {"ul", "ol"}:
            is_ol = name == "ol"
            idx = 1
            for li in node.find_all("li", recursive=False):
                text = "".join(inline(c) for c in li.children)
                if is_ol:
                    out.append(f"{idx}. {text}")
                    idx += 1
                else:
                    out.append(f"- {text}")
            out.append("")
            return
        if name == "pre":
            code_node = node.find("code")
            code_text = code_node.get_text() if code_node else node.get_text()
            lang = ""
            cls = (code_node.get("class") if code_node else node.get("class")) or []
            for c in cls:
                s = str(c)
                if s.startswith("language-"):
                    lang = s.split("-", 1)[-1]
                    break
            out.append(f"```{lang}\n{code_text}\n```\n")
            return
        if name == "blockquote":
            lines = [l for l in txt(node).splitlines() if l.strip()]
            for l in lines:
                out.append("> " + l)
            out.append("")
            return
        if name == "table":
            rows = node.find_all("tr")
            if not rows:
                return
            headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])]
            if headers:
                out.append("|" + "|".join(headers) + "|")
                sep = "|" + "|".join(["---" for _ in headers]) + "|"
                out.append(sep)
            for tr in rows[1:]:
                cells = [td.get_text(strip=True) for td in tr.find_all("td")]
                if cells:
                    out.append("|" + "|".join(cells) + "|")
            out.append("")
            return
        if name == "div":
            for c in node.children:
                block(c)
            return
        segs = [inline(c) for c in node.children]
        if segs:
            out.append("".join(segs))
            out.append("")
    root = soup.body or soup
    for ch in getattr(root, "children", []):
        block(ch)
    return _normalize_newlines("\n".join(out)).strip()


def _lower_html_table_tags(html: str) -> str:
    """
    @function _lower_html_table_tags
    @description Normalizes HTML table tags to lowercase
    @param html Input HTML string
    @return Normalized HTML string
    """
    if not html:
        return html
    tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"]
    out = html
    for t in tags:
        out = re.sub(r"</?" + t + r"\b", lambda m: m.group(0).lower(), out)
    out = re.sub(r">\s*\n+\s*", ">\n", out)
    return out


def _replace_admonitions(md: str) -> str:
    """
    @function _replace_admonitions
    @description Replaces ::: style admonitions with !!! style
    @param md Input markdown string
    @return Processed markdown string
    """
    if not md:
        return md
    lines = md.split("\n")
    out = []
    in_block = False
    for raw in lines:
        t = raw.strip()
        if t.startswith(":::"):
            if not in_block:
                name = t[3:].strip()
                if not name:
                    out.append("!!!")
                else:
                    out.append("!!! " + name)
                in_block = True
            else:
                out.append("!!!")
                in_block = False
            continue
        out.append(raw)
    return "\n".join(out)


def _enhance_codeblocks(md: str) -> str:
    if not md:
        return md
    lines = md.split("\n")
    res = []
    in_fence = False
    fence_lang = ""
    i = 0
    while i < len(lines):
        line = lines[i]
        t = line.strip()
        if t.startswith("```"):
            in_fence = not in_fence
            try:
                fence_lang = (t[3:] or "").strip() if in_fence else ""
            except Exception:
                fence_lang = ""
            res.append(line)
            i += 1
            continue
        if in_fence:
            res.append(line)
            i += 1
            continue
        if t.startswith("{") or t.startswith("["):
            buf = [line]
            j = i + 1
            closed = False
            depth = t.count("{") - t.count("}")
            while j < len(lines):
                buf.append(lines[j])
                s = lines[j].strip()
                depth += s.count("{") - s.count("}")
                if depth <= 0 and s.endswith("}"):
                    closed = True
                    break
                j += 1
            if closed and len(buf) >= 3:
                lang = "json"
                res.append("```" + lang)
                res.extend(buf)
                res.append("```")
                i = j + 1
                continue
        code_sig = (
            ("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t)
        )
        if code_sig:
            buf = [line]
            j = i + 1
            while j < len(lines):
                s = lines[j].strip()
                if not s:
                    break
                if s.startswith("# ") or s.startswith("## ") or s.startswith("### "):
                    break
                buf.append(lines[j])
                j += 1
            if len(buf) >= 3:
                res.append("```")
                res.extend(buf)
                res.append("```")
                i = j + 1
                continue
        res.append(line)
        i += 1
    return "\n".join(res)


class FormatConverter:
    """
    @class FormatConverter
    @description Unified converter class wrapping Docling and word2markdown
    """
    def __init__(self) -> None:
        self._docling = DocumentConverter()

    def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]:
        """
        @function convert
        @description Convert a document source to specified format
        @param source Path or URL to source document
        @param export Output format (markdown, html, json, doctags)
        @param engine Optional engine override (word2markdown/docling)
        @param mdx_safe_mode_enabled Toggle safe mode for MDX
        @return Tuple of (encoding, content)
        """
        

        # Prefer custom word2markdown engine for DOC/DOCX when available
        auto_engine = None
        try:
            from pathlib import Path as _P
            suf = _P(source).suffix.lower()
            if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE:
                auto_engine = "word2markdown"
        except Exception:
            auto_engine = None
        use_engine = (engine or auto_engine or "").lower()
        try:
            from urllib.parse import urlsplit
            path = source
            if _is_http(source):
                path = urlsplit(source).path or ""
            ext = Path(path).suffix.lower()
        except Exception:
            ext = Path(source).suffix.lower()
        if ext in {".txt"}:
            raw, ct = _read_bytes(source)
            text = _normalize_newlines(_decode_to_utf8(raw, ct))
            if export.lower() == "html":
                if _render_md_html is not None:
                    html = _render_md_html(text)
                else:
                    try:
                        import marko
                        html = marko.convert(text)
                    except Exception:
                        html = f"<pre>{text}</pre>"
                return "utf-8", _lower_html_table_tags(html), None
            md = _enhance_codeblocks(text)
            return "utf-8", md, None
        if ext in {".md"}:
            raw, ct = _read_bytes(source)
            text = _normalize_newlines(_decode_to_utf8(raw, ct))
            if export.lower() == "html":
                if _render_md_html is not None:
                    html = _render_md_html(text)
                else:
                    try:
                        import marko
                        html = marko.convert(text)
                    except Exception:
                        html = text
                return "utf-8", _lower_html_table_tags(html), None
            return "utf-8", text, None
        if ext in {".html", ".htm"}:
            try:
                conv = DocumentConverter(allowed_formats=[InputFormat.HTML])
                result = conv.convert(source)
                if export.lower() == "html":
                    html = result.document.export_to_html()
                    html = _lower_html_table_tags(html)
                    return "utf-8", html, None
                md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
                md = _replace_admonitions(md)
                md = _enhance_codeblocks(md)
                return "utf-8", md, None
            except Exception:
                raw, ct = _read_bytes(source)
                html_in = _normalize_newlines(_decode_to_utf8(raw, ct))
                if export.lower() == "html":
                    html = _normalize_html(html_in) if _normalize_html is not None else html_in
                    return "utf-8", _lower_html_table_tags(html), None
                md = _html_to_markdown(html_in)
                md = _replace_admonitions(md)
                md = _enhance_codeblocks(md)
                return "utf-8", md, None
        if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE:
            enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled)
            md = _replace_admonitions(md)
            md = _enhance_codeblocks(md)
            return enc or "utf-8", md, None
        # Configure PDF pipeline to generate picture images into a per-call artifacts directory
        artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_")
        pdf_opts = PdfPipelineOptions()
        pdf_opts.generate_picture_images = True
        pdf_opts.generate_page_images = True
        pdf_opts.images_scale = 2.0
        pdf_opts.do_code_enrichment = True
        pdf_opts.do_formula_enrichment = True
        self._docling = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_cls=StandardPdfPipeline,
                    pipeline_options=pdf_opts,
                )
            }
        )
        result = self._docling.convert(source)
        if export.lower() == "markdown":
            md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
            md = _replace_admonitions(md)
            md = _enhance_codeblocks(md)
            return "utf-8", md, artifacts_dir
        if export.lower() == "html":
            html = result.document.export_to_html()
            html = _lower_html_table_tags(html)
            return "utf-8", html, artifacts_dir
        if export.lower() == "json":
            js = result.document.export_to_json()
            return "utf-8", js, artifacts_dir
        if export.lower() == "doctags":
            dt = result.document.export_to_doctags()
            return "utf-8", dt, artifacts_dir
        raise RuntimeError("unsupported export")
Import project files 2026-01-07 17:18:26 +08:00			`from pathlib import Path`
			`from typing import Optional, Tuple`
			`import re`

			`import tempfile`
			`import sys`
			`from urllib.parse import urlsplit`
			`from urllib.request import urlopen`
			`from urllib.error import HTTPError, URLError`
			`import io`
			`_DOC_AVAILABLE = True`
			`try:`
			`_DOC_BASE = Path(__file__).resolve().parents[2] / "docling"`
			`p = str(_DOC_BASE)`
			`if p not in sys.path:`
			`sys.path.insert(0, p)`
			`except Exception:`
			`pass`
			`try:`
			`from docling.document_converter import DocumentConverter`
			`from docling.datamodel.base_models import InputFormat`
			`from docling.document_converter import PdfFormatOption`
			`from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline`
			`from docling.datamodel.pipeline_options import PdfPipelineOptions`
			`from docling_core.types.doc import ImageRefMode`
			`except Exception:`
			`_DOC_AVAILABLE = False`
			`class DocumentConverter: # type: ignore`
			`def __init__(self, args, *kwargs):`
			`pass`
			`def convert(self, source):`
			`raise RuntimeError("docling unavailable")`
			`class InputFormat: # type: ignore`
			`PDF = "pdf"`
			`class PdfFormatOption: # type: ignore`
			`def __init__(self, args, *kwargs):`
			`pass`
			`class StandardPdfPipeline: # type: ignore`
			`pass`
			`class PdfPipelineOptions: # type: ignore`
			`def __init__(self):`
			`pass`
			`class ImageRefMode: # type: ignore`
			`EMBEDDED = None`

			`"""`
			`@api Unified Converter Service`
			`@description Provides core document conversion logic unifying Docling and word2markdown engines`
			`"""`

			`_W2M_AVAILABLE = False`
			`try:`
			`from app.services.word2markdown import convert_any as _w2m_convert_any # type: ignore`
			`_W2M_AVAILABLE = True`
			`except Exception:`
			`_W2M_AVAILABLE = False`

			`try:`
			`from bs4 import BeautifulSoup # type: ignore`
			`except Exception:`
			`BeautifulSoup = None # type: ignore`
			`try:`
			`from app.services.docling_adapter import normalize_html as _normalize_html # type: ignore`
			`from app.services.docling_adapter import resolve_link as _resolve_link # type: ignore`
			`from app.services.docling_adapter import _render_markdown_html as _render_md_html # type: ignore`
			`except Exception:`
			`_normalize_html = None # type: ignore`
			`_resolve_link = None # type: ignore`
			`_render_md_html = None # type: ignore`

			`def _is_http(s: str) -> bool:`
			`t = (s or "").lower()`
			`return t.startswith("http://") or t.startswith("https://")`

			`def _read_bytes(source: str) -> Tuple[bytes, str]:`
			`ct = ""`
			`try:`
			`if _is_http(source):`
			`from urllib.request import urlopen`
			`with urlopen(source, timeout=10) as r:`
			`ct = r.headers.get("Content-Type") or ""`
			`return r.read() or b"", ct`
			`p = Path(source)`
			`if p.exists() and p.is_file():`
			`return p.read_bytes(), ct`
			`except Exception:`
			`return b"", ct`
			`return b"", ct`

			`def _decode_to_utf8(raw: bytes, ct: str = "") -> str:`
			`if not raw:`
			`return ""`
			`if raw.startswith(b"\xef\xbb\xbf"):`
			`try:`
			`return raw[3:].decode("utf-8")`
			`except Exception:`
			`pass`
			`if raw.startswith(b"\xff\xfe"):`
			`try:`
			`return raw[2:].decode("utf-16le")`
			`except Exception:`
			`pass`
			`if raw.startswith(b"\xfe\xff"):`
			`try:`
			`return raw[2:].decode("utf-16be")`
			`except Exception:`
			`pass`
			`try:`
			`m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE)`
			`if m:`
			`enc = m.group(1).strip().lower()`
			`try:`
			`return raw.decode(enc)`
			`except Exception:`
			`pass`
			`except Exception:`
			`pass`
			`candidates = [`
			`"utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252",`
			`]`
			`for enc in candidates:`
			`try:`
			`return raw.decode(enc)`
			`except Exception:`
			`continue`
			`return raw.decode("utf-8", errors="replace")`

			`def _normalize_newlines(s: str) -> str:`
			`return (s or "").replace("\r\n", "\n").replace("\r", "\n")`

			`def _html_to_markdown(html: str) -> str:`
			`if not html:`
			`return ""`
			`if BeautifulSoup is None:`
			`return html`
			`soup = BeautifulSoup(html, "html.parser")`
			`out: list[str] = []`
			`def txt(node) -> str:`
			`return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "")`
			`def inline(node) -> str:`
			`if isinstance(node, str):`
			`return node`
			`name = getattr(node, "name", None)`
			`if name in {None}: # type: ignore`
			`return str(node)`
			`if name in {"strong", "b"}:`
			`return "" + txt(node) + ""`
			`if name in {"em", "i"}:`
			`return "" + txt(node) + ""`
			`if name == "code":`
			return "`" + txt(node) + "`"
			`if name == "a":`
			`href_val = node.get("href")`
			`extra_val = node.get("data-doc")`
			`href = href_val if isinstance(href_val, str) else None`
			`extra = extra_val if isinstance(extra_val, str) else None`
			`resolved = _resolve_link(href, extra) if _resolve_link else (href or extra)`
			`url = resolved or ""`
			`text = txt(node)`
			`if url:`
			`return f"[{text}]({url})"`
			`return text`
			`if name == "img":`
			`alt = node.get("alt") or "image"`
			`src = node.get("src") or ""`
			`return f"![{alt}]({src})"`
			`res = []`
			`for c in getattr(node, "children", []):`
			`res.append(inline(c))`
			`return "".join(res)`
			`def block(node):`
			`name = getattr(node, "name", None)`
			`if name is None:`
			`s = str(node).strip()`
			`if s:`
			`out.append(s)`
			`return`
			`if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:`
			`lvl = int(name[1])`
			`out.append("#" * lvl + " " + txt(node))`
			`out.append("")`
			`return`
			`if name == "p":`
			`segs = [inline(c) for c in node.children]`
			`out.append("".join(segs))`
			`out.append("")`
			`return`
			`if name == "br":`
			`out.append("")`
			`return`
			`if name in {"ul", "ol"}:`
			`is_ol = name == "ol"`
			`idx = 1`
			`for li in node.find_all("li", recursive=False):`
			`text = "".join(inline(c) for c in li.children)`
			`if is_ol:`
			`out.append(f"{idx}. {text}")`
			`idx += 1`
			`else:`
			`out.append(f"- {text}")`
			`out.append("")`
			`return`
			`if name == "pre":`
			`code_node = node.find("code")`
			`code_text = code_node.get_text() if code_node else node.get_text()`
			`lang = ""`
			`cls = (code_node.get("class") if code_node else node.get("class")) or []`
			`for c in cls:`
			`s = str(c)`
			`if s.startswith("language-"):`
			`lang = s.split("-", 1)[-1]`
			`break`
			out.append(f"```{lang}\n{code_text}\n```\n")
			`return`
			`if name == "blockquote":`
			`lines = [l for l in txt(node).splitlines() if l.strip()]`
			`for l in lines:`
			`out.append("> " + l)`
			`out.append("")`
			`return`
			`if name == "table":`
			`rows = node.find_all("tr")`
			`if not rows:`
			`return`
			`headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])]`
			`if headers:`
			`out.append("\|" + "\|".join(headers) + "\|")`
			`sep = "\|" + "\|".join(["---" for _ in headers]) + "\|"`
			`out.append(sep)`
			`for tr in rows[1:]:`
			`cells = [td.get_text(strip=True) for td in tr.find_all("td")]`
			`if cells:`
			`out.append("\|" + "\|".join(cells) + "\|")`
			`out.append("")`
			`return`
			`if name == "div":`
			`for c in node.children:`
			`block(c)`
			`return`
			`segs = [inline(c) for c in node.children]`
			`if segs:`
			`out.append("".join(segs))`
			`out.append("")`
			`root = soup.body or soup`
			`for ch in getattr(root, "children", []):`
			`block(ch)`
			`return _normalize_newlines("\n".join(out)).strip()`


			`def _lower_html_table_tags(html: str) -> str:`
			`"""`
			`@function _lower_html_table_tags`
			`@description Normalizes HTML table tags to lowercase`
			`@param html Input HTML string`
			`@return Normalized HTML string`
			`"""`
			`if not html:`
			`return html`
			`tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"]`
			`out = html`
			`for t in tags:`
			`out = re.sub(r"</?" + t + r"\b", lambda m: m.group(0).lower(), out)`
			`out = re.sub(r">\s\n+\s", ">\n", out)`
			`return out`


			`def _replace_admonitions(md: str) -> str:`
			`"""`
			`@function _replace_admonitions`
			`@description Replaces ::: style admonitions with !!! style`
			`@param md Input markdown string`
			`@return Processed markdown string`
			`"""`
			`if not md:`
			`return md`
			`lines = md.split("\n")`
			`out = []`
			`in_block = False`
			`for raw in lines:`
			`t = raw.strip()`
			`if t.startswith(":::"):`
			`if not in_block:`
			`name = t[3:].strip()`
			`if not name:`
			`out.append("!!!")`
			`else:`
			`out.append("!!! " + name)`
			`in_block = True`
			`else:`
			`out.append("!!!")`
			`in_block = False`
			`continue`
			`out.append(raw)`
			`return "\n".join(out)`


			`def _enhance_codeblocks(md: str) -> str:`
			`if not md:`
			`return md`
			`lines = md.split("\n")`
			`res = []`
			`in_fence = False`
			`fence_lang = ""`
			`i = 0`
			`while i < len(lines):`
			`line = lines[i]`
			`t = line.strip()`
			if t.startswith("```"):
			`in_fence = not in_fence`
			`try:`
			`fence_lang = (t[3:] or "").strip() if in_fence else ""`
			`except Exception:`
			`fence_lang = ""`
			`res.append(line)`
			`i += 1`
			`continue`
			`if in_fence:`
			`res.append(line)`
			`i += 1`
			`continue`
			`if t.startswith("{") or t.startswith("["):`
			`buf = [line]`
			`j = i + 1`
			`closed = False`
			`depth = t.count("{") - t.count("}")`
			`while j < len(lines):`
			`buf.append(lines[j])`
			`s = lines[j].strip()`
			`depth += s.count("{") - s.count("}")`
			`if depth <= 0 and s.endswith("}"):`
			`closed = True`
			`break`
			`j += 1`
			`if closed and len(buf) >= 3:`
			`lang = "json"`
			res.append("```" + lang)
			`res.extend(buf)`
			res.append("```")
			`i = j + 1`
			`continue`
			`code_sig = (`
			`("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t)`
			`)`
			`if code_sig:`
			`buf = [line]`
			`j = i + 1`
			`while j < len(lines):`
			`s = lines[j].strip()`
			`if not s:`
			`break`
			`if s.startswith("# ") or s.startswith("## ") or s.startswith("### "):`
			`break`
			`buf.append(lines[j])`
			`j += 1`
			`if len(buf) >= 3:`
			res.append("```")
			`res.extend(buf)`
			res.append("```")
			`i = j + 1`
			`continue`
			`res.append(line)`
			`i += 1`
			`return "\n".join(res)`


			`class FormatConverter:`
			`"""`
			`@class FormatConverter`
			`@description Unified converter class wrapping Docling and word2markdown`
			`"""`
			`def __init__(self) -> None:`
			`self._docling = DocumentConverter()`

			`def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]:`
			`"""`
			`@function convert`
			`@description Convert a document source to specified format`
			`@param source Path or URL to source document`
			`@param export Output format (markdown, html, json, doctags)`
			`@param engine Optional engine override (word2markdown/docling)`
			`@param mdx_safe_mode_enabled Toggle safe mode for MDX`
			`@return Tuple of (encoding, content)`
			`"""`


			`# Prefer custom word2markdown engine for DOC/DOCX when available`
			`auto_engine = None`
			`try:`
			`from pathlib import Path as _P`
			`suf = _P(source).suffix.lower()`
			`if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE:`
			`auto_engine = "word2markdown"`
			`except Exception:`
			`auto_engine = None`
			`use_engine = (engine or auto_engine or "").lower()`
			`try:`
			`from urllib.parse import urlsplit`
			`path = source`
			`if _is_http(source):`
			`path = urlsplit(source).path or ""`
			`ext = Path(path).suffix.lower()`
			`except Exception:`
			`ext = Path(source).suffix.lower()`
			`if ext in {".txt"}:`
			`raw, ct = _read_bytes(source)`
			`text = _normalize_newlines(_decode_to_utf8(raw, ct))`
			`if export.lower() == "html":`
			`if _render_md_html is not None:`
			`html = _render_md_html(text)`
			`else:`
			`try:`
			`import marko`
			`html = marko.convert(text)`
			`except Exception:`
			`html = f"<pre>{text}</pre>"`
			`return "utf-8", _lower_html_table_tags(html), None`
			`md = _enhance_codeblocks(text)`
			`return "utf-8", md, None`
			`if ext in {".md"}:`
			`raw, ct = _read_bytes(source)`
			`text = _normalize_newlines(_decode_to_utf8(raw, ct))`
			`if export.lower() == "html":`
			`if _render_md_html is not None:`
			`html = _render_md_html(text)`
			`else:`
			`try:`
			`import marko`
			`html = marko.convert(text)`
			`except Exception:`
			`html = text`
			`return "utf-8", _lower_html_table_tags(html), None`
			`return "utf-8", text, None`
			`if ext in {".html", ".htm"}:`
			`try:`
			`conv = DocumentConverter(allowed_formats=[InputFormat.HTML])`
			`result = conv.convert(source)`
			`if export.lower() == "html":`
			`html = result.document.export_to_html()`
			`html = _lower_html_table_tags(html)`
			`return "utf-8", html, None`
			`md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)`
			`md = _replace_admonitions(md)`
			`md = _enhance_codeblocks(md)`
			`return "utf-8", md, None`
			`except Exception:`
			`raw, ct = _read_bytes(source)`
			`html_in = _normalize_newlines(_decode_to_utf8(raw, ct))`
			`if export.lower() == "html":`
			`html = _normalize_html(html_in) if _normalize_html is not None else html_in`
			`return "utf-8", _lower_html_table_tags(html), None`
			`md = _html_to_markdown(html_in)`
			`md = _replace_admonitions(md)`
			`md = _enhance_codeblocks(md)`
			`return "utf-8", md, None`
			`if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE:`
			`enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled)`
			`md = _replace_admonitions(md)`
			`md = _enhance_codeblocks(md)`
			`return enc or "utf-8", md, None`
			`# Configure PDF pipeline to generate picture images into a per-call artifacts directory`
			`artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_")`
			`pdf_opts = PdfPipelineOptions()`
			`pdf_opts.generate_picture_images = True`
			`pdf_opts.generate_page_images = True`
			`pdf_opts.images_scale = 2.0`
			`pdf_opts.do_code_enrichment = True`
			`pdf_opts.do_formula_enrichment = True`
			`self._docling = DocumentConverter(`
			`format_options={`
			`InputFormat.PDF: PdfFormatOption(`
			`pipeline_cls=StandardPdfPipeline,`
			`pipeline_options=pdf_opts,`
			`)`
			`}`
			`)`
			`result = self._docling.convert(source)`
			`if export.lower() == "markdown":`
			`md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)`
			`md = _replace_admonitions(md)`
			`md = _enhance_codeblocks(md)`
			`return "utf-8", md, artifacts_dir`
			`if export.lower() == "html":`
			`html = result.document.export_to_html()`
			`html = _lower_html_table_tags(html)`
			`return "utf-8", html, artifacts_dir`
			`if export.lower() == "json":`
			`js = result.document.export_to_json()`
			`return "utf-8", js, artifacts_dir`
			`if export.lower() == "doctags":`
			`dt = result.document.export_to_doctags()`
			`return "utf-8", dt, artifacts_dir`
			`raise RuntimeError("unsupported export")`