from pathlib import Path from typing import Optional, Tuple import re import tempfile import sys from urllib.parse import urlsplit from urllib.request import urlopen from urllib.error import HTTPError, URLError import io _DOC_AVAILABLE = True try: _DOC_BASE = Path(__file__).resolve().parents[2] / "docling" p = str(_DOC_BASE) if p not in sys.path: sys.path.insert(0, p) except Exception: pass try: from docling.document_converter import DocumentConverter from docling.datamodel.base_models import InputFormat from docling.document_converter import PdfFormatOption from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.datamodel.pipeline_options import PdfPipelineOptions from docling_core.types.doc import ImageRefMode except Exception: _DOC_AVAILABLE = False class DocumentConverter: # type: ignore def __init__(self, *args, **kwargs): pass def convert(self, source): raise RuntimeError("docling unavailable") class InputFormat: # type: ignore PDF = "pdf" class PdfFormatOption: # type: ignore def __init__(self, *args, **kwargs): pass class StandardPdfPipeline: # type: ignore pass class PdfPipelineOptions: # type: ignore def __init__(self): pass class ImageRefMode: # type: ignore EMBEDDED = None """ @api Unified Converter Service @description Provides core document conversion logic unifying Docling and word2markdown engines """ _W2M_AVAILABLE = False try: from app.services.word2markdown import convert_any as _w2m_convert_any # type: ignore _W2M_AVAILABLE = True except Exception: _W2M_AVAILABLE = False try: from bs4 import BeautifulSoup # type: ignore except Exception: BeautifulSoup = None # type: ignore try: from app.services.docling_adapter import normalize_html as _normalize_html # type: ignore from app.services.docling_adapter import resolve_link as _resolve_link # type: ignore from app.services.docling_adapter import _render_markdown_html as _render_md_html # type: ignore except Exception: _normalize_html = None # type: ignore _resolve_link = None # type: ignore _render_md_html = None # type: ignore def _is_http(s: str) -> bool: t = (s or "").lower() return t.startswith("http://") or t.startswith("https://") def _read_bytes(source: str) -> Tuple[bytes, str]: ct = "" try: if _is_http(source): from urllib.request import urlopen with urlopen(source, timeout=10) as r: ct = r.headers.get("Content-Type") or "" return r.read() or b"", ct p = Path(source) if p.exists() and p.is_file(): return p.read_bytes(), ct except Exception: return b"", ct return b"", ct def _decode_to_utf8(raw: bytes, ct: str = "") -> str: if not raw: return "" if raw.startswith(b"\xef\xbb\xbf"): try: return raw[3:].decode("utf-8") except Exception: pass if raw.startswith(b"\xff\xfe"): try: return raw[2:].decode("utf-16le") except Exception: pass if raw.startswith(b"\xfe\xff"): try: return raw[2:].decode("utf-16be") except Exception: pass try: m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE) if m: enc = m.group(1).strip().lower() try: return raw.decode(enc) except Exception: pass except Exception: pass candidates = [ "utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252", ] for enc in candidates: try: return raw.decode(enc) except Exception: continue return raw.decode("utf-8", errors="replace") def _normalize_newlines(s: str) -> str: return (s or "").replace("\r\n", "\n").replace("\r", "\n") def _html_to_markdown(html: str) -> str: if not html: return "" if BeautifulSoup is None: return html soup = BeautifulSoup(html, "html.parser") out: list[str] = [] def txt(node) -> str: return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "") def inline(node) -> str: if isinstance(node, str): return node name = getattr(node, "name", None) if name in {None}: # type: ignore return str(node) if name in {"strong", "b"}: return "**" + txt(node) + "**" if name in {"em", "i"}: return "*" + txt(node) + "*" if name == "code": return "`" + txt(node) + "`" if name == "a": href_val = node.get("href") extra_val = node.get("data-doc") href = href_val if isinstance(href_val, str) else None extra = extra_val if isinstance(extra_val, str) else None resolved = _resolve_link(href, extra) if _resolve_link else (href or extra) url = resolved or "" text = txt(node) if url: return f"[{text}]({url})" return text if name == "img": alt = node.get("alt") or "image" src = node.get("src") or "" return f"![{alt}]({src})" res = [] for c in getattr(node, "children", []): res.append(inline(c)) return "".join(res) def block(node): name = getattr(node, "name", None) if name is None: s = str(node).strip() if s: out.append(s) return if name in {"h1", "h2", "h3", "h4", "h5", "h6"}: lvl = int(name[1]) out.append("#" * lvl + " " + txt(node)) out.append("") return if name == "p": segs = [inline(c) for c in node.children] out.append("".join(segs)) out.append("") return if name == "br": out.append("") return if name in {"ul", "ol"}: is_ol = name == "ol" idx = 1 for li in node.find_all("li", recursive=False): text = "".join(inline(c) for c in li.children) if is_ol: out.append(f"{idx}. {text}") idx += 1 else: out.append(f"- {text}") out.append("") return if name == "pre": code_node = node.find("code") code_text = code_node.get_text() if code_node else node.get_text() lang = "" cls = (code_node.get("class") if code_node else node.get("class")) or [] for c in cls: s = str(c) if s.startswith("language-"): lang = s.split("-", 1)[-1] break out.append(f"```{lang}\n{code_text}\n```\n") return if name == "blockquote": lines = [l for l in txt(node).splitlines() if l.strip()] for l in lines: out.append("> " + l) out.append("") return if name == "table": rows = node.find_all("tr") if not rows: return headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])] if headers: out.append("|" + "|".join(headers) + "|") sep = "|" + "|".join(["---" for _ in headers]) + "|" out.append(sep) for tr in rows[1:]: cells = [td.get_text(strip=True) for td in tr.find_all("td")] if cells: out.append("|" + "|".join(cells) + "|") out.append("") return if name == "div": for c in node.children: block(c) return segs = [inline(c) for c in node.children] if segs: out.append("".join(segs)) out.append("") root = soup.body or soup for ch in getattr(root, "children", []): block(ch) return _normalize_newlines("\n".join(out)).strip() def _lower_html_table_tags(html: str) -> str: """ @function _lower_html_table_tags @description Normalizes HTML table tags to lowercase @param html Input HTML string @return Normalized HTML string """ if not html: return html tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"] out = html for t in tags: out = re.sub(r"\s*\n+\s*", ">\n", out) return out def _replace_admonitions(md: str) -> str: """ @function _replace_admonitions @description Replaces ::: style admonitions with !!! style @param md Input markdown string @return Processed markdown string """ if not md: return md lines = md.split("\n") out = [] in_block = False for raw in lines: t = raw.strip() if t.startswith(":::"): if not in_block: name = t[3:].strip() if not name: out.append("!!!") else: out.append("!!! " + name) in_block = True else: out.append("!!!") in_block = False continue out.append(raw) return "\n".join(out) def _enhance_codeblocks(md: str) -> str: if not md: return md lines = md.split("\n") res = [] in_fence = False fence_lang = "" i = 0 while i < len(lines): line = lines[i] t = line.strip() if t.startswith("```"): in_fence = not in_fence try: fence_lang = (t[3:] or "").strip() if in_fence else "" except Exception: fence_lang = "" res.append(line) i += 1 continue if in_fence: res.append(line) i += 1 continue if t.startswith("{") or t.startswith("["): buf = [line] j = i + 1 closed = False depth = t.count("{") - t.count("}") while j < len(lines): buf.append(lines[j]) s = lines[j].strip() depth += s.count("{") - s.count("}") if depth <= 0 and s.endswith("}"): closed = True break j += 1 if closed and len(buf) >= 3: lang = "json" res.append("```" + lang) res.extend(buf) res.append("```") i = j + 1 continue code_sig = ( ("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t) ) if code_sig: buf = [line] j = i + 1 while j < len(lines): s = lines[j].strip() if not s: break if s.startswith("# ") or s.startswith("## ") or s.startswith("### "): break buf.append(lines[j]) j += 1 if len(buf) >= 3: res.append("```") res.extend(buf) res.append("```") i = j + 1 continue res.append(line) i += 1 return "\n".join(res) class FormatConverter: """ @class FormatConverter @description Unified converter class wrapping Docling and word2markdown """ def __init__(self) -> None: self._docling = DocumentConverter() def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]: """ @function convert @description Convert a document source to specified format @param source Path or URL to source document @param export Output format (markdown, html, json, doctags) @param engine Optional engine override (word2markdown/docling) @param mdx_safe_mode_enabled Toggle safe mode for MDX @return Tuple of (encoding, content) """ # Prefer custom word2markdown engine for DOC/DOCX when available auto_engine = None try: from pathlib import Path as _P suf = _P(source).suffix.lower() if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE: auto_engine = "word2markdown" except Exception: auto_engine = None use_engine = (engine or auto_engine or "").lower() try: from urllib.parse import urlsplit path = source if _is_http(source): path = urlsplit(source).path or "" ext = Path(path).suffix.lower() except Exception: ext = Path(source).suffix.lower() if ext in {".txt"}: raw, ct = _read_bytes(source) text = _normalize_newlines(_decode_to_utf8(raw, ct)) if export.lower() == "html": if _render_md_html is not None: html = _render_md_html(text) else: try: import marko html = marko.convert(text) except Exception: html = f"
{text}
" return "utf-8", _lower_html_table_tags(html), None md = _enhance_codeblocks(text) return "utf-8", md, None if ext in {".md"}: raw, ct = _read_bytes(source) text = _normalize_newlines(_decode_to_utf8(raw, ct)) if export.lower() == "html": if _render_md_html is not None: html = _render_md_html(text) else: try: import marko html = marko.convert(text) except Exception: html = text return "utf-8", _lower_html_table_tags(html), None return "utf-8", text, None if ext in {".html", ".htm"}: try: conv = DocumentConverter(allowed_formats=[InputFormat.HTML]) result = conv.convert(source) if export.lower() == "html": html = result.document.export_to_html() html = _lower_html_table_tags(html) return "utf-8", html, None md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) md = _replace_admonitions(md) md = _enhance_codeblocks(md) return "utf-8", md, None except Exception: raw, ct = _read_bytes(source) html_in = _normalize_newlines(_decode_to_utf8(raw, ct)) if export.lower() == "html": html = _normalize_html(html_in) if _normalize_html is not None else html_in return "utf-8", _lower_html_table_tags(html), None md = _html_to_markdown(html_in) md = _replace_admonitions(md) md = _enhance_codeblocks(md) return "utf-8", md, None if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE: enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled) md = _replace_admonitions(md) md = _enhance_codeblocks(md) return enc or "utf-8", md, None # Configure PDF pipeline to generate picture images into a per-call artifacts directory artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_") pdf_opts = PdfPipelineOptions() pdf_opts.generate_picture_images = True pdf_opts.generate_page_images = True pdf_opts.images_scale = 2.0 pdf_opts.do_code_enrichment = True pdf_opts.do_formula_enrichment = True self._docling = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=StandardPdfPipeline, pipeline_options=pdf_opts, ) } ) result = self._docling.convert(source) if export.lower() == "markdown": md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) md = _replace_admonitions(md) md = _enhance_codeblocks(md) return "utf-8", md, artifacts_dir if export.lower() == "html": html = result.document.export_to_html() html = _lower_html_table_tags(html) return "utf-8", html, artifacts_dir if export.lower() == "json": js = result.document.export_to_json() return "utf-8", js, artifacts_dir if export.lower() == "doctags": dt = result.document.export_to_doctags() return "utf-8", dt, artifacts_dir raise RuntimeError("unsupported export")