Import project files

2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions
--- a/docling/app/services/unified_converter.py
+++ b/docling/app/services/unified_converter.py
@@ -0,0 +1,492 @@
+from pathlib import Path
+from typing import Optional, Tuple
+import re
+
+import tempfile
+import sys
+from urllib.parse import urlsplit
+from urllib.request import urlopen
+from urllib.error import HTTPError, URLError
+import io
+_DOC_AVAILABLE = True
+try:
+    _DOC_BASE = Path(__file__).resolve().parents[2] / "docling"
+    p = str(_DOC_BASE)
+    if p not in sys.path:
+        sys.path.insert(0, p)
+except Exception:
+    pass
+try:
+    from docling.document_converter import DocumentConverter
+    from docling.datamodel.base_models import InputFormat
+    from docling.document_converter import PdfFormatOption
+    from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+    from docling.datamodel.pipeline_options import PdfPipelineOptions
+    from docling_core.types.doc import ImageRefMode
+except Exception:
+    _DOC_AVAILABLE = False
+    class DocumentConverter:  # type: ignore
+        def __init__(self, *args, **kwargs):
+            pass
+        def convert(self, source):
+            raise RuntimeError("docling unavailable")
+    class InputFormat:  # type: ignore
+        PDF = "pdf"
+    class PdfFormatOption:  # type: ignore
+        def __init__(self, *args, **kwargs):
+            pass
+    class StandardPdfPipeline:  # type: ignore
+        pass
+    class PdfPipelineOptions:  # type: ignore
+        def __init__(self):
+            pass
+    class ImageRefMode:  # type: ignore
+        EMBEDDED = None
+
+"""
+@api Unified Converter Service
+@description Provides core document conversion logic unifying Docling and word2markdown engines
+"""
+
+_W2M_AVAILABLE = False
+try:
+    from app.services.word2markdown import convert_any as _w2m_convert_any  # type: ignore
+    _W2M_AVAILABLE = True
+except Exception:
+    _W2M_AVAILABLE = False
+
+try:
+    from bs4 import BeautifulSoup  # type: ignore
+except Exception:
+    BeautifulSoup = None  # type: ignore
+try:
+    from app.services.docling_adapter import normalize_html as _normalize_html  # type: ignore
+    from app.services.docling_adapter import resolve_link as _resolve_link  # type: ignore
+    from app.services.docling_adapter import _render_markdown_html as _render_md_html  # type: ignore
+except Exception:
+    _normalize_html = None  # type: ignore
+    _resolve_link = None  # type: ignore
+    _render_md_html = None  # type: ignore
+
+def _is_http(s: str) -> bool:
+    t = (s or "").lower()
+    return t.startswith("http://") or t.startswith("https://")
+
+def _read_bytes(source: str) -> Tuple[bytes, str]:
+    ct = ""
+    try:
+        if _is_http(source):
+            from urllib.request import urlopen
+            with urlopen(source, timeout=10) as r:
+                ct = r.headers.get("Content-Type") or ""
+                return r.read() or b"", ct
+        p = Path(source)
+        if p.exists() and p.is_file():
+            return p.read_bytes(), ct
+    except Exception:
+        return b"", ct
+    return b"", ct
+
+def _decode_to_utf8(raw: bytes, ct: str = "") -> str:
+    if not raw:
+        return ""
+    if raw.startswith(b"\xef\xbb\xbf"):
+        try:
+            return raw[3:].decode("utf-8")
+        except Exception:
+            pass
+    if raw.startswith(b"\xff\xfe"):
+        try:
+            return raw[2:].decode("utf-16le")
+        except Exception:
+            pass
+    if raw.startswith(b"\xfe\xff"):
+        try:
+            return raw[2:].decode("utf-16be")
+        except Exception:
+            pass
+    try:
+        m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE)
+        if m:
+            enc = m.group(1).strip().lower()
+            try:
+                return raw.decode(enc)
+            except Exception:
+                pass
+    except Exception:
+        pass
+    candidates = [
+        "utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252",
+    ]
+    for enc in candidates:
+        try:
+            return raw.decode(enc)
+        except Exception:
+            continue
+    return raw.decode("utf-8", errors="replace")
+
+def _normalize_newlines(s: str) -> str:
+    return (s or "").replace("\r\n", "\n").replace("\r", "\n")
+
+def _html_to_markdown(html: str) -> str:
+    if not html:
+        return ""
+    if BeautifulSoup is None:
+        return html
+    soup = BeautifulSoup(html, "html.parser")
+    out: list[str] = []
+    def txt(node) -> str:
+        return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "")
+    def inline(node) -> str:
+        if isinstance(node, str):
+            return node
+        name = getattr(node, "name", None)
+        if name in {None}:  # type: ignore
+            return str(node)
+        if name in {"strong", "b"}:
+            return "**" + txt(node) + "**"
+        if name in {"em", "i"}:
+            return "*" + txt(node) + "*"
+        if name == "code":
+            return "`" + txt(node) + "`"
+        if name == "a":
+            href_val = node.get("href")
+            extra_val = node.get("data-doc")
+            href = href_val if isinstance(href_val, str) else None
+            extra = extra_val if isinstance(extra_val, str) else None
+            resolved = _resolve_link(href, extra) if _resolve_link else (href or extra)
+            url = resolved or ""
+            text = txt(node)
+            if url:
+                return f"[{text}]({url})"
+            return text
+        if name == "img":
+            alt = node.get("alt") or "image"
+            src = node.get("src") or ""
+            return f"![{alt}]({src})"
+        res = []
+        for c in getattr(node, "children", []):
+            res.append(inline(c))
+        return "".join(res)
+    def block(node):
+        name = getattr(node, "name", None)
+        if name is None:
+            s = str(node).strip()
+            if s:
+                out.append(s)
+            return
+        if name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+            lvl = int(name[1])
+            out.append("#" * lvl + " " + txt(node))
+            out.append("")
+            return
+        if name == "p":
+            segs = [inline(c) for c in node.children]
+            out.append("".join(segs))
+            out.append("")
+            return
+        if name == "br":
+            out.append("")
+            return
+        if name in {"ul", "ol"}:
+            is_ol = name == "ol"
+            idx = 1
+            for li in node.find_all("li", recursive=False):
+                text = "".join(inline(c) for c in li.children)
+                if is_ol:
+                    out.append(f"{idx}. {text}")
+                    idx += 1
+                else:
+                    out.append(f"- {text}")
+            out.append("")
+            return
+        if name == "pre":
+            code_node = node.find("code")
+            code_text = code_node.get_text() if code_node else node.get_text()
+            lang = ""
+            cls = (code_node.get("class") if code_node else node.get("class")) or []
+            for c in cls:
+                s = str(c)
+                if s.startswith("language-"):
+                    lang = s.split("-", 1)[-1]
+                    break
+            out.append(f"```{lang}\n{code_text}\n```\n")
+            return
+        if name == "blockquote":
+            lines = [l for l in txt(node).splitlines() if l.strip()]
+            for l in lines:
+                out.append("> " + l)
+            out.append("")
+            return
+        if name == "table":
+            rows = node.find_all("tr")
+            if not rows:
+                return
+            headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])]
+            if headers:
+                out.append("|" + "|".join(headers) + "|")
+                sep = "|" + "|".join(["---" for _ in headers]) + "|"
+                out.append(sep)
+            for tr in rows[1:]:
+                cells = [td.get_text(strip=True) for td in tr.find_all("td")]
+                if cells:
+                    out.append("|" + "|".join(cells) + "|")
+            out.append("")
+            return
+        if name == "div":
+            for c in node.children:
+                block(c)
+            return
+        segs = [inline(c) for c in node.children]
+        if segs:
+            out.append("".join(segs))
+            out.append("")
+    root = soup.body or soup
+    for ch in getattr(root, "children", []):
+        block(ch)
+    return _normalize_newlines("\n".join(out)).strip()
+
+
+def _lower_html_table_tags(html: str) -> str:
+    """
+    @function _lower_html_table_tags
+    @description Normalizes HTML table tags to lowercase
+    @param html Input HTML string
+    @return Normalized HTML string
+    """
+    if not html:
+        return html
+    tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"]
+    out = html
+    for t in tags:
+        out = re.sub(r"</?" + t + r"\b", lambda m: m.group(0).lower(), out)
+    out = re.sub(r">\s*\n+\s*", ">\n", out)
+    return out
+
+
+def _replace_admonitions(md: str) -> str:
+    """
+    @function _replace_admonitions
+    @description Replaces ::: style admonitions with !!! style
+    @param md Input markdown string
+    @return Processed markdown string
+    """
+    if not md:
+        return md
+    lines = md.split("\n")
+    out = []
+    in_block = False
+    for raw in lines:
+        t = raw.strip()
+        if t.startswith(":::"):
+            if not in_block:
+                name = t[3:].strip()
+                if not name:
+                    out.append("!!!")
+                else:
+                    out.append("!!! " + name)
+                in_block = True
+            else:
+                out.append("!!!")
+                in_block = False
+            continue
+        out.append(raw)
+    return "\n".join(out)
+
+
+def _enhance_codeblocks(md: str) -> str:
+    if not md:
+        return md
+    lines = md.split("\n")
+    res = []
+    in_fence = False
+    fence_lang = ""
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        t = line.strip()
+        if t.startswith("```"):
+            in_fence = not in_fence
+            try:
+                fence_lang = (t[3:] or "").strip() if in_fence else ""
+            except Exception:
+                fence_lang = ""
+            res.append(line)
+            i += 1
+            continue
+        if in_fence:
+            res.append(line)
+            i += 1
+            continue
+        if t.startswith("{") or t.startswith("["):
+            buf = [line]
+            j = i + 1
+            closed = False
+            depth = t.count("{") - t.count("}")
+            while j < len(lines):
+                buf.append(lines[j])
+                s = lines[j].strip()
+                depth += s.count("{") - s.count("}")
+                if depth <= 0 and s.endswith("}"):
+                    closed = True
+                    break
+                j += 1
+            if closed and len(buf) >= 3:
+                lang = "json"
+                res.append("```" + lang)
+                res.extend(buf)
+                res.append("```")
+                i = j + 1
+                continue
+        code_sig = (
+            ("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t)
+        )
+        if code_sig:
+            buf = [line]
+            j = i + 1
+            while j < len(lines):
+                s = lines[j].strip()
+                if not s:
+                    break
+                if s.startswith("# ") or s.startswith("## ") or s.startswith("### "):
+                    break
+                buf.append(lines[j])
+                j += 1
+            if len(buf) >= 3:
+                res.append("```")
+                res.extend(buf)
+                res.append("```")
+                i = j + 1
+                continue
+        res.append(line)
+        i += 1
+    return "\n".join(res)
+
+
+class FormatConverter:
+    """
+    @class FormatConverter
+    @description Unified converter class wrapping Docling and word2markdown
+    """
+    def __init__(self) -> None:
+        self._docling = DocumentConverter()
+
+    def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]:
+        """
+        @function convert
+        @description Convert a document source to specified format
+        @param source Path or URL to source document
+        @param export Output format (markdown, html, json, doctags)
+        @param engine Optional engine override (word2markdown/docling)
+        @param mdx_safe_mode_enabled Toggle safe mode for MDX
+        @return Tuple of (encoding, content)
+        """
+        
+
+        # Prefer custom word2markdown engine for DOC/DOCX when available
+        auto_engine = None
+        try:
+            from pathlib import Path as _P
+            suf = _P(source).suffix.lower()
+            if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE:
+                auto_engine = "word2markdown"
+        except Exception:
+            auto_engine = None
+        use_engine = (engine or auto_engine or "").lower()
+        try:
+            from urllib.parse import urlsplit
+            path = source
+            if _is_http(source):
+                path = urlsplit(source).path or ""
+            ext = Path(path).suffix.lower()
+        except Exception:
+            ext = Path(source).suffix.lower()
+        if ext in {".txt"}:
+            raw, ct = _read_bytes(source)
+            text = _normalize_newlines(_decode_to_utf8(raw, ct))
+            if export.lower() == "html":
+                if _render_md_html is not None:
+                    html = _render_md_html(text)
+                else:
+                    try:
+                        import marko
+                        html = marko.convert(text)
+                    except Exception:
+                        html = f"<pre>{text}</pre>"
+                return "utf-8", _lower_html_table_tags(html), None
+            md = _enhance_codeblocks(text)
+            return "utf-8", md, None
+        if ext in {".md"}:
+            raw, ct = _read_bytes(source)
+            text = _normalize_newlines(_decode_to_utf8(raw, ct))
+            if export.lower() == "html":
+                if _render_md_html is not None:
+                    html = _render_md_html(text)
+                else:
+                    try:
+                        import marko
+                        html = marko.convert(text)
+                    except Exception:
+                        html = text
+                return "utf-8", _lower_html_table_tags(html), None
+            return "utf-8", text, None
+        if ext in {".html", ".htm"}:
+            try:
+                conv = DocumentConverter(allowed_formats=[InputFormat.HTML])
+                result = conv.convert(source)
+                if export.lower() == "html":
+                    html = result.document.export_to_html()
+                    html = _lower_html_table_tags(html)
+                    return "utf-8", html, None
+                md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+                md = _replace_admonitions(md)
+                md = _enhance_codeblocks(md)
+                return "utf-8", md, None
+            except Exception:
+                raw, ct = _read_bytes(source)
+                html_in = _normalize_newlines(_decode_to_utf8(raw, ct))
+                if export.lower() == "html":
+                    html = _normalize_html(html_in) if _normalize_html is not None else html_in
+                    return "utf-8", _lower_html_table_tags(html), None
+                md = _html_to_markdown(html_in)
+                md = _replace_admonitions(md)
+                md = _enhance_codeblocks(md)
+                return "utf-8", md, None
+        if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE:
+            enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled)
+            md = _replace_admonitions(md)
+            md = _enhance_codeblocks(md)
+            return enc or "utf-8", md, None
+        # Configure PDF pipeline to generate picture images into a per-call artifacts directory
+        artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_")
+        pdf_opts = PdfPipelineOptions()
+        pdf_opts.generate_picture_images = True
+        pdf_opts.generate_page_images = True
+        pdf_opts.images_scale = 2.0
+        pdf_opts.do_code_enrichment = True
+        pdf_opts.do_formula_enrichment = True
+        self._docling = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_cls=StandardPdfPipeline,
+                    pipeline_options=pdf_opts,
+                )
+            }
+        )
+        result = self._docling.convert(source)
+        if export.lower() == "markdown":
+            md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+            md = _replace_admonitions(md)
+            md = _enhance_codeblocks(md)
+            return "utf-8", md, artifacts_dir
+        if export.lower() == "html":
+            html = result.document.export_to_html()
+            html = _lower_html_table_tags(html)
+            return "utf-8", html, artifacts_dir
+        if export.lower() == "json":
+            js = result.document.export_to_json()
+            return "utf-8", js, artifacts_dir
+        if export.lower() == "doctags":
+            dt = result.document.export_to_doctags()
+            return "utf-8", dt, artifacts_dir
+        raise RuntimeError("unsupported export")