Import project files

2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions
--- a/docling/app/services/word2markdown.py
+++ b/docling/app/services/word2markdown.py
@@ -0,0 +1,429 @@
+from pathlib import Path
+from typing import Tuple, List
+
+from docx import Document
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+import re
+import base64
+import hashlib
+import tempfile
+import subprocess
+from lxml import etree
+
+
+def _iter_blocks(doc: Document):
+    parent = doc
+    parent_elm = parent.element.body
+    for child in parent_elm.iterchildren():
+        tag = child.tag.split('}')[-1]
+        if tag == 'p':
+            yield Paragraph(child, parent)
+        elif tag == 'tbl':
+            yield Table(child, parent)
+
+
+def _cell_text(cell) -> str:
+    parts = []
+    for p in cell.paragraphs:
+        t = p.text or ""
+        parts.append(t)
+    return "\n".join([s for s in parts if s is not None])
+
+
+def _guess_lang(text: str) -> str:
+    t = (text or "").strip()
+    head = t[:512]
+    if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
+        return "java"
+    if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
+        return "python"
+    if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
+        return "javascript"
+    if re.search(r"^#include|\bint\s+main\s*\(\)", head):
+        return "c"
+    if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
+        return "cpp"
+    if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
+        return "sql"
+    if head.startswith("{") or head.startswith("["):
+        return "json"
+    if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
+        return "html"
+    if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
+        return "xml"
+    return ""
+
+
+def _table_to_md(tbl: Table) -> str:
+    rows = tbl.rows
+    cols = tbl.columns
+    if len(rows) == 1 and len(cols) == 1:
+        txt = _cell_text(rows[0].cells[0]).strip()
+        lang = _guess_lang(txt)
+        return f"```{lang}\n{txt}\n```\n"
+
+    def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
+        el = paragraph._element
+        parts: List[str] = []
+        try:
+            for ch in el.iterchildren():
+                tag = ch.tag.split('}')[-1]
+                if tag == 'r':
+                    for rc in ch.iterchildren():
+                        rtag = rc.tag.split('}')[-1]
+                        if rtag == 't':
+                            s = rc.text or ''
+                            if s:
+                                parts.append(s)
+                        elif rtag == 'br':
+                            parts.append('\n')
+                        elif rtag == 'drawing':
+                            try:
+                                for node in rc.iter():
+                                    local = node.tag.split('}')[-1]
+                                    rid = None
+                                    if local == 'blip':
+                                        rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
+                                    elif local == 'imagedata':
+                                        rid = node.get(f"{{{NS['r']}}}id")
+                                    if not rid:
+                                        continue
+                                    try:
+                                        part = None
+                                        rp = getattr(doc.part, 'related_parts', None)
+                                        if isinstance(rp, dict) and rid in rp:
+                                            part = rp.get(rid)
+                                        if part is None:
+                                            rels = getattr(doc.part, 'rels', None)
+                                            if rels is not None and hasattr(rels, 'get'):
+                                                rel = rels.get(rid)
+                                                part = getattr(rel, 'target_part', None)
+                                        if part is None:
+                                            rel = getattr(doc.part, '_rels', {}).get(rid)
+                                            part = getattr(rel, 'target_part', None)
+                                        ct = getattr(part, 'content_type', '') if part is not None else ''
+                                        data = part.blob if part is not None and hasattr(part, 'blob') else None
+                                        if data:
+                                            b64 = base64.b64encode(data).decode('ascii')
+                                            parts.append(f"![Image](data:{ct};base64,{b64})")
+                                    except Exception:
+                                        pass
+                            except Exception:
+                                pass
+        except Exception:
+            pass
+        return ''.join(parts)
+
+    out = []
+    # python-docx table parent is the Document
+    doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
+    for r_i, r in enumerate(rows):
+        vals = []
+        for c in r.cells:
+            segs: List[str] = []
+            for p in c.paragraphs:
+                s = _cell_inline_md(doc, p)
+                if s:
+                    segs.append(s)
+            cell_text = '<br>'.join([x for x in segs if x is not None])
+            vals.append((cell_text or '').replace('|', '\\|').strip())
+        line = "| " + " | ".join(vals) + " |"
+        out.append(line)
+        if r_i == 0:
+            sep = "| " + " | ".join(["---" for _ in vals]) + " |"
+            out.append(sep)
+    return "\n".join(out) + "\n"
+
+
+def _paragraph_to_md(p: Paragraph) -> str:
+    return (p.text or "").strip() + "\n\n"
+
+
+def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
+    ext = path.suffix.lower()
+    use_path = path
+    if ext == ".doc":
+        use_path = _convert_doc_to_docx_cross_platform(path)
+    if use_path.suffix.lower() not in {".docx"}:
+        raise RuntimeError("unsupported input for word2markdown")
+    doc = Document(str(use_path))
+    out: List[str] = []
+    in_code = False
+    code_lines: List[str] = []
+    lang_hint: str = ''
+    for blk in _iter_blocks(doc):
+        if isinstance(blk, Table):
+            out.append(_table_to_md(blk))
+        elif isinstance(blk, Paragraph):
+            tboxes = _paragraph_textboxes(blk)
+            for tb in tboxes:
+                if tb.strip():
+                    out.append(_md_code_block(tb.strip()))
+            sdts = _paragraph_sdts(blk)
+            for s in sdts:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            btx = _paragraph_bordered_text(blk)
+            for s in btx:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            ftx = _paragraph_framed(blk)
+            for s in ftx:
+                if s.strip():
+                    out.append(_md_code_block(s.strip()))
+            raw = (blk.text or "")
+            sraw = raw.strip()
+            if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
+                if not in_code:
+                    in_code = True
+                    lang_hint = _guess_lang(sraw)
+                    code_lines = []
+                code_lines.append(raw)
+                continue
+            if in_code and code_lines:
+                text = "\n".join(code_lines)
+                use_lang = lang_hint or _guess_lang(text)
+                out.append(f"```{use_lang}\n{text}\n```\n")
+                in_code = False
+                code_lines = []
+                lang_hint = ''
+            def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
+                el = p._element
+                parts: List[str] = []
+                try:
+                    for ch in el.iterchildren():
+                        tag = ch.tag.split('}')[-1]
+                        if tag == 'r':
+                            for rc in ch.iterchildren():
+                                rtag = rc.tag.split('}')[-1]
+                                if rtag == 't':
+                                    s = rc.text or ''
+                                    if s:
+                                        parts.append(s)
+                                elif rtag == 'br':
+                                    parts.append('\n')
+                                elif rtag == 'drawing':
+                                    for node in rc.iter():
+                                        local = node.tag.split('}')[-1]
+                                        rid = None
+                                        if local == 'blip':
+                                            rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
+                                        elif local == 'imagedata':
+                                            rid = node.get(f"{{{NS['r']}}}id")
+                                        if not rid:
+                                            continue
+                                        try:
+                                            part = None
+                                            rp = getattr(doc.part, 'related_parts', None)
+                                            if isinstance(rp, dict) and rid in rp:
+                                                part = rp.get(rid)
+                                            if part is None:
+                                                rels = getattr(doc.part, 'rels', None)
+                                                if rels is not None and hasattr(rels, 'get'):
+                                                    rel = rels.get(rid)
+                                                    part = getattr(rel, 'target_part', None)
+                                            if part is None:
+                                                rel = getattr(doc.part, '_rels', {}).get(rid)
+                                                part = getattr(rel, 'target_part', None)
+                                            ct = getattr(part, 'content_type', '') if part is not None else ''
+                                            data = part.blob if part is not None and hasattr(part, 'blob') else None
+                                            if data:
+                                                b64 = base64.b64encode(data).decode('ascii')
+                                                parts.append(f"![Image](data:{ct};base64,{b64})")
+                                        except Exception:
+                                            pass
+                except Exception:
+                    pass
+                s = ''.join(parts).strip()
+                return (s + '\n\n') if s else ''
+            txt = _paragraph_with_images(doc, blk)
+            if txt.strip():
+                out.append(txt)
+    if in_code and code_lines:
+        text = "\n".join(code_lines)
+        use_lang = lang_hint or _guess_lang(text)
+        out.append(f"```{use_lang}\n{text}\n```\n")
+    try:
+        boxes = _doclevel_textboxes(doc)
+        existing_texts = set()
+        try:
+            for seg in out:
+                if isinstance(seg, str):
+                    ss = seg.strip()
+                    if ss.startswith("```"):
+                        m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
+                        if m:
+                            existing_texts.add(m.group(1).strip())
+                            continue
+                    existing_texts.add(ss)
+        except Exception:
+            pass
+        for tb in boxes:
+            s = (tb or '').strip()
+            if not s:
+                continue
+            if s in existing_texts:
+                continue
+            out.append(_md_code_block(s))
+            existing_texts.add(s)
+    except Exception:
+        pass
+    md = "".join(out)
+    return "utf-8", md
+
+NS = {
+    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+    "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+    "v": "urn:schemas-microsoft-com:vml",
+    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
+}
+
+
+def _paragraph_textboxes(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        texts: List[str] = []
+        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        return texts
+    except Exception:
+        return []
+
+
+def _paragraph_sdts(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        texts: List[str] = []
+        for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
+            paras = sdt.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([t.text or '' for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        return texts
+    except Exception:
+        return []
+
+
+def _paragraph_bordered_text(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
+        t = (p.text or '').strip()
+        if has_border and t:
+            return [t]
+    except Exception:
+        pass
+    return []
+
+
+def _paragraph_framed(p: Paragraph) -> List[str]:
+    try:
+        el = p._element
+        has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
+        t = (p.text or '').strip()
+        if has_frame and t:
+            return [t]
+    except Exception:
+        pass
+    return []
+
+
+def _md_code_block(text: str) -> str:
+    lang = _guess_lang(text)
+    return f"```{lang}\n{text}\n```\n"
+
+
+def _looks_like_code_paragraph(t: str) -> bool:
+    s = (t or '').strip()
+    if not s:
+        return False
+    if s.startswith('{') or s.startswith('[') or s.endswith('}'):
+        return True
+    if s.startswith('    ') or s.startswith('\t'):
+        return True
+    if ';' in s or '{' in s or '}' in s:
+        return True
+    keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
+    return any(k in s for k in keywords)
+
+
+def _doclevel_textboxes(doc: Document) -> List[str]:
+    texts: List[str] = []
+    try:
+        el = doc.element.body
+        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([(t.text or '') for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
+            paras = tbox.xpath('.//w:p', namespaces=NS)
+            buf: List[str] = []
+            for w_p in paras:
+                ts = w_p.xpath('.//w:t', namespaces=NS)
+                s = ''.join([(t.text or '') for t in ts]).strip()
+                if s:
+                    buf.append(s)
+            if buf:
+                texts.append('\n'.join(buf))
+    except Exception:
+        pass
+    return texts
+
+
+def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
+            tmp.close()
+            subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
+            return Path(tmp.name)
+    except Exception:
+        pass
+    try:
+        outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
+        subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
+        candidate = outdir / (path.stem + ".docx")
+        if candidate.exists():
+            return candidate
+    except Exception:
+        pass
+    try:
+        out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
+        subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
+        if out.exists():
+            return out
+    except Exception:
+        pass
+    raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")