from pathlib import Path from typing import Tuple, List from docx import Document from docx.table import Table from docx.text.paragraph import Paragraph import re import base64 import hashlib import tempfile import subprocess from lxml import etree def _iter_blocks(doc: Document): parent = doc parent_elm = parent.element.body for child in parent_elm.iterchildren(): tag = child.tag.split('}')[-1] if tag == 'p': yield Paragraph(child, parent) elif tag == 'tbl': yield Table(child, parent) def _cell_text(cell) -> str: parts = [] for p in cell.paragraphs: t = p.text or "" parts.append(t) return "\n".join([s for s in parts if s is not None]) def _guess_lang(text: str) -> str: t = (text or "").strip() head = t[:512] if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head): return "java" if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head): return "python" if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head): return "javascript" if re.search(r"^#include|\bint\s+main\s*\(\)", head): return "c" if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head): return "cpp" if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE): return "sql" if head.startswith("{") or head.startswith("["): return "json" if re.search(r"", head): return "xml" return "" def _table_to_md(tbl: Table) -> str: rows = tbl.rows cols = tbl.columns if len(rows) == 1 and len(cols) == 1: txt = _cell_text(rows[0].cells[0]).strip() lang = _guess_lang(txt) return f"```{lang}\n{txt}\n```\n" def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str: el = paragraph._element parts: List[str] = [] try: for ch in el.iterchildren(): tag = ch.tag.split('}')[-1] if tag == 'r': for rc in ch.iterchildren(): rtag = rc.tag.split('}')[-1] if rtag == 't': s = rc.text or '' if s: parts.append(s) elif rtag == 'br': parts.append('\n') elif rtag == 'drawing': try: for node in rc.iter(): local = node.tag.split('}')[-1] rid = None if local == 'blip': rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link") elif local == 'imagedata': rid = node.get(f"{{{NS['r']}}}id") if not rid: continue try: part = None rp = getattr(doc.part, 'related_parts', None) if isinstance(rp, dict) and rid in rp: part = rp.get(rid) if part is None: rels = getattr(doc.part, 'rels', None) if rels is not None and hasattr(rels, 'get'): rel = rels.get(rid) part = getattr(rel, 'target_part', None) if part is None: rel = getattr(doc.part, '_rels', {}).get(rid) part = getattr(rel, 'target_part', None) ct = getattr(part, 'content_type', '') if part is not None else '' data = part.blob if part is not None and hasattr(part, 'blob') else None if data: b64 = base64.b64encode(data).decode('ascii') parts.append(f"![Image](data:{ct};base64,{b64})") except Exception: pass except Exception: pass except Exception: pass return ''.join(parts) out = [] # python-docx table parent is the Document doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None) for r_i, r in enumerate(rows): vals = [] for c in r.cells: segs: List[str] = [] for p in c.paragraphs: s = _cell_inline_md(doc, p) if s: segs.append(s) cell_text = '
'.join([x for x in segs if x is not None]) vals.append((cell_text or '').replace('|', '\\|').strip()) line = "| " + " | ".join(vals) + " |" out.append(line) if r_i == 0: sep = "| " + " | ".join(["---" for _ in vals]) + " |" out.append(sep) return "\n".join(out) + "\n" def _paragraph_to_md(p: Paragraph) -> str: return (p.text or "").strip() + "\n\n" def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]: ext = path.suffix.lower() use_path = path if ext == ".doc": use_path = _convert_doc_to_docx_cross_platform(path) if use_path.suffix.lower() not in {".docx"}: raise RuntimeError("unsupported input for word2markdown") doc = Document(str(use_path)) out: List[str] = [] in_code = False code_lines: List[str] = [] lang_hint: str = '' for blk in _iter_blocks(doc): if isinstance(blk, Table): out.append(_table_to_md(blk)) elif isinstance(blk, Paragraph): tboxes = _paragraph_textboxes(blk) for tb in tboxes: if tb.strip(): out.append(_md_code_block(tb.strip())) sdts = _paragraph_sdts(blk) for s in sdts: if s.strip(): out.append(_md_code_block(s.strip())) btx = _paragraph_bordered_text(blk) for s in btx: if s.strip(): out.append(_md_code_block(s.strip())) ftx = _paragraph_framed(blk) for s in ftx: if s.strip(): out.append(_md_code_block(s.strip())) raw = (blk.text or "") sraw = raw.strip() if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""): if not in_code: in_code = True lang_hint = _guess_lang(sraw) code_lines = [] code_lines.append(raw) continue if in_code and code_lines: text = "\n".join(code_lines) use_lang = lang_hint or _guess_lang(text) out.append(f"```{use_lang}\n{text}\n```\n") in_code = False code_lines = [] lang_hint = '' def _paragraph_with_images(doc: Document, p: Paragraph) -> str: el = p._element parts: List[str] = [] try: for ch in el.iterchildren(): tag = ch.tag.split('}')[-1] if tag == 'r': for rc in ch.iterchildren(): rtag = rc.tag.split('}')[-1] if rtag == 't': s = rc.text or '' if s: parts.append(s) elif rtag == 'br': parts.append('\n') elif rtag == 'drawing': for node in rc.iter(): local = node.tag.split('}')[-1] rid = None if local == 'blip': rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link") elif local == 'imagedata': rid = node.get(f"{{{NS['r']}}}id") if not rid: continue try: part = None rp = getattr(doc.part, 'related_parts', None) if isinstance(rp, dict) and rid in rp: part = rp.get(rid) if part is None: rels = getattr(doc.part, 'rels', None) if rels is not None and hasattr(rels, 'get'): rel = rels.get(rid) part = getattr(rel, 'target_part', None) if part is None: rel = getattr(doc.part, '_rels', {}).get(rid) part = getattr(rel, 'target_part', None) ct = getattr(part, 'content_type', '') if part is not None else '' data = part.blob if part is not None and hasattr(part, 'blob') else None if data: b64 = base64.b64encode(data).decode('ascii') parts.append(f"![Image](data:{ct};base64,{b64})") except Exception: pass except Exception: pass s = ''.join(parts).strip() return (s + '\n\n') if s else '' txt = _paragraph_with_images(doc, blk) if txt.strip(): out.append(txt) if in_code and code_lines: text = "\n".join(code_lines) use_lang = lang_hint or _guess_lang(text) out.append(f"```{use_lang}\n{text}\n```\n") try: boxes = _doclevel_textboxes(doc) existing_texts = set() try: for seg in out: if isinstance(seg, str): ss = seg.strip() if ss.startswith("```"): m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss) if m: existing_texts.add(m.group(1).strip()) continue existing_texts.add(ss) except Exception: pass for tb in boxes: s = (tb or '').strip() if not s: continue if s in existing_texts: continue out.append(_md_code_block(s)) existing_texts.add(s) except Exception: pass md = "".join(out) return "utf-8", md NS = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", "v": "urn:schemas-microsoft-com:vml", "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", } def _paragraph_textboxes(p: Paragraph) -> List[str]: try: el = p._element texts: List[str] = [] for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS): paras = tbox.xpath('.//w:p', namespaces=NS) buf: List[str] = [] for w_p in paras: ts = w_p.xpath('.//w:t', namespaces=NS) s = ''.join([t.text or '' for t in ts]).strip() if s: buf.append(s) if buf: texts.append('\n'.join(buf)) for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS): paras = tbox.xpath('.//w:p', namespaces=NS) buf: List[str] = [] for w_p in paras: ts = w_p.xpath('.//w:t', namespaces=NS) s = ''.join([t.text or '' for t in ts]).strip() if s: buf.append(s) if buf: texts.append('\n'.join(buf)) return texts except Exception: return [] def _paragraph_sdts(p: Paragraph) -> List[str]: try: el = p._element texts: List[str] = [] for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS): paras = sdt.xpath('.//w:p', namespaces=NS) buf: List[str] = [] for w_p in paras: ts = w_p.xpath('.//w:t', namespaces=NS) s = ''.join([t.text or '' for t in ts]).strip() if s: buf.append(s) if buf: texts.append('\n'.join(buf)) return texts except Exception: return [] def _paragraph_bordered_text(p: Paragraph) -> List[str]: try: el = p._element has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS)) t = (p.text or '').strip() if has_border and t: return [t] except Exception: pass return [] def _paragraph_framed(p: Paragraph) -> List[str]: try: el = p._element has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS)) t = (p.text or '').strip() if has_frame and t: return [t] except Exception: pass return [] def _md_code_block(text: str) -> str: lang = _guess_lang(text) return f"```{lang}\n{text}\n```\n" def _looks_like_code_paragraph(t: str) -> bool: s = (t or '').strip() if not s: return False if s.startswith('{') or s.startswith('[') or s.endswith('}'): return True if s.startswith(' ') or s.startswith('\t'): return True if ';' in s or '{' in s or '}' in s: return True keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory'] return any(k in s for k in keywords) def _doclevel_textboxes(doc: Document) -> List[str]: texts: List[str] = [] try: el = doc.element.body for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS): paras = tbox.xpath('.//w:p', namespaces=NS) buf: List[str] = [] for w_p in paras: ts = w_p.xpath('.//w:t', namespaces=NS) s = ''.join([(t.text or '') for t in ts]).strip() if s: buf.append(s) if buf: texts.append('\n'.join(buf)) for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS): paras = tbox.xpath('.//w:p', namespaces=NS) buf: List[str] = [] for w_p in paras: ts = w_p.xpath('.//w:t', namespaces=NS) s = ''.join([(t.text or '') for t in ts]).strip() if s: buf.append(s) if buf: texts.append('\n'.join(buf)) except Exception: pass return texts def _convert_doc_to_docx_cross_platform(path: Path) -> Path: try: with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp: tmp.close() subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True) return Path(tmp.name) except Exception: pass try: outdir = Path(tempfile.mkdtemp(prefix="doc2docx_")) subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True) candidate = outdir / (path.stem + ".docx") if candidate.exists(): return candidate except Exception: pass try: out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name) subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True) if out.exists(): return out except Exception: pass raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")