from pathlib import Path
from typing import Tuple, List

from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
import re
import base64
import hashlib
import tempfile
import subprocess
from lxml import etree


def _iter_blocks(doc: Document):
    parent = doc
    parent_elm = parent.element.body
    for child in parent_elm.iterchildren():
        tag = child.tag.split('}')[-1]
        if tag == 'p':
            yield Paragraph(child, parent)
        elif tag == 'tbl':
            yield Table(child, parent)


def _cell_text(cell) -> str:
    parts = []
    for p in cell.paragraphs:
        t = p.text or ""
        parts.append(t)
    return "\n".join([s for s in parts if s is not None])


def _guess_lang(text: str) -> str:
    t = (text or "").strip()
    head = t[:512]
    if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
        return "java"
    if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
        return "python"
    if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
        return "javascript"
    if re.search(r"^#include|\bint\s+main\s*\(\)", head):
        return "c"
    if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
        return "cpp"
    if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
        return "sql"
    if head.startswith("{") or head.startswith("["):
        return "json"
    if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
        return "html"
    if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
        return "xml"
    return ""


def _table_to_md(tbl: Table) -> str:
    rows = tbl.rows
    cols = tbl.columns
    if len(rows) == 1 and len(cols) == 1:
        txt = _cell_text(rows[0].cells[0]).strip()
        lang = _guess_lang(txt)
        return f"```{lang}\n{txt}\n```\n"

    def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
        el = paragraph._element
        parts: List[str] = []
        try:
            for ch in el.iterchildren():
                tag = ch.tag.split('}')[-1]
                if tag == 'r':
                    for rc in ch.iterchildren():
                        rtag = rc.tag.split('}')[-1]
                        if rtag == 't':
                            s = rc.text or ''
                            if s:
                                parts.append(s)
                        elif rtag == 'br':
                            parts.append('\n')
                        elif rtag == 'drawing':
                            try:
                                for node in rc.iter():
                                    local = node.tag.split('}')[-1]
                                    rid = None
                                    if local == 'blip':
                                        rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
                                    elif local == 'imagedata':
                                        rid = node.get(f"{{{NS['r']}}}id")
                                    if not rid:
                                        continue
                                    try:
                                        part = None
                                        rp = getattr(doc.part, 'related_parts', None)
                                        if isinstance(rp, dict) and rid in rp:
                                            part = rp.get(rid)
                                        if part is None:
                                            rels = getattr(doc.part, 'rels', None)
                                            if rels is not None and hasattr(rels, 'get'):
                                                rel = rels.get(rid)
                                                part = getattr(rel, 'target_part', None)
                                        if part is None:
                                            rel = getattr(doc.part, '_rels', {}).get(rid)
                                            part = getattr(rel, 'target_part', None)
                                        ct = getattr(part, 'content_type', '') if part is not None else ''
                                        data = part.blob if part is not None and hasattr(part, 'blob') else None
                                        if data:
                                            b64 = base64.b64encode(data).decode('ascii')
                                            parts.append(f"![Image](data:{ct};base64,{b64})")
                                    except Exception:
                                        pass
                            except Exception:
                                pass
        except Exception:
            pass
        return ''.join(parts)

    out = []
    # python-docx table parent is the Document
    doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
    for r_i, r in enumerate(rows):
        vals = []
        for c in r.cells:
            segs: List[str] = []
            for p in c.paragraphs:
                s = _cell_inline_md(doc, p)
                if s:
                    segs.append(s)
            cell_text = '<br>'.join([x for x in segs if x is not None])
            vals.append((cell_text or '').replace('|', '\\|').strip())
        line = "| " + " | ".join(vals) + " |"
        out.append(line)
        if r_i == 0:
            sep = "| " + " | ".join(["---" for _ in vals]) + " |"
            out.append(sep)
    return "\n".join(out) + "\n"


def _paragraph_to_md(p: Paragraph) -> str:
    return (p.text or "").strip() + "\n\n"


def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
    ext = path.suffix.lower()
    use_path = path
    if ext == ".doc":
        use_path = _convert_doc_to_docx_cross_platform(path)
    if use_path.suffix.lower() not in {".docx"}:
        raise RuntimeError("unsupported input for word2markdown")
    doc = Document(str(use_path))
    out: List[str] = []
    in_code = False
    code_lines: List[str] = []
    lang_hint: str = ''
    for blk in _iter_blocks(doc):
        if isinstance(blk, Table):
            out.append(_table_to_md(blk))
        elif isinstance(blk, Paragraph):
            tboxes = _paragraph_textboxes(blk)
            for tb in tboxes:
                if tb.strip():
                    out.append(_md_code_block(tb.strip()))
            sdts = _paragraph_sdts(blk)
            for s in sdts:
                if s.strip():
                    out.append(_md_code_block(s.strip()))
            btx = _paragraph_bordered_text(blk)
            for s in btx:
                if s.strip():
                    out.append(_md_code_block(s.strip()))
            ftx = _paragraph_framed(blk)
            for s in ftx:
                if s.strip():
                    out.append(_md_code_block(s.strip()))
            raw = (blk.text or "")
            sraw = raw.strip()
            if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
                if not in_code:
                    in_code = True
                    lang_hint = _guess_lang(sraw)
                    code_lines = []
                code_lines.append(raw)
                continue
            if in_code and code_lines:
                text = "\n".join(code_lines)
                use_lang = lang_hint or _guess_lang(text)
                out.append(f"```{use_lang}\n{text}\n```\n")
                in_code = False
                code_lines = []
                lang_hint = ''
            def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
                el = p._element
                parts: List[str] = []
                try:
                    for ch in el.iterchildren():
                        tag = ch.tag.split('}')[-1]
                        if tag == 'r':
                            for rc in ch.iterchildren():
                                rtag = rc.tag.split('}')[-1]
                                if rtag == 't':
                                    s = rc.text or ''
                                    if s:
                                        parts.append(s)
                                elif rtag == 'br':
                                    parts.append('\n')
                                elif rtag == 'drawing':
                                    for node in rc.iter():
                                        local = node.tag.split('}')[-1]
                                        rid = None
                                        if local == 'blip':
                                            rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
                                        elif local == 'imagedata':
                                            rid = node.get(f"{{{NS['r']}}}id")
                                        if not rid:
                                            continue
                                        try:
                                            part = None
                                            rp = getattr(doc.part, 'related_parts', None)
                                            if isinstance(rp, dict) and rid in rp:
                                                part = rp.get(rid)
                                            if part is None:
                                                rels = getattr(doc.part, 'rels', None)
                                                if rels is not None and hasattr(rels, 'get'):
                                                    rel = rels.get(rid)
                                                    part = getattr(rel, 'target_part', None)
                                            if part is None:
                                                rel = getattr(doc.part, '_rels', {}).get(rid)
                                                part = getattr(rel, 'target_part', None)
                                            ct = getattr(part, 'content_type', '') if part is not None else ''
                                            data = part.blob if part is not None and hasattr(part, 'blob') else None
                                            if data:
                                                b64 = base64.b64encode(data).decode('ascii')
                                                parts.append(f"![Image](data:{ct};base64,{b64})")
                                        except Exception:
                                            pass
                except Exception:
                    pass
                s = ''.join(parts).strip()
                return (s + '\n\n') if s else ''
            txt = _paragraph_with_images(doc, blk)
            if txt.strip():
                out.append(txt)
    if in_code and code_lines:
        text = "\n".join(code_lines)
        use_lang = lang_hint or _guess_lang(text)
        out.append(f"```{use_lang}\n{text}\n```\n")
    try:
        boxes = _doclevel_textboxes(doc)
        existing_texts = set()
        try:
            for seg in out:
                if isinstance(seg, str):
                    ss = seg.strip()
                    if ss.startswith("```"):
                        m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
                        if m:
                            existing_texts.add(m.group(1).strip())
                            continue
                    existing_texts.add(ss)
        except Exception:
            pass
        for tb in boxes:
            s = (tb or '').strip()
            if not s:
                continue
            if s in existing_texts:
                continue
            out.append(_md_code_block(s))
            existing_texts.add(s)
    except Exception:
        pass
    md = "".join(out)
    return "utf-8", md

NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
    "v": "urn:schemas-microsoft-com:vml",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
}


def _paragraph_textboxes(p: Paragraph) -> List[str]:
    try:
        el = p._element
        texts: List[str] = []
        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
            paras = tbox.xpath('.//w:p', namespaces=NS)
            buf: List[str] = []
            for w_p in paras:
                ts = w_p.xpath('.//w:t', namespaces=NS)
                s = ''.join([t.text or '' for t in ts]).strip()
                if s:
                    buf.append(s)
            if buf:
                texts.append('\n'.join(buf))
        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
            paras = tbox.xpath('.//w:p', namespaces=NS)
            buf: List[str] = []
            for w_p in paras:
                ts = w_p.xpath('.//w:t', namespaces=NS)
                s = ''.join([t.text or '' for t in ts]).strip()
                if s:
                    buf.append(s)
            if buf:
                texts.append('\n'.join(buf))
        return texts
    except Exception:
        return []


def _paragraph_sdts(p: Paragraph) -> List[str]:
    try:
        el = p._element
        texts: List[str] = []
        for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
            paras = sdt.xpath('.//w:p', namespaces=NS)
            buf: List[str] = []
            for w_p in paras:
                ts = w_p.xpath('.//w:t', namespaces=NS)
                s = ''.join([t.text or '' for t in ts]).strip()
                if s:
                    buf.append(s)
            if buf:
                texts.append('\n'.join(buf))
        return texts
    except Exception:
        return []


def _paragraph_bordered_text(p: Paragraph) -> List[str]:
    try:
        el = p._element
        has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
        t = (p.text or '').strip()
        if has_border and t:
            return [t]
    except Exception:
        pass
    return []


def _paragraph_framed(p: Paragraph) -> List[str]:
    try:
        el = p._element
        has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
        t = (p.text or '').strip()
        if has_frame and t:
            return [t]
    except Exception:
        pass
    return []


def _md_code_block(text: str) -> str:
    lang = _guess_lang(text)
    return f"```{lang}\n{text}\n```\n"


def _looks_like_code_paragraph(t: str) -> bool:
    s = (t or '').strip()
    if not s:
        return False
    if s.startswith('{') or s.startswith('[') or s.endswith('}'):
        return True
    if s.startswith('    ') or s.startswith('\t'):
        return True
    if ';' in s or '{' in s or '}' in s:
        return True
    keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
    return any(k in s for k in keywords)


def _doclevel_textboxes(doc: Document) -> List[str]:
    texts: List[str] = []
    try:
        el = doc.element.body
        for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
            paras = tbox.xpath('.//w:p', namespaces=NS)
            buf: List[str] = []
            for w_p in paras:
                ts = w_p.xpath('.//w:t', namespaces=NS)
                s = ''.join([(t.text or '') for t in ts]).strip()
                if s:
                    buf.append(s)
            if buf:
                texts.append('\n'.join(buf))
        for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
            paras = tbox.xpath('.//w:p', namespaces=NS)
            buf: List[str] = []
            for w_p in paras:
                ts = w_p.xpath('.//w:t', namespaces=NS)
                s = ''.join([(t.text or '') for t in ts]).strip()
                if s:
                    buf.append(s)
            if buf:
                texts.append('\n'.join(buf))
    except Exception:
        pass
    return texts


def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
            tmp.close()
            subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
            return Path(tmp.name)
    except Exception:
        pass
    try:
        outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
        subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
        candidate = outdir / (path.stem + ".docx")
        if candidate.exists():
            return candidate
    except Exception:
        pass
    try:
        out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
        subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
        if out.exists():
            return out
    except Exception:
        pass
    raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")