Import project files
This commit is contained in:
429
docling/app/services/word2markdown.py
Normal file
429
docling/app/services/word2markdown.py
Normal file
@@ -0,0 +1,429 @@
|
||||
from pathlib import Path
|
||||
from typing import Tuple, List
|
||||
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
import re
|
||||
import base64
|
||||
import hashlib
|
||||
import tempfile
|
||||
import subprocess
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def _iter_blocks(doc: Document):
|
||||
parent = doc
|
||||
parent_elm = parent.element.body
|
||||
for child in parent_elm.iterchildren():
|
||||
tag = child.tag.split('}')[-1]
|
||||
if tag == 'p':
|
||||
yield Paragraph(child, parent)
|
||||
elif tag == 'tbl':
|
||||
yield Table(child, parent)
|
||||
|
||||
|
||||
def _cell_text(cell) -> str:
|
||||
parts = []
|
||||
for p in cell.paragraphs:
|
||||
t = p.text or ""
|
||||
parts.append(t)
|
||||
return "\n".join([s for s in parts if s is not None])
|
||||
|
||||
|
||||
def _guess_lang(text: str) -> str:
|
||||
t = (text or "").strip()
|
||||
head = t[:512]
|
||||
if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
|
||||
return "java"
|
||||
if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
|
||||
return "python"
|
||||
if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
|
||||
return "javascript"
|
||||
if re.search(r"^#include|\bint\s+main\s*\(\)", head):
|
||||
return "c"
|
||||
if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
|
||||
return "cpp"
|
||||
if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
|
||||
return "sql"
|
||||
if head.startswith("{") or head.startswith("["):
|
||||
return "json"
|
||||
if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
|
||||
return "html"
|
||||
if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
|
||||
return "xml"
|
||||
return ""
|
||||
|
||||
|
||||
def _table_to_md(tbl: Table) -> str:
|
||||
rows = tbl.rows
|
||||
cols = tbl.columns
|
||||
if len(rows) == 1 and len(cols) == 1:
|
||||
txt = _cell_text(rows[0].cells[0]).strip()
|
||||
lang = _guess_lang(txt)
|
||||
return f"```{lang}\n{txt}\n```\n"
|
||||
|
||||
def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
|
||||
el = paragraph._element
|
||||
parts: List[str] = []
|
||||
try:
|
||||
for ch in el.iterchildren():
|
||||
tag = ch.tag.split('}')[-1]
|
||||
if tag == 'r':
|
||||
for rc in ch.iterchildren():
|
||||
rtag = rc.tag.split('}')[-1]
|
||||
if rtag == 't':
|
||||
s = rc.text or ''
|
||||
if s:
|
||||
parts.append(s)
|
||||
elif rtag == 'br':
|
||||
parts.append('\n')
|
||||
elif rtag == 'drawing':
|
||||
try:
|
||||
for node in rc.iter():
|
||||
local = node.tag.split('}')[-1]
|
||||
rid = None
|
||||
if local == 'blip':
|
||||
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
|
||||
elif local == 'imagedata':
|
||||
rid = node.get(f"{{{NS['r']}}}id")
|
||||
if not rid:
|
||||
continue
|
||||
try:
|
||||
part = None
|
||||
rp = getattr(doc.part, 'related_parts', None)
|
||||
if isinstance(rp, dict) and rid in rp:
|
||||
part = rp.get(rid)
|
||||
if part is None:
|
||||
rels = getattr(doc.part, 'rels', None)
|
||||
if rels is not None and hasattr(rels, 'get'):
|
||||
rel = rels.get(rid)
|
||||
part = getattr(rel, 'target_part', None)
|
||||
if part is None:
|
||||
rel = getattr(doc.part, '_rels', {}).get(rid)
|
||||
part = getattr(rel, 'target_part', None)
|
||||
ct = getattr(part, 'content_type', '') if part is not None else ''
|
||||
data = part.blob if part is not None and hasattr(part, 'blob') else None
|
||||
if data:
|
||||
b64 = base64.b64encode(data).decode('ascii')
|
||||
parts.append(f"")
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return ''.join(parts)
|
||||
|
||||
out = []
|
||||
# python-docx table parent is the Document
|
||||
doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
|
||||
for r_i, r in enumerate(rows):
|
||||
vals = []
|
||||
for c in r.cells:
|
||||
segs: List[str] = []
|
||||
for p in c.paragraphs:
|
||||
s = _cell_inline_md(doc, p)
|
||||
if s:
|
||||
segs.append(s)
|
||||
cell_text = '<br>'.join([x for x in segs if x is not None])
|
||||
vals.append((cell_text or '').replace('|', '\\|').strip())
|
||||
line = "| " + " | ".join(vals) + " |"
|
||||
out.append(line)
|
||||
if r_i == 0:
|
||||
sep = "| " + " | ".join(["---" for _ in vals]) + " |"
|
||||
out.append(sep)
|
||||
return "\n".join(out) + "\n"
|
||||
|
||||
|
||||
def _paragraph_to_md(p: Paragraph) -> str:
|
||||
return (p.text or "").strip() + "\n\n"
|
||||
|
||||
|
||||
def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
|
||||
ext = path.suffix.lower()
|
||||
use_path = path
|
||||
if ext == ".doc":
|
||||
use_path = _convert_doc_to_docx_cross_platform(path)
|
||||
if use_path.suffix.lower() not in {".docx"}:
|
||||
raise RuntimeError("unsupported input for word2markdown")
|
||||
doc = Document(str(use_path))
|
||||
out: List[str] = []
|
||||
in_code = False
|
||||
code_lines: List[str] = []
|
||||
lang_hint: str = ''
|
||||
for blk in _iter_blocks(doc):
|
||||
if isinstance(blk, Table):
|
||||
out.append(_table_to_md(blk))
|
||||
elif isinstance(blk, Paragraph):
|
||||
tboxes = _paragraph_textboxes(blk)
|
||||
for tb in tboxes:
|
||||
if tb.strip():
|
||||
out.append(_md_code_block(tb.strip()))
|
||||
sdts = _paragraph_sdts(blk)
|
||||
for s in sdts:
|
||||
if s.strip():
|
||||
out.append(_md_code_block(s.strip()))
|
||||
btx = _paragraph_bordered_text(blk)
|
||||
for s in btx:
|
||||
if s.strip():
|
||||
out.append(_md_code_block(s.strip()))
|
||||
ftx = _paragraph_framed(blk)
|
||||
for s in ftx:
|
||||
if s.strip():
|
||||
out.append(_md_code_block(s.strip()))
|
||||
raw = (blk.text or "")
|
||||
sraw = raw.strip()
|
||||
if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
|
||||
if not in_code:
|
||||
in_code = True
|
||||
lang_hint = _guess_lang(sraw)
|
||||
code_lines = []
|
||||
code_lines.append(raw)
|
||||
continue
|
||||
if in_code and code_lines:
|
||||
text = "\n".join(code_lines)
|
||||
use_lang = lang_hint or _guess_lang(text)
|
||||
out.append(f"```{use_lang}\n{text}\n```\n")
|
||||
in_code = False
|
||||
code_lines = []
|
||||
lang_hint = ''
|
||||
def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
|
||||
el = p._element
|
||||
parts: List[str] = []
|
||||
try:
|
||||
for ch in el.iterchildren():
|
||||
tag = ch.tag.split('}')[-1]
|
||||
if tag == 'r':
|
||||
for rc in ch.iterchildren():
|
||||
rtag = rc.tag.split('}')[-1]
|
||||
if rtag == 't':
|
||||
s = rc.text or ''
|
||||
if s:
|
||||
parts.append(s)
|
||||
elif rtag == 'br':
|
||||
parts.append('\n')
|
||||
elif rtag == 'drawing':
|
||||
for node in rc.iter():
|
||||
local = node.tag.split('}')[-1]
|
||||
rid = None
|
||||
if local == 'blip':
|
||||
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
|
||||
elif local == 'imagedata':
|
||||
rid = node.get(f"{{{NS['r']}}}id")
|
||||
if not rid:
|
||||
continue
|
||||
try:
|
||||
part = None
|
||||
rp = getattr(doc.part, 'related_parts', None)
|
||||
if isinstance(rp, dict) and rid in rp:
|
||||
part = rp.get(rid)
|
||||
if part is None:
|
||||
rels = getattr(doc.part, 'rels', None)
|
||||
if rels is not None and hasattr(rels, 'get'):
|
||||
rel = rels.get(rid)
|
||||
part = getattr(rel, 'target_part', None)
|
||||
if part is None:
|
||||
rel = getattr(doc.part, '_rels', {}).get(rid)
|
||||
part = getattr(rel, 'target_part', None)
|
||||
ct = getattr(part, 'content_type', '') if part is not None else ''
|
||||
data = part.blob if part is not None and hasattr(part, 'blob') else None
|
||||
if data:
|
||||
b64 = base64.b64encode(data).decode('ascii')
|
||||
parts.append(f"")
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
s = ''.join(parts).strip()
|
||||
return (s + '\n\n') if s else ''
|
||||
txt = _paragraph_with_images(doc, blk)
|
||||
if txt.strip():
|
||||
out.append(txt)
|
||||
if in_code and code_lines:
|
||||
text = "\n".join(code_lines)
|
||||
use_lang = lang_hint or _guess_lang(text)
|
||||
out.append(f"```{use_lang}\n{text}\n```\n")
|
||||
try:
|
||||
boxes = _doclevel_textboxes(doc)
|
||||
existing_texts = set()
|
||||
try:
|
||||
for seg in out:
|
||||
if isinstance(seg, str):
|
||||
ss = seg.strip()
|
||||
if ss.startswith("```"):
|
||||
m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
|
||||
if m:
|
||||
existing_texts.add(m.group(1).strip())
|
||||
continue
|
||||
existing_texts.add(ss)
|
||||
except Exception:
|
||||
pass
|
||||
for tb in boxes:
|
||||
s = (tb or '').strip()
|
||||
if not s:
|
||||
continue
|
||||
if s in existing_texts:
|
||||
continue
|
||||
out.append(_md_code_block(s))
|
||||
existing_texts.add(s)
|
||||
except Exception:
|
||||
pass
|
||||
md = "".join(out)
|
||||
return "utf-8", md
|
||||
|
||||
NS = {
|
||||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
||||
"v": "urn:schemas-microsoft-com:vml",
|
||||
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
|
||||
}
|
||||
|
||||
|
||||
def _paragraph_textboxes(p: Paragraph) -> List[str]:
|
||||
try:
|
||||
el = p._element
|
||||
texts: List[str] = []
|
||||
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
|
||||
paras = tbox.xpath('.//w:p', namespaces=NS)
|
||||
buf: List[str] = []
|
||||
for w_p in paras:
|
||||
ts = w_p.xpath('.//w:t', namespaces=NS)
|
||||
s = ''.join([t.text or '' for t in ts]).strip()
|
||||
if s:
|
||||
buf.append(s)
|
||||
if buf:
|
||||
texts.append('\n'.join(buf))
|
||||
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
|
||||
paras = tbox.xpath('.//w:p', namespaces=NS)
|
||||
buf: List[str] = []
|
||||
for w_p in paras:
|
||||
ts = w_p.xpath('.//w:t', namespaces=NS)
|
||||
s = ''.join([t.text or '' for t in ts]).strip()
|
||||
if s:
|
||||
buf.append(s)
|
||||
if buf:
|
||||
texts.append('\n'.join(buf))
|
||||
return texts
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _paragraph_sdts(p: Paragraph) -> List[str]:
|
||||
try:
|
||||
el = p._element
|
||||
texts: List[str] = []
|
||||
for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
|
||||
paras = sdt.xpath('.//w:p', namespaces=NS)
|
||||
buf: List[str] = []
|
||||
for w_p in paras:
|
||||
ts = w_p.xpath('.//w:t', namespaces=NS)
|
||||
s = ''.join([t.text or '' for t in ts]).strip()
|
||||
if s:
|
||||
buf.append(s)
|
||||
if buf:
|
||||
texts.append('\n'.join(buf))
|
||||
return texts
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _paragraph_bordered_text(p: Paragraph) -> List[str]:
|
||||
try:
|
||||
el = p._element
|
||||
has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
|
||||
t = (p.text or '').strip()
|
||||
if has_border and t:
|
||||
return [t]
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _paragraph_framed(p: Paragraph) -> List[str]:
|
||||
try:
|
||||
el = p._element
|
||||
has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
|
||||
t = (p.text or '').strip()
|
||||
if has_frame and t:
|
||||
return [t]
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _md_code_block(text: str) -> str:
|
||||
lang = _guess_lang(text)
|
||||
return f"```{lang}\n{text}\n```\n"
|
||||
|
||||
|
||||
def _looks_like_code_paragraph(t: str) -> bool:
|
||||
s = (t or '').strip()
|
||||
if not s:
|
||||
return False
|
||||
if s.startswith('{') or s.startswith('[') or s.endswith('}'):
|
||||
return True
|
||||
if s.startswith(' ') or s.startswith('\t'):
|
||||
return True
|
||||
if ';' in s or '{' in s or '}' in s:
|
||||
return True
|
||||
keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
|
||||
return any(k in s for k in keywords)
|
||||
|
||||
|
||||
def _doclevel_textboxes(doc: Document) -> List[str]:
|
||||
texts: List[str] = []
|
||||
try:
|
||||
el = doc.element.body
|
||||
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
|
||||
paras = tbox.xpath('.//w:p', namespaces=NS)
|
||||
buf: List[str] = []
|
||||
for w_p in paras:
|
||||
ts = w_p.xpath('.//w:t', namespaces=NS)
|
||||
s = ''.join([(t.text or '') for t in ts]).strip()
|
||||
if s:
|
||||
buf.append(s)
|
||||
if buf:
|
||||
texts.append('\n'.join(buf))
|
||||
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
|
||||
paras = tbox.xpath('.//w:p', namespaces=NS)
|
||||
buf: List[str] = []
|
||||
for w_p in paras:
|
||||
ts = w_p.xpath('.//w:t', namespaces=NS)
|
||||
s = ''.join([(t.text or '') for t in ts]).strip()
|
||||
if s:
|
||||
buf.append(s)
|
||||
if buf:
|
||||
texts.append('\n'.join(buf))
|
||||
except Exception:
|
||||
pass
|
||||
return texts
|
||||
|
||||
|
||||
def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
|
||||
tmp.close()
|
||||
subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
|
||||
return Path(tmp.name)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
|
||||
subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
|
||||
candidate = outdir / (path.stem + ".docx")
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
|
||||
subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
|
||||
if out.exists():
|
||||
return out
|
||||
except Exception:
|
||||
pass
|
||||
raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")
|
||||
Reference in New Issue
Block a user