Import project files

This commit is contained in:
2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions

View File

@@ -0,0 +1,429 @@
from pathlib import Path
from typing import Tuple, List
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
import re
import base64
import hashlib
import tempfile
import subprocess
from lxml import etree
def _iter_blocks(doc: Document):
parent = doc
parent_elm = parent.element.body
for child in parent_elm.iterchildren():
tag = child.tag.split('}')[-1]
if tag == 'p':
yield Paragraph(child, parent)
elif tag == 'tbl':
yield Table(child, parent)
def _cell_text(cell) -> str:
parts = []
for p in cell.paragraphs:
t = p.text or ""
parts.append(t)
return "\n".join([s for s in parts if s is not None])
def _guess_lang(text: str) -> str:
t = (text or "").strip()
head = t[:512]
if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head):
return "java"
if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head):
return "python"
if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head):
return "javascript"
if re.search(r"^#include|\bint\s+main\s*\(\)", head):
return "c"
if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head):
return "cpp"
if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE):
return "sql"
if head.startswith("{") or head.startswith("["):
return "json"
if re.search(r"<html|<div|<span|<table|<code|<pre", head, re.IGNORECASE):
return "html"
if re.search(r"<\?xml|</?[A-Za-z0-9:_-]+>", head):
return "xml"
return ""
def _table_to_md(tbl: Table) -> str:
rows = tbl.rows
cols = tbl.columns
if len(rows) == 1 and len(cols) == 1:
txt = _cell_text(rows[0].cells[0]).strip()
lang = _guess_lang(txt)
return f"```{lang}\n{txt}\n```\n"
def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str:
el = paragraph._element
parts: List[str] = []
try:
for ch in el.iterchildren():
tag = ch.tag.split('}')[-1]
if tag == 'r':
for rc in ch.iterchildren():
rtag = rc.tag.split('}')[-1]
if rtag == 't':
s = rc.text or ''
if s:
parts.append(s)
elif rtag == 'br':
parts.append('\n')
elif rtag == 'drawing':
try:
for node in rc.iter():
local = node.tag.split('}')[-1]
rid = None
if local == 'blip':
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
elif local == 'imagedata':
rid = node.get(f"{{{NS['r']}}}id")
if not rid:
continue
try:
part = None
rp = getattr(doc.part, 'related_parts', None)
if isinstance(rp, dict) and rid in rp:
part = rp.get(rid)
if part is None:
rels = getattr(doc.part, 'rels', None)
if rels is not None and hasattr(rels, 'get'):
rel = rels.get(rid)
part = getattr(rel, 'target_part', None)
if part is None:
rel = getattr(doc.part, '_rels', {}).get(rid)
part = getattr(rel, 'target_part', None)
ct = getattr(part, 'content_type', '') if part is not None else ''
data = part.blob if part is not None and hasattr(part, 'blob') else None
if data:
b64 = base64.b64encode(data).decode('ascii')
parts.append(f"![Image](data:{ct};base64,{b64})")
except Exception:
pass
except Exception:
pass
except Exception:
pass
return ''.join(parts)
out = []
# python-docx table parent is the Document
doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None)
for r_i, r in enumerate(rows):
vals = []
for c in r.cells:
segs: List[str] = []
for p in c.paragraphs:
s = _cell_inline_md(doc, p)
if s:
segs.append(s)
cell_text = '<br>'.join([x for x in segs if x is not None])
vals.append((cell_text or '').replace('|', '\\|').strip())
line = "| " + " | ".join(vals) + " |"
out.append(line)
if r_i == 0:
sep = "| " + " | ".join(["---" for _ in vals]) + " |"
out.append(sep)
return "\n".join(out) + "\n"
def _paragraph_to_md(p: Paragraph) -> str:
return (p.text or "").strip() + "\n\n"
def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]:
ext = path.suffix.lower()
use_path = path
if ext == ".doc":
use_path = _convert_doc_to_docx_cross_platform(path)
if use_path.suffix.lower() not in {".docx"}:
raise RuntimeError("unsupported input for word2markdown")
doc = Document(str(use_path))
out: List[str] = []
in_code = False
code_lines: List[str] = []
lang_hint: str = ''
for blk in _iter_blocks(doc):
if isinstance(blk, Table):
out.append(_table_to_md(blk))
elif isinstance(blk, Paragraph):
tboxes = _paragraph_textboxes(blk)
for tb in tboxes:
if tb.strip():
out.append(_md_code_block(tb.strip()))
sdts = _paragraph_sdts(blk)
for s in sdts:
if s.strip():
out.append(_md_code_block(s.strip()))
btx = _paragraph_bordered_text(blk)
for s in btx:
if s.strip():
out.append(_md_code_block(s.strip()))
ftx = _paragraph_framed(blk)
for s in ftx:
if s.strip():
out.append(_md_code_block(s.strip()))
raw = (blk.text or "")
sraw = raw.strip()
if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""):
if not in_code:
in_code = True
lang_hint = _guess_lang(sraw)
code_lines = []
code_lines.append(raw)
continue
if in_code and code_lines:
text = "\n".join(code_lines)
use_lang = lang_hint or _guess_lang(text)
out.append(f"```{use_lang}\n{text}\n```\n")
in_code = False
code_lines = []
lang_hint = ''
def _paragraph_with_images(doc: Document, p: Paragraph) -> str:
el = p._element
parts: List[str] = []
try:
for ch in el.iterchildren():
tag = ch.tag.split('}')[-1]
if tag == 'r':
for rc in ch.iterchildren():
rtag = rc.tag.split('}')[-1]
if rtag == 't':
s = rc.text or ''
if s:
parts.append(s)
elif rtag == 'br':
parts.append('\n')
elif rtag == 'drawing':
for node in rc.iter():
local = node.tag.split('}')[-1]
rid = None
if local == 'blip':
rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link")
elif local == 'imagedata':
rid = node.get(f"{{{NS['r']}}}id")
if not rid:
continue
try:
part = None
rp = getattr(doc.part, 'related_parts', None)
if isinstance(rp, dict) and rid in rp:
part = rp.get(rid)
if part is None:
rels = getattr(doc.part, 'rels', None)
if rels is not None and hasattr(rels, 'get'):
rel = rels.get(rid)
part = getattr(rel, 'target_part', None)
if part is None:
rel = getattr(doc.part, '_rels', {}).get(rid)
part = getattr(rel, 'target_part', None)
ct = getattr(part, 'content_type', '') if part is not None else ''
data = part.blob if part is not None and hasattr(part, 'blob') else None
if data:
b64 = base64.b64encode(data).decode('ascii')
parts.append(f"![Image](data:{ct};base64,{b64})")
except Exception:
pass
except Exception:
pass
s = ''.join(parts).strip()
return (s + '\n\n') if s else ''
txt = _paragraph_with_images(doc, blk)
if txt.strip():
out.append(txt)
if in_code and code_lines:
text = "\n".join(code_lines)
use_lang = lang_hint or _guess_lang(text)
out.append(f"```{use_lang}\n{text}\n```\n")
try:
boxes = _doclevel_textboxes(doc)
existing_texts = set()
try:
for seg in out:
if isinstance(seg, str):
ss = seg.strip()
if ss.startswith("```"):
m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss)
if m:
existing_texts.add(m.group(1).strip())
continue
existing_texts.add(ss)
except Exception:
pass
for tb in boxes:
s = (tb or '').strip()
if not s:
continue
if s in existing_texts:
continue
out.append(_md_code_block(s))
existing_texts.add(s)
except Exception:
pass
md = "".join(out)
return "utf-8", md
NS = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"v": "urn:schemas-microsoft-com:vml",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
}
def _paragraph_textboxes(p: Paragraph) -> List[str]:
try:
el = p._element
texts: List[str] = []
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
return texts
except Exception:
return []
def _paragraph_sdts(p: Paragraph) -> List[str]:
try:
el = p._element
texts: List[str] = []
for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS):
paras = sdt.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([t.text or '' for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
return texts
except Exception:
return []
def _paragraph_bordered_text(p: Paragraph) -> List[str]:
try:
el = p._element
has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS))
t = (p.text or '').strip()
if has_border and t:
return [t]
except Exception:
pass
return []
def _paragraph_framed(p: Paragraph) -> List[str]:
try:
el = p._element
has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS))
t = (p.text or '').strip()
if has_frame and t:
return [t]
except Exception:
pass
return []
def _md_code_block(text: str) -> str:
lang = _guess_lang(text)
return f"```{lang}\n{text}\n```\n"
def _looks_like_code_paragraph(t: str) -> bool:
s = (t or '').strip()
if not s:
return False
if s.startswith('{') or s.startswith('[') or s.endswith('}'):
return True
if s.startswith(' ') or s.startswith('\t'):
return True
if ';' in s or '{' in s or '}' in s:
return True
keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory']
return any(k in s for k in keywords)
def _doclevel_textboxes(doc: Document) -> List[str]:
texts: List[str] = []
try:
el = doc.element.body
for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([(t.text or '') for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS):
paras = tbox.xpath('.//w:p', namespaces=NS)
buf: List[str] = []
for w_p in paras:
ts = w_p.xpath('.//w:t', namespaces=NS)
s = ''.join([(t.text or '') for t in ts]).strip()
if s:
buf.append(s)
if buf:
texts.append('\n'.join(buf))
except Exception:
pass
return texts
def _convert_doc_to_docx_cross_platform(path: Path) -> Path:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
tmp.close()
subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True)
return Path(tmp.name)
except Exception:
pass
try:
outdir = Path(tempfile.mkdtemp(prefix="doc2docx_"))
subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True)
candidate = outdir / (path.stem + ".docx")
if candidate.exists():
return candidate
except Exception:
pass
try:
out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name)
subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True)
if out.exists():
return out
except Exception:
pass
raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually")