Files
FunMD_Convert/docling/app/server.py

3193 lines
139 KiB
Python
Raw Normal View History

2026-01-07 17:18:26 +08:00
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, Query
from fastapi.responses import Response, HTMLResponse, JSONResponse, FileResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pathlib import Path
import tempfile
import os
import asyncio
from typing import Optional, List, Dict, Tuple
from datetime import timedelta
import mimetypes
from urllib.request import urlopen, Request
from urllib.error import HTTPError, URLError
from urllib.parse import urlsplit, urlunsplit, quote, unquote
import logging
import traceback
import time
import re
import io
import shutil
import uuid
import subprocess
import sys
import json
try:
from minio import Minio # type: ignore
import urllib3 # type: ignore
except Exception:
Minio = None
urllib3 = None # type: ignore
from pydantic import BaseModel
class ConvertResponse(BaseModel):
minio_url: Optional[str]
minio_presigned_url: Optional[str]
name: str
media_type: str
class MinioPresignResponse(BaseModel):
bucket: str
object: str
minio_url: Optional[str]
minio_presigned_url: Optional[str]
expires: int
try:
import fitz # type: ignore
except Exception:
fitz = None # type: ignore
from app.services.docling_adapter import (
convert_source,
md_to_docx_bytes,
md_to_pdf_bytes_with_renderer,
infer_basename,
sanitize_filename,
load_linkmap,
save_linkmap,
)
from app.services.unified_converter import FormatConverter
from app.services.minio_utils import minio_current, join_prefix, presigned_read
2026-01-13 22:56:22 +08:00
from app.services.pdf_converter import (
word_to_pdf_bytes,
markdown_to_pdf_bytes,
markdown_file_to_pdf_bytes,
read_file_content,
)
2026-01-07 17:18:26 +08:00
"""
@api Server Application
@description FastAPI server providing document conversion endpoints and MinIO integration
"""
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
try:
_ui_dir = Path(__file__).resolve().parents[2] / "frontend" / "dist"
if _ui_dir.exists():
app.mount("/ui", StaticFiles(directory=str(_ui_dir), html=True), name="ui")
try:
assets_dir = _ui_dir / "assets"
if assets_dir.exists():
app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets")
except Exception:
pass
try:
svg_path = _ui_dir / "vite.svg"
if svg_path.exists():
@app.get("/vite.svg")
def _vite_svg():
return FileResponse(str(svg_path), media_type="image/svg+xml")
except Exception:
pass
except Exception:
pass
@app.get("/health")
def health():
"""
@function health
@description Health check endpoint
@return {"status": "ok"}
"""
return {"status": "ok"}
@app.post("/convert")
async def convert(
file: Optional[UploadFile] = File(None),
source_url: Optional[str] = Form(None),
export: str = Form("markdown"),
save: Optional[bool] = Form(False),
filename: Optional[str] = Form(None),
):
"""
@function convert
@description Convert various document formats to Markdown/HTML/JSON
@param file Uploaded file (optional)
@param source_url URL of the source document (optional)
@param export Target export format (default: markdown)
@param save Whether to save to MinIO (default: False)
@param filename Custom filename for the output
@return JSON response with conversion result or MinIO URL
"""
if (file is None and not source_url) or (file is not None and source_url):
raise HTTPException(status_code=400, detail="provide exactly one of file or source_url")
export = _normalize_export(export)
if source_url:
enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, source_url, export=export)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None))
out_ext = _export_ext(export)
ct = _media_type(export)
if export.lower() == "markdown":
try:
client_rw, bucket_rw, public_rw, prefix_rw = minio_current(RUNTIME_CONFIG)
if client_rw is not None and bucket_rw and public_rw:
base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_"))
new_text, _ms = _rewrite_md_assets_to_minio(
content,
base_dir,
client_rw,
bucket_rw,
public_rw,
prefix_rw,
search_root=(Path(artifacts_dir) if artifacts_dir else None),
)
content = new_text
try:
if artifacts_dir:
_bulk_upload_assets(Path(artifacts_dir), client_rw, bucket_rw, public_rw, prefix_rw)
except Exception:
pass
except Exception:
pass
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is None or not bucket or not public_base:
raise HTTPException(status_code=400, detail="MinIO is not configured for save")
rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"}
if not rc_store_final:
raise HTTPException(status_code=400, detail="Saving to MinIO is disabled by configuration")
out_name = f"{base}{out_ext}"
obj = join_prefix(prefix, f"converted/{out_name}")
raw = content.encode(enc or "utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type=ct) # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp)
resp = JSONResponse({
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"name": out_name,
"export": export,
"media_type": ct
})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
assert file is not None
suffix = ""
if file.filename and "." in file.filename:
suffix = "." + file.filename.rsplit(".", 1)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, tmp_path, export=export)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename))
out_ext = _export_ext(export)
ct = _media_type(export)
if export.lower() == "markdown":
try:
client_rw, bucket_rw, public_rw, prefix_rw = minio_current(RUNTIME_CONFIG)
if client_rw is not None and bucket_rw and public_rw:
base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_"))
new_text, _ms = _rewrite_md_assets_to_minio(
content,
base_dir,
client_rw,
bucket_rw,
public_rw,
prefix_rw,
search_root=(Path(artifacts_dir) if artifacts_dir else None),
)
content = new_text
try:
if artifacts_dir:
_bulk_upload_assets(Path(artifacts_dir), client_rw, bucket_rw, public_rw, prefix_rw)
except Exception:
pass
except Exception:
pass
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is None or not bucket or not public_base:
raise HTTPException(status_code=400, detail="MinIO is not configured for save")
rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"}
if not rc_store_final:
raise HTTPException(status_code=400, detail="Saving to MinIO is disabled by configuration")
out_name = f"{base}{out_ext}"
obj = join_prefix(prefix, f"converted/{out_name}")
raw = content.encode(enc or "utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type=ct) # type: ignore
minio_url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp)
resp = JSONResponse({
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"name": out_name,
"export": export,
"media_type": ct
})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
finally:
try:
os.remove(tmp_path)
except Exception:
pass
profiles_dir = Path(__file__).parent / "configs"
profiles_dir.mkdir(parents=True, exist_ok=True)
@app.get("/")
def index():
return JSONResponse({"ok": True, "service": "docling-api", "version": "v2"})
@app.get("/@vite/client")
def vite_client_stub():
return JSONResponse({"ok": True})
@app.get("/refresh.js")
def refresh_js_stub():
return Response(content="window.initClient=function(){},window.addRefresh=function(){};", media_type="application/javascript")
RUNTIME_CONFIG: Dict[str, Dict[str, Optional[str]]] = {
"minio": {
"endpoint": None,
"public": None,
"access": None,
"secret": None,
"bucket": None,
"secure": None,
"prefix": None,
"store_final": "true",
"public_read": "true",
},
"db": {
"webhook_url": None,
"token": None,
},
}
def _normalize_export(export: str) -> str:
e = (export or "").strip().lower()
allowed = {"markdown", "html", "json", "doctags"}
if e not in allowed:
raise HTTPException(status_code=422, detail="unsupported export")
return e
def _normalize_engine(engine: Optional[str]) -> Optional[str]:
if engine is None:
return None
e = (engine or "").strip().lower()
allowed = {"docling", "word2markdown", "pandoc", "custom"}
if e not in allowed:
raise HTTPException(status_code=422, detail="unsupported engine")
return e
def _fix_garbled_name(name: str) -> str:
try:
s = name
t = s.strip()
# If pure ASCII, no fix needed
if all(ord(c) < 128 for c in t):
return name
# Try to reconstruct original bytes assuming CP437 (Zip default when UTF-8 flag not set)
try:
raw = s.encode("cp437", errors="strict")
except UnicodeEncodeError:
# Not CP437 mojibake, keep original
return name
encs = [
"gb18030",
"gbk",
"cp936",
"utf-8",
"big5",
"cp950",
"shift_jis",
"cp932",
"cp949",
"euc-kr",
"euc-jp",
]
for e in encs:
try:
fixed = raw.decode(e)
if fixed:
return fixed
except Exception:
continue
except Exception:
pass
return name
def _safe_target(base: Path, name: str) -> Optional[Path]:
try:
n = name.replace("\\", "/").lstrip("/")
parts = [p for p in n.split("/") if p and p not in {".", ".."}]
tgt = base / "/".join(parts)
rp = tgt.resolve()
rb = base.resolve()
try:
rp.relative_to(rb)
except Exception:
return None
return rp
except Exception:
return None
def _zip_extract_safely(zf: object, dest: Path) -> None:
try:
for zi in zf.infolist(): # type: ignore
try:
name = str(getattr(zi, "filename", ""))
flag = int(getattr(zi, "flag_bits", 0))
use = name
if (flag & 0x800) == 0:
use = _fix_garbled_name(name)
target = _safe_target(dest, use)
if target is None:
continue
if hasattr(zi, "is_dir") and zi.is_dir(): # type: ignore
target.mkdir(parents=True, exist_ok=True)
continue
target.parent.mkdir(parents=True, exist_ok=True)
with zf.open(zi, "r") as src: # type: ignore
data = src.read()
with open(target, "wb") as out:
out.write(data)
except Exception:
continue
except Exception:
pass
def _tar_extract_safely(tf: object, dest: Path) -> None:
try:
for m in tf.getmembers(): # type: ignore
try:
name = str(getattr(m, "name", ""))
use = _fix_garbled_name(name)
target = _safe_target(dest, use)
if target is None:
continue
if getattr(m, "isdir", lambda: False)():
target.mkdir(parents=True, exist_ok=True)
continue
target.parent.mkdir(parents=True, exist_ok=True)
f = tf.extractfile(m) # type: ignore
if f is None:
continue
data = f.read()
with open(target, "wb") as out:
out.write(data)
except Exception:
continue
except Exception:
pass
def _minio_head_bucket(client: object, bucket: str) -> bool:
try:
if hasattr(client, "bucket_exists"):
try:
return bool(client.bucket_exists(bucket)) # type: ignore
except Exception:
pass
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
client._url_open(method="HEAD", region=region, bucket_name=bucket) # type: ignore
return True
except Exception:
try:
names = [getattr(b, "name", None) for b in client.list_buckets()] # type: ignore
return bucket in set(n for n in names if n)
except Exception:
return False
def _minio_create_bucket(client: object, bucket: str) -> bool:
# Prefer SDK methods, fallback to low-level call
try:
if hasattr(client, "bucket_exists"):
try:
if client.bucket_exists(bucket): # type: ignore
return True
except Exception:
pass
if hasattr(client, "make_bucket"):
try:
client.make_bucket(bucket) # type: ignore
return True
except Exception:
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
try:
client.make_bucket(bucket, location=region) # type: ignore
return True
except Exception:
pass
try:
try:
region = client._get_region(bucket) # type: ignore
except Exception:
region = "us-east-1"
client._url_open(method="PUT", region=region, bucket_name=bucket) # type: ignore
return True
except Exception as ce:
if "BucketAlreadyOwnedByYou" in str(ce) or "BucketAlreadyExists" in str(ce):
return True
raise
except Exception as e:
raise e
def _minio_client(endpoint: str, access: str, secret: str, secure: bool):
if urllib3 is not None:
try:
http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=3.0, read=20.0))
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure, http_client=http) # type: ignore
except Exception:
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore
return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore
def _minio_time_hint(endpoint: str, secure: bool) -> Optional[str]:
try:
scheme = "https" if secure else "http"
r = urlopen(f"{scheme}://{endpoint}", timeout=3)
srv_date = r.headers.get("Date")
if not srv_date:
return None
from email.utils import parsedate_to_datetime
from datetime import datetime, timezone
dt = parsedate_to_datetime(srv_date)
now = datetime.now(timezone.utc)
diff = abs((now - dt).total_seconds())
return f"服务器时间与本机相差约 {int(diff)}"
except Exception:
return None
def _db_notify(payload: Dict[str, object]):
try:
import requests # type: ignore
except Exception:
return
url = (RUNTIME_CONFIG.get("db", {}).get("webhook_url") or "").strip()
if not url:
return
token = (RUNTIME_CONFIG.get("db", {}).get("token") or "")
headers = {"Content-Type": "application/json"}
if token:
headers["Authorization"] = f"Bearer {token}"
try:
requests.post(url, json=payload, headers=headers, timeout=5)
except Exception:
pass
@app.post("/config/minio")
async def set_minio_config(
endpoint: str = Form(...),
public: Optional[str] = Form(None),
access: str = Form(...),
secret: str = Form(...),
bucket: str = Form(...),
secure: Optional[str] = Form("false"),
prefix: Optional[str] = Form(None),
store_final: Optional[str] = Form("true"),
public_read: Optional[str] = Form("true"),
):
ep_raw = (endpoint or "").strip()
ep_host = ep_raw
try:
from urllib.parse import urlsplit
u = urlsplit(ep_raw)
if u.scheme:
ep_host = (u.netloc or ep_raw).split("/")[0]
else:
ep_host = ep_raw.split("/")[0]
except Exception:
ep_host = ep_raw.split("/")[0]
# reject console port or console paths for endpoint
try:
if (":9001" in ep_host) or ("/browser" in ep_raw) or ("/minio" in ep_raw):
return {"ok": False, "error": "请使用 MinIO API 端口 9000而非 9001 控制台)"}
except Exception:
pass
pub_val = public
try:
from urllib.parse import urlsplit
pu = urlsplit((public or "").strip())
if (pu.netloc.endswith(":9001") or "/browser" in (public or "") or "/minio" in (public or "")):
pub_val = None
except Exception:
if public and (":9001" in public or "/browser" in public or "/minio" in public):
pub_val = None
# ensure public has scheme
try:
if pub_val:
from urllib.parse import urlsplit
pu = urlsplit(pub_val.strip())
scheme = pu.scheme or ("https" if str(secure or "false").lower() in {"1","true","yes","on"} else "http")
host = pu.netloc or pu.path.split("/")[0]
pub_val = f"{scheme}://{host}"
except Exception:
try:
if pub_val:
host = pub_val.strip().split("/")[0]
scheme = "https" if str(secure or "false").lower() in {"1","true","yes","on"} else "http"
pub_val = f"{scheme}://{host}"
except Exception:
pass
RUNTIME_CONFIG["minio"].update({
"endpoint": ep_host,
"public": pub_val,
"access": access,
"secret": secret,
"bucket": bucket,
"secure": secure,
"prefix": prefix,
"store_final": store_final,
"public_read": public_read,
})
client, bkt, pub, _ = minio_current(RUNTIME_CONFIG)
if client is None or not bkt or not pub:
return {"ok": False, "error": "MinIO config invalid"}
try:
pr = str(public_read or "true").lower() in {"1","true","yes","on"}
if pr:
policy = {
"Version": "2012-10-17",
"Statement": [
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bkt}"]},
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bkt}/*"]},
],
}
import json as _json
client.set_bucket_policy(bucket_name=bkt, policy=_json.dumps(policy)) # type: ignore
else:
try:
client.delete_bucket_policy(bkt) # type: ignore
except Exception:
pass
except Exception:
pass
return {"ok": True}
@app.post("/config/minio/test")
async def test_minio_config(
endpoint: str = Form(...),
public: Optional[str] = Form(None),
access: str = Form(...),
secret: str = Form(...),
bucket: str = Form(...),
secure: Optional[str] = Form("false"),
create_if_missing: Optional[str] = Form("true"),
public_read: Optional[str] = Form("false"),
):
if Minio is None:
return {"ok": False, "connected": False, "bucket_exists": False, "error": "minio client not available"}
try:
sec = str(secure or "false").lower() in {"1","true","yes","on"}
ep_raw = (endpoint or "").strip()
ep_host = ep_raw
try:
from urllib.parse import urlsplit
u = urlsplit(ep_raw)
if u.scheme:
ep_host = (u.netloc or ep_raw).split("/")[0]
else:
ep_host = ep_raw.split("/")[0]
except Exception:
ep_host = ep_raw.split("/")[0]
if ":9001" in ep_host or "/browser" in ep_raw or "/minio" in ep_raw:
return {"ok": False, "connected": False, "bucket_exists": False, "error": "请使用 MinIO API 端口 9000而非 9001 控制台)"}
client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=sec)
# handshake fallback
try:
try:
client.list_buckets() # type: ignore
except Exception as e:
if sec and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)):
client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=False)
sec = False
except Exception:
pass
exists = False
created = False
exists = _minio_head_bucket(client, bucket)
if not exists and str(create_if_missing or "true").lower() in {"1","true","yes","on"}:
if _minio_create_bucket(client, bucket):
exists = True
created = True
# 始终根据 public_read 应用/移除策略(即使桶已存在)
try:
import json as _json
if str(public_read or "false").lower() in {"1","true","yes","on"}:
policy = {
"Version": "2012-10-17",
"Statement": [
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bucket}"]},
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bucket}/*"]},
],
}
client.set_bucket_policy(bucket_name=bucket, policy=_json.dumps(policy)) # type: ignore
else:
try:
client.delete_bucket_policy(bucket) # type: ignore
except Exception:
pass
except Exception:
pass
return {"ok": True, "connected": True, "bucket_exists": exists, "created": created, "hint": ("使用 HTTPS 访问 9000 端口可能失败,请确认启用 HTTPS 与证书配置匹配" if sec and (public or "").startswith("http://") else None)}
except Exception as e:
hint = None
if "RequestTimeTooSkewed" in str(e):
hint = _minio_time_hint(ep_host, sec)
return {"ok": False, "connected": False, "bucket_exists": False, "error": str(e), "hint": hint}
@app.get("/config/profile/list")
async def list_profiles():
names: List[str] = []
try:
for p in profiles_dir.rglob("*.json"):
try:
names.append(p.stem)
except Exception:
continue
except Exception:
pass
return {"ok": True, "profiles": sorted(set(names))}
@app.post("/config/profile/activate")
async def activate_profile(name: str = Form(...)):
target = None
try:
for p in profiles_dir.rglob("*.json"):
if p.stem.lower() == (name or "").strip().lower():
target = p
break
if target is None:
raise HTTPException(status_code=404, detail="profile not found")
active_path = profiles_dir / "active.json"
data = json.loads(target.read_text("utf-8"))
# 应用并覆盖到运行时配置
try:
minio_cfg = data.get("minio", {})
if isinstance(minio_cfg, dict) and minio_cfg:
sanitized = dict(minio_cfg)
try:
ep = str(sanitized.get("endpoint") or "").strip()
if ep and ":9001" in ep:
h = ep.split("/")[0]
if ":" in h:
parts = h.split(":")
sanitized["endpoint"] = f"{parts[0]}:9000"
else:
sanitized["endpoint"] = h
except Exception:
pass
try:
pub = str(sanitized.get("public") or "").strip()
if pub and (":9001" in pub or "/browser" in pub or "/minio" in pub):
host = pub.split("/")[0]
sec = str(sanitized.get("secure") or RUNTIME_CONFIG.get("minio", {}).get("secure") or "false").lower() in {"1","true","yes","on"}
scheme = "https" if sec else "http"
if ":" in host:
base_host = host.split(":")[0]
sanitized["public"] = f"{scheme}://{base_host}:9000"
else:
sanitized["public"] = f"{scheme}://{host}:9000"
except Exception:
pass
RUNTIME_CONFIG["minio"].update(sanitized)
except Exception:
pass
try:
db_cfg = data.get("db", {})
if isinstance(db_cfg, dict) and db_cfg:
RUNTIME_CONFIG["db"].update(db_cfg)
except Exception:
pass
# 写入 active.json 以便后续观察者检测到变更
active_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), "utf-8")
return {"ok": True, "active": target.stem}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/system/time/check")
def system_time_check(
endpoint: Optional[str] = Query(None),
public: Optional[str] = Query(None),
secure: Optional[str] = Query(None),
):
try:
rc = RUNTIME_CONFIG.get("minio", {})
ep_raw = (endpoint or rc.get("endpoint") or "").strip()
pub_raw = (public or rc.get("public") or "").strip()
sec_flag = secure if secure is not None else (rc.get("secure") or "false")
sec = str(sec_flag or "false").lower() in {"1","true","yes","on"}
scheme = "https" if sec else "http"
# 解析 host优先 public其次 endpoint
def _host(s: str) -> str:
try:
from urllib.parse import urlsplit
u = urlsplit(s)
return (u.netloc or s).split("/")[0] if u.scheme else s.split("/")[0]
except Exception:
return s.split("/")[0]
base_host = _host(pub_raw or ep_raw)
if not base_host:
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
return {"ok": True, "server_time": None, "local_time": now.isoformat(), "diff_sec": None, "hint": "未配置 MinIO 端点"}
# 构造候选检测 URL尽量使用 MinIO 健康端点以获取标准 Date 头)
base = f"{scheme}://{base_host}"
candidates = [
base,
base + "/minio/health/live",
base + "/minio/health/ready",
base + "/minio/health/version",
]
srv_date = None
for url in candidates:
try:
req = Request(url, method="HEAD")
r = urlopen(req, timeout=3)
d = r.headers.get("Date") or r.headers.get("date")
if d:
srv_date = d
break
except Exception:
try:
r = urlopen(url, timeout=3)
d = r.headers.get("Date") or r.headers.get("date")
if d:
srv_date = d
break
except Exception:
pass
# 如果按当前 scheme 获取失败,尝试切换 scheme 再试一次
if not srv_date:
alt_scheme = "http" if scheme == "https" else "https"
alt_base = f"{alt_scheme}://{base_host}"
alt_candidates = [
alt_base,
alt_base + "/minio/health/live",
alt_base + "/minio/health/ready",
alt_base + "/minio/health/version",
]
for url in alt_candidates:
try:
req = Request(url, method="HEAD")
r = urlopen(req, timeout=3)
d = r.headers.get("Date") or r.headers.get("date")
if d:
srv_date = d
break
except Exception:
try:
r = urlopen(url, timeout=3)
d = r.headers.get("Date") or r.headers.get("date")
if d:
srv_date = d
break
except Exception:
pass
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
diff = None
if srv_date:
from email.utils import parsedate_to_datetime
try:
dt = parsedate_to_datetime(srv_date)
diff = int(abs((now - dt).total_seconds()))
except Exception:
diff = None
hint = _minio_time_hint(base_host, sec)
return {"ok": True, "server_time": srv_date, "local_time": now.isoformat(), "diff_sec": diff, "hint": hint}
except Exception as e:
return {"ok": False, "error": str(e)}
@app.post("/system/time/sync")
async def system_time_sync(method: Optional[str] = Form("auto"), ntp_server: Optional[str] = Form(None)):
cmds = []
servers = [s for s in [ntp_server, "time.apple.com", "pool.ntp.org"] if s]
for srv in servers:
if (method or "auto") in {"auto", "sntp"}:
cmds.append(["sntp", "-sS", srv])
if (method or "auto") in {"auto", "ntpdate"}:
cmds.append(["ntpdate", "-u", srv])
outputs = []
success = False
for cmd in cmds:
try:
p = subprocess.run(cmd, capture_output=True, text=True, timeout=8)
outputs.append({"cmd": " ".join(cmd), "code": p.returncode, "out": p.stdout, "err": p.stderr})
if p.returncode == 0:
success = True
break
except Exception as e:
outputs.append({"cmd": " ".join(cmd), "code": -1, "out": "", "err": str(e)})
if not success and sys.platform == "darwin":
elev_cmds = []
for srv in servers:
elev_cmds.append(["osascript", "-e", f'do shell script "sntp -sS {srv}" with administrator privileges'])
elev_cmds.append(["osascript", "-e", f'do shell script "ntpdate -u {srv}" with administrator privileges'])
elev_cmds.append(["osascript", "-e", f'do shell script "/usr/sbin/systemsetup -setnetworktimeserver {srv}" with administrator privileges'])
elev_cmds.append(["osascript", "-e", 'do shell script "/usr/sbin/systemsetup -setusingnetworktime on" with administrator privileges'])
for cmd in elev_cmds:
try:
p = subprocess.run(cmd, capture_output=True, text=True, timeout=12)
outputs.append({"cmd": " ".join(cmd), "code": p.returncode, "out": p.stdout, "err": p.stderr})
if p.returncode == 0:
success = True
break
except Exception as e:
outputs.append({"cmd": " ".join(cmd), "code": -1, "out": "", "err": str(e)})
chk = system_time_check()
return {"ok": success, "result": outputs, "check": chk}
@app.get("/api/system/time/check")
def system_time_check_api(
endpoint: Optional[str] = Query(None),
public: Optional[str] = Query(None),
secure: Optional[str] = Query(None),
):
return system_time_check(endpoint=endpoint, public=public, secure=secure)
@app.post("/api/system/time/sync")
async def system_time_sync_api(method: Optional[str] = Form("auto"), ntp_server: Optional[str] = Form(None)):
return await system_time_sync(method=method, ntp_server=ntp_server)
async def _auto_time_calibration():
try:
await asyncio.sleep(1.0)
chk = system_time_check()
try:
diff = int((chk or {}).get("diff_sec") or 0)
except Exception:
diff = 0
if diff and diff > 120:
try:
await system_time_sync(method="auto", ntp_server=None)
except Exception:
pass
except Exception:
pass
@app.get("/config/minio/buckets")
def list_minio_buckets(
endpoint: str,
access: str,
secret: str,
secure: Optional[str] = "false",
):
if Minio is None:
return {"ok": False, "error": "minio client not available", "buckets": []}
try:
sec = str(secure or "false").lower() in {"1","true","yes","on"}
client = _minio_client(endpoint=endpoint, access=access, secret=secret, secure=sec)
names = [b.name for b in client.list_buckets()]
return {"ok": True, "buckets": names}
except Exception as e:
return {"ok": False, "error": str(e), "buckets": []}
@app.post("/config/minio/create-bucket")
async def create_minio_bucket(
endpoint: str = Form(...),
access: str = Form(...),
secret: str = Form(...),
bucket: str = Form(...),
secure: Optional[str] = Form("false"),
public_read: Optional[str] = Form("false"),
):
if Minio is None:
return {"ok": False, "error": "minio client not available"}
try:
sec = str(secure or "false").lower() in {"1","true","yes","on"}
ep_raw = (endpoint or "").strip()
ep_host = ep_raw
try:
from urllib.parse import urlsplit
u = urlsplit(ep_raw)
if u.scheme:
ep_host = (u.netloc or ep_raw).split("/")[0]
else:
ep_host = ep_raw.split("/")[0]
except Exception:
ep_host = ep_raw.split("/")[0]
if ":9001" in ep_host or "/browser" in ep_raw or "/minio" in ep_raw:
return {"ok": False, "error": "请使用 MinIO API 端口 9000而非 9001 控制台)"}
client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=sec)
try:
try:
client.list_buckets() # type: ignore
except Exception as e:
if sec and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)):
client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=False)
sec = False
except Exception:
pass
_minio_create_bucket(client, bucket)
try:
pr = str(public_read or "false").lower() in {"1","true","yes","on"}
if pr:
policy = {
"Version": "2012-10-17",
"Statement": [
{"Effect": "Allow", "Principal": {"AWS": ["*"]}, "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bucket}"]},
{"Effect": "Allow", "Principal": {"AWS": ["*"]}, "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bucket}/*"]},
],
}
import json as _json
client.set_bucket_policy(bucket, _json.dumps(policy)) # type: ignore
except Exception:
pass
return {"ok": True, "bucket_exists": True}
except Exception as e:
hint = None
if "RequestTimeTooSkewed" in str(e):
hint = _minio_time_hint(ep_host, sec)
return {"ok": False, "error": str(e), "hint": hint}
@app.post("/minio/presign", response_model=MinioPresignResponse)
async def minio_presign(
url: Optional[str] = Form(None),
object_name: Optional[str] = Form(None),
bucket: Optional[str] = Form(None),
expires: Optional[int] = Form(3600),
):
client, cfg_bucket, public_base, _ = minio_current(RUNTIME_CONFIG)
if client is None:
raise HTTPException(status_code=400, detail="MinIO 未配置")
obj = (object_name or "").strip()
bkt = (bucket or cfg_bucket or "").strip()
if (not obj) and url:
try:
from urllib.parse import urlsplit, unquote
u = urlsplit((url or "").strip())
path = u.path or ""
parts = [p for p in path.split("/") if p]
if parts:
if not bkt:
bkt = parts[0]
obj = "/".join(parts[1:])
obj = unquote(obj)
except Exception:
obj = obj
if not bkt or not obj:
raise HTTPException(status_code=400, detail="bucket 与 object_name/URL 不能为空")
exp = int(expires or 3600)
ps = presigned_read(client, bkt, obj, exp) if client is not None else None
pub_url = None
try:
from urllib.parse import quote as _quote
if public_base:
pub_url = f"{public_base}/{bkt}/{_quote(obj, safe='/')}"
except Exception:
pub_url = None
return MinioPresignResponse(
bucket=bkt,
object=obj,
minio_url=pub_url,
minio_presigned_url=ps,
expires=exp,
)
@app.get("/minio/object")
def minio_object(bucket: Optional[str] = None, object: str = ""):
client, cfg_bucket, public_base, _ = minio_current(RUNTIME_CONFIG)
if client is None:
raise HTTPException(status_code=400, detail="MinIO 未配置")
bkt = (bucket or cfg_bucket or "").strip()
obj_in = (object or "").strip()
try:
from urllib.parse import unquote as _unquote
obj = _unquote(obj_in)
except Exception:
obj = obj_in
if not bkt or not obj:
raise HTTPException(status_code=400, detail="bucket 与 object 不能为空")
ct = None
try:
try:
st = client.stat_object(bucket_name=bkt, object_name=obj) # type: ignore
except TypeError:
st = client.stat_object(bkt, obj) # type: ignore
ct = getattr(st, "content_type", None)
except Exception:
ct = None
data = b""
try:
try:
resp = client.get_object(bucket_name=bkt, object_name=obj) # type: ignore
except TypeError:
resp = client.get_object(bkt, obj) # type: ignore
try:
data = resp.read() # type: ignore
finally:
try:
resp.close() # type: ignore
except Exception:
pass
except Exception as e:
raise HTTPException(status_code=403, detail=str(e))
media = ct or detect_mime(obj, data)
headers = {"Content-Disposition": f"inline; filename*=UTF-8''" + quote(Path(obj).name)}
return Response(content=data, media_type=media, headers=headers)
@app.post("/config/db")
async def set_db_config(webhook_url: Optional[str] = Form(None), token: Optional[str] = Form(None)):
RUNTIME_CONFIG["db"].update({"webhook_url": webhook_url, "token": token})
return {"ok": True}
@app.get("/config")
def get_config_snapshot():
safe = {
"minio": {
k: ("***" if k == "secret" and v else v)
for k, v in RUNTIME_CONFIG.get("minio", {}).items()
},
"db": RUNTIME_CONFIG.get("db", {}),
}
return safe
@app.get("/config/profiles")
def list_profiles():
names = []
try:
for p in profiles_dir.glob("*.json"):
names.append(p.stem)
except Exception:
names = []
return {"ok": True, "profiles": sorted(names)}
@app.post("/config/save_profile")
async def save_profile(name: str = Form(...)):
if not name.strip():
raise HTTPException(status_code=400, detail="name required")
data = {
"minio": RUNTIME_CONFIG.get("minio", {}),
"db": RUNTIME_CONFIG.get("db", {}),
}
import json as _json
path = profiles_dir / f"{sanitize_filename(name)}.json"
try:
path.write_text(_json.dumps(data, ensure_ascii=False, indent=2), "utf-8")
return {"ok": True, "name": path.stem}
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/config/load_profile")
def load_profile(name: str):
import json as _json
path = profiles_dir / f"{sanitize_filename(name)}.json"
if not path.exists():
raise HTTPException(status_code=404, detail="profile not found")
try:
data = _json.loads(path.read_text("utf-8"))
m = data.get("minio", {})
d = data.get("db", {})
RUNTIME_CONFIG["minio"].update(m)
RUNTIME_CONFIG["db"].update(d)
client, bkt, pub, _ = minio_current(RUNTIME_CONFIG)
if client is None or not bkt or not pub:
raise HTTPException(status_code=400, detail="MinIO config invalid")
return {"ok": True, "config": data}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
# ──────────────────────────────────────────────────────────────────────────────
# Auto-load DB config from app/configs without restart or page refresh
# ──────────────────────────────────────────────────────────────────────────────
def _choose_default_config_file() -> Optional[Path]:
try:
candidates: List[Path] = []
for p in profiles_dir.rglob("*.json"):
candidates.append(p)
if not candidates:
return None
by_name = {x.stem.lower(): x for x in candidates}
for prefer in ("active", "default", "test"):
if prefer in by_name:
return by_name[prefer]
return sorted(candidates, key=lambda x: x.stat().st_mtime, reverse=True)[0]
except Exception:
return None
def _apply_configs_from_file(path: Path) -> None:
try:
import json as _json
data = _json.loads(path.read_text("utf-8"))
db_cfg = data.get("db", {})
if isinstance(db_cfg, dict) and db_cfg:
RUNTIME_CONFIG["db"].update(db_cfg)
minio_cfg = data.get("minio", {})
if isinstance(minio_cfg, dict) and minio_cfg:
sanitized = dict(minio_cfg)
try:
ep = str(sanitized.get("endpoint") or "").strip()
if ep and ":9001" in ep:
h = ep.split("/")[0]
if ":" in h:
parts = h.split(":")
sanitized["endpoint"] = f"{parts[0]}:9000"
else:
sanitized["endpoint"] = h
except Exception:
pass
try:
pub = str(sanitized.get("public") or "").strip()
if pub and (":9001" in pub or "/browser" in pub or "/minio" in pub):
host = pub.split("/")[0]
sec = str(sanitized.get("secure") or RUNTIME_CONFIG.get("minio", {}).get("secure") or "false").lower() in {"1","true","yes","on"}
scheme = "https" if sec else "http"
if ":" in host:
base_host = host.split(":")[0]
sanitized["public"] = f"{scheme}://{base_host}:9000"
else:
sanitized["public"] = f"{scheme}://{host}:9000"
except Exception:
pass
for k, v in sanitized.items():
try:
cur = RUNTIME_CONFIG["minio"].get(k)
if cur in (None, ""):
RUNTIME_CONFIG["minio"][k] = v
except Exception:
RUNTIME_CONFIG["minio"][k] = v
except Exception:
pass
async def _watch_db_config_changes(interval_sec: float = 3.0) -> None:
last_path: Optional[Path] = _choose_default_config_file()
last_mtime: float = (last_path.stat().st_mtime if last_path and last_path.exists() else 0.0)
# Apply once at startup
if last_path:
_apply_configs_from_file(last_path)
while True:
try:
cur = _choose_default_config_file()
if cur and cur.exists():
mt = cur.stat().st_mtime
if cur != last_path or mt > last_mtime:
_apply_configs_from_file(cur)
last_path = cur
last_mtime = mt
except Exception:
pass
await asyncio.sleep(interval_sec)
@app.on_event("startup")
async def _startup_autoload_configs():
try:
asyncio.create_task(_watch_db_config_changes(interval_sec=3.0))
except Exception:
pass
try:
asyncio.create_task(_auto_time_calibration())
except Exception:
pass
@app.post("/md/convert", response_model=ConvertResponse)
async def md_convert(
md_file: Optional[UploadFile] = File(None),
markdown_text: Optional[str] = Form(None),
markdown_url: Optional[str] = Form(None),
target: str = Form("docx"),
save: Optional[bool] = Form(False),
filename: Optional[str] = Form(None),
css_name: Optional[str] = Form(None),
css_text: Optional[str] = Form(None),
toc: Optional[bool] = Form(True),
header_text: Optional[str] = Form(None),
footer_text: Optional[str] = Form(None),
logo_url: Optional[str] = Form(None),
logo_file: Optional[UploadFile] = File(None),
cover_url: Optional[str] = Form(None),
cover_file: Optional[UploadFile] = File(None),
product_name: Optional[str] = Form(None),
document_name: Optional[str] = Form(None),
product_version: Optional[str] = Form(None),
document_version: Optional[str] = Form(None),
copyright_text: Optional[str] = Form(None),
):
"""
@function md_convert
@description Advanced Markdown conversion endpoint supporting custom styling, logos, and metadata
@param md_file Uploaded Markdown file (optional)
@param markdown_text Raw Markdown text (optional)
@param markdown_url URL to Markdown file (optional)
@param target Output format (docx/pdf)
@param save Save to MinIO
@param filename Output filename
@param css_name Predefined CSS profile name
@param css_text Custom CSS content
@param toc Include Table of Contents
@param header_text Custom header text
@param footer_text Custom footer text
@param logo_url URL for logo image
@param logo_file Uploaded logo file
@param cover_url URL for cover image
@param cover_file Uploaded cover file
@param product_name Product name for cover
@param document_name Document name for cover
@param product_version Product version for cover
@param document_version Document version for cover
@param copyright_text Copyright text
@return File download or JSON response
"""
logging.info(f"md_convert start target={target} save={save} filename={filename}")
provided = 0
if md_file is not None:
provided += 1
if markdown_text:
provided += 1
if markdown_url:
provided += 1
if provided != 1:
raise HTTPException(status_code=400, detail="provide exactly one of md_file, markdown_text, markdown_url")
if target.lower() not in {"docx", "pdf"}:
raise HTTPException(status_code=400, detail="target must be docx or pdf")
mappings: List[Dict[str, str]] = []
base_dir = Path(".").resolve()
if md_file is not None:
content = (await md_file.read()).decode("utf-8", errors="ignore")
base_dir = Path(md_file.filename or ".").resolve().parent if md_file and md_file.filename else Path(".")
base = sanitize_filename(filename) if filename else sanitize_filename(os.path.splitext(md_file.filename or "document")[0])
elif markdown_url:
src = markdown_url.strip()
try:
if src.lower().startswith("http"):
already_escaped = "%" in src
safe = src if already_escaped else _safe_http_url(src)
try:
with urlopen(safe, timeout=10) as r:
raw = r.read()
try:
logging.info(f"md_convert fetched markdown_url len={len(raw)} url={safe}")
except Exception:
pass
except UnicodeEncodeError:
alt = quote(src, safe=':/?&=%#')
with urlopen(_safe_http_url(alt), timeout=10) as r:
raw = r.read()
try:
logging.info(f"md_convert fetched markdown_url(len={len(raw)}) with alt url")
except Exception:
pass
except HTTPError as err:
raise HTTPException(status_code=400, detail={"error": "fetch_failed", "status": err.code, "url": getattr(err, 'url', src)})
except URLError as err:
raise HTTPException(status_code=400, detail={"error": "fetch_failed", "status": None, "url": src, "reason": str(getattr(err, 'reason', err))})
try:
content = raw.decode("utf-8")
except Exception:
content = raw.decode("latin-1", errors="ignore")
else:
with open(src, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
base_dir = Path(src).resolve().parent
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail={"error": "fetch_failed", "url": src, "message": str(e)})
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(src, None))
else:
content = markdown_text or ""
base = sanitize_filename(filename) if filename else "document"
# Rewrite local assets to MinIO URLs if configured
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is not None and bucket and public_base and base_dir:
try:
content, mappings = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix)
except Exception:
pass
# Prepare common assets (logo, cover) for both DOCX and PDF
logo_src = None
try:
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if logo_file is not None and getattr(logo_file, "filename", None):
lb = await logo_file.read()
mime = detect_image_mime(logo_file.filename, lb)
safe_logo = sanitize_filename(os.path.splitext(logo_file.filename or "logo")[0])
extl = "." + (logo_file.filename.rsplit(".", 1)[-1].lower() if "." in (logo_file.filename or "") else "png")
obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{safe_logo}{extl}")
bio = io.BytesIO(lb)
if client is not None and bucket and public_base:
client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
enc = _quote(obj_logo, safe="/")
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None
logo_src = ps or f"{public_base}/{bucket}/{enc}"
except Exception:
logo_src = f"{public_base}/{bucket}/{obj_logo}"
try:
if not save:
import base64 as _b64
logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii")
except Exception:
pass
elif logo_url:
u = logo_url.strip()
if u.lower().startswith("http://") or u.lower().startswith("https://"):
logo_src = u
elif u.startswith("/"):
p = Path(u)
try:
lb = p.read_bytes()
mime = detect_image_mime(p.name, lb)
obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}")
bio = io.BytesIO(lb)
if client is not None and bucket and public_base:
client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
enc = _quote(obj_logo, safe="/")
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None
logo_src = ps or f"{public_base}/{bucket}/{enc}"
except Exception:
logo_src = f"{public_base}/{bucket}/{obj_logo}"
try:
if not save:
import base64 as _b64
logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii")
except Exception:
pass
except Exception:
logo_src = p.resolve().as_uri()
else:
p = Path(u)
try:
lb = p.read_bytes()
mime = detect_image_mime(p.name, lb)
obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}")
bio = io.BytesIO(lb)
if client is not None and bucket and public_base:
client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
enc = _quote(obj_logo, safe="/")
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None
logo_src = ps or f"{public_base}/{bucket}/{enc}"
except Exception:
logo_src = f"{public_base}/{bucket}/{obj_logo}"
try:
if not save:
import base64 as _b64
logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii")
except Exception:
pass
except Exception:
logo_src = p.resolve().as_uri()
except Exception:
logo_src = None
cover_src = None
try:
limit = 2 * 1024 * 1024
if cover_file is not None and getattr(cover_file, "filename", None):
cb = await cover_file.read()
if len(cb) > limit:
raise HTTPException(status_code=400, detail="cover image exceeds 2MB limit")
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
mime = detect_image_mime(cover_file.filename, cb)
safe_cov = sanitize_filename(os.path.splitext(cover_file.filename or "cover")[0])
extc = "." + (cover_file.filename.rsplit(".", 1)[-1].lower() if "." in (cover_file.filename or "") else "png")
obj_cov = join_prefix(prefix, f"uploads/cover/{int(time.time())}-{safe_cov}{extc}")
bio = io.BytesIO(cb)
if client is not None and bucket and public_base:
client.put_object(bucket_name=bucket, object_name=obj_cov, data=bio, length=len(cb), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
enc = _quote(obj_cov, safe="/")
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj_cov, exp) if client is not None else None
cover_src = ps or f"{public_base}/{bucket}/{enc}"
except Exception:
cover_src = f"{public_base}/{bucket}/{obj_cov}"
try:
if not save:
import base64 as _b64
cover_src = f"data:{mime};base64," + _b64.b64encode(cb).decode("ascii")
except Exception:
pass
elif cover_url:
cu = cover_url.strip()
if cu.lower().startswith("http://") or cu.lower().startswith("https://"):
cover_src = cu
else:
p = Path(cu)
rb = p.read_bytes()
if len(rb) > limit:
raise HTTPException(status_code=400, detail="cover image exceeds 2MB limit")
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
mime = detect_image_mime(cu, rb)
obj_cov = join_prefix(prefix, f"uploads/cover/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}")
bio = io.BytesIO(rb)
if client is not None and bucket and public_base:
client.put_object(bucket_name=bucket, object_name=obj_cov, data=bio, length=len(rb), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
enc = _quote(obj_cov, safe="/")
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj_cov, exp) if client is not None else None
cover_src = ps or f"{public_base}/{bucket}/{enc}"
except Exception:
cover_src = f"{public_base}/{bucket}/{obj_cov}"
try:
if not save:
import base64 as _b64
cover_src = f"data:{mime};base64," + _b64.b64encode(rb).decode("ascii")
except Exception:
pass
except HTTPException:
raise
except Exception:
cover_src = None
logging.info(f"md_convert assets prepared logo_src={bool(logo_src)} cover_src={bool(cover_src)} css_name={css_name} css_text_len={(len(css_text) if css_text else 0)}")
if target.lower() == "docx":
data = md_to_docx_bytes(
content,
toc=bool(toc),
header_text=header_text,
footer_text=footer_text,
logo_url=logo_src or logo_url,
copyright_text=copyright_text,
filename_text=base,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
media = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
ext = ".docx"
# Upload final docx to MinIO
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
minio_url = None
minio_presigned_url = None
try:
rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"}
if client is not None and bucket and public_base and rc_store_final:
out_name = f"{base}{ext}"
obj = f"{(prefix or '').strip('/')}/converted/{out_name}".lstrip("/")
bio = io.BytesIO(data)
ct = media or "application/octet-stream"
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=ct) # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
try:
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None
except Exception:
minio_presigned_url = None
except Exception:
minio_url = None
logging.info(f"md_convert done docx name={base}{ext} size={len(data)}")
_db_notify({
"type": "md_convert",
"base": base,
"target": target.lower(),
"local_url": None,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"mappings": mappings,
"time": int(time.time())
})
return ConvertResponse(
minio_url=minio_url,
minio_presigned_url=minio_presigned_url,
name=f"{base}{ext}",
media_type=media,
)
else:
use_css_name = css_name if css_name else ("default" if not css_text else None)
data = md_to_pdf_bytes_with_renderer(
content,
"weasyprint",
css_name=use_css_name,
css_text=css_text,
toc=bool(toc),
header_text=header_text,
footer_text=footer_text,
logo_url=logo_src or logo_url,
copyright_text=copyright_text,
filename_text=base,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
media = "application/pdf"
ext = ".pdf"
minio_url = None
minio_presigned_url = None
try:
rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"}
if client is not None and bucket and public_base and rc_store_final:
out_name = f"{base}{ext}"
obj = f"{(prefix or '').strip('/')}/converted/{out_name}".lstrip("/")
bio = io.BytesIO(data)
ct = media or "application/octet-stream"
try:
if ct.startswith("text/") and "charset" not in ct.lower():
ct = ct + "; charset=utf-8"
except Exception:
pass
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=ct) # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
try:
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None
except Exception:
minio_presigned_url = None
except Exception:
minio_url = None
logging.info(f"md_convert done pdf name={base}{ext} size={len(data)}")
_db_notify({
"type": "md_convert",
"base": base,
"target": target.lower(),
"local_url": None,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"mappings": mappings,
"time": int(time.time())
})
return ConvertResponse(
minio_url=minio_url,
minio_presigned_url=minio_presigned_url,
name=f"{base}{ext}",
media_type=media,
)
@app.get("/config/linkmap")
def get_linkmap():
return load_linkmap()
@app.post("/config/linkmap")
async def set_linkmap(mapping: dict):
try:
save_linkmap(mapping)
return {"ok": True}
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
def detect_image_mime(filename: Optional[str], data: bytes) -> str:
ext = (os.path.splitext(filename or "")[1] or "").lower()
if ext in {".png"}:
return "image/png"
if ext in {".jpg", ".jpeg"}:
return "image/jpeg"
if ext in {".svg"}:
return "image/svg+xml"
if ext in {".webp"}:
return "image/webp"
if data.startswith(b"\x89PNG\r\n\x1a\n"):
return "image/png"
if data.startswith(b"\xff\xd8\xff"):
return "image/jpeg"
if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
return "image/webp"
try:
head = data[:512].decode("utf-8", errors="ignore")
if "<svg" in head:
return "image/svg+xml"
except Exception:
pass
guessed, _ = mimetypes.guess_type(filename or "")
if guessed:
return guessed
return "image/png"
def detect_mime(filename: Optional[str], data: bytes) -> str:
ext = (os.path.splitext(filename or "")[1] or "").lower()
if ext in {".png", ".jpg", ".jpeg", ".svg", ".webp"}:
return detect_image_mime(filename, data)
sig_png = data.startswith(b"\x89PNG\r\n\x1a\n")
sig_jpg = data.startswith(b"\xff\xd8\xff")
sig_webp = len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP"
if sig_png or sig_jpg or sig_webp:
return detect_image_mime(filename, data)
guessed, _ = mimetypes.guess_type(filename or "")
if guessed:
return guessed
return "application/octet-stream"
@app.post("/proxy/download")
async def proxy_download(url: str = Form(...)):
u = (url or "").strip()
if not u:
raise HTTPException(status_code=400, detail="url required")
try:
data: bytes
ct: str
name: str
if u.lower().startswith("http://") or u.lower().startswith("https://"):
already_escaped = "%" in u
safe = u if already_escaped else _safe_http_url(u)
with urlopen(safe, timeout=15) as r:
data = r.read()
ct = r.headers.get("Content-Type") or detect_mime(None, data)
from urllib.parse import urlparse, unquote
import os as _os
parsed = urlparse(u)
path = unquote(parsed.path or "")
last = (_os.path.basename(path) or "download").split("?")[0]
if "." in last:
name = last
else:
import mimetypes as _m
ext = _m.guess_extension((ct or "").split(";")[0].strip()) or ".md"
name = last + ext
else:
p = Path(u)
if not p.exists() or not p.is_file():
raise HTTPException(status_code=404, detail="local path not found")
data = p.read_bytes()
ct = detect_mime(p.name, data)
name = p.name
disp = f"attachment; filename=\"{name}\"; filename*=UTF-8''" + quote(name)
headers = {"Content-Disposition": disp}
return Response(content=data, media_type=ct, headers=headers)
except HTTPError as err:
raise HTTPException(status_code=err.code, detail=f"download failed: {err}")
except URLError as err:
raise HTTPException(status_code=400, detail=f"download failed: {err}")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
def _minio_from_env() -> Tuple[Optional[object], Optional[str], Optional[str], str]:
endpoint = os.environ.get("MINIO_ENDPOINT")
access = os.environ.get("MINIO_ACCESS_KEY")
secret = os.environ.get("MINIO_SECRET_KEY")
bucket = os.environ.get("MINIO_BUCKET")
secure = str(os.environ.get("MINIO_SECURE", "false")).lower() in {"1","true","yes","on"}
public_base = os.environ.get("MINIO_PUBLIC_ENDPOINT") or (f"https://{endpoint}" if secure else f"http://{endpoint}" if endpoint else None)
if Minio is None or not endpoint or not access or not secret or not bucket or not public_base:
return None, None, None, ""
client = Minio(endpoint, access_key=access, secret_key=secret, secure=secure)
try:
_minio_create_bucket(client, bucket)
except Exception:
pass
return client, bucket, public_base, os.environ.get("MINIO_PREFIX", "")
def _export_ext(export: str) -> str:
e = (export or "").lower()
if e == "markdown":
return ".md"
if e == "html":
return ".html"
if e in {"json", "doctags"}:
return ".json"
return ".txt"
def _media_type(export: str) -> str:
e = (export or "").lower()
if e == "markdown":
return "text/markdown; charset=utf-8"
if e == "html":
return "text/html; charset=utf-8"
if e in {"json", "doctags"}:
return "application/json"
return "text/plain; charset=utf-8"
def _rewrite_md_assets_to_minio(text: str, base_dir: Path, client: object, bucket: str, public_base: str, prefix: str, search_root: Optional[Path] = None) -> Tuple[str, List[Dict[str, str]]]:
mappings: List[Dict[str, str]] = []
def _abs_key(p: Path) -> str:
k = p.resolve().as_posix().lstrip("/")
return k.replace(":", "")
def _upload_data_uri(uri: str) -> Optional[str]:
try:
import base64, hashlib
head, _, b64 = uri.partition(",")
if not b64:
return None
b = base64.b64decode(b64, validate=False)
mime = ""
try:
low = head.lower()
pos = low.find("data:")
if pos != -1:
rest = head[pos+5:]
semi = rest.find(";")
mime = rest[:semi] if semi != -1 else rest
except Exception:
mime = ""
if not mime:
mime = detect_image_mime(None, b)
ext = ".png"
if mime.lower() in {"image/jpeg", "image/jpg"}:
ext = ".jpg"
elif mime.lower() == "image/webp":
ext = ".webp"
elif mime.lower() == "image/svg+xml":
ext = ".svg"
elif mime.lower() == "image/gif":
ext = ".gif"
h = hashlib.sha256(b).hexdigest()[:16]
obj = join_prefix(prefix, f"embed/{h}{ext}")
bio = io.BytesIO(b)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(b), content_type=mime or detect_image_mime(None, b)) # type: ignore
try:
from urllib.parse import quote as _quote
return f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
return f"{public_base}/{bucket}/{obj}"
except Exception:
return None
def _upload(path: Path) -> Optional[str]:
try:
data = path.read_bytes()
mime = detect_mime(path.name, data)
obj = join_prefix(prefix, f"abs/{_abs_key(path)}")
bio = io.BytesIO(data)
size = len(data)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=size, content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
return f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
return f"{public_base}/{bucket}/{obj}"
except Exception:
return None
def _resolve_path(pure: str) -> Optional[Path]:
q = pure.replace("\\", "/")
if q.startswith("/"):
try:
rel = q.lstrip("/")
base = (search_root or base_dir)
p0 = (base / rel).resolve()
except Exception:
p0 = (search_root or base_dir) / q.lstrip("/")
if p0.exists():
return p0
try:
p = (base_dir / q).resolve()
except Exception:
p = (base_dir / q)
if p.exists():
return p
try:
name = Path(q).name
search = (search_root or base_dir)
for hit in search.rglob(name):
if hit.exists():
return hit
except Exception:
pass
return None
def _replace_md(m: re.Match) -> str:
full = m.group(0)
urlpart = m.group(1).strip()
if urlpart.startswith("data:"):
new = _upload_data_uri(urlpart)
if new:
mappings.append({"from": "data_uri", "to": new, "ok": True, "type": "md_image_data"})
return full.replace(urlpart, new)
mappings.append({"from": "data_uri", "to": None, "ok": False, "type": "md_image_data"})
return full
if urlpart.startswith("http://") or urlpart.startswith("https://"):
return full
s = urlpart
pure = s
tail = ""
if s.startswith("<"):
gt = s.find(">")
if gt != -1:
pure = s[1:gt].strip()
tail = s[gt+1:]
else:
dq = s.find('"')
sq = s.find("'")
qpos = -1
if dq != -1 and sq != -1:
qpos = dq if dq < sq else sq
elif dq != -1:
qpos = dq
elif sq != -1:
qpos = sq
if qpos != -1:
pure = s[:qpos].rstrip()
tail = s[qpos:]
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"})
return full
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"})
return full
mappings.append({"from": pure, "to": new, "ok": True, "type": "md_link"})
return full.replace(urlpart, f"{new}{tail}")
text = re.sub(r"!\[[^\]]*\]\(([^)]+)\)", _replace_md, text)
def _replace_mdlink(m: re.Match) -> str:
full = m.group(0)
urlpart = m.group(1).strip()
if urlpart.startswith("http://") or urlpart.startswith("https://") or urlpart.startswith("data:"):
return full
s = urlpart
pure = s
tail = ""
if s.startswith("<"):
gt = s.find(">")
if gt != -1:
pure = s[1:gt].strip()
tail = s[gt+1:]
else:
dq = s.find('"')
sq = s.find("'")
qpos = -1
if dq != -1 and sq != -1:
qpos = dq if dq < sq else sq
elif dq != -1:
qpos = dq
elif sq != -1:
qpos = sq
if qpos != -1:
pure = s[:qpos].rstrip()
tail = s[qpos:]
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"})
return full
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"})
return full
mappings.append({"from": pure, "to": new, "ok": True, "type": "md_link"})
return full.replace(urlpart, f"{new}{tail}")
text = re.sub(r"(?<!!)\[[^\]]*\]\(([^)]+)\)", _replace_mdlink, text)
def _replace_img(m: re.Match) -> str:
src = m.group(1).strip()
if src.startswith("data:"):
new = _upload_data_uri(src)
if new:
mappings.append({"from": "data_uri", "to": new, "ok": True, "type": "html_img_data"})
return m.group(0).replace(src, new)
mappings.append({"from": "data_uri", "to": None, "ok": False, "type": "html_img_data"})
return m.group(0)
if src.startswith("http://") or src.startswith("https://"):
return m.group(0)
pure = src
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_img"})
return m.group(0)
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_img"})
return m.group(0)
mappings.append({"from": pure, "to": new, "ok": True, "type": "html_img"})
return m.group(0).replace(src, new)
text = re.sub(r"<img[^>]+src=\"([^\"]+)\"", _replace_img, text)
text = re.sub(r"<img[^>]+src='([^']+)'", _replace_img, text)
def _replace_href(m: re.Match) -> str:
src = m.group(1).strip()
if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"):
return m.group(0)
pure = src
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_href"})
return m.group(0)
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_href"})
return m.group(0)
mappings.append({"from": pure, "to": new, "ok": True, "type": "html_href"})
return m.group(0).replace(src, new)
text = re.sub(r"<a[^>]+href=\"([^\"]+)\"", _replace_href, text)
text = re.sub(r"<a[^>]+href='([^']+)'", _replace_href, text)
def _replace_video(m: re.Match) -> str:
src = m.group(1).strip()
if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"):
return m.group(0)
pure = src
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_video"})
return m.group(0)
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_video"})
return m.group(0)
mappings.append({"from": pure, "to": new, "ok": True, "type": "html_video"})
return m.group(0).replace(src, new)
text = re.sub(r"<video[^>]+src=\"([^\"]+)\"", _replace_video, text)
text = re.sub(r"<video[^>]+src='([^']+)'", _replace_video, text)
def _replace_audio(m: re.Match) -> str:
src = m.group(1).strip()
if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"):
return m.group(0)
pure = src
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_audio"})
return m.group(0)
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_audio"})
return m.group(0)
mappings.append({"from": pure, "to": new, "ok": True, "type": "html_audio"})
return m.group(0).replace(src, new)
text = re.sub(r"<audio[^>]+src=\"([^\"]+)\"", _replace_audio, text)
text = re.sub(r"<audio[^>]+src='([^']+)'", _replace_audio, text)
def _replace_source(m: re.Match) -> str:
src = m.group(1).strip()
if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"):
return m.group(0)
pure = src
p = _resolve_path(pure)
if not p or not p.exists():
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_source"})
return m.group(0)
new = _upload(p)
if not new:
mappings.append({"from": pure, "to": None, "ok": False, "type": "html_source"})
return m.group(0)
mappings.append({"from": pure, "to": new, "ok": True, "type": "html_source"})
return m.group(0).replace(src, new)
text = re.sub(r"<source[^>]+src=\"([^\"]+)\"", _replace_source, text)
text = re.sub(r"<source[^>]+src='([^']+)'", _replace_source, text)
return text, mappings
def _uplift_rel_path(rel: Path, md_dir: Path, root: Optional[Path], mappings: List[Dict[str, str]]) -> Path:
try:
parts = list(rel.parts)
if len(parts) < 2:
return rel
exts = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp"}
def _is_asset_dir(name: str) -> bool:
n = name.strip().lower()
return n in {"image", "images", "img", "imgs", "media", "assets", "pic", "pics", "picture", "pictures", "visio pic", "visio_pic", "visio", "图片", "图像"}
def _has_asset_sibling() -> bool:
try:
for ch in md_dir.iterdir():
if ch.is_dir() and _is_asset_dir(ch.name):
for f in ch.rglob("*"):
if f.is_file() and f.suffix.lower() in exts:
return True
for f in md_dir.iterdir():
if f.is_file() and f.suffix.lower() in exts:
return True
except Exception:
pass
return False
def _mappings_indicate_local_assets() -> bool:
try:
for m in mappings or []:
if isinstance(m.get("from"), str):
s = str(m.get("from") or "").strip()
if s and not (s.startswith("http://") or s.startswith("https://") or s.startswith("data:") or s.startswith("file://")):
return True
except Exception:
pass
return False
try:
if len(parts) >= 2:
new_parts = parts[:-2] + [parts[-1]]
return Path("/".join(new_parts))
except Exception:
pass
return rel
except Exception:
return rel
def _inject_image_urls_for_markers(text: str, urls: List[str]) -> str:
if not urls:
return text
out = []
i = 0
for line in text.splitlines():
if "<!-- image -->" in line and i < len(urls):
line = line.replace("<!-- image -->", f"![image]({urls[i]})")
i += 1
out.append(line)
return "\n".join(out)
def _extract_pdf_images(pdf_path: Path) -> List[Tuple[str, bytes]]:
imgs: List[Tuple[str, bytes]] = []
if fitz is None:
return imgs
try:
doc = fitz.open(pdf_path)
for page in doc:
for xref in page.get_images(full=True):
try:
info = doc.extract_image(xref[0])
ext = info.get("ext", "png")
data = info.get("image", b"")
if data:
imgs.append((ext, data))
except Exception:
continue
doc.close()
except Exception:
pass
return imgs
def _bulk_upload_assets(root: Path, client: object, bucket: str, public_base: str, prefix: str) -> List[str]:
urls: List[str] = []
exts = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".tif", ".tiff", ".ico", ".jfif", ".heic", ".heif", ".emf", ".wmf", ".eps", ".psd"}
for f in root.rglob("*"):
try:
if not f.is_file():
continue
if f.suffix.lower() not in exts:
continue
data = f.read_bytes()
mime = detect_mime(f.name, data)
k = f.resolve().as_posix().lstrip("/").replace(":", "")
obj = join_prefix(prefix, f"abs/{k}")
bio = io.BytesIO(data)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore
urls.append(f"{public_base}/{bucket}/{obj}")
except Exception:
pass
return urls
@app.post("/md/convert-folder")
async def md_convert_folder(folder_path: str = Form(...), prefix: Optional[str] = Form(None)):
p = Path(folder_path).expanduser().resolve()
if not p.exists() or not p.is_dir():
raise HTTPException(status_code=400, detail="folder_path must be an existing directory")
client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG)
if client is None or bucket is None or not public_base:
raise HTTPException(status_code=400, detail="MinIO is not configured")
use_prefix = (prefix or env_prefix or "").strip()
processed: List[Dict[str, str]] = []
try:
_bulk_upload_assets(p, client, bucket, public_base, use_prefix)
except Exception:
pass
for md_file in p.rglob("*.md"):
rel_md = md_file.relative_to(p)
rel_uplift_path = rel_md
minio_url: Optional[str] = None
minio_presigned_url: Optional[str] = None
mappings: List[Dict[str, str]] = []
try:
content = md_file.read_text("utf-8", errors="ignore")
new_text, mappings = _rewrite_md_assets_to_minio(content, md_file.parent, client, bucket, public_base, use_prefix, search_root=p)
rel_uplift_path = _uplift_rel_path(rel_md, md_file.parent, p, mappings)
# upload rewritten md to MinIO
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift_path.as_posix()}".lstrip("/")
raw = new_text.encode("utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
minio_url_display = unquote(minio_url)
minio_url_display = unquote(minio_url)
try:
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None
except Exception:
minio_presigned_url = None
except Exception as e:
logging.error(str(e))
okc = sum(1 for m in mappings if m.get("ok"))
frc = sum(1 for m in mappings if not m.get("ok"))
asset_urls = [m.get("to") for m in mappings if m.get("ok") and m.get("to")]
processed.append({
"source": rel_uplift_path.as_posix(),
"output": None,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"mappings": mappings,
"asset_ok": okc,
"asset_fail": frc,
"asset_urls": asset_urls
})
return {"ok": True, "count": len(processed), "files": processed}
@app.post("/md/upload-folder")
async def md_upload_folder(folder_files: List[UploadFile] = File(None), folder_paths: List[str] = Form(None), prefix: Optional[str] = Form(None)):
if not folder_files or not folder_paths or len(folder_files) != len(folder_paths):
raise HTTPException(status_code=400, detail="folder_files and folder_paths are required and must match in length")
client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG)
if client is None or bucket is None or not public_base:
raise HTTPException(status_code=400, detail="MinIO is not configured")
use_prefix = (prefix or env_prefix or "").strip()
staging = Path(tempfile.mkdtemp(prefix="folder_stage_"))
try:
for f, rel in zip(folder_files, folder_paths):
rel_norm = rel.replace("\\", "/")
dest = staging / rel_norm
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(await f.read())
base = staging
try:
_bulk_upload_assets(base, client, bucket, public_base, use_prefix)
except Exception:
pass
processed: List[Dict[str, str]] = []
for md_file in base.rglob("*.md"):
try:
content = md_file.read_text("utf-8", errors="ignore")
new_text, mappings = _rewrite_md_assets_to_minio(content, md_file.parent, client, bucket, public_base, use_prefix, search_root=base)
rel_md = md_file.relative_to(base)
rel_uplift = _uplift_rel_path(rel_md, md_file.parent, base, mappings)
try:
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
bio = io.BytesIO(new_text.encode("utf-8"))
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(new_text.encode("utf-8")), content_type="text/markdown; charset=utf-8") # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
minio_presigned_url = None
try:
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None
except Exception:
minio_presigned_url = None
except Exception:
minio_url = None
minio_presigned_url = None
okc = sum(1 for m in mappings if m.get("ok"))
frc = sum(1 for m in mappings if not m.get("ok"))
asset_urls = [m.get("to") for m in mappings if m.get("ok") and m.get("to")]
processed.append({
"source": rel_uplift.as_posix(),
"output": None,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"mappings": mappings,
"asset_ok": okc,
"asset_fail": frc,
"asset_urls": asset_urls
})
except Exception as e:
logging.error(str(e))
return {"ok": True, "count": len(processed), "files": processed}
finally:
try:
shutil.rmtree(staging)
except Exception:
pass
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
def _is_debug(request: Request) -> bool:
try:
q = request.query_params.get("debug")
if q and str(q).lower() in ("1", "true", "yes", "on"):
return True
except Exception:
pass
h = request.headers.get("X-Debug")
if h and str(h).lower() in ("1", "true", "yes", "on"):
return True
env = os.environ.get("APP_DEBUG")
if env and str(env).lower() in ("1", "true", "yes", "on"):
return True
return False
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
start = time.time()
try:
response = await call_next(request)
duration = int((time.time() - start) * 1000)
logging.info(f"{request.method} {request.url.path} -> {response.status_code} {duration}ms")
return response
except Exception as exc:
duration = int((time.time() - start) * 1000)
tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
logging.error(f"{request.method} {request.url.path} FAILED {duration}ms: {exc}\n{tb}")
raise
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
logging.error(f"HTTP error on {request.method} {request.url.path}: {exc}\n{tb}")
debug = _is_debug(request)
body = {"error": "http_error", "detail": exc.detail}
if debug:
body["trace"] = tb
return JSONResponse(status_code=exc.status_code, content=body)
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
logging.error(f"Unhandled error on {request.method} {request.url.path}: {exc}\n{tb}")
debug = _is_debug(request)
body = {"error": "internal_error", "detail": str(exc)}
if debug:
body["trace"] = tb
return JSONResponse(status_code=500, content=body)
def _safe_http_url(u: str) -> str:
try:
parts = urlsplit(u)
path = quote(parts.path, safe="/:%")
query = quote(parts.query, safe="=&%")
frag = quote(parts.fragment, safe="")
netloc = parts.netloc
try:
userinfo = ''
hostport = netloc
if '@' in netloc:
userinfo, hostport = netloc.split('@', 1)
userinfo += '@'
if hostport.startswith('['):
netloc = userinfo + hostport
else:
port = ''
host = hostport
if ':' in hostport:
host, port = hostport.rsplit(':', 1)
if port and not port.isdigit():
host = hostport
port = ''
try:
host_idna = host.encode('idna').decode('ascii')
except Exception:
host_idna = host
netloc = f"{userinfo}{host_idna}{(':' + port) if port else ''}"
except Exception:
pass
return urlunsplit((parts.scheme, netloc, path, query, frag))
except Exception:
return u
# ──────────────────────────────────────────────────────────────────────────────
# API v2 endpoints with standard code/msg/data
# ──────────────────────────────────────────────────────────────────────────────
_converter_v2 = FormatConverter()
def _ok(data: dict, msg: str = "ok"):
return JSONResponse({"code": 0, "msg": msg, "data": data})
def _err(msg: str, code: int = 500, detail: object = None):
payload = {"code": code, "msg": msg, "data": None}
if detail is not None:
payload["detail"] = detail
return JSONResponse(payload, status_code=200)
@app.post("/api/convert")
async def api_convert(
file: Optional[UploadFile] = File(None),
source_url: Optional[str] = Form(None),
export: str = Form("markdown"),
engine: Optional[str] = Form(None),
save: Optional[bool] = Form(False),
filename: Optional[str] = Form(None),
):
try:
if (file is None and not source_url) or (file is not None and source_url):
return _err("参数错误file 与 source_url 二选一")
export = _normalize_export(export)
engine = _normalize_engine(engine)
if source_url:
enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, source_url, export=export, engine=engine)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None))
out_ext = _export_ext(export)
ct = _media_type(export)
mappings: list[dict[str, str]] = []
trace: List[str] = []
trace.append(f"source_url={source_url}")
trace.append(f"export={export}")
if artifacts_dir:
trace.append(f"artifacts_dir={artifacts_dir}")
if export.lower() == "markdown":
try:
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is not None and bucket and public_base:
trace.append(f"minio bucket={bucket} public={public_base} prefix={(prefix or '').strip('/')}")
base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_"))
new_text, ms = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix, search_root=(Path(artifacts_dir) if artifacts_dir else None))
urls: List[str] = []
if artifacts_dir:
try:
urls = _bulk_upload_assets(Path(artifacts_dir), client, bucket, public_base, prefix)
except Exception:
urls = []
trace.append(f"asset_urls={len(urls)}")
try:
if source_url:
src_path: Optional[Path] = None
if source_url.startswith('file://') or Path(source_url).exists():
src_path = Path(source_url.replace('file://', ''))
elif source_url.startswith('http://') or source_url.startswith('https://'):
import tempfile as _tf
from urllib.request import urlopen
with _tf.NamedTemporaryFile(delete=False, suffix=Path(infer_basename(source_url, None)).suffix or '.bin') as _tmp:
try:
with urlopen(source_url) as resp:
_tmp.write(resp.read())
finally:
_tmp.flush(); _tmp.close()
src_path = Path(_tmp.name)
if src_path and src_path.exists() and str(src_path).lower().endswith('.pdf'):
pdf_imgs = _extract_pdf_images(src_path)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None))
extra_urls: List[str] = []
for idx, (img_ext, data) in enumerate(pdf_imgs):
obj = join_prefix(prefix, f"converted/{base}_img_{idx}.{img_ext}")
bio = io.BytesIO(data)
mime = "image/png" if img_ext.lower() == "png" else "image/jpeg"
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
obj_enc = _quote(obj, safe="/")
extra_urls.append(f"{public_base}/{bucket}/{obj_enc}")
except Exception:
extra_urls.append(f"{public_base}/{bucket}/{obj}")
urls.extend(extra_urls)
trace.append(f"pdf_imgs_uploaded={len(extra_urls)}")
if source_url.startswith('http://') or source_url.startswith('https://'):
try:
os.unlink(str(src_path))
except Exception:
pass
except Exception:
pass
before = new_text.count("<!-- image -->")
new_text = _inject_image_urls_for_markers(new_text, urls)
after = new_text.count("<!-- image -->")
trace.append(f"image_placeholders_before={before} after={after}")
content = new_text
mappings = ms
except Exception:
pass
if not save:
resp = _ok({"encoding": enc, "content": content, "name": f"{base}{out_ext}", "media_type": ct, "mappings": mappings, "trace": trace})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is None or not bucket or not public_base:
return _err("MinIO 未配置,无法保存")
out_name = f"{base}{out_ext}"
if export.lower() == "markdown" and not out_name.lower().endswith(".md"):
out_name = f"{base}.md"
obj = join_prefix(prefix, f"converted/{out_name}")
bio = io.BytesIO(content.encode("utf-8"))
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(content.encode("utf-8")), content_type=ct) # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
minio_url_display = unquote(minio_url)
try:
trace.append(f"save out_name={out_name}")
trace.append(f"save obj={obj}")
trace.append(f"save minio_url={minio_url}")
except Exception:
pass
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp)
resp = _ok({
"encoding": enc,
"name": out_name,
"media_type": ct,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"minio_url_display": minio_url_display,
"mappings": mappings,
"trace": trace,
})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
assert file is not None
suffix = ""
if file.filename and "." in file.filename:
suffix = "." + file.filename.rsplit(".", 1)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, tmp_path, export=export, engine=engine)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename))
out_ext = _export_ext(export)
ct = _media_type(export)
mappings: list[dict[str, str]] = []
trace: List[str] = []
trace.append(f"file={file.filename}")
trace.append(f"tmp_path={tmp_path}")
trace.append(f"export={export}")
if artifacts_dir:
trace.append(f"artifacts_dir={artifacts_dir}")
if export.lower() == "markdown":
try:
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is not None and bucket and public_base:
trace.append(f"minio bucket={bucket} public={public_base} prefix={(prefix or '').strip('/')}")
base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_"))
new_text, ms = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix, search_root=(Path(artifacts_dir) if artifacts_dir else None))
urls: List[str] = []
if artifacts_dir:
try:
urls = _bulk_upload_assets(Path(artifacts_dir), client, bucket, public_base, prefix)
except Exception:
urls = []
trace.append(f"asset_urls={len(urls)}")
try:
if tmp_path and tmp_path.exists() and str(tmp_path).lower().endswith('.pdf'):
pdf_imgs = _extract_pdf_images(tmp_path)
base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename))
extra_urls: List[str] = []
for idx, (img_ext, data) in enumerate(pdf_imgs):
obj = join_prefix(prefix, f"converted/{base}_img_{idx}.{img_ext}")
bio = io.BytesIO(data)
mime = "image/png" if img_ext.lower() == "png" else "image/jpeg"
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore
try:
from urllib.parse import quote as _quote
extra_urls.append(f"{public_base}/{bucket}/{_quote(obj, safe='/')}")
except Exception:
extra_urls.append(f"{public_base}/{bucket}/{obj}")
urls.extend(extra_urls)
trace.append(f"pdf_imgs_uploaded={len(extra_urls)}")
except Exception:
pass
before = new_text.count("<!-- image -->")
new_text = _inject_image_urls_for_markers(new_text, urls)
after = new_text.count("<!-- image -->")
trace.append(f"image_placeholders_before={before} after={after}")
content = new_text
mappings = ms
except Exception:
pass
if not save:
resp = _ok({"encoding": enc, "content": content, "name": f"{base}{out_ext}", "media_type": ct, "mappings": mappings, "trace": trace})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG)
if client is None or not bucket or not public_base:
return _err("MinIO 未配置,无法保存")
out_name = f"{base}{out_ext}"
if export.lower() == "markdown" and not out_name.lower().endswith(".md"):
out_name = f"{base}.md"
obj = join_prefix(prefix, f"converted/{out_name}")
bio = io.BytesIO(content.encode("utf-8"))
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(content.encode("utf-8")), content_type=ct) # type: ignore
try:
from urllib.parse import quote as _quote
minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
minio_url = f"{public_base}/{bucket}/{obj}"
minio_url_display = unquote(minio_url)
try:
trace.append(f"save out_name={out_name}")
trace.append(f"save obj={obj}")
trace.append(f"save minio_url={minio_url}")
except Exception:
pass
exp = int(timedelta(hours=12).total_seconds())
minio_presigned_url = presigned_read(client, bucket, obj, exp)
resp = _ok({
"encoding": enc,
"name": out_name,
"media_type": ct,
"minio_url": minio_url,
"minio_presigned_url": minio_presigned_url,
"minio_url_display": minio_url_display,
"mappings": mappings,
"trace": trace,
})
try:
if artifacts_dir:
shutil.rmtree(artifacts_dir, ignore_errors=True)
except Exception:
pass
return resp
finally:
try:
os.remove(tmp_path)
except Exception:
pass
except HTTPException as e:
return _err(str(e.detail), 400)
except Exception as e:
return _err(str(e))
2026-01-13 22:56:22 +08:00
@app.post("/api/pdf/convert")
async def api_pdf_convert(
file: Optional[UploadFile] = File(None),
file_path: Optional[str] = Form(None),
markdown_content: Optional[str] = Form(None),
toc: bool = Form(False),
header_text: Optional[str] = Form(None),
footer_text: Optional[str] = Form(None),
logo_url: Optional[str] = Form(None),
copyright_text: Optional[str] = Form(None),
filename_text: Optional[str] = Form(None),
cover_src: Optional[str] = Form(None),
product_name: Optional[str] = Form(None),
document_name: Optional[str] = Form(None),
product_version: Optional[str] = Form(None),
document_version: Optional[str] = Form(None),
css_name: Optional[str] = Form(None),
css_text: Optional[str] = Form(None),
download: bool = Form(True),
):
"""
Convert Word or Markdown to PDF
Supports three input methods:
1. Upload file (Word .doc/.docx or Markdown .md)
2. Specify file_path (local file path)
3. Provide markdown_content directly
Returns PDF file as download by default
"""
try:
pdf_bytes: bytes = b""
output_filename: str = "document.pdf"
# Determine input source
if file:
# Handle uploaded file
filename = file.filename or "upload"
suffix = Path(filename).suffix.lower()
# Save uploaded file to temp
tmp_path = Path(tempfile.mktemp(suffix=suffix))
try:
content = await file.read()
tmp_path.write_bytes(content)
if suffix in {".doc", ".docx"}:
# Convert Word to PDF
output_filename = f"{Path(filename).stem}.pdf"
pdf_bytes = await asyncio.to_thread(
word_to_pdf_bytes,
tmp_path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or Path(filename).stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
elif suffix in {".md", ".markdown"}:
# Convert Markdown file to PDF
output_filename = f"{Path(filename).stem}.pdf"
pdf_bytes = await asyncio.to_thread(
markdown_file_to_pdf_bytes,
tmp_path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or Path(filename).stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
finally:
try:
tmp_path.unlink(missing_ok=True)
except Exception:
pass
elif file_path:
# Handle local file path
path = Path(file_path).expanduser()
if not path.exists():
return _err(f"文件不存在: {file_path}")
suffix = path.suffix.lower()
output_filename = f"{path.stem}.pdf"
if suffix in {".doc", ".docx"}:
pdf_bytes = await asyncio.to_thread(
word_to_pdf_bytes,
path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or path.stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
)
elif suffix in {".md", ".markdown"}:
pdf_bytes = await asyncio.to_thread(
markdown_file_to_pdf_bytes,
path,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text or path.stem,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
elif markdown_content:
# Handle direct markdown content
output_filename = f"{filename_text or 'document'}.pdf"
pdf_bytes = await asyncio.to_thread(
markdown_to_pdf_bytes,
markdown_content,
toc=toc,
header_text=header_text,
footer_text=footer_text,
logo_url=logo_url,
copyright_text=copyright_text,
filename_text=filename_text,
cover_src=cover_src,
product_name=product_name,
document_name=document_name,
product_version=product_version,
document_version=document_version,
css_name=css_name,
css_text=css_text,
)
else:
return _err("必须提供 file、file_path 或 markdown_content 中的一个")
if not pdf_bytes:
return _err("PDF 转换失败,未生成内容")
# Return PDF file
if download:
from fastapi.responses import StreamingResponse
import urllib.parse
# 处理中文文件名 - 使用 URL 编码确保只包含 ASCII 字符
# 先将中文文件名进行百分比编码
safe_filename = urllib.parse.quote(output_filename, safe='')
2026-01-13 22:56:22 +08:00
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename={safe_filename}"
2026-01-13 22:56:22 +08:00
}
)
else:
# Return as base64 in JSON
import base64
return _ok({
"pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"),
"filename": output_filename,
"size": len(pdf_bytes)
})
except Exception as e:
logging.exception("PDF conversion error")
return _err(f"PDF 转换失败: {str(e)}")
2026-01-07 17:18:26 +08:00
@app.post("/api/import/convert")
async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)):
try:
raw_text: Optional[str] = None
if json_file is not None:
raw = await json_file.read()
raw_text = raw.decode("utf-8", errors="ignore")
elif json_text:
raw_text = json_text
else:
use_path = (path or "import.json").strip()
p = Path(use_path).expanduser()
if not p.exists():
return _err(f"未找到文件: {use_path}")
raw_text = p.read_text("utf-8", errors="ignore")
import json as _json
data = _json.loads(raw_text or "{}")
files = data.get("files", [])
if not isinstance(files, list):
return _err("JSON结构不合法缺少 files 数组")
imp = _build_import_tree(files, int(versionId or 1001))
if download:
from fastapi.responses import StreamingResponse
b = _json.dumps(imp, ensure_ascii=False, indent=2).encode("utf-8")
return StreamingResponse(io.BytesIO(b), media_type="application/json; charset=utf-8", headers={"Content-Disposition": "attachment; filename=import.json"})
return _ok({"import": imp})
except Exception as e:
return _err(str(e))
@app.post("/api/upload-archive")
async def api_upload_archive(file: UploadFile = File(...), prefix: Optional[str] = Form(None)):
try:
client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG)
if client is None or bucket is None or not public_base:
return _err("MinIO 未配置")
use_prefix = (prefix or env_prefix or "").strip()
suffix = (file.filename or "").lower()
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
data = await file.read()
tmp.write(data)
tmp.flush(); tmp.close()
root = Path(tempfile.mkdtemp(prefix="extract_"))
try:
if suffix.endswith(".zip"):
import zipfile
with zipfile.ZipFile(tmp.name, "r") as zf:
_zip_extract_safely(zf, root)
elif ".tar" in suffix or suffix.endswith(".tgz") or suffix.endswith(".tar.gz") or suffix.endswith(".tar.bz2") or suffix.endswith(".tar.xz"):
import tarfile
with tarfile.open(tmp.name, "r:*") as tf:
_tar_extract_safely(tf, root)
else:
return _err("不支持的压缩格式")
try:
_bulk_upload_assets(root, client, bucket, public_base, use_prefix)
except Exception:
pass
files = []
# Process Markdown files as-is
for md in root.rglob("*.md"):
try:
text = md.read_text("utf-8", errors="ignore")
new_text, mappings = _rewrite_md_assets_to_minio(text, md.parent, client, bucket, public_base, use_prefix, search_root=root)
rel_md = md.relative_to(root)
rel_uplift = _uplift_rel_path(rel_md, md.parent, root, mappings)
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
bio = io.BytesIO(new_text.encode("utf-8"))
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(new_text.encode("utf-8")), content_type="text/markdown; charset=utf-8") # type: ignore
try:
url = f"{public_base}/{bucket}/{quote(obj, safe='/')}"
except Exception:
url = f"{public_base}/{bucket}/{obj}"
url_display = unquote(url)
url_display = unquote(url)
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj, exp) if client is not None else None
raw = new_text.encode("utf-8")
files.append({
"source": rel_uplift.as_posix(),
"minio_url": url,
"minio_presigned_url": ps,
"minio_url_display": url_display,
"mappings": mappings,
"object_name": obj,
"size": len(raw),
})
except Exception:
files.append({"source": (md.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0})
# Convert HTML files to Markdown and process similarly
for html in [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in {".html", ".htm"}]:
try:
# Skip if a sibling Markdown already exists for the same base name
rel_html = html.relative_to(root)
md_target_rel = rel_html.with_suffix(".md")
md_sibling = (root / md_target_rel).exists()
if md_sibling:
continue
html_src = html.read_text("utf-8", errors="ignore")
html_rew, mappings = _rewrite_md_assets_to_minio(html_src, html.parent, client, bucket, public_base, use_prefix, search_root=root)
tmpd = Path(tempfile.mkdtemp(prefix="rew_html_"))
tmpf = tmpd / html.name
tmpf.write_text(html_rew, "utf-8")
enc, md_text, _art = _converter_v2.convert(str(tmpf), export="markdown")
md_text2, mappings2 = _rewrite_md_assets_to_minio(md_text, html.parent, client, bucket, public_base, use_prefix, search_root=root)
mappings = (mappings or []) + (mappings2 or [])
new_text = md_text2
rel_uplift = _uplift_rel_path(md_target_rel, html.parent, root, mappings)
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
raw = new_text.encode(enc or "utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore
try:
url = f"{public_base}/{bucket}/{quote(obj, safe='/')}"
except Exception:
url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj, exp) if client is not None else None
files.append({
"source": rel_uplift.as_posix(),
"minio_url": url,
"minio_presigned_url": ps,
"minio_url_display": url_display,
"mappings": mappings,
"object_name": obj,
"size": len(raw),
})
except Exception:
files.append({"source": (html.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0})
finally:
try:
shutil.rmtree(tmpd, ignore_errors=True)
except Exception:
pass
imp = _build_import_tree(files, int(1001))
return _ok({"count": len(files), "files": files, "import": imp})
finally:
try:
os.unlink(tmp.name)
except Exception:
pass
try:
shutil.rmtree(root)
except Exception:
pass
except Exception as e:
return _err(str(e))
STAGED_ARCHIVES: Dict[str, Dict[str, object]] = {}
def _build_import_tree(processed: List[Dict[str, object]], version_id: int) -> Dict[str, object]:
def ensure_folder(children: list, name: str) -> Dict[str, object]:
for n in children:
if isinstance(n, dict) and n.get("name") == name and n.get("type") == "FOLDER":
return n
node = {"name": name, "type": "FOLDER", "children": [], "sortOrder": 100}
children.append(node)
return node
tree: List[Dict[str, object]] = []
for idx, f in enumerate(processed):
src = str(f.get("source") or "")
obj = str(f.get("object_name") or "")
size = int(f.get("size") or 0)
parts = [p for p in src.split("/") if p]
if not parts:
continue
cur = tree
for d in parts[:-1]:
folder = ensure_folder(cur, d)
cur = folder.setdefault("children", []) # type: ignore
fname = parts[-1]
base = fname.rsplit(".", 1)[0]
file_node = {"name": base, "type": "FILE", "sortOrder": 100 + idx, "files": [{"languageId": 1, "objectName": obj, "fileName": fname, "fileSize": size}]}
cur.append(file_node) # type: ignore
return {"versionId": version_id, "tree": tree}
@app.post("/api/archive/stage")
async def api_archive_stage(file: UploadFile = File(...), prefix: Optional[str] = Form(None)):
try:
suffix = (file.filename or "").lower()
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
data = await file.read()
tmp.write(data)
tmp.flush(); tmp.close()
sid = uuid.uuid4().hex
STAGED_ARCHIVES[sid] = {"path": tmp.name, "prefix": (prefix or "")}
return _ok({"id": sid, "name": file.filename, "size": len(data)})
except Exception as e:
return _err(str(e))
@app.post("/api/archive/process")
async def api_archive_process(id: str = Form(...), prefix: Optional[str] = Form(None), versionId: Optional[int] = Form(1001)):
try:
st = STAGED_ARCHIVES.get(id)
if not st:
return _err("未找到已上传的压缩包")
tmp_path = Path(str(st.get("path")))
use_prefix_param = (prefix or str(st.get("prefix") or "")).strip()
client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG)
if client is None or bucket is None or not public_base:
return _err("MinIO 未配置")
use_prefix = (use_prefix_param or env_prefix or "").strip()
root = Path(tempfile.mkdtemp(prefix="extract_"))
try:
sfx = tmp_path.name.lower()
if sfx.endswith(".zip"):
import zipfile
with zipfile.ZipFile(str(tmp_path), "r") as zf:
_zip_extract_safely(zf, root)
elif ".tar" in sfx or sfx.endswith(".tgz") or sfx.endswith(".tar.gz") or sfx.endswith(".tar.bz2") or sfx.endswith(".tar.xz"):
import tarfile
with tarfile.open(str(tmp_path), "r:*") as tf:
_tar_extract_safely(tf, root)
else:
return _err("不支持的压缩格式")
try:
_bulk_upload_assets(root, client, bucket, public_base, use_prefix)
except Exception:
pass
processed: List[Dict[str, object]] = []
# Process existing Markdown files
for md in root.rglob("*.md"):
try:
text = md.read_text("utf-8", errors="ignore")
new_text, mappings = _rewrite_md_assets_to_minio(text, md.parent, client, bucket, public_base, use_prefix, search_root=root)
rel_md = md.relative_to(root)
rel_uplift = _uplift_rel_path(rel_md, md.parent, root, mappings)
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
raw = new_text.encode("utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore
try:
from urllib.parse import quote as _quote
url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj, exp) if client is not None else None
processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw)})
except Exception:
processed.append({"source": (md.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0})
# Convert HTML files to Markdown and process
for html in [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in {".html", ".htm"}]:
try:
rel_html = html.relative_to(root)
md_target_rel = rel_html.with_suffix(".md")
md_sibling = (root / md_target_rel).exists()
if md_sibling:
continue
html_src = html.read_text("utf-8", errors="ignore")
html_rew, mappings = _rewrite_md_assets_to_minio(html_src, html.parent, client, bucket, public_base, use_prefix, search_root=root)
tmpd = Path(tempfile.mkdtemp(prefix="rew_html_"))
tmpf = tmpd / html.name
tmpf.write_text(html_rew, "utf-8")
enc, md_text, _art = _converter_v2.convert(str(tmpf), export="markdown")
md_text2, mappings2 = _rewrite_md_assets_to_minio(md_text, html.parent, client, bucket, public_base, use_prefix, search_root=root)
mappings = (mappings or []) + (mappings2 or [])
new_text = md_text2
rel_uplift = _uplift_rel_path(md_target_rel, html.parent, root, mappings)
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
raw = new_text.encode(enc or "utf-8")
bio = io.BytesIO(raw)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore
try:
from urllib.parse import quote as _quote
url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj, exp) if client is not None else None
processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw)})
except Exception:
processed.append({"source": (html.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0})
finally:
try:
shutil.rmtree(tmpd, ignore_errors=True)
except Exception:
pass
imp = _build_import_tree(processed, int(versionId or 1001))
return _ok({"count": len(processed), "files": processed, "import": imp})
finally:
try:
os.unlink(str(tmp_path))
except Exception:
pass
try:
shutil.rmtree(root)
except Exception:
pass
try:
STAGED_ARCHIVES.pop(id, None)
except Exception:
pass
except Exception as e:
return _err(str(e))
@app.post("/api/upload-list")
async def api_upload_list(list_file: UploadFile = File(...), prefix: Optional[str] = Form(None), versionId: Optional[int] = Form(1001)):
try:
client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG)
if client is None or bucket is None or not public_base:
return _err("MinIO 未配置")
use_prefix = (prefix or env_prefix or "").strip()
raw = await list_file.read()
text = raw.decode("utf-8", errors="ignore")
lines = [l.strip() for l in text.splitlines()]
paths: List[str] = [l for l in lines if l and not l.startswith("#")]
locals: List[Path] = []
for p in paths:
if p.startswith("http://") or p.startswith("https://"):
pass
else:
lp = Path(p).expanduser()
if lp.exists() and lp.is_file():
locals.append(lp.resolve())
base_root = None
try:
if locals:
base_root = Path(os.path.commonpath([str(x) for x in locals]))
except Exception:
base_root = None
processed: List[Dict[str, object]] = []
for p in locals:
try:
content = p.read_text("utf-8", errors="ignore")
new_text, mappings = _rewrite_md_assets_to_minio(content, p.parent, client, bucket, public_base, use_prefix, search_root=base_root)
rel0 = p.relative_to(base_root) if base_root else Path(p.name)
rel_uplift = _uplift_rel_path(rel0, p.parent, base_root, mappings)
obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/")
raw_md = new_text.encode("utf-8")
bio = io.BytesIO(raw_md)
client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw_md), content_type="text/markdown; charset=utf-8") # type: ignore
try:
from urllib.parse import quote as _quote
url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}"
except Exception:
url = f"{public_base}/{bucket}/{obj}"
exp = int(timedelta(hours=12).total_seconds())
ps = presigned_read(client, bucket, obj, exp) if client is not None else None
processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw_md)})
except Exception:
processed.append({"source": p.name, "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0})
imp = _build_import_tree(processed, int(versionId or 1001))
return _ok({"count": len(processed), "files": processed, "import": imp})
except Exception as e:
return _err(str(e))
@app.get("/config/minio/policy")
async def get_minio_policy(bucket: Optional[str] = None):
client, cfg_bucket, _, _ = minio_current(RUNTIME_CONFIG)
if client is None:
raise HTTPException(status_code=400, detail="MinIO 未配置")
bkt = (bucket or cfg_bucket or "").strip()
if not bkt:
raise HTTPException(status_code=400, detail="bucket 不能为空")
try:
pol = client.get_bucket_policy(bucket_name=bkt) # type: ignore
try:
import json as _json
data = _json.loads(pol)
except Exception:
data = {"raw": pol}
return {"ok": True, "bucket": bkt, "policy": data}
except Exception as e:
try:
try:
region = client._get_region(bkt) # type: ignore
except Exception:
region = "us-east-1"
resp = client._url_open(method="GET", region=region, bucket_name=bkt, query_params={"policy": ""}) # type: ignore
raw = None
try:
raw = getattr(resp, "data", None)
if raw is not None and hasattr(raw, "decode"):
raw = raw.decode("utf-8")
except Exception:
raw = None
if raw is None:
try:
raw = resp.read().decode("utf-8") # type: ignore
except Exception:
raw = ""
try:
import json as _json
data = _json.loads(raw)
except Exception:
data = {"raw": raw}
return {"ok": True, "bucket": bkt, "policy": data}
except Exception as e2:
return {"ok": False, "bucket": bkt, "error": str(e2)}
@app.post("/config/minio/apply_public_read")
async def apply_public_read(bucket: Optional[str] = Form(None), enable: Optional[str] = Form("true")):
client, cfg_bucket, _, _ = minio_current(RUNTIME_CONFIG)
if client is None:
raise HTTPException(status_code=400, detail="MinIO 未配置")
bkt = (bucket or cfg_bucket or "").strip()
if not bkt:
raise HTTPException(status_code=400, detail="bucket 不能为空")
try:
import json as _json
if str(enable or "true").lower() in {"1","true","yes","on"}:
policy = {
"Version": "2012-10-17",
"Statement": [
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bkt}"]},
{"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bkt}/*"]},
],
}
try:
client.set_bucket_policy(bucket_name=bkt, policy=_json.dumps(policy)) # type: ignore
return {"ok": True, "bucket": bkt, "applied": True}
except Exception:
try:
try:
region = client._get_region(bkt) # type: ignore
except Exception:
region = "us-east-1"
raw = _json.dumps(policy).encode("utf-8")
client._url_open(method="PUT", region=region, bucket_name=bkt, query_params={"policy": ""}, body=raw) # type: ignore
return {"ok": True, "bucket": bkt, "applied": True}
except Exception as e2:
return {"ok": False, "bucket": bkt, "error": str(e2)}
try:
client.delete_bucket_policy(bkt) # type: ignore
except Exception:
pass
return {"ok": True, "bucket": bkt, "applied": False}
except Exception as e:
return {"ok": False, "bucket": bkt, "error": str(e)}