From 0b07e63b7605f4da3cd292680874097496df2c57 Mon Sep 17 00:00:00 2001 From: seekee Date: Wed, 7 Jan 2026 17:18:26 +0800 Subject: [PATCH] Import project files --- .gitignore | 16 + .vscode/settings.json | 3 + Dockerfile | 58 + api.md | 238 ++ docker本地构建与打包.md | 141 + docling/README.zh-CN.md | 154 + docling/app/__init__.py | 1 + docling/app/configs/dm.json | 17 + docling/app/configs/linkmap/linkmap.json | 1 + docling/app/configs/profiles/active.json | 17 + docling/app/configs/profiles/default2.json | 17 + docling/app/configs/profiles/local2.json | 17 + docling/app/configs/profiles/localhost.json | 17 + docling/app/configs/profiles/localhost3.json | 17 + docling/app/configs/profiles/test.json | 17 + docling/app/configs/styles/default.css | 88 + docling/app/configs/test02.json | 17 + docling/app/server.py | 2993 +++++++++++++++++ docling/app/services/__init__.py | 1 + docling/app/services/docling_adapter.py | 709 ++++ docling/app/services/minio_utils.py | 190 ++ docling/app/services/unified_converter.py | 492 +++ docling/app/services/word2markdown.py | 429 +++ docling/app/tests/run_batch_upload_debug.py | 80 + docling/app/tests/run_convert_folder_debug.py | 75 + docling/app/tests/run_edge_cases_debug.py | 97 + docling/app/tests/run_minio_object_debug.py | 77 + docling/app/tests/run_minio_presign_debug.py | 50 + docling/app/tests/run_slash_path_debug.py | 74 + docling/app/tests/test_api_convert.py | 29 + .../app/tests/test_batch_upload_edge_cases.py | 113 + .../app/tests/test_batch_upload_endpoints.py | 185 + docling/app/tests/test_md_to_docx.py | 53 + .../tests/test_word2markdown_inline_images.py | 51 + docling/docling | 1 + docling/requirements.txt | 28 + docling/tests/debug_api.py | 17 + docling/tests/test_api_prd.py | 131 + frontend/.gitignore | 24 + frontend/.vscode/extensions.json | 3 + frontend/README.md | 5 + frontend/index.html | 13 + frontend/package-lock.json | 1454 ++++++++ frontend/package.json | 23 + frontend/public/vite.svg | 1 + frontend/src/App.vue | 101 + frontend/src/assets/vue.svg | 1 + frontend/src/components/BatchProcess.vue | 448 +++ frontend/src/components/ConfigModal.vue | 494 +++ frontend/src/components/DocToMd.vue | 337 ++ frontend/src/components/HelloWorld.vue | 41 + frontend/src/components/MdToDoc.vue | 384 +++ frontend/src/main.ts | 5 + frontend/src/services/api.ts | 305 ++ frontend/src/style.css | 72 + frontend/tests/check_frontend_prd.mjs | 14 + frontend/tsconfig.app.json | 16 + frontend/tsconfig.json | 7 + frontend/tsconfig.node.json | 26 + frontend/vite.config.ts | 47 + import.json | 129 + k8s/deployment.yaml | 51 + package_offline.sh | 60 + prd.md | 63 + 修改总结.md | 460 +++ 批量导入目录树.json | 202 ++ 66 files changed, 11497 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 Dockerfile create mode 100644 api.md create mode 100644 docker本地构建与打包.md create mode 100644 docling/README.zh-CN.md create mode 100644 docling/app/__init__.py create mode 100644 docling/app/configs/dm.json create mode 100644 docling/app/configs/linkmap/linkmap.json create mode 100644 docling/app/configs/profiles/active.json create mode 100644 docling/app/configs/profiles/default2.json create mode 100644 docling/app/configs/profiles/local2.json create mode 100644 docling/app/configs/profiles/localhost.json create mode 100644 docling/app/configs/profiles/localhost3.json create mode 100644 docling/app/configs/profiles/test.json create mode 100644 docling/app/configs/styles/default.css create mode 100644 docling/app/configs/test02.json create mode 100644 docling/app/server.py create mode 100644 docling/app/services/__init__.py create mode 100644 docling/app/services/docling_adapter.py create mode 100644 docling/app/services/minio_utils.py create mode 100644 docling/app/services/unified_converter.py create mode 100644 docling/app/services/word2markdown.py create mode 100644 docling/app/tests/run_batch_upload_debug.py create mode 100644 docling/app/tests/run_convert_folder_debug.py create mode 100644 docling/app/tests/run_edge_cases_debug.py create mode 100644 docling/app/tests/run_minio_object_debug.py create mode 100644 docling/app/tests/run_minio_presign_debug.py create mode 100644 docling/app/tests/run_slash_path_debug.py create mode 100644 docling/app/tests/test_api_convert.py create mode 100644 docling/app/tests/test_batch_upload_edge_cases.py create mode 100644 docling/app/tests/test_batch_upload_endpoints.py create mode 100644 docling/app/tests/test_md_to_docx.py create mode 100644 docling/app/tests/test_word2markdown_inline_images.py create mode 160000 docling/docling create mode 100644 docling/requirements.txt create mode 100644 docling/tests/debug_api.py create mode 100644 docling/tests/test_api_prd.py create mode 100644 frontend/.gitignore create mode 100644 frontend/.vscode/extensions.json create mode 100644 frontend/README.md create mode 100644 frontend/index.html create mode 100644 frontend/package-lock.json create mode 100644 frontend/package.json create mode 100644 frontend/public/vite.svg create mode 100644 frontend/src/App.vue create mode 100644 frontend/src/assets/vue.svg create mode 100644 frontend/src/components/BatchProcess.vue create mode 100644 frontend/src/components/ConfigModal.vue create mode 100644 frontend/src/components/DocToMd.vue create mode 100644 frontend/src/components/HelloWorld.vue create mode 100644 frontend/src/components/MdToDoc.vue create mode 100644 frontend/src/main.ts create mode 100644 frontend/src/services/api.ts create mode 100644 frontend/src/style.css create mode 100644 frontend/tests/check_frontend_prd.mjs create mode 100644 frontend/tsconfig.app.json create mode 100644 frontend/tsconfig.json create mode 100644 frontend/tsconfig.node.json create mode 100644 frontend/vite.config.ts create mode 100644 import.json create mode 100644 k8s/deployment.yaml create mode 100644 package_offline.sh create mode 100644 prd.md create mode 100644 修改总结.md create mode 100644 批量导入目录树.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5344a96 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +# OS +.DS_Store + +# Node/Vite +node_modules/ +frontend/dist/ + +# Python +__pycache__/ +*.pyc +.env + +# Local archives (do not push huge files) +FunMD_Convert.tar +FunMD_Convert_Image.tar + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3b66410 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "git.ignoreLimitWarning": true +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8bdfd9a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,58 @@ +########## Frontend build stage ########## +FROM node:20-alpine AS frontend +WORKDIR /frontend + +# Install dependencies and build +COPY frontend/package*.json ./ +RUN npm ci +COPY frontend/ . + +# Allow overriding API base at build time if needed +ARG VITE_API_BASE_URL= +ENV VITE_API_BASE_URL=${VITE_API_BASE_URL} + +RUN npm run build + +########## Backend runtime stage ########## +FROM python:3.10-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + libgl1 \ + libglib2.0-0 \ + pandoc \ + libreoffice \ + fonts-noto \ + fonts-noto-cjk \ + && rm -rf /var/lib/apt/lists/* + +# Copy backend requirements and install +COPY docling/requirements.txt ./requirements.txt +ARG PIP_INDEX_URL +RUN if [ -n "$PIP_INDEX_URL" ]; then pip install --no-cache-dir -i "$PIP_INDEX_URL" --timeout 120 -r requirements.txt; else pip install --no-cache-dir --timeout 120 -r requirements.txt; fi + +# Copy backend code +COPY docling/ /app/docling/ + +# Copy built frontend into expected location +COPY --from=frontend /frontend/dist /app/frontend/dist + +# Prefetch models for offline use +ENV DOCLING_CACHE_DIR=/root/.cache/docling +ENV PYTHONPATH=/app:/app/docling:/app/docling/docling +RUN python - <<'PY' +from docling.utils.model_downloader import download_models +print('Prefetching Docling models (layout, table, picture-classifier, code-formula, rapidocr)...') +download_models(progress=False) +print('Models downloaded.') +PY + +# Expose port +ENV PORT=8000 +EXPOSE 8000 + +# Start backend (serves API and /ui) +CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/api.md b/api.md new file mode 100644 index 0000000..fb9af3e --- /dev/null +++ b/api.md @@ -0,0 +1,238 @@ +# FunMD 文档处理接口与测试说明 + +## 基本信息 + +- 基地址(内网):`http://192.168.110.58:8000` +- 前端内网测试链接:`http://192.168.110.58:8000/ui/` +- 统一返回结构(API v2):成功 `{"code":0,"msg":"ok","data":{...}}`,失败 `{"code":<错误码>,"msg":<错误>,"data":null}`(HTTP 状态保持 200)。 +- 建议前端设置:`localStorage.setItem('app.api.base','http://192.168.110.58:8000')` +- 重要约定:当 MinIO 桶为私有时,优先使用返回的 `minio_presigned_url` 进行下载;直链 `minio_url` 可能 403。 + +## 接口列表 + +### 健康检查 + +- 方法:`GET /health` +- 返回:`{"status":"ok"}` +- 参考:`docling/app/server.py:99` + +### 统一转换:DOCX/PDF → Markdown/HTML/JSON + +- 方法:`POST /api/convert` +- 表单字段: + - `file` 与 `source_url` 二选一 + - `export`: `markdown|html|json|doctags`,默认 `markdown` + - `engine`(可选):`word2markdown|docling` + - `save`(可选):`true|false` + - `filename`(可选):输出基名 +- 返回: + - 未保存:`data.content` 为文本,`data.media_type` 指示类型 + - 已保存:`data.minio_url` 与 `data.minio_presigned_url` +- 示例: + ```bash + # 本地 PDF 转 Markdown(不保存) + curl -X POST http://192.168.110.58:8000/api/convert \ + -F file=@/path/to/file.pdf \ + -F export=markdown + + # 远程 URL 转 HTML(保存) + curl -X POST http://192.168.110.58:8000/api/convert \ + -F source_url=https://example.com/page.pdf \ + -F export=html -F save=true -F filename=example + ``` +- 参考:`docling/app/server.py:2296` + +### Markdown → DOCX/PDF(高级样式支持) + +- 方法:`POST /md/convert` +- 输入三选一:`md_file` | `markdown_text` | `markdown_url` +- 必填:`target=docx|pdf` +- 可选(高级设置): + - 布局:`css_name`、`css_text`、`toc=true|false`、`header_text`、`footer_text` + - 封面与 Logo:`cover_url|cover_file`、`logo_url|logo_file` + - 封面文字:`product_name|document_name|product_version|document_version` + - 版权:`copyright_text` + - 保存:`save=true|false` +- 行为说明: + - `save=false` 时,封面/Logo 会内嵌为 `data:`,避免私有桶 403;`save=true` 时返回 MinIO 链接。 +- 例: + ```bash + # 文本转 PDF(封面、Logo、目录、页眉页脚) + curl -X POST http://192.168.110.58:8000/md/convert \ + -F markdown_text='# 标题\n\n内容' \ + -F target=pdf -F toc=true \ + -F header_text='Internal' -F footer_text='Confidential' \ + -F product_name='CMS' -F document_name='周报' \ + -F product_version='v1.0' -F document_version='2025-W48' \ + -F cover_file=@/path/to/cover.png -F logo_file=@/path/to/logo.png + + # 文件转 DOCX(保存到 MinIO) + curl -X POST http://192.168.110.58:8000/md/convert \ + -F md_file=@/path/to/doc.md \ + -F target=docx -F save=true -F filename='周报' + ``` +- 参考:`docling/app/server.py:1198` + +### 本地文件夹批量处理(重写 MD 资源并上传) + +- 方法:`POST /md/convert-folder` +- 表单字段: + - `folder_path`(必填):本地文件夹绝对路径(后端机器) + - `prefix`(可选):MinIO 前缀(如 `assets`) +- 返回:`{ ok, count, files: [{ source, minio_url, minio_presigned_url, asset_ok, asset_fail, mappings }] }` +- 示例: + ```bash + curl -X POST http://192.168.110.58:8000/md/convert-folder \ + -F folder_path='/Users/fanyang/Desktop/Others/CMS/达梦数据-各类示范文档/数+产品手册-MD源文件/DMDRS_Build_Manual_DM8' \ + -F prefix='assets' + ``` +- 参考:`docling/app/server.py:2075` + +### 上传压缩包批量处理 + +- 方法:`POST /api/upload-archive` +- 表单字段:`file`(zip/tar.gz/tgz),`prefix`(可选) +- 返回:`{ code, msg, data: { count, files: [{ source, minio_url, minio_presigned_url, mappings }] } }` +- 示例: + ```bash + curl -X POST http://192.168.110.58:8000/api/upload-archive \ + -F file=@/path/to/archive.zip -F prefix='assets' + ``` +- 参考:`docling/app/server.py:2571` + +### 归档分阶段处理 + +- 暂存上传:`POST /api/archive/stage`,返回 `{ id, name, size }` +- 批量处理:`POST /api/archive/process`,字段:`id`(必填)、`prefix`(可选)、`versionId`(可选) +- 说明:HTML 文件按“两阶段重写”策略处理(HTML 资源上传到 MinIO 并重写 → 转换为 Markdown → 再次重写 MD 中的资源与链接),支持 `data:image/*;base64,` 图片上传并替换为 MinIO 链接 +- 参考:`docling/app/server.py:2714,2728` + +### MinIO 配置与测试 + +- 设置配置:`POST /config/minio` + - 字段:`endpoint`、`public`、`access`、`secret`、`bucket`、`secure=true|false`、`prefix`、`store_final=true|false`、`public_read=true|false` + - 示例: + ```bash + curl -X POST http://192.168.110.58:8000/config/minio \ + -F endpoint='127.0.0.1:9000' -F public='127.0.0.1:9000' \ + -F access='minioadmin' -F secret='minioadmin123' \ + -F bucket='doctest' -F secure=false -F prefix='assets' \ + -F store_final=true -F public_read=true + ``` + - 注意:请使用 MinIO API 端口 `9000`(而非 `9001` 控制台端口);若填写控制台地址或 `:9001` 将被拒绝 + - 参考:`docling/app/server.py:488` + +- 连通测试并应用策略:`POST /config/minio/test` + - 同上字段,额外可携带 `create_if_missing=true` + - 返回:`{ ok, connected, bucket_exists, created, error?, hint? }` + - 参考:`docling/app/server.py:577` + +- 获取配置快照:`GET /config`(参考:`docling/app/server.py:1047`) +- 配置档案:`GET /config/profiles`、`POST /config/save_profile`、`GET /config/load_profile?name=xxx`(参考:`docling/app/server.py:1058,1068,1084`) + +### 系统时间检查(MinIO 时间偏差排查) + +- 方法:`GET /system/time/check` +- 查询参数:`endpoint`、`public`、`secure`(可选,不传则使用当前运行时配置) +- 返回:`{ ok, server_time, local_time, diff_sec, hint }` +- 参考:`docling/app/server.py:720` + +### 资源映射与代理下载(可选) + +- Linkmap:`GET /config/linkmap`、`POST /config/linkmap`(参考:`docling/app/server.py:1583,1587`) +- 代理下载:`POST /proxy/download`(参考:`docling/app/server.py:1635`) + +## 前端集成要点 + +- 基地址读取:`frontend/src/services/api.ts:56-64`(localStorage `app.api.base` 优先,其次 `VITE_API_BASE_URL`) +- 提供的方法: + - `convertDoc` → `/api/convert`(`frontend/src/services/api.ts:96`) + - `uploadArchive` → `/api/upload-archive`(`frontend/src/services/api.ts:104`) + - `stageArchive` → `/api/archive/stage`(`frontend/src/services/api.ts:185`) + - `processArchive` → `/api/archive/process`(`frontend/src/services/api.ts:193`) + - `convertMd` → `/md/convert`(`frontend/src/services/api.ts:157`) + - `convertFolder` → `/md/convert-folder`(`frontend/src/services/api.ts:164`) + - MinIO 配置:`setMinioConfig`(`frontend/src/services/api.ts:112`)、`testMinioConfig`(`frontend/src/services/api.ts:128`)、`createBucket`(`frontend/src/services/api.ts:145`) +- 私有桶注意:直链可能 403,前端应优先使用 `minio_presigned_url`(UI 已支持)。 + +## 测试说明(覆盖所有能力) + +### 1. 健康检查 + +- 请求:`GET /health` +- 断言:返回 `{"status":"ok"}`。 + +### 2. DOCX/PDF → Markdown/HTML/JSON + +- 用例 A:本地 PDF → Markdown(不保存) + - `POST /api/convert`,`file=@/path/to/file.pdf`,`export=markdown` + - 断言:`code=0`,`data.content` 包含 Markdown 文本、`data.media_type` 为 `text/markdown; charset=utf-8`。 + +- 用例 B:远程 PDF → HTML(保存) + - `POST /api/convert`,`source_url=http(s)://...pdf`,`export=html`,`save=true`,`filename=example` + - 断言:返回 `minio_url` 与 `minio_presigned_url` 可访问;中文路径正确编码。 + +### 3. Markdown → DOCX/PDF + +- 用例 C:文本 → PDF(高级参数,`save=false`) + - 字段:`markdown_text`、`target=pdf`、`toc=true`、`header_text`、`footer_text`、封面/Logo 文件与封面文字 + - 断言:返回 PDF 二进制可打开;封面与 Logo 可见。日志中的 `word-break: break-word` 警告不影响生成。 + +- 用例 D:文件 → DOCX(`save=true`) + - 字段:`md_file`、`target=docx`、`save=true` + - 断言:`minio_presigned_url` 可下载;中文文件名编码正确。 + +- 用例 E:URL → PDF + - 字段:`markdown_url=http(s)://...md`、`target=pdf` + - 断言:生成成功;封面与 Logo 正常加载(若私有桶则走签名链接)。 + +### 4. 批量处理 + +- 用例 F:本地文件夹批量重写并上传 + - `POST /md/convert-folder`,`folder_path='/Users/fanyang/Desktop/Others/CMS/达梦数据-各类示范文档/数+产品手册-MD源文件/DMDRS_Build_Manual_DM8'`、`prefix='assets'` + - 断言:`count>0`;各文件 `asset_ok/asset_fail` 合理;`minio_presigned_url` 可下载。 + +- 用例 G:上传压缩包批量处理 + - `POST /api/upload-archive`,`file=@/path/to/archive.zip`、`prefix='assets'` + - 断言:`data.count` 正确;各文件链接可用。 + +### 5. MinIO 配置与策略 + +- 用例 H:设置配置 + - `POST /config/minio`(真实参数) + - 断言:返回 `ok:true`。 + +- 用例 I:连通测试并应用策略 + - `POST /config/minio/test`,`public_read=true|false`,`create_if_missing=true` + - 断言:返回连通状态;私有桶下使用 `minio_presigned_url` 可访问。 + +### 6. 资源映射与代理(可选) + +- 用例 J:`GET/POST /config/linkmap` 设置静态映射;`POST /proxy/download` 验证代理下载功能。 + +## 兼容性与注意事项 + +- 路径编码:所有返回的对象路径已进行编码,适配中文、空格、括号等字符。 +- 私有桶:直链可能 403;前端测试请使用 `minio_presigned_url`。 +- 样式警告:WeasyPrint 不支持 `word-break: break-word`,建议 `overflow-wrap: break-word` 或 `word-break: break-all`。 + - 安全解压:ZIP/TAR 采用路径穿越防护,解压目标限定在工作目录内;同时自动修复常见文件名乱码编码 + - HTML 资产重写:批量处理对 HTML 的资源链接进行两阶段重写并上传至 MinIO;内嵌 Base64 图片自动上传并替换为可访问链接 + - 控制台端口限制:`/config/minio` 与 `/config/minio/test` 会拒绝 `:9001` 或带 `/browser`、`/minio` 的控制台地址;请使用 `9000` API 端口 + +## 本地运行(后端与前端) + +- 后端(FastAPI,端口 `8000`): + - `cd /Users/fanyang/Desktop/FunMD_Convert/docling` + - `PYTHONPATH=. python -m uvicorn app.server:app --host 127.0.0.1 --port 8000` +- 前端(Vite): + - `cd /Users/fanyang/Desktop/FunMD_Convert/frontend` + - `npm install` + - `VITE_API_BASE_URL=http://127.0.0.1:8000 npm run dev` +- 访问: + - 后端 UI 首页:`http://127.0.0.1:8000/ui/` + - 前端开发页面:Vite 控制台输出的本地地址(通常为 `http://127.0.0.1:5173/`) + +### 前端配置 API Base(避免代理空响应) + +- 打开前端页面右上角“数据库配置”弹窗,在“接口地址”一栏填写:`http://127.0.0.1:8000`,点击“保存配置”。 +- 保存后,前端会直连后端 `8000` 端口,不再通过 Vite 代理,避免长耗时请求在 `5173` 上出现 `ERR_EMPTY_RESPONSE`。 diff --git a/docker本地构建与打包.md b/docker本地构建与打包.md new file mode 100644 index 0000000..0645e0e --- /dev/null +++ b/docker本地构建与打包.md @@ -0,0 +1,141 @@ +toolName: search_by_regex + +status: success + +query: StaticFiles + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: view_folder + +status: success + +/Users/fanyang/Desktop/FunMD_Convert + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/Dockerfile + + + + +toolName: view_folder + +status: success + +/Users/fanyang/Desktop/FunMD_Convert + + + + + + + +toolName: edit_file_update + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/Dockerfile + +changes: + + + + +toolName: ApplyPatch + +status: failed + + + + + +toolName: write_to_file + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/package_offline.sh + +content: undefined + + + + +toolName: todo_write + +status: success + +Todos updated: 2 items + +下面是按你给出的参考流程,结合当前项目的启动方式(后端 FastAPI 同时托管前端 `/ui`),进行“打包 → 传输 → 加载 → 启动”的完整方案。 + +**打包镜像** +- 在项目根目录执行: + - 构建镜像:`docker build -t funmd-convert:latest .` + - 保存镜像:`docker save funmd-convert:latest -o FunMD_Convert.tar` + - 查看大小:`ls -lh FunMD_Convert.tar` + - 计算校验:`shasum -a 256 FunMD_Convert.tar | awk '{print $1}'` +- 也可以直接用脚本: + - `chmod +x package_offline.sh && ./package_offline.sh` + - 该脚本会自动构建、生成 `FunMD_Convert.tar`、打印大小与 `sha256`,并给出服务器端加载和运行提示 + +**离线服务器加载** +- 复制镜像包到服务器:`scp FunMD_Convert.tar user@server:/opt/FunMD_Convert/` +- 加载镜像:`docker load -i /opt/FunMD_Convert/FunMD_Convert.tar` +- 验证镜像:`docker images | grep funmd-convert` + +**启动容器** +- 启动后端(同时托管前端 `/ui`): + `docker run -d -p 8000:8000 --name FunMD_Convert --restart unless-stopped funmd-convert:latest` +- 访问地址: + - 后端健康检查:`http://<服务器IP>:8000/health` + - 前端页面:`http://<服务器IP>:8000/ui/` + - API 基地址:`http://<服务器IP>:8000/api/…` + +**MinIO 配置(首次部署或变更时)** +- 通过后端接口配置 MinIO(确保 `public` 指向内网可达的 9000 端口): + - `curl -X POST -F endpoint=10.9.35.31:9000 -F public=http://10.9.35.31:9000 -F access= -F secret= -F bucket=file-cms -F secure=false -F public_read=true http://<服务器IP>:8000/config/minio` +- 验证连接:`curl -X POST -F endpoint=10.9.35.31:9000 -F bucket=file-cms -F access= -F secret= -F secure=false http://<服务器IP>:8000/config/minio/test` + +**说明与对齐** +- 镜像内已包含: + - 后端 FastAPI 服务(`uvicorn` 启动,端口 `8000`) + - 前端已构建的静态资源,后端自动挂载为 `/ui` 与 `/assets` + - Docling 模型离线预下载(构建阶段执行),容器内可直接使用 +- 前端请求基座: + - 当用后端托管前端时,前端自动回退到 `http://<当前主机>:8000`,无需额外配置 +- 中文路径与 URL 显示: + - 后端已修复 Zip/Tar 解压中文“乱码”,同时返回 `minio_url`(安全编码)和 `minio_url_display`(中文可读),前端展示文本用后者,资源实际加载用前者 + +**推荐的打包校验清单** +- 构建完成后: + - `ls -lh offline-funmd-convert.tar` 确认大小合理(包含模型与前端) + - `shasum -a 256 offline-funmd-convert.tar | awk '{print $1}'` 记录校验值 +- 服务器端: + - `docker load -i ...` 后 `docker images | grep offline-funmd-convert` 显示镜像 + - `docker ps` 容器运行后访问 `http://<服务器IP>:8000/health` 为 `{"status":"ok"}` + +如果你希望镜像命名和示例完全对齐项目名(如 `FunMD_Convert`),只需在构建与运行命令中替换为: +- 构建:`docker build -t funmd-convert:latest .` +- 保存:`docker save funmd-convert:latest -o FunMD_Convert.tar` +- 加载:`docker load -i /opt/FunMD_Convert/FunMD_Convert.tar` +- 启动:`docker run -d -p 8000:8000 --name FunMD_Convert --restart unless-stopped funmd-convert:latest` diff --git a/docling/README.zh-CN.md b/docling/README.zh-CN.md new file mode 100644 index 0000000..12f2c01 --- /dev/null +++ b/docling/README.zh-CN.md @@ -0,0 +1,154 @@ +# 本地安装与启动指南(Docling + FastAPI 服务) + +本文档介绍如何在本机安装与启动本仓库的转换服务,以供前端调用生成并下载 PDF。 + +## 环境要求 +- 操作系统:macOS(已验证),Linux/Windows 亦可 +- Python:3.9–3.13 +- 建议安装工具:`python -m venv` 或 [uv](https://docs.astral.sh/uv/) + +## 创建虚拟环境 +- 使用 venv: + ```bash + cd /Users/fanyang/Desktop/docling + python -m venv .venv + source .venv/bin/activate + python -m pip install -U pip + ``` +- 或使用 uv: + ```bash + cd /Users/fanyang/Desktop/docling + uv venv + source .venv/bin/activate + ``` + +## 安装依赖 +- 安装本地 Docling 库(可编辑模式): + ```bash + python -m pip install -e ./docling + ``` +- 安装后端服务依赖: + ```bash + python -m pip install fastapi uvicorn minio weasyprint pytest + ``` + - 若 WeasyPrint 在 macOS 上提示缺少系统库,可使用 Homebrew 安装: + ```bash + brew install cairo pango gdk-pixbuf libffi + ``` + +## 启动服务 +- 在项目根目录执行: + ```bash + PYTHONPATH=. python -m uvicorn app.server:app --host 127.0.0.1 --port 8000 + ``` +- 访问: + - 首页 UI:`http://127.0.0.1:8000/` + - 健康检查:`http://127.0.0.1:8000/health`(返回 `{"status":"ok"}`) + +### 接口总览 +- `GET /` 本地 UI(静态文件) +- `GET /health` 服务健康检查 +- `POST /md/convert` Markdown/HTML → `docx|pdf`(核心接口,返回 MinIO 下载链接) +- `POST /md/convert-folder` 批量转换本地文件夹内的 `.md` 文件并上传结果到 MinIO +- `POST /md/upload-folder` 批量上传前端打包的文件夹内容并转换其中 `.md` 文件 +- MinIO 配置相关: + - `POST /config/minio` 设置连接信息与前缀 + - `POST /config/minio/test` 验证连接 + - `GET /config/minio/buckets` 列出桶 + - `POST /config/minio/create-bucket` 创建桶 + +## MinIO 配置 +- 环境变量方式(推荐): + ```bash + export MINIO_ENDPOINT=127.0.0.1:9000 + export MINIO_ACCESS_KEY=minioadmin + export MINIO_SECRET_KEY=minioadmin + export MINIO_BUCKET=docling-target + export MINIO_SECURE=false + export MINIO_PUBLIC_ENDPOINT=http://127.0.0.1:9000 + export MINIO_PREFIX=cms-files + ``` +- 运行时接口方式: + - `POST /config/minio` 设置连接信息与前缀 + - `POST /config/minio/test` 测试连通性 + - `GET /config/minio/buckets` 列出桶 + - `POST /config/minio/create-bucket` 创建桶 + +## 前端下载 PDF(接口说明) +- 核心接口:`POST /md/convert` +- 作用:将 Markdown/HTML 转换为 PDF 并上传至 MinIO,返回可下载链接 +- 参数(FormData,以下三选一提供文档来源): + - `md_file`:上传 Markdown 文件 + - `markdown_text`:直接传入 Markdown 文本 + - `markdown_url`:文档 URL(推荐) +- 目标格式:`target=pdf` +- 可选参数:`toc`、`header_text`、`footer_text`、`logo_url|logo_file`、`cover_url|cover_file`、`product_name`、`document_name`、`product_version`、`document_version`、`css_name|css_text` +- 返回 JSON 字段:`minio_presigned_url`(时效下载链接)或 `minio_url`(公开链接)、`name`、`media_type` + +### 前端调用示例(TypeScript) +```ts +async function downloadPdf(markdownUrl: string) { + const fd = new FormData(); + fd.append('markdown_url', markdownUrl); + fd.append('target', 'pdf'); + fd.append('toc', 'true'); + // 可选品牌参数: + // fd.append('header_text', '产品名|文档标题'); + // fd.append('footer_text', '© 公司'); + + const resp = await fetch('http://127.0.0.1:8000/md/convert', { method: 'POST', body: fd }); + if (!resp.ok) throw new Error('转换失败'); + const data = await resp.json(); + const url = data.minio_presigned_url || data.minio_url; + if (!url) throw new Error('未返回可下载链接,请检查 MinIO 配置'); + window.location.href = url; // 触发下载 +} +``` + +### cURL 示例(URL → PDF) +```bash +curl -s -X POST \ + -F 'markdown_url=http://127.0.0.1:9000/docs/assets/rewritten/DMDRS_Build_Manual_Oracle/DMDRS搭建手册-Oracle.md' \ + -F 'target=pdf' \ + -F 'toc=true' \ + -F 'header_text=产品名|文档标题' \ + -F 'footer_text=© 2025 公司' \ + http://127.0.0.1:8000/md/convert +``` + +返回示例: +```json +{ + "minio_url": "http://127.0.0.1:9000/docling-target/cms-files/converted/DMDRS搭建手册-Oracle.pdf", + "minio_presigned_url": "http://127.0.0.1:9000/...presigned...", + "name": "DMDRS搭建手册-Oracle.pdf", + "media_type": "application/pdf" +} +``` + +### 批量转换(文件夹) +- 将本地文件夹内的 `.md` 全量转换并上传结果: +```bash +curl -s -X POST -F 'folder_path=/Users/you/docs' http://127.0.0.1:8000/md/convert-folder +``` + +### 直接转 DOCX(按需) +```bash +curl -s -X POST \ + -F 'markdown_url=http://127.0.0.1:9000/docs/assets/rewritten/DMDRS_Build_Manual_Oracle/DMDRS搭建手册-Oracle.md' \ + -F 'target=docx' \ + http://127.0.0.1:8000/md/convert +``` + +## 常见问题 +- `ModuleNotFoundError: No module named 'app' / 'docling'` + - 请在启动命令前设置 `PYTHONPATH=.` 或在当前 shell 直接以 `PYTHONPATH=. python -m uvicorn ...` 方式启动。 +- 未返回下载 URL + - 请检查 MinIO 环境变量或使用 `/config/minio` 进行配置;确保桶存在且服务端启用了 `store_final=true`。 +- 图片或样式异常 + - 确认资源已被重写为公共 URL(服务会自动上传并改写),并检查 `css_name`/`css_text`(PDF 默认样式为 `default`,位于 `app/configs/styles/default.css`)。 +- WeasyPrint 依赖缺失(macOS) + - 执行 `brew install cairo pango gdk-pixbuf libffi` 后重试;如仍失败,请检查 `PATH`/`DYLD_LIBRARY_PATH`。 + +## 相关文档 +- 服务端接口中文说明:`docling/README.zh-CN.md` diff --git a/docling/app/__init__.py b/docling/app/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/docling/app/__init__.py @@ -0,0 +1 @@ + diff --git a/docling/app/configs/dm.json b/docling/app/configs/dm.json new file mode 100644 index 0000000..bf20721 --- /dev/null +++ b/docling/app/configs/dm.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} \ No newline at end of file diff --git a/docling/app/configs/linkmap/linkmap.json b/docling/app/configs/linkmap/linkmap.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/docling/app/configs/linkmap/linkmap.json @@ -0,0 +1 @@ +{} diff --git a/docling/app/configs/profiles/active.json b/docling/app/configs/profiles/active.json new file mode 100644 index 0000000..a05fda9 --- /dev/null +++ b/docling/app/configs/profiles/active.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/profiles/default2.json b/docling/app/configs/profiles/default2.json new file mode 100644 index 0000000..99bffeb --- /dev/null +++ b/docling/app/configs/profiles/default2.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9001", + "public": "127.0.0.1:9001", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "true", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/profiles/local2.json b/docling/app/configs/profiles/local2.json new file mode 100644 index 0000000..8e3f82a --- /dev/null +++ b/docling/app/configs/profiles/local2.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/profiles/localhost.json b/docling/app/configs/profiles/localhost.json new file mode 100644 index 0000000..b94489e --- /dev/null +++ b/docling/app/configs/profiles/localhost.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "true", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/profiles/localhost3.json b/docling/app/configs/profiles/localhost3.json new file mode 100644 index 0000000..a05fda9 --- /dev/null +++ b/docling/app/configs/profiles/localhost3.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/profiles/test.json b/docling/app/configs/profiles/test.json new file mode 100644 index 0000000..7f99880 --- /dev/null +++ b/docling/app/configs/profiles/test.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "8.163.40.177:9000", + "public": "http://8.163.40.177:9000", + "access": "minioadmin", + "secret": "minioadmin", + "bucket": "cms-files", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} diff --git a/docling/app/configs/styles/default.css b/docling/app/configs/styles/default.css new file mode 100644 index 0000000..1599280 --- /dev/null +++ b/docling/app/configs/styles/default.css @@ -0,0 +1,88 @@ +@page { + size: A4; + margin: 20mm 15mm 20mm 15mm; + @top-left { content: none; } + @top-center { content: element(header); } + @bottom-left { content: element(copyright); } + @bottom-center { content: element(footer); } + @bottom-right { content: counter(page); font-size: 10pt; color: #444; } +} + +html { font-family: "Noto Sans CJK SC", "Noto Sans", "Source Han Sans SC", "DejaVu Sans", sans-serif; font-size: 12pt; line-height: 1.6; } + +body { color: #111; } + +h1 { font-size: 20pt; margin: 0 0 8pt; page-break-before: always; } +h2 { font-size: 16pt; margin: 16pt 0 8pt; } +h3 { font-size: 14pt; margin: 12pt 0 6pt; } + +h1, h2, h3 { page-break-after: avoid; break-after: avoid-page; } + +p { margin: 0 0 8pt; } + +pre, code { font-family: "DejaVu Sans Mono", "Noto Sans Mono", monospace; font-size: 10pt; } + +table { width: 100%; border-collapse: collapse; margin: 8pt 0; table-layout: fixed; } +th, td { border: 1px solid #ddd; padding: 6pt 8pt; } +thead { display: table-header-group; } +tfoot { display: table-footer-group; } +table, thead, tbody, tr, th, td { page-break-inside: avoid; break-inside: avoid-page; } + +th, td { white-space: normal; overflow-wrap: anywhere; word-break: break-word; hyphens: auto; } +.table-block { page-break-inside: avoid; break-inside: avoid-page; } + +pre { background: #f6f8fa; border: 1px solid #e5e7eb; border-radius: 6pt; padding: 8pt 10pt; white-space: pre-wrap; overflow-wrap: anywhere; word-break: break-word; } +code { background: #f6f8fa; border-radius: 4pt; padding: 0 3pt; } + +a { color: #0366d6; text-decoration: underline; } +a:hover { text-decoration: underline; } + +.break-before { page-break-before: always; } +.break-after { page-break-after: always; } +.doc-meta { height: 0; overflow: hidden; } +.doc-header-text { position: running(header); } +.doc-footer-text { position: running(footer); } +.doc-copyright { position: running(copyright); } +img#brand-logo { display: none; } + +.toc { page-break-after: always; } +.toc h1 { font-size: 18pt; margin: 0 0 8pt; } +.toc ul { list-style: none; padding: 0; } +.toc li { margin: 4pt 0; display: grid; grid-template-columns: auto 1fr 30pt; column-gap: 8pt; align-items: baseline; } +.toc li.toc-h1 .toc-text { font-weight: 600; } +.toc li.toc-h2 .toc-text { margin-left: 8pt; } +.toc li.toc-h3 .toc-text { margin-left: 16pt; } +.toc .toc-dots { border-bottom: 1px dotted currentColor; height: 0.9em; transform: translateY(-0.1em); } +.toc .toc-page { text-align: right; } +.toc .toc-page::before { content: target-counter(attr(data-target), page); } +@page { @bottom-right { content: counter(page); font-size: 10pt; color: #444; } } +.doc-header-text { position: running(header); display: flex; justify-content: space-between; align-items: center; font-size: 11pt; color: #444; border-bottom: 1px solid #e5e7eb; padding-bottom: 6pt; min-height: 26pt; } +.doc-header-left { font-weight: 500; } +.doc-header-right { font-size: 10pt; color: #666; } +.doc-header-text img.logo-inline { height: 26pt; margin-right: 8pt; } +.doc-header-text img.logo-inline { height: 26pt; margin-right: 8pt; } +.doc-footer-text { position: running(footer); display: block; text-align: center; font-size: 10pt; color: #444; border-top: 1px solid #e5e7eb; padding-top: 6pt; } +.toc a { color: #0366d6; text-decoration: underline; } +.toc li { grid-template-columns: auto 1fr 48pt; } +.toc li.toc-h2 .toc-text { margin-left: 12pt; } +.toc li.toc-h3 .toc-text { margin-left: 24pt; } +table { max-width: 100%; box-sizing: border-box; } +tr, th, td { page-break-inside: avoid; break-inside: avoid-page; } +img, svg, canvas { + display: block; + max-width: 100%; + height: auto; + box-sizing: border-box; + page-break-inside: avoid; + break-inside: avoid-page; +} +p > img { margin: 6pt auto; } +td img, th img { max-width: 100%; height: auto; } +@page cover { size: A4; margin: 0; } +.cover { page: cover; position: relative; width: 210mm; height: 297mm; overflow: hidden; page-break-after: always; } +.cover .cover-bg { position: absolute; left: 0; top: 0; width: 100%; height: 100%; object-fit: cover; } +.cover .cover-brand { position: absolute; top: 20mm; left: 20mm; font-size: 18pt; font-weight: 700; color: #1d4ed8; } +.cover .cover-footer { position: absolute; left: 0; right: 0; bottom: 0; background: #1d4ed8; color: #fff; padding: 12mm 20mm; } +.cover .cover-title { font-size: 24pt; font-weight: 700; margin: 0; } +.cover .cover-subtitle { font-size: 13pt; margin-top: 4pt; } +.cover .cover-meta { margin-top: 8pt; font-size: 11pt; display: flex; gap: 20mm; } diff --git a/docling/app/configs/test02.json b/docling/app/configs/test02.json new file mode 100644 index 0000000..bf20721 --- /dev/null +++ b/docling/app/configs/test02.json @@ -0,0 +1,17 @@ +{ + "minio": { + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "minioadmin", + "secret": "minioadmin123", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true" + }, + "db": { + "webhook_url": null, + "token": null + } +} \ No newline at end of file diff --git a/docling/app/server.py b/docling/app/server.py new file mode 100644 index 0000000..8ab2c08 --- /dev/null +++ b/docling/app/server.py @@ -0,0 +1,2993 @@ +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, Query +from fastapi.responses import Response, HTMLResponse, JSONResponse, FileResponse +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware +from pathlib import Path +import tempfile +import os +import asyncio +from typing import Optional, List, Dict, Tuple +from datetime import timedelta +import mimetypes +from urllib.request import urlopen, Request +from urllib.error import HTTPError, URLError +from urllib.parse import urlsplit, urlunsplit, quote, unquote +import logging +import traceback +import time +import re +import io +import shutil +import uuid +import subprocess +import sys +import json +try: + from minio import Minio # type: ignore + import urllib3 # type: ignore +except Exception: + Minio = None + urllib3 = None # type: ignore +from pydantic import BaseModel + +class ConvertResponse(BaseModel): + minio_url: Optional[str] + minio_presigned_url: Optional[str] + name: str + media_type: str + +class MinioPresignResponse(BaseModel): + bucket: str + object: str + minio_url: Optional[str] + minio_presigned_url: Optional[str] + expires: int + +try: + import fitz # type: ignore +except Exception: + fitz = None # type: ignore +from app.services.docling_adapter import ( + convert_source, + md_to_docx_bytes, + md_to_pdf_bytes_with_renderer, + infer_basename, + sanitize_filename, + load_linkmap, + save_linkmap, +) +from app.services.unified_converter import FormatConverter +from app.services.minio_utils import minio_current, join_prefix, presigned_read + +""" +@api Server Application +@description FastAPI server providing document conversion endpoints and MinIO integration +""" + +app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +try: + _ui_dir = Path(__file__).resolve().parents[2] / "frontend" / "dist" + if _ui_dir.exists(): + app.mount("/ui", StaticFiles(directory=str(_ui_dir), html=True), name="ui") + try: + assets_dir = _ui_dir / "assets" + if assets_dir.exists(): + app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets") + except Exception: + pass + try: + svg_path = _ui_dir / "vite.svg" + if svg_path.exists(): + @app.get("/vite.svg") + def _vite_svg(): + return FileResponse(str(svg_path), media_type="image/svg+xml") + except Exception: + pass +except Exception: + pass + + + + +@app.get("/health") +def health(): + """ + @function health + @description Health check endpoint + @return {"status": "ok"} + """ + return {"status": "ok"} + +@app.post("/convert") +async def convert( + file: Optional[UploadFile] = File(None), + source_url: Optional[str] = Form(None), + export: str = Form("markdown"), + save: Optional[bool] = Form(False), + filename: Optional[str] = Form(None), +): + """ + @function convert + @description Convert various document formats to Markdown/HTML/JSON + @param file Uploaded file (optional) + @param source_url URL of the source document (optional) + @param export Target export format (default: markdown) + @param save Whether to save to MinIO (default: False) + @param filename Custom filename for the output + @return JSON response with conversion result or MinIO URL + """ + if (file is None and not source_url) or (file is not None and source_url): + raise HTTPException(status_code=400, detail="provide exactly one of file or source_url") + export = _normalize_export(export) + if source_url: + enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, source_url, export=export) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None)) + out_ext = _export_ext(export) + ct = _media_type(export) + if export.lower() == "markdown": + try: + client_rw, bucket_rw, public_rw, prefix_rw = minio_current(RUNTIME_CONFIG) + if client_rw is not None and bucket_rw and public_rw: + base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_")) + new_text, _ms = _rewrite_md_assets_to_minio( + content, + base_dir, + client_rw, + bucket_rw, + public_rw, + prefix_rw, + search_root=(Path(artifacts_dir) if artifacts_dir else None), + ) + content = new_text + try: + if artifacts_dir: + _bulk_upload_assets(Path(artifacts_dir), client_rw, bucket_rw, public_rw, prefix_rw) + except Exception: + pass + except Exception: + pass + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is None or not bucket or not public_base: + raise HTTPException(status_code=400, detail="MinIO is not configured for save") + rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"} + if not rc_store_final: + raise HTTPException(status_code=400, detail="Saving to MinIO is disabled by configuration") + out_name = f"{base}{out_ext}" + obj = join_prefix(prefix, f"converted/{out_name}") + raw = content.encode(enc or "utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type=ct) # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) + resp = JSONResponse({ + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "name": out_name, + "export": export, + "media_type": ct + }) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + assert file is not None + suffix = "" + if file.filename and "." in file.filename: + suffix = "." + file.filename.rsplit(".", 1)[-1] + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(await file.read()) + tmp_path = tmp.name + try: + enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, tmp_path, export=export) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename)) + out_ext = _export_ext(export) + ct = _media_type(export) + if export.lower() == "markdown": + try: + client_rw, bucket_rw, public_rw, prefix_rw = minio_current(RUNTIME_CONFIG) + if client_rw is not None and bucket_rw and public_rw: + base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_")) + new_text, _ms = _rewrite_md_assets_to_minio( + content, + base_dir, + client_rw, + bucket_rw, + public_rw, + prefix_rw, + search_root=(Path(artifacts_dir) if artifacts_dir else None), + ) + content = new_text + try: + if artifacts_dir: + _bulk_upload_assets(Path(artifacts_dir), client_rw, bucket_rw, public_rw, prefix_rw) + except Exception: + pass + except Exception: + pass + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is None or not bucket or not public_base: + raise HTTPException(status_code=400, detail="MinIO is not configured for save") + rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"} + if not rc_store_final: + raise HTTPException(status_code=400, detail="Saving to MinIO is disabled by configuration") + out_name = f"{base}{out_ext}" + obj = join_prefix(prefix, f"converted/{out_name}") + raw = content.encode(enc or "utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type=ct) # type: ignore + minio_url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) + resp = JSONResponse({ + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "name": out_name, + "export": export, + "media_type": ct + }) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + finally: + try: + os.remove(tmp_path) + except Exception: + pass + +profiles_dir = Path(__file__).parent / "configs" +profiles_dir.mkdir(parents=True, exist_ok=True) + +@app.get("/") +def index(): + return JSONResponse({"ok": True, "service": "docling-api", "version": "v2"}) + +@app.get("/@vite/client") +def vite_client_stub(): + return JSONResponse({"ok": True}) + +@app.get("/refresh.js") +def refresh_js_stub(): + return Response(content="window.initClient=function(){},window.addRefresh=function(){};", media_type="application/javascript") + + + +RUNTIME_CONFIG: Dict[str, Dict[str, Optional[str]]] = { + "minio": { + "endpoint": None, + "public": None, + "access": None, + "secret": None, + "bucket": None, + "secure": None, + "prefix": None, + "store_final": "true", + "public_read": "true", + }, + "db": { + "webhook_url": None, + "token": None, + }, +} + +def _normalize_export(export: str) -> str: + e = (export or "").strip().lower() + allowed = {"markdown", "html", "json", "doctags"} + if e not in allowed: + raise HTTPException(status_code=422, detail="unsupported export") + return e + +def _normalize_engine(engine: Optional[str]) -> Optional[str]: + if engine is None: + return None + e = (engine or "").strip().lower() + allowed = {"docling", "word2markdown", "pandoc", "custom"} + if e not in allowed: + raise HTTPException(status_code=422, detail="unsupported engine") + return e + +def _fix_garbled_name(name: str) -> str: + try: + s = name + t = s.strip() + # If pure ASCII, no fix needed + if all(ord(c) < 128 for c in t): + return name + # Try to reconstruct original bytes assuming CP437 (Zip default when UTF-8 flag not set) + try: + raw = s.encode("cp437", errors="strict") + except UnicodeEncodeError: + # Not CP437 mojibake, keep original + return name + encs = [ + "gb18030", + "gbk", + "cp936", + "utf-8", + "big5", + "cp950", + "shift_jis", + "cp932", + "cp949", + "euc-kr", + "euc-jp", + ] + for e in encs: + try: + fixed = raw.decode(e) + if fixed: + return fixed + except Exception: + continue + except Exception: + pass + return name + +def _safe_target(base: Path, name: str) -> Optional[Path]: + try: + n = name.replace("\\", "/").lstrip("/") + parts = [p for p in n.split("/") if p and p not in {".", ".."}] + tgt = base / "/".join(parts) + rp = tgt.resolve() + rb = base.resolve() + try: + rp.relative_to(rb) + except Exception: + return None + return rp + except Exception: + return None + +def _zip_extract_safely(zf: object, dest: Path) -> None: + try: + for zi in zf.infolist(): # type: ignore + try: + name = str(getattr(zi, "filename", "")) + flag = int(getattr(zi, "flag_bits", 0)) + use = name + if (flag & 0x800) == 0: + use = _fix_garbled_name(name) + target = _safe_target(dest, use) + if target is None: + continue + if hasattr(zi, "is_dir") and zi.is_dir(): # type: ignore + target.mkdir(parents=True, exist_ok=True) + continue + target.parent.mkdir(parents=True, exist_ok=True) + with zf.open(zi, "r") as src: # type: ignore + data = src.read() + with open(target, "wb") as out: + out.write(data) + except Exception: + continue + except Exception: + pass + +def _tar_extract_safely(tf: object, dest: Path) -> None: + try: + for m in tf.getmembers(): # type: ignore + try: + name = str(getattr(m, "name", "")) + use = _fix_garbled_name(name) + target = _safe_target(dest, use) + if target is None: + continue + if getattr(m, "isdir", lambda: False)(): + target.mkdir(parents=True, exist_ok=True) + continue + target.parent.mkdir(parents=True, exist_ok=True) + f = tf.extractfile(m) # type: ignore + if f is None: + continue + data = f.read() + with open(target, "wb") as out: + out.write(data) + except Exception: + continue + except Exception: + pass + +def _minio_head_bucket(client: object, bucket: str) -> bool: + try: + if hasattr(client, "bucket_exists"): + try: + return bool(client.bucket_exists(bucket)) # type: ignore + except Exception: + pass + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + client._url_open(method="HEAD", region=region, bucket_name=bucket) # type: ignore + return True + except Exception: + try: + names = [getattr(b, "name", None) for b in client.list_buckets()] # type: ignore + return bucket in set(n for n in names if n) + except Exception: + return False + +def _minio_create_bucket(client: object, bucket: str) -> bool: + # Prefer SDK methods, fallback to low-level call + try: + if hasattr(client, "bucket_exists"): + try: + if client.bucket_exists(bucket): # type: ignore + return True + except Exception: + pass + if hasattr(client, "make_bucket"): + try: + client.make_bucket(bucket) # type: ignore + return True + except Exception: + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + try: + client.make_bucket(bucket, location=region) # type: ignore + return True + except Exception: + pass + try: + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + client._url_open(method="PUT", region=region, bucket_name=bucket) # type: ignore + return True + except Exception as ce: + if "BucketAlreadyOwnedByYou" in str(ce) or "BucketAlreadyExists" in str(ce): + return True + raise + except Exception as e: + raise e +def _minio_client(endpoint: str, access: str, secret: str, secure: bool): + if urllib3 is not None: + try: + http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=3.0, read=20.0)) + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure, http_client=http) # type: ignore + except Exception: + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore +def _minio_time_hint(endpoint: str, secure: bool) -> Optional[str]: + try: + scheme = "https" if secure else "http" + r = urlopen(f"{scheme}://{endpoint}", timeout=3) + srv_date = r.headers.get("Date") + if not srv_date: + return None + from email.utils import parsedate_to_datetime + from datetime import datetime, timezone + dt = parsedate_to_datetime(srv_date) + now = datetime.now(timezone.utc) + diff = abs((now - dt).total_seconds()) + return f"服务器时间与本机相差约 {int(diff)} 秒" + except Exception: + return None + + +def _db_notify(payload: Dict[str, object]): + try: + import requests # type: ignore + except Exception: + return + url = (RUNTIME_CONFIG.get("db", {}).get("webhook_url") or "").strip() + if not url: + return + token = (RUNTIME_CONFIG.get("db", {}).get("token") or "") + headers = {"Content-Type": "application/json"} + if token: + headers["Authorization"] = f"Bearer {token}" + try: + requests.post(url, json=payload, headers=headers, timeout=5) + except Exception: + pass + +@app.post("/config/minio") +async def set_minio_config( + endpoint: str = Form(...), + public: Optional[str] = Form(None), + access: str = Form(...), + secret: str = Form(...), + bucket: str = Form(...), + secure: Optional[str] = Form("false"), + prefix: Optional[str] = Form(None), + store_final: Optional[str] = Form("true"), + public_read: Optional[str] = Form("true"), +): + ep_raw = (endpoint or "").strip() + ep_host = ep_raw + try: + from urllib.parse import urlsplit + u = urlsplit(ep_raw) + if u.scheme: + ep_host = (u.netloc or ep_raw).split("/")[0] + else: + ep_host = ep_raw.split("/")[0] + except Exception: + ep_host = ep_raw.split("/")[0] + # reject console port or console paths for endpoint + try: + if (":9001" in ep_host) or ("/browser" in ep_raw) or ("/minio" in ep_raw): + return {"ok": False, "error": "请使用 MinIO API 端口 9000(而非 9001 控制台)"} + except Exception: + pass + pub_val = public + try: + from urllib.parse import urlsplit + pu = urlsplit((public or "").strip()) + if (pu.netloc.endswith(":9001") or "/browser" in (public or "") or "/minio" in (public or "")): + pub_val = None + except Exception: + if public and (":9001" in public or "/browser" in public or "/minio" in public): + pub_val = None + # ensure public has scheme + try: + if pub_val: + from urllib.parse import urlsplit + pu = urlsplit(pub_val.strip()) + scheme = pu.scheme or ("https" if str(secure or "false").lower() in {"1","true","yes","on"} else "http") + host = pu.netloc or pu.path.split("/")[0] + pub_val = f"{scheme}://{host}" + except Exception: + try: + if pub_val: + host = pub_val.strip().split("/")[0] + scheme = "https" if str(secure or "false").lower() in {"1","true","yes","on"} else "http" + pub_val = f"{scheme}://{host}" + except Exception: + pass + RUNTIME_CONFIG["minio"].update({ + "endpoint": ep_host, + "public": pub_val, + "access": access, + "secret": secret, + "bucket": bucket, + "secure": secure, + "prefix": prefix, + "store_final": store_final, + "public_read": public_read, + }) + client, bkt, pub, _ = minio_current(RUNTIME_CONFIG) + if client is None or not bkt or not pub: + return {"ok": False, "error": "MinIO config invalid"} + try: + pr = str(public_read or "true").lower() in {"1","true","yes","on"} + if pr: + policy = { + "Version": "2012-10-17", + "Statement": [ + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bkt}"]}, + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bkt}/*"]}, + ], + } + import json as _json + client.set_bucket_policy(bucket_name=bkt, policy=_json.dumps(policy)) # type: ignore + else: + try: + client.delete_bucket_policy(bkt) # type: ignore + except Exception: + pass + except Exception: + pass + return {"ok": True} + +@app.post("/config/minio/test") +async def test_minio_config( + endpoint: str = Form(...), + public: Optional[str] = Form(None), + access: str = Form(...), + secret: str = Form(...), + bucket: str = Form(...), + secure: Optional[str] = Form("false"), + create_if_missing: Optional[str] = Form("true"), + public_read: Optional[str] = Form("false"), +): + if Minio is None: + return {"ok": False, "connected": False, "bucket_exists": False, "error": "minio client not available"} + try: + sec = str(secure or "false").lower() in {"1","true","yes","on"} + ep_raw = (endpoint or "").strip() + ep_host = ep_raw + try: + from urllib.parse import urlsplit + u = urlsplit(ep_raw) + if u.scheme: + ep_host = (u.netloc or ep_raw).split("/")[0] + else: + ep_host = ep_raw.split("/")[0] + except Exception: + ep_host = ep_raw.split("/")[0] + if ":9001" in ep_host or "/browser" in ep_raw or "/minio" in ep_raw: + return {"ok": False, "connected": False, "bucket_exists": False, "error": "请使用 MinIO API 端口 9000(而非 9001 控制台)"} + client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=sec) + # handshake fallback + try: + try: + client.list_buckets() # type: ignore + except Exception as e: + if sec and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)): + client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=False) + sec = False + except Exception: + pass + exists = False + created = False + exists = _minio_head_bucket(client, bucket) + if not exists and str(create_if_missing or "true").lower() in {"1","true","yes","on"}: + if _minio_create_bucket(client, bucket): + exists = True + created = True + # 始终根据 public_read 应用/移除策略(即使桶已存在) + try: + import json as _json + if str(public_read or "false").lower() in {"1","true","yes","on"}: + policy = { + "Version": "2012-10-17", + "Statement": [ + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bucket}"]}, + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bucket}/*"]}, + ], + } + client.set_bucket_policy(bucket_name=bucket, policy=_json.dumps(policy)) # type: ignore + else: + try: + client.delete_bucket_policy(bucket) # type: ignore + except Exception: + pass + except Exception: + pass + return {"ok": True, "connected": True, "bucket_exists": exists, "created": created, "hint": ("使用 HTTPS 访问 9000 端口可能失败,请确认启用 HTTPS 与证书配置匹配" if sec and (public or "").startswith("http://") else None)} + except Exception as e: + hint = None + if "RequestTimeTooSkewed" in str(e): + hint = _minio_time_hint(ep_host, sec) + return {"ok": False, "connected": False, "bucket_exists": False, "error": str(e), "hint": hint} + +@app.get("/config/profile/list") +async def list_profiles(): + names: List[str] = [] + try: + for p in profiles_dir.rglob("*.json"): + try: + names.append(p.stem) + except Exception: + continue + except Exception: + pass + return {"ok": True, "profiles": sorted(set(names))} + +@app.post("/config/profile/activate") +async def activate_profile(name: str = Form(...)): + target = None + try: + for p in profiles_dir.rglob("*.json"): + if p.stem.lower() == (name or "").strip().lower(): + target = p + break + if target is None: + raise HTTPException(status_code=404, detail="profile not found") + active_path = profiles_dir / "active.json" + data = json.loads(target.read_text("utf-8")) + # 应用并覆盖到运行时配置 + try: + minio_cfg = data.get("minio", {}) + if isinstance(minio_cfg, dict) and minio_cfg: + sanitized = dict(minio_cfg) + try: + ep = str(sanitized.get("endpoint") or "").strip() + if ep and ":9001" in ep: + h = ep.split("/")[0] + if ":" in h: + parts = h.split(":") + sanitized["endpoint"] = f"{parts[0]}:9000" + else: + sanitized["endpoint"] = h + except Exception: + pass + try: + pub = str(sanitized.get("public") or "").strip() + if pub and (":9001" in pub or "/browser" in pub or "/minio" in pub): + host = pub.split("/")[0] + sec = str(sanitized.get("secure") or RUNTIME_CONFIG.get("minio", {}).get("secure") or "false").lower() in {"1","true","yes","on"} + scheme = "https" if sec else "http" + if ":" in host: + base_host = host.split(":")[0] + sanitized["public"] = f"{scheme}://{base_host}:9000" + else: + sanitized["public"] = f"{scheme}://{host}:9000" + except Exception: + pass + RUNTIME_CONFIG["minio"].update(sanitized) + except Exception: + pass + try: + db_cfg = data.get("db", {}) + if isinstance(db_cfg, dict) and db_cfg: + RUNTIME_CONFIG["db"].update(db_cfg) + except Exception: + pass + # 写入 active.json 以便后续观察者检测到变更 + active_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), "utf-8") + return {"ok": True, "active": target.stem} + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +@app.get("/system/time/check") +def system_time_check( + endpoint: Optional[str] = Query(None), + public: Optional[str] = Query(None), + secure: Optional[str] = Query(None), +): + try: + rc = RUNTIME_CONFIG.get("minio", {}) + ep_raw = (endpoint or rc.get("endpoint") or "").strip() + pub_raw = (public or rc.get("public") or "").strip() + sec_flag = secure if secure is not None else (rc.get("secure") or "false") + sec = str(sec_flag or "false").lower() in {"1","true","yes","on"} + scheme = "https" if sec else "http" + # 解析 host(优先 public,其次 endpoint) + def _host(s: str) -> str: + try: + from urllib.parse import urlsplit + u = urlsplit(s) + return (u.netloc or s).split("/")[0] if u.scheme else s.split("/")[0] + except Exception: + return s.split("/")[0] + base_host = _host(pub_raw or ep_raw) + if not base_host: + from datetime import datetime, timezone + now = datetime.now(timezone.utc) + return {"ok": True, "server_time": None, "local_time": now.isoformat(), "diff_sec": None, "hint": "未配置 MinIO 端点"} + # 构造候选检测 URL(尽量使用 MinIO 健康端点以获取标准 Date 头) + base = f"{scheme}://{base_host}" + candidates = [ + base, + base + "/minio/health/live", + base + "/minio/health/ready", + base + "/minio/health/version", + ] + srv_date = None + for url in candidates: + try: + req = Request(url, method="HEAD") + r = urlopen(req, timeout=3) + d = r.headers.get("Date") or r.headers.get("date") + if d: + srv_date = d + break + except Exception: + try: + r = urlopen(url, timeout=3) + d = r.headers.get("Date") or r.headers.get("date") + if d: + srv_date = d + break + except Exception: + pass + # 如果按当前 scheme 获取失败,尝试切换 scheme 再试一次 + if not srv_date: + alt_scheme = "http" if scheme == "https" else "https" + alt_base = f"{alt_scheme}://{base_host}" + alt_candidates = [ + alt_base, + alt_base + "/minio/health/live", + alt_base + "/minio/health/ready", + alt_base + "/minio/health/version", + ] + for url in alt_candidates: + try: + req = Request(url, method="HEAD") + r = urlopen(req, timeout=3) + d = r.headers.get("Date") or r.headers.get("date") + if d: + srv_date = d + break + except Exception: + try: + r = urlopen(url, timeout=3) + d = r.headers.get("Date") or r.headers.get("date") + if d: + srv_date = d + break + except Exception: + pass + from datetime import datetime, timezone + now = datetime.now(timezone.utc) + diff = None + if srv_date: + from email.utils import parsedate_to_datetime + try: + dt = parsedate_to_datetime(srv_date) + diff = int(abs((now - dt).total_seconds())) + except Exception: + diff = None + hint = _minio_time_hint(base_host, sec) + return {"ok": True, "server_time": srv_date, "local_time": now.isoformat(), "diff_sec": diff, "hint": hint} + except Exception as e: + return {"ok": False, "error": str(e)} + +@app.post("/system/time/sync") +async def system_time_sync(method: Optional[str] = Form("auto"), ntp_server: Optional[str] = Form(None)): + cmds = [] + servers = [s for s in [ntp_server, "time.apple.com", "pool.ntp.org"] if s] + for srv in servers: + if (method or "auto") in {"auto", "sntp"}: + cmds.append(["sntp", "-sS", srv]) + if (method or "auto") in {"auto", "ntpdate"}: + cmds.append(["ntpdate", "-u", srv]) + outputs = [] + success = False + for cmd in cmds: + try: + p = subprocess.run(cmd, capture_output=True, text=True, timeout=8) + outputs.append({"cmd": " ".join(cmd), "code": p.returncode, "out": p.stdout, "err": p.stderr}) + if p.returncode == 0: + success = True + break + except Exception as e: + outputs.append({"cmd": " ".join(cmd), "code": -1, "out": "", "err": str(e)}) + if not success and sys.platform == "darwin": + elev_cmds = [] + for srv in servers: + elev_cmds.append(["osascript", "-e", f'do shell script "sntp -sS {srv}" with administrator privileges']) + elev_cmds.append(["osascript", "-e", f'do shell script "ntpdate -u {srv}" with administrator privileges']) + elev_cmds.append(["osascript", "-e", f'do shell script "/usr/sbin/systemsetup -setnetworktimeserver {srv}" with administrator privileges']) + elev_cmds.append(["osascript", "-e", 'do shell script "/usr/sbin/systemsetup -setusingnetworktime on" with administrator privileges']) + for cmd in elev_cmds: + try: + p = subprocess.run(cmd, capture_output=True, text=True, timeout=12) + outputs.append({"cmd": " ".join(cmd), "code": p.returncode, "out": p.stdout, "err": p.stderr}) + if p.returncode == 0: + success = True + break + except Exception as e: + outputs.append({"cmd": " ".join(cmd), "code": -1, "out": "", "err": str(e)}) + chk = system_time_check() + return {"ok": success, "result": outputs, "check": chk} + +@app.get("/api/system/time/check") +def system_time_check_api( + endpoint: Optional[str] = Query(None), + public: Optional[str] = Query(None), + secure: Optional[str] = Query(None), +): + return system_time_check(endpoint=endpoint, public=public, secure=secure) + +@app.post("/api/system/time/sync") +async def system_time_sync_api(method: Optional[str] = Form("auto"), ntp_server: Optional[str] = Form(None)): + return await system_time_sync(method=method, ntp_server=ntp_server) + +async def _auto_time_calibration(): + try: + await asyncio.sleep(1.0) + chk = system_time_check() + try: + diff = int((chk or {}).get("diff_sec") or 0) + except Exception: + diff = 0 + if diff and diff > 120: + try: + await system_time_sync(method="auto", ntp_server=None) + except Exception: + pass + except Exception: + pass + +@app.get("/config/minio/buckets") +def list_minio_buckets( + endpoint: str, + access: str, + secret: str, + secure: Optional[str] = "false", +): + if Minio is None: + return {"ok": False, "error": "minio client not available", "buckets": []} + try: + sec = str(secure or "false").lower() in {"1","true","yes","on"} + client = _minio_client(endpoint=endpoint, access=access, secret=secret, secure=sec) + names = [b.name for b in client.list_buckets()] + return {"ok": True, "buckets": names} + except Exception as e: + return {"ok": False, "error": str(e), "buckets": []} + +@app.post("/config/minio/create-bucket") +async def create_minio_bucket( + endpoint: str = Form(...), + access: str = Form(...), + secret: str = Form(...), + bucket: str = Form(...), + secure: Optional[str] = Form("false"), + public_read: Optional[str] = Form("false"), +): + if Minio is None: + return {"ok": False, "error": "minio client not available"} + try: + sec = str(secure or "false").lower() in {"1","true","yes","on"} + ep_raw = (endpoint or "").strip() + ep_host = ep_raw + try: + from urllib.parse import urlsplit + u = urlsplit(ep_raw) + if u.scheme: + ep_host = (u.netloc or ep_raw).split("/")[0] + else: + ep_host = ep_raw.split("/")[0] + except Exception: + ep_host = ep_raw.split("/")[0] + if ":9001" in ep_host or "/browser" in ep_raw or "/minio" in ep_raw: + return {"ok": False, "error": "请使用 MinIO API 端口 9000(而非 9001 控制台)"} + client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=sec) + try: + try: + client.list_buckets() # type: ignore + except Exception as e: + if sec and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)): + client = _minio_client(endpoint=ep_host, access=access, secret=secret, secure=False) + sec = False + except Exception: + pass + _minio_create_bucket(client, bucket) + try: + pr = str(public_read or "false").lower() in {"1","true","yes","on"} + if pr: + policy = { + "Version": "2012-10-17", + "Statement": [ + {"Effect": "Allow", "Principal": {"AWS": ["*"]}, "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bucket}"]}, + {"Effect": "Allow", "Principal": {"AWS": ["*"]}, "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bucket}/*"]}, + ], + } + import json as _json + client.set_bucket_policy(bucket, _json.dumps(policy)) # type: ignore + except Exception: + pass + return {"ok": True, "bucket_exists": True} + except Exception as e: + hint = None + if "RequestTimeTooSkewed" in str(e): + hint = _minio_time_hint(ep_host, sec) + return {"ok": False, "error": str(e), "hint": hint} + +@app.post("/minio/presign", response_model=MinioPresignResponse) +async def minio_presign( + url: Optional[str] = Form(None), + object_name: Optional[str] = Form(None), + bucket: Optional[str] = Form(None), + expires: Optional[int] = Form(3600), +): + client, cfg_bucket, public_base, _ = minio_current(RUNTIME_CONFIG) + if client is None: + raise HTTPException(status_code=400, detail="MinIO 未配置") + obj = (object_name or "").strip() + bkt = (bucket or cfg_bucket or "").strip() + if (not obj) and url: + try: + from urllib.parse import urlsplit, unquote + u = urlsplit((url or "").strip()) + path = u.path or "" + parts = [p for p in path.split("/") if p] + if parts: + if not bkt: + bkt = parts[0] + obj = "/".join(parts[1:]) + obj = unquote(obj) + except Exception: + obj = obj + if not bkt or not obj: + raise HTTPException(status_code=400, detail="bucket 与 object_name/URL 不能为空") + exp = int(expires or 3600) + ps = presigned_read(client, bkt, obj, exp) if client is not None else None + pub_url = None + try: + from urllib.parse import quote as _quote + if public_base: + pub_url = f"{public_base}/{bkt}/{_quote(obj, safe='/')}" + except Exception: + pub_url = None + return MinioPresignResponse( + bucket=bkt, + object=obj, + minio_url=pub_url, + minio_presigned_url=ps, + expires=exp, + ) + +@app.get("/minio/object") +def minio_object(bucket: Optional[str] = None, object: str = ""): + client, cfg_bucket, public_base, _ = minio_current(RUNTIME_CONFIG) + if client is None: + raise HTTPException(status_code=400, detail="MinIO 未配置") + bkt = (bucket or cfg_bucket or "").strip() + obj_in = (object or "").strip() + try: + from urllib.parse import unquote as _unquote + obj = _unquote(obj_in) + except Exception: + obj = obj_in + if not bkt or not obj: + raise HTTPException(status_code=400, detail="bucket 与 object 不能为空") + ct = None + try: + try: + st = client.stat_object(bucket_name=bkt, object_name=obj) # type: ignore + except TypeError: + st = client.stat_object(bkt, obj) # type: ignore + ct = getattr(st, "content_type", None) + except Exception: + ct = None + data = b"" + try: + try: + resp = client.get_object(bucket_name=bkt, object_name=obj) # type: ignore + except TypeError: + resp = client.get_object(bkt, obj) # type: ignore + try: + data = resp.read() # type: ignore + finally: + try: + resp.close() # type: ignore + except Exception: + pass + except Exception as e: + raise HTTPException(status_code=403, detail=str(e)) + media = ct or detect_mime(obj, data) + headers = {"Content-Disposition": f"inline; filename*=UTF-8''" + quote(Path(obj).name)} + return Response(content=data, media_type=media, headers=headers) + +@app.post("/config/db") +async def set_db_config(webhook_url: Optional[str] = Form(None), token: Optional[str] = Form(None)): + RUNTIME_CONFIG["db"].update({"webhook_url": webhook_url, "token": token}) + return {"ok": True} + +@app.get("/config") +def get_config_snapshot(): + safe = { + "minio": { + k: ("***" if k == "secret" and v else v) + for k, v in RUNTIME_CONFIG.get("minio", {}).items() + }, + "db": RUNTIME_CONFIG.get("db", {}), + } + return safe + +@app.get("/config/profiles") +def list_profiles(): + names = [] + try: + for p in profiles_dir.glob("*.json"): + names.append(p.stem) + except Exception: + names = [] + return {"ok": True, "profiles": sorted(names)} + +@app.post("/config/save_profile") +async def save_profile(name: str = Form(...)): + if not name.strip(): + raise HTTPException(status_code=400, detail="name required") + data = { + "minio": RUNTIME_CONFIG.get("minio", {}), + "db": RUNTIME_CONFIG.get("db", {}), + } + import json as _json + path = profiles_dir / f"{sanitize_filename(name)}.json" + try: + path.write_text(_json.dumps(data, ensure_ascii=False, indent=2), "utf-8") + return {"ok": True, "name": path.stem} + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +@app.get("/config/load_profile") +def load_profile(name: str): + import json as _json + path = profiles_dir / f"{sanitize_filename(name)}.json" + if not path.exists(): + raise HTTPException(status_code=404, detail="profile not found") + try: + data = _json.loads(path.read_text("utf-8")) + m = data.get("minio", {}) + d = data.get("db", {}) + RUNTIME_CONFIG["minio"].update(m) + RUNTIME_CONFIG["db"].update(d) + client, bkt, pub, _ = minio_current(RUNTIME_CONFIG) + if client is None or not bkt or not pub: + raise HTTPException(status_code=400, detail="MinIO config invalid") + return {"ok": True, "config": data} + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +# ────────────────────────────────────────────────────────────────────────────── +# Auto-load DB config from app/configs without restart or page refresh +# ────────────────────────────────────────────────────────────────────────────── + +def _choose_default_config_file() -> Optional[Path]: + try: + candidates: List[Path] = [] + for p in profiles_dir.rglob("*.json"): + candidates.append(p) + if not candidates: + return None + by_name = {x.stem.lower(): x for x in candidates} + for prefer in ("active", "default", "test"): + if prefer in by_name: + return by_name[prefer] + return sorted(candidates, key=lambda x: x.stat().st_mtime, reverse=True)[0] + except Exception: + return None + +def _apply_configs_from_file(path: Path) -> None: + try: + import json as _json + data = _json.loads(path.read_text("utf-8")) + db_cfg = data.get("db", {}) + if isinstance(db_cfg, dict) and db_cfg: + RUNTIME_CONFIG["db"].update(db_cfg) + minio_cfg = data.get("minio", {}) + if isinstance(minio_cfg, dict) and minio_cfg: + sanitized = dict(minio_cfg) + try: + ep = str(sanitized.get("endpoint") or "").strip() + if ep and ":9001" in ep: + h = ep.split("/")[0] + if ":" in h: + parts = h.split(":") + sanitized["endpoint"] = f"{parts[0]}:9000" + else: + sanitized["endpoint"] = h + except Exception: + pass + try: + pub = str(sanitized.get("public") or "").strip() + if pub and (":9001" in pub or "/browser" in pub or "/minio" in pub): + host = pub.split("/")[0] + sec = str(sanitized.get("secure") or RUNTIME_CONFIG.get("minio", {}).get("secure") or "false").lower() in {"1","true","yes","on"} + scheme = "https" if sec else "http" + if ":" in host: + base_host = host.split(":")[0] + sanitized["public"] = f"{scheme}://{base_host}:9000" + else: + sanitized["public"] = f"{scheme}://{host}:9000" + except Exception: + pass + for k, v in sanitized.items(): + try: + cur = RUNTIME_CONFIG["minio"].get(k) + if cur in (None, ""): + RUNTIME_CONFIG["minio"][k] = v + except Exception: + RUNTIME_CONFIG["minio"][k] = v + except Exception: + pass + +async def _watch_db_config_changes(interval_sec: float = 3.0) -> None: + last_path: Optional[Path] = _choose_default_config_file() + last_mtime: float = (last_path.stat().st_mtime if last_path and last_path.exists() else 0.0) + # Apply once at startup + if last_path: + _apply_configs_from_file(last_path) + while True: + try: + cur = _choose_default_config_file() + if cur and cur.exists(): + mt = cur.stat().st_mtime + if cur != last_path or mt > last_mtime: + _apply_configs_from_file(cur) + last_path = cur + last_mtime = mt + except Exception: + pass + await asyncio.sleep(interval_sec) + +@app.on_event("startup") +async def _startup_autoload_configs(): + try: + asyncio.create_task(_watch_db_config_changes(interval_sec=3.0)) + except Exception: + pass + try: + asyncio.create_task(_auto_time_calibration()) + except Exception: + pass + +@app.post("/md/convert", response_model=ConvertResponse) +async def md_convert( + md_file: Optional[UploadFile] = File(None), + markdown_text: Optional[str] = Form(None), + markdown_url: Optional[str] = Form(None), + target: str = Form("docx"), + save: Optional[bool] = Form(False), + filename: Optional[str] = Form(None), + css_name: Optional[str] = Form(None), + css_text: Optional[str] = Form(None), + toc: Optional[bool] = Form(True), + header_text: Optional[str] = Form(None), + footer_text: Optional[str] = Form(None), + logo_url: Optional[str] = Form(None), + logo_file: Optional[UploadFile] = File(None), + cover_url: Optional[str] = Form(None), + cover_file: Optional[UploadFile] = File(None), + product_name: Optional[str] = Form(None), + document_name: Optional[str] = Form(None), + product_version: Optional[str] = Form(None), + document_version: Optional[str] = Form(None), + copyright_text: Optional[str] = Form(None), +): + """ + @function md_convert + @description Advanced Markdown conversion endpoint supporting custom styling, logos, and metadata + @param md_file Uploaded Markdown file (optional) + @param markdown_text Raw Markdown text (optional) + @param markdown_url URL to Markdown file (optional) + @param target Output format (docx/pdf) + @param save Save to MinIO + @param filename Output filename + @param css_name Predefined CSS profile name + @param css_text Custom CSS content + @param toc Include Table of Contents + @param header_text Custom header text + @param footer_text Custom footer text + @param logo_url URL for logo image + @param logo_file Uploaded logo file + @param cover_url URL for cover image + @param cover_file Uploaded cover file + @param product_name Product name for cover + @param document_name Document name for cover + @param product_version Product version for cover + @param document_version Document version for cover + @param copyright_text Copyright text + @return File download or JSON response + """ + logging.info(f"md_convert start target={target} save={save} filename={filename}") + provided = 0 + if md_file is not None: + provided += 1 + if markdown_text: + provided += 1 + if markdown_url: + provided += 1 + if provided != 1: + raise HTTPException(status_code=400, detail="provide exactly one of md_file, markdown_text, markdown_url") + if target.lower() not in {"docx", "pdf"}: + raise HTTPException(status_code=400, detail="target must be docx or pdf") + mappings: List[Dict[str, str]] = [] + base_dir = Path(".").resolve() + if md_file is not None: + content = (await md_file.read()).decode("utf-8", errors="ignore") + base_dir = Path(md_file.filename or ".").resolve().parent if md_file and md_file.filename else Path(".") + base = sanitize_filename(filename) if filename else sanitize_filename(os.path.splitext(md_file.filename or "document")[0]) + elif markdown_url: + src = markdown_url.strip() + try: + if src.lower().startswith("http"): + already_escaped = "%" in src + safe = src if already_escaped else _safe_http_url(src) + try: + with urlopen(safe, timeout=10) as r: + raw = r.read() + try: + logging.info(f"md_convert fetched markdown_url len={len(raw)} url={safe}") + except Exception: + pass + except UnicodeEncodeError: + alt = quote(src, safe=':/?&=%#') + with urlopen(_safe_http_url(alt), timeout=10) as r: + raw = r.read() + try: + logging.info(f"md_convert fetched markdown_url(len={len(raw)}) with alt url") + except Exception: + pass + except HTTPError as err: + raise HTTPException(status_code=400, detail={"error": "fetch_failed", "status": err.code, "url": getattr(err, 'url', src)}) + except URLError as err: + raise HTTPException(status_code=400, detail={"error": "fetch_failed", "status": None, "url": src, "reason": str(getattr(err, 'reason', err))}) + try: + content = raw.decode("utf-8") + except Exception: + content = raw.decode("latin-1", errors="ignore") + else: + with open(src, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + base_dir = Path(src).resolve().parent + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=400, detail={"error": "fetch_failed", "url": src, "message": str(e)}) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(src, None)) + else: + content = markdown_text or "" + base = sanitize_filename(filename) if filename else "document" + # Rewrite local assets to MinIO URLs if configured + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is not None and bucket and public_base and base_dir: + try: + content, mappings = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix) + except Exception: + pass + # Prepare common assets (logo, cover) for both DOCX and PDF + logo_src = None + try: + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if logo_file is not None and getattr(logo_file, "filename", None): + lb = await logo_file.read() + mime = detect_image_mime(logo_file.filename, lb) + safe_logo = sanitize_filename(os.path.splitext(logo_file.filename or "logo")[0]) + extl = "." + (logo_file.filename.rsplit(".", 1)[-1].lower() if "." in (logo_file.filename or "") else "png") + obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{safe_logo}{extl}") + bio = io.BytesIO(lb) + if client is not None and bucket and public_base: + client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + enc = _quote(obj_logo, safe="/") + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None + logo_src = ps or f"{public_base}/{bucket}/{enc}" + except Exception: + logo_src = f"{public_base}/{bucket}/{obj_logo}" + try: + if not save: + import base64 as _b64 + logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii") + except Exception: + pass + elif logo_url: + u = logo_url.strip() + if u.lower().startswith("http://") or u.lower().startswith("https://"): + logo_src = u + elif u.startswith("/"): + p = Path(u) + try: + lb = p.read_bytes() + mime = detect_image_mime(p.name, lb) + obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}") + bio = io.BytesIO(lb) + if client is not None and bucket and public_base: + client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + enc = _quote(obj_logo, safe="/") + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None + logo_src = ps or f"{public_base}/{bucket}/{enc}" + except Exception: + logo_src = f"{public_base}/{bucket}/{obj_logo}" + try: + if not save: + import base64 as _b64 + logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii") + except Exception: + pass + except Exception: + logo_src = p.resolve().as_uri() + else: + p = Path(u) + try: + lb = p.read_bytes() + mime = detect_image_mime(p.name, lb) + obj_logo = join_prefix(prefix, f"uploads/logo/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}") + bio = io.BytesIO(lb) + if client is not None and bucket and public_base: + client.put_object(bucket_name=bucket, object_name=obj_logo, data=bio, length=len(lb), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + enc = _quote(obj_logo, safe="/") + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj_logo, exp) if client is not None else None + logo_src = ps or f"{public_base}/{bucket}/{enc}" + except Exception: + logo_src = f"{public_base}/{bucket}/{obj_logo}" + try: + if not save: + import base64 as _b64 + logo_src = f"data:{mime};base64," + _b64.b64encode(lb).decode("ascii") + except Exception: + pass + except Exception: + logo_src = p.resolve().as_uri() + except Exception: + logo_src = None + + cover_src = None + try: + limit = 2 * 1024 * 1024 + if cover_file is not None and getattr(cover_file, "filename", None): + cb = await cover_file.read() + if len(cb) > limit: + raise HTTPException(status_code=400, detail="cover image exceeds 2MB limit") + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + mime = detect_image_mime(cover_file.filename, cb) + safe_cov = sanitize_filename(os.path.splitext(cover_file.filename or "cover")[0]) + extc = "." + (cover_file.filename.rsplit(".", 1)[-1].lower() if "." in (cover_file.filename or "") else "png") + obj_cov = join_prefix(prefix, f"uploads/cover/{int(time.time())}-{safe_cov}{extc}") + bio = io.BytesIO(cb) + if client is not None and bucket and public_base: + client.put_object(bucket_name=bucket, object_name=obj_cov, data=bio, length=len(cb), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + enc = _quote(obj_cov, safe="/") + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj_cov, exp) if client is not None else None + cover_src = ps or f"{public_base}/{bucket}/{enc}" + except Exception: + cover_src = f"{public_base}/{bucket}/{obj_cov}" + try: + if not save: + import base64 as _b64 + cover_src = f"data:{mime};base64," + _b64.b64encode(cb).decode("ascii") + except Exception: + pass + elif cover_url: + cu = cover_url.strip() + if cu.lower().startswith("http://") or cu.lower().startswith("https://"): + cover_src = cu + else: + p = Path(cu) + rb = p.read_bytes() + if len(rb) > limit: + raise HTTPException(status_code=400, detail="cover image exceeds 2MB limit") + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + mime = detect_image_mime(cu, rb) + obj_cov = join_prefix(prefix, f"uploads/cover/{int(time.time())}-{sanitize_filename(p.stem)}{p.suffix or '.png'}") + bio = io.BytesIO(rb) + if client is not None and bucket and public_base: + client.put_object(bucket_name=bucket, object_name=obj_cov, data=bio, length=len(rb), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + enc = _quote(obj_cov, safe="/") + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj_cov, exp) if client is not None else None + cover_src = ps or f"{public_base}/{bucket}/{enc}" + except Exception: + cover_src = f"{public_base}/{bucket}/{obj_cov}" + try: + if not save: + import base64 as _b64 + cover_src = f"data:{mime};base64," + _b64.b64encode(rb).decode("ascii") + except Exception: + pass + except HTTPException: + raise + except Exception: + cover_src = None + logging.info(f"md_convert assets prepared logo_src={bool(logo_src)} cover_src={bool(cover_src)} css_name={css_name} css_text_len={(len(css_text) if css_text else 0)}") + if target.lower() == "docx": + data = md_to_docx_bytes( + content, + toc=bool(toc), + header_text=header_text, + footer_text=footer_text, + logo_url=logo_src or logo_url, + copyright_text=copyright_text, + filename_text=base, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + media = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ext = ".docx" + # Upload final docx to MinIO + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + minio_url = None + minio_presigned_url = None + try: + rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"} + if client is not None and bucket and public_base and rc_store_final: + out_name = f"{base}{ext}" + obj = f"{(prefix or '').strip('/')}/converted/{out_name}".lstrip("/") + bio = io.BytesIO(data) + ct = media or "application/octet-stream" + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=ct) # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + try: + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None + except Exception: + minio_presigned_url = None + except Exception: + minio_url = None + logging.info(f"md_convert done docx name={base}{ext} size={len(data)}") + _db_notify({ + "type": "md_convert", + "base": base, + "target": target.lower(), + "local_url": None, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "mappings": mappings, + "time": int(time.time()) + }) + return ConvertResponse( + minio_url=minio_url, + minio_presigned_url=minio_presigned_url, + name=f"{base}{ext}", + media_type=media, + ) + else: + use_css_name = css_name if css_name else ("default" if not css_text else None) + data = md_to_pdf_bytes_with_renderer( + content, + "weasyprint", + css_name=use_css_name, + css_text=css_text, + toc=bool(toc), + header_text=header_text, + footer_text=footer_text, + logo_url=logo_src or logo_url, + copyright_text=copyright_text, + filename_text=base, + cover_src=cover_src, + product_name=product_name, + document_name=document_name, + product_version=product_version, + document_version=document_version, + ) + media = "application/pdf" + ext = ".pdf" + minio_url = None + minio_presigned_url = None + try: + rc_store_final = str(RUNTIME_CONFIG.get("minio", {}).get("store_final") or "true").lower() in {"1","true","yes","on"} + if client is not None and bucket and public_base and rc_store_final: + out_name = f"{base}{ext}" + obj = f"{(prefix or '').strip('/')}/converted/{out_name}".lstrip("/") + bio = io.BytesIO(data) + ct = media or "application/octet-stream" + try: + if ct.startswith("text/") and "charset" not in ct.lower(): + ct = ct + "; charset=utf-8" + except Exception: + pass + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=ct) # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + try: + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None + except Exception: + minio_presigned_url = None + except Exception: + minio_url = None + logging.info(f"md_convert done pdf name={base}{ext} size={len(data)}") + _db_notify({ + "type": "md_convert", + "base": base, + "target": target.lower(), + "local_url": None, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "mappings": mappings, + "time": int(time.time()) + }) + return ConvertResponse( + minio_url=minio_url, + minio_presigned_url=minio_presigned_url, + name=f"{base}{ext}", + media_type=media, + ) + +@app.get("/config/linkmap") +def get_linkmap(): + return load_linkmap() + +@app.post("/config/linkmap") +async def set_linkmap(mapping: dict): + try: + save_linkmap(mapping) + return {"ok": True} + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) +def detect_image_mime(filename: Optional[str], data: bytes) -> str: + ext = (os.path.splitext(filename or "")[1] or "").lower() + if ext in {".png"}: + return "image/png" + if ext in {".jpg", ".jpeg"}: + return "image/jpeg" + if ext in {".svg"}: + return "image/svg+xml" + if ext in {".webp"}: + return "image/webp" + if data.startswith(b"\x89PNG\r\n\x1a\n"): + return "image/png" + if data.startswith(b"\xff\xd8\xff"): + return "image/jpeg" + if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP": + return "image/webp" + try: + head = data[:512].decode("utf-8", errors="ignore") + if " str: + ext = (os.path.splitext(filename or "")[1] or "").lower() + if ext in {".png", ".jpg", ".jpeg", ".svg", ".webp"}: + return detect_image_mime(filename, data) + sig_png = data.startswith(b"\x89PNG\r\n\x1a\n") + sig_jpg = data.startswith(b"\xff\xd8\xff") + sig_webp = len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP" + if sig_png or sig_jpg or sig_webp: + return detect_image_mime(filename, data) + guessed, _ = mimetypes.guess_type(filename or "") + if guessed: + return guessed + return "application/octet-stream" + +@app.post("/proxy/download") +async def proxy_download(url: str = Form(...)): + u = (url or "").strip() + if not u: + raise HTTPException(status_code=400, detail="url required") + try: + data: bytes + ct: str + name: str + if u.lower().startswith("http://") or u.lower().startswith("https://"): + already_escaped = "%" in u + safe = u if already_escaped else _safe_http_url(u) + with urlopen(safe, timeout=15) as r: + data = r.read() + ct = r.headers.get("Content-Type") or detect_mime(None, data) + from urllib.parse import urlparse, unquote + import os as _os + parsed = urlparse(u) + path = unquote(parsed.path or "") + last = (_os.path.basename(path) or "download").split("?")[0] + if "." in last: + name = last + else: + import mimetypes as _m + ext = _m.guess_extension((ct or "").split(";")[0].strip()) or ".md" + name = last + ext + else: + p = Path(u) + if not p.exists() or not p.is_file(): + raise HTTPException(status_code=404, detail="local path not found") + data = p.read_bytes() + ct = detect_mime(p.name, data) + name = p.name + disp = f"attachment; filename=\"{name}\"; filename*=UTF-8''" + quote(name) + headers = {"Content-Disposition": disp} + return Response(content=data, media_type=ct, headers=headers) + except HTTPError as err: + raise HTTPException(status_code=err.code, detail=f"download failed: {err}") + except URLError as err: + raise HTTPException(status_code=400, detail=f"download failed: {err}") + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +def _minio_from_env() -> Tuple[Optional[object], Optional[str], Optional[str], str]: + endpoint = os.environ.get("MINIO_ENDPOINT") + access = os.environ.get("MINIO_ACCESS_KEY") + secret = os.environ.get("MINIO_SECRET_KEY") + bucket = os.environ.get("MINIO_BUCKET") + secure = str(os.environ.get("MINIO_SECURE", "false")).lower() in {"1","true","yes","on"} + public_base = os.environ.get("MINIO_PUBLIC_ENDPOINT") or (f"https://{endpoint}" if secure else f"http://{endpoint}" if endpoint else None) + if Minio is None or not endpoint or not access or not secret or not bucket or not public_base: + return None, None, None, "" + client = Minio(endpoint, access_key=access, secret_key=secret, secure=secure) + try: + _minio_create_bucket(client, bucket) + except Exception: + pass + return client, bucket, public_base, os.environ.get("MINIO_PREFIX", "") + + + +def _export_ext(export: str) -> str: + e = (export or "").lower() + if e == "markdown": + return ".md" + if e == "html": + return ".html" + if e in {"json", "doctags"}: + return ".json" + return ".txt" + +def _media_type(export: str) -> str: + e = (export or "").lower() + if e == "markdown": + return "text/markdown; charset=utf-8" + if e == "html": + return "text/html; charset=utf-8" + if e in {"json", "doctags"}: + return "application/json" + return "text/plain; charset=utf-8" + + + +def _rewrite_md_assets_to_minio(text: str, base_dir: Path, client: object, bucket: str, public_base: str, prefix: str, search_root: Optional[Path] = None) -> Tuple[str, List[Dict[str, str]]]: + mappings: List[Dict[str, str]] = [] + def _abs_key(p: Path) -> str: + k = p.resolve().as_posix().lstrip("/") + return k.replace(":", "") + def _upload_data_uri(uri: str) -> Optional[str]: + try: + import base64, hashlib + head, _, b64 = uri.partition(",") + if not b64: + return None + b = base64.b64decode(b64, validate=False) + mime = "" + try: + low = head.lower() + pos = low.find("data:") + if pos != -1: + rest = head[pos+5:] + semi = rest.find(";") + mime = rest[:semi] if semi != -1 else rest + except Exception: + mime = "" + if not mime: + mime = detect_image_mime(None, b) + ext = ".png" + if mime.lower() in {"image/jpeg", "image/jpg"}: + ext = ".jpg" + elif mime.lower() == "image/webp": + ext = ".webp" + elif mime.lower() == "image/svg+xml": + ext = ".svg" + elif mime.lower() == "image/gif": + ext = ".gif" + h = hashlib.sha256(b).hexdigest()[:16] + obj = join_prefix(prefix, f"embed/{h}{ext}") + bio = io.BytesIO(b) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(b), content_type=mime or detect_image_mime(None, b)) # type: ignore + try: + from urllib.parse import quote as _quote + return f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + return f"{public_base}/{bucket}/{obj}" + except Exception: + return None + def _upload(path: Path) -> Optional[str]: + try: + data = path.read_bytes() + mime = detect_mime(path.name, data) + obj = join_prefix(prefix, f"abs/{_abs_key(path)}") + bio = io.BytesIO(data) + size = len(data) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=size, content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + return f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + return f"{public_base}/{bucket}/{obj}" + except Exception: + return None + def _resolve_path(pure: str) -> Optional[Path]: + q = pure.replace("\\", "/") + if q.startswith("/"): + try: + rel = q.lstrip("/") + base = (search_root or base_dir) + p0 = (base / rel).resolve() + except Exception: + p0 = (search_root or base_dir) / q.lstrip("/") + if p0.exists(): + return p0 + try: + p = (base_dir / q).resolve() + except Exception: + p = (base_dir / q) + if p.exists(): + return p + try: + name = Path(q).name + search = (search_root or base_dir) + for hit in search.rglob(name): + if hit.exists(): + return hit + except Exception: + pass + return None + def _replace_md(m: re.Match) -> str: + full = m.group(0) + urlpart = m.group(1).strip() + if urlpart.startswith("data:"): + new = _upload_data_uri(urlpart) + if new: + mappings.append({"from": "data_uri", "to": new, "ok": True, "type": "md_image_data"}) + return full.replace(urlpart, new) + mappings.append({"from": "data_uri", "to": None, "ok": False, "type": "md_image_data"}) + return full + if urlpart.startswith("http://") or urlpart.startswith("https://"): + return full + s = urlpart + pure = s + tail = "" + if s.startswith("<"): + gt = s.find(">") + if gt != -1: + pure = s[1:gt].strip() + tail = s[gt+1:] + else: + dq = s.find('"') + sq = s.find("'") + qpos = -1 + if dq != -1 and sq != -1: + qpos = dq if dq < sq else sq + elif dq != -1: + qpos = dq + elif sq != -1: + qpos = sq + if qpos != -1: + pure = s[:qpos].rstrip() + tail = s[qpos:] + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"}) + return full + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"}) + return full + mappings.append({"from": pure, "to": new, "ok": True, "type": "md_link"}) + return full.replace(urlpart, f"{new}{tail}") + text = re.sub(r"!\[[^\]]*\]\(([^)]+)\)", _replace_md, text) + def _replace_mdlink(m: re.Match) -> str: + full = m.group(0) + urlpart = m.group(1).strip() + if urlpart.startswith("http://") or urlpart.startswith("https://") or urlpart.startswith("data:"): + return full + s = urlpart + pure = s + tail = "" + if s.startswith("<"): + gt = s.find(">") + if gt != -1: + pure = s[1:gt].strip() + tail = s[gt+1:] + else: + dq = s.find('"') + sq = s.find("'") + qpos = -1 + if dq != -1 and sq != -1: + qpos = dq if dq < sq else sq + elif dq != -1: + qpos = dq + elif sq != -1: + qpos = sq + if qpos != -1: + pure = s[:qpos].rstrip() + tail = s[qpos:] + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"}) + return full + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "md_link"}) + return full + mappings.append({"from": pure, "to": new, "ok": True, "type": "md_link"}) + return full.replace(urlpart, f"{new}{tail}") + text = re.sub(r"(? str: + src = m.group(1).strip() + if src.startswith("data:"): + new = _upload_data_uri(src) + if new: + mappings.append({"from": "data_uri", "to": new, "ok": True, "type": "html_img_data"}) + return m.group(0).replace(src, new) + mappings.append({"from": "data_uri", "to": None, "ok": False, "type": "html_img_data"}) + return m.group(0) + if src.startswith("http://") or src.startswith("https://"): + return m.group(0) + pure = src + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_img"}) + return m.group(0) + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_img"}) + return m.group(0) + mappings.append({"from": pure, "to": new, "ok": True, "type": "html_img"}) + return m.group(0).replace(src, new) + text = re.sub(r"]+src=\"([^\"]+)\"", _replace_img, text) + text = re.sub(r"]+src='([^']+)'", _replace_img, text) + def _replace_href(m: re.Match) -> str: + src = m.group(1).strip() + if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"): + return m.group(0) + pure = src + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_href"}) + return m.group(0) + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_href"}) + return m.group(0) + mappings.append({"from": pure, "to": new, "ok": True, "type": "html_href"}) + return m.group(0).replace(src, new) + text = re.sub(r"]+href=\"([^\"]+)\"", _replace_href, text) + text = re.sub(r"]+href='([^']+)'", _replace_href, text) + def _replace_video(m: re.Match) -> str: + src = m.group(1).strip() + if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"): + return m.group(0) + pure = src + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_video"}) + return m.group(0) + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_video"}) + return m.group(0) + mappings.append({"from": pure, "to": new, "ok": True, "type": "html_video"}) + return m.group(0).replace(src, new) + text = re.sub(r"]+src=\"([^\"]+)\"", _replace_video, text) + text = re.sub(r"]+src='([^']+)'", _replace_video, text) + def _replace_audio(m: re.Match) -> str: + src = m.group(1).strip() + if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"): + return m.group(0) + pure = src + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_audio"}) + return m.group(0) + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_audio"}) + return m.group(0) + mappings.append({"from": pure, "to": new, "ok": True, "type": "html_audio"}) + return m.group(0).replace(src, new) + text = re.sub(r"]+src=\"([^\"]+)\"", _replace_audio, text) + text = re.sub(r"]+src='([^']+)'", _replace_audio, text) + def _replace_source(m: re.Match) -> str: + src = m.group(1).strip() + if src.startswith("http://") or src.startswith("https://") or src.startswith("data:"): + return m.group(0) + pure = src + p = _resolve_path(pure) + if not p or not p.exists(): + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_source"}) + return m.group(0) + new = _upload(p) + if not new: + mappings.append({"from": pure, "to": None, "ok": False, "type": "html_source"}) + return m.group(0) + mappings.append({"from": pure, "to": new, "ok": True, "type": "html_source"}) + return m.group(0).replace(src, new) + text = re.sub(r"]+src=\"([^\"]+)\"", _replace_source, text) + text = re.sub(r"]+src='([^']+)'", _replace_source, text) + return text, mappings + +def _uplift_rel_path(rel: Path, md_dir: Path, root: Optional[Path], mappings: List[Dict[str, str]]) -> Path: + try: + parts = list(rel.parts) + if len(parts) < 2: + return rel + exts = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp"} + def _is_asset_dir(name: str) -> bool: + n = name.strip().lower() + return n in {"image", "images", "img", "imgs", "media", "assets", "pic", "pics", "picture", "pictures", "visio pic", "visio_pic", "visio", "图片", "图像"} + def _has_asset_sibling() -> bool: + try: + for ch in md_dir.iterdir(): + if ch.is_dir() and _is_asset_dir(ch.name): + for f in ch.rglob("*"): + if f.is_file() and f.suffix.lower() in exts: + return True + for f in md_dir.iterdir(): + if f.is_file() and f.suffix.lower() in exts: + return True + except Exception: + pass + return False + def _mappings_indicate_local_assets() -> bool: + try: + for m in mappings or []: + if isinstance(m.get("from"), str): + s = str(m.get("from") or "").strip() + if s and not (s.startswith("http://") or s.startswith("https://") or s.startswith("data:") or s.startswith("file://")): + return True + except Exception: + pass + return False + try: + if len(parts) >= 2: + new_parts = parts[:-2] + [parts[-1]] + return Path("/".join(new_parts)) + except Exception: + pass + return rel + except Exception: + return rel + +def _inject_image_urls_for_markers(text: str, urls: List[str]) -> str: + if not urls: + return text + out = [] + i = 0 + for line in text.splitlines(): + if "" in line and i < len(urls): + line = line.replace("", f"![image]({urls[i]})") + i += 1 + out.append(line) + return "\n".join(out) + +def _extract_pdf_images(pdf_path: Path) -> List[Tuple[str, bytes]]: + imgs: List[Tuple[str, bytes]] = [] + if fitz is None: + return imgs + try: + doc = fitz.open(pdf_path) + for page in doc: + for xref in page.get_images(full=True): + try: + info = doc.extract_image(xref[0]) + ext = info.get("ext", "png") + data = info.get("image", b"") + if data: + imgs.append((ext, data)) + except Exception: + continue + doc.close() + except Exception: + pass + return imgs + +def _bulk_upload_assets(root: Path, client: object, bucket: str, public_base: str, prefix: str) -> List[str]: + urls: List[str] = [] + exts = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".tif", ".tiff", ".ico", ".jfif", ".heic", ".heif", ".emf", ".wmf", ".eps", ".psd"} + for f in root.rglob("*"): + try: + if not f.is_file(): + continue + if f.suffix.lower() not in exts: + continue + data = f.read_bytes() + mime = detect_mime(f.name, data) + k = f.resolve().as_posix().lstrip("/").replace(":", "") + obj = join_prefix(prefix, f"abs/{k}") + bio = io.BytesIO(data) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore + urls.append(f"{public_base}/{bucket}/{obj}") + except Exception: + pass + return urls + +@app.post("/md/convert-folder") +async def md_convert_folder(folder_path: str = Form(...), prefix: Optional[str] = Form(None)): + p = Path(folder_path).expanduser().resolve() + if not p.exists() or not p.is_dir(): + raise HTTPException(status_code=400, detail="folder_path must be an existing directory") + client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG) + if client is None or bucket is None or not public_base: + raise HTTPException(status_code=400, detail="MinIO is not configured") + use_prefix = (prefix or env_prefix or "").strip() + processed: List[Dict[str, str]] = [] + try: + _bulk_upload_assets(p, client, bucket, public_base, use_prefix) + except Exception: + pass + for md_file in p.rglob("*.md"): + rel_md = md_file.relative_to(p) + rel_uplift_path = rel_md + minio_url: Optional[str] = None + minio_presigned_url: Optional[str] = None + mappings: List[Dict[str, str]] = [] + try: + content = md_file.read_text("utf-8", errors="ignore") + new_text, mappings = _rewrite_md_assets_to_minio(content, md_file.parent, client, bucket, public_base, use_prefix, search_root=p) + rel_uplift_path = _uplift_rel_path(rel_md, md_file.parent, p, mappings) + # upload rewritten md to MinIO + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift_path.as_posix()}".lstrip("/") + raw = new_text.encode("utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + minio_url_display = unquote(minio_url) + minio_url_display = unquote(minio_url) + try: + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None + except Exception: + minio_presigned_url = None + except Exception as e: + logging.error(str(e)) + okc = sum(1 for m in mappings if m.get("ok")) + frc = sum(1 for m in mappings if not m.get("ok")) + asset_urls = [m.get("to") for m in mappings if m.get("ok") and m.get("to")] + processed.append({ + "source": rel_uplift_path.as_posix(), + "output": None, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "mappings": mappings, + "asset_ok": okc, + "asset_fail": frc, + "asset_urls": asset_urls + }) + return {"ok": True, "count": len(processed), "files": processed} + +@app.post("/md/upload-folder") +async def md_upload_folder(folder_files: List[UploadFile] = File(None), folder_paths: List[str] = Form(None), prefix: Optional[str] = Form(None)): + if not folder_files or not folder_paths or len(folder_files) != len(folder_paths): + raise HTTPException(status_code=400, detail="folder_files and folder_paths are required and must match in length") + client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG) + if client is None or bucket is None or not public_base: + raise HTTPException(status_code=400, detail="MinIO is not configured") + use_prefix = (prefix or env_prefix or "").strip() + staging = Path(tempfile.mkdtemp(prefix="folder_stage_")) + try: + for f, rel in zip(folder_files, folder_paths): + rel_norm = rel.replace("\\", "/") + dest = staging / rel_norm + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(await f.read()) + base = staging + try: + _bulk_upload_assets(base, client, bucket, public_base, use_prefix) + except Exception: + pass + processed: List[Dict[str, str]] = [] + for md_file in base.rglob("*.md"): + try: + content = md_file.read_text("utf-8", errors="ignore") + new_text, mappings = _rewrite_md_assets_to_minio(content, md_file.parent, client, bucket, public_base, use_prefix, search_root=base) + rel_md = md_file.relative_to(base) + rel_uplift = _uplift_rel_path(rel_md, md_file.parent, base, mappings) + try: + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + bio = io.BytesIO(new_text.encode("utf-8")) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(new_text.encode("utf-8")), content_type="text/markdown; charset=utf-8") # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + minio_presigned_url = None + try: + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) if client is not None else None + except Exception: + minio_presigned_url = None + except Exception: + minio_url = None + minio_presigned_url = None + okc = sum(1 for m in mappings if m.get("ok")) + frc = sum(1 for m in mappings if not m.get("ok")) + asset_urls = [m.get("to") for m in mappings if m.get("ok") and m.get("to")] + processed.append({ + "source": rel_uplift.as_posix(), + "output": None, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "mappings": mappings, + "asset_ok": okc, + "asset_fail": frc, + "asset_urls": asset_urls + }) + except Exception as e: + logging.error(str(e)) + return {"ok": True, "count": len(processed), "files": processed} + finally: + try: + shutil.rmtree(staging) + except Exception: + pass +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + +def _is_debug(request: Request) -> bool: + try: + q = request.query_params.get("debug") + if q and str(q).lower() in ("1", "true", "yes", "on"): + return True + except Exception: + pass + h = request.headers.get("X-Debug") + if h and str(h).lower() in ("1", "true", "yes", "on"): + return True + env = os.environ.get("APP_DEBUG") + if env and str(env).lower() in ("1", "true", "yes", "on"): + return True + return False + +@app.middleware("http") +async def logging_middleware(request: Request, call_next): + start = time.time() + try: + response = await call_next(request) + duration = int((time.time() - start) * 1000) + logging.info(f"{request.method} {request.url.path} -> {response.status_code} {duration}ms") + return response + except Exception as exc: + duration = int((time.time() - start) * 1000) + tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)) + logging.error(f"{request.method} {request.url.path} FAILED {duration}ms: {exc}\n{tb}") + raise + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)) + logging.error(f"HTTP error on {request.method} {request.url.path}: {exc}\n{tb}") + debug = _is_debug(request) + body = {"error": "http_error", "detail": exc.detail} + if debug: + body["trace"] = tb + return JSONResponse(status_code=exc.status_code, content=body) + +@app.exception_handler(Exception) +async def global_exception_handler(request: Request, exc: Exception): + tb = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)) + logging.error(f"Unhandled error on {request.method} {request.url.path}: {exc}\n{tb}") + debug = _is_debug(request) + body = {"error": "internal_error", "detail": str(exc)} + if debug: + body["trace"] = tb + return JSONResponse(status_code=500, content=body) +def _safe_http_url(u: str) -> str: + try: + parts = urlsplit(u) + path = quote(parts.path, safe="/:%") + query = quote(parts.query, safe="=&%") + frag = quote(parts.fragment, safe="") + netloc = parts.netloc + try: + userinfo = '' + hostport = netloc + if '@' in netloc: + userinfo, hostport = netloc.split('@', 1) + userinfo += '@' + if hostport.startswith('['): + netloc = userinfo + hostport + else: + port = '' + host = hostport + if ':' in hostport: + host, port = hostport.rsplit(':', 1) + if port and not port.isdigit(): + host = hostport + port = '' + try: + host_idna = host.encode('idna').decode('ascii') + except Exception: + host_idna = host + netloc = f"{userinfo}{host_idna}{(':' + port) if port else ''}" + except Exception: + pass + return urlunsplit((parts.scheme, netloc, path, query, frag)) + except Exception: + return u + +# ────────────────────────────────────────────────────────────────────────────── +# API v2 endpoints with standard code/msg/data +# ────────────────────────────────────────────────────────────────────────────── + +_converter_v2 = FormatConverter() + +def _ok(data: dict, msg: str = "ok"): + return JSONResponse({"code": 0, "msg": msg, "data": data}) + +def _err(msg: str, code: int = 500, detail: object = None): + payload = {"code": code, "msg": msg, "data": None} + if detail is not None: + payload["detail"] = detail + return JSONResponse(payload, status_code=200) + +@app.post("/api/convert") +async def api_convert( + file: Optional[UploadFile] = File(None), + source_url: Optional[str] = Form(None), + export: str = Form("markdown"), + engine: Optional[str] = Form(None), + save: Optional[bool] = Form(False), + filename: Optional[str] = Form(None), +): + try: + if (file is None and not source_url) or (file is not None and source_url): + return _err("参数错误:file 与 source_url 二选一") + export = _normalize_export(export) + engine = _normalize_engine(engine) + if source_url: + enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, source_url, export=export, engine=engine) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None)) + out_ext = _export_ext(export) + ct = _media_type(export) + mappings: list[dict[str, str]] = [] + trace: List[str] = [] + trace.append(f"source_url={source_url}") + trace.append(f"export={export}") + if artifacts_dir: + trace.append(f"artifacts_dir={artifacts_dir}") + if export.lower() == "markdown": + try: + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is not None and bucket and public_base: + trace.append(f"minio bucket={bucket} public={public_base} prefix={(prefix or '').strip('/')}") + base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_")) + new_text, ms = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix, search_root=(Path(artifacts_dir) if artifacts_dir else None)) + urls: List[str] = [] + if artifacts_dir: + try: + urls = _bulk_upload_assets(Path(artifacts_dir), client, bucket, public_base, prefix) + except Exception: + urls = [] + trace.append(f"asset_urls={len(urls)}") + try: + if source_url: + src_path: Optional[Path] = None + if source_url.startswith('file://') or Path(source_url).exists(): + src_path = Path(source_url.replace('file://', '')) + elif source_url.startswith('http://') or source_url.startswith('https://'): + import tempfile as _tf + from urllib.request import urlopen + with _tf.NamedTemporaryFile(delete=False, suffix=Path(infer_basename(source_url, None)).suffix or '.bin') as _tmp: + try: + with urlopen(source_url) as resp: + _tmp.write(resp.read()) + finally: + _tmp.flush(); _tmp.close() + src_path = Path(_tmp.name) + if src_path and src_path.exists() and str(src_path).lower().endswith('.pdf'): + pdf_imgs = _extract_pdf_images(src_path) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(source_url, None)) + extra_urls: List[str] = [] + for idx, (img_ext, data) in enumerate(pdf_imgs): + obj = join_prefix(prefix, f"converted/{base}_img_{idx}.{img_ext}") + bio = io.BytesIO(data) + mime = "image/png" if img_ext.lower() == "png" else "image/jpeg" + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + obj_enc = _quote(obj, safe="/") + extra_urls.append(f"{public_base}/{bucket}/{obj_enc}") + except Exception: + extra_urls.append(f"{public_base}/{bucket}/{obj}") + urls.extend(extra_urls) + trace.append(f"pdf_imgs_uploaded={len(extra_urls)}") + if source_url.startswith('http://') or source_url.startswith('https://'): + try: + os.unlink(str(src_path)) + except Exception: + pass + except Exception: + pass + before = new_text.count("") + new_text = _inject_image_urls_for_markers(new_text, urls) + after = new_text.count("") + trace.append(f"image_placeholders_before={before} after={after}") + content = new_text + mappings = ms + except Exception: + pass + if not save: + resp = _ok({"encoding": enc, "content": content, "name": f"{base}{out_ext}", "media_type": ct, "mappings": mappings, "trace": trace}) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is None or not bucket or not public_base: + return _err("MinIO 未配置,无法保存") + out_name = f"{base}{out_ext}" + if export.lower() == "markdown" and not out_name.lower().endswith(".md"): + out_name = f"{base}.md" + obj = join_prefix(prefix, f"converted/{out_name}") + bio = io.BytesIO(content.encode("utf-8")) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(content.encode("utf-8")), content_type=ct) # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + minio_url_display = unquote(minio_url) + try: + trace.append(f"save out_name={out_name}") + trace.append(f"save obj={obj}") + trace.append(f"save minio_url={minio_url}") + except Exception: + pass + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) + resp = _ok({ + "encoding": enc, + "name": out_name, + "media_type": ct, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "minio_url_display": minio_url_display, + "mappings": mappings, + "trace": trace, + }) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + assert file is not None + suffix = "" + if file.filename and "." in file.filename: + suffix = "." + file.filename.rsplit(".", 1)[-1] + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(await file.read()) + tmp_path = tmp.name + try: + enc, content, artifacts_dir = await asyncio.to_thread(_converter_v2.convert, tmp_path, export=export, engine=engine) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename)) + out_ext = _export_ext(export) + ct = _media_type(export) + mappings: list[dict[str, str]] = [] + trace: List[str] = [] + trace.append(f"file={file.filename}") + trace.append(f"tmp_path={tmp_path}") + trace.append(f"export={export}") + if artifacts_dir: + trace.append(f"artifacts_dir={artifacts_dir}") + if export.lower() == "markdown": + try: + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is not None and bucket and public_base: + trace.append(f"minio bucket={bucket} public={public_base} prefix={(prefix or '').strip('/')}") + base_dir = Path(artifacts_dir) if artifacts_dir else Path(tempfile.mkdtemp(prefix="md_assets_")) + new_text, ms = _rewrite_md_assets_to_minio(content, base_dir, client, bucket, public_base, prefix, search_root=(Path(artifacts_dir) if artifacts_dir else None)) + urls: List[str] = [] + if artifacts_dir: + try: + urls = _bulk_upload_assets(Path(artifacts_dir), client, bucket, public_base, prefix) + except Exception: + urls = [] + trace.append(f"asset_urls={len(urls)}") + try: + if tmp_path and tmp_path.exists() and str(tmp_path).lower().endswith('.pdf'): + pdf_imgs = _extract_pdf_images(tmp_path) + base = sanitize_filename(filename) if filename else sanitize_filename(infer_basename(None, file.filename)) + extra_urls: List[str] = [] + for idx, (img_ext, data) in enumerate(pdf_imgs): + obj = join_prefix(prefix, f"converted/{base}_img_{idx}.{img_ext}") + bio = io.BytesIO(data) + mime = "image/png" if img_ext.lower() == "png" else "image/jpeg" + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(data), content_type=mime) # type: ignore + try: + from urllib.parse import quote as _quote + extra_urls.append(f"{public_base}/{bucket}/{_quote(obj, safe='/')}") + except Exception: + extra_urls.append(f"{public_base}/{bucket}/{obj}") + urls.extend(extra_urls) + trace.append(f"pdf_imgs_uploaded={len(extra_urls)}") + except Exception: + pass + before = new_text.count("") + new_text = _inject_image_urls_for_markers(new_text, urls) + after = new_text.count("") + trace.append(f"image_placeholders_before={before} after={after}") + content = new_text + mappings = ms + except Exception: + pass + if not save: + resp = _ok({"encoding": enc, "content": content, "name": f"{base}{out_ext}", "media_type": ct, "mappings": mappings, "trace": trace}) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + client, bucket, public_base, prefix = minio_current(RUNTIME_CONFIG) + if client is None or not bucket or not public_base: + return _err("MinIO 未配置,无法保存") + out_name = f"{base}{out_ext}" + if export.lower() == "markdown" and not out_name.lower().endswith(".md"): + out_name = f"{base}.md" + obj = join_prefix(prefix, f"converted/{out_name}") + bio = io.BytesIO(content.encode("utf-8")) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(content.encode("utf-8")), content_type=ct) # type: ignore + try: + from urllib.parse import quote as _quote + minio_url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + minio_url = f"{public_base}/{bucket}/{obj}" + minio_url_display = unquote(minio_url) + try: + trace.append(f"save out_name={out_name}") + trace.append(f"save obj={obj}") + trace.append(f"save minio_url={minio_url}") + except Exception: + pass + exp = int(timedelta(hours=12).total_seconds()) + minio_presigned_url = presigned_read(client, bucket, obj, exp) + resp = _ok({ + "encoding": enc, + "name": out_name, + "media_type": ct, + "minio_url": minio_url, + "minio_presigned_url": minio_presigned_url, + "minio_url_display": minio_url_display, + "mappings": mappings, + "trace": trace, + }) + try: + if artifacts_dir: + shutil.rmtree(artifacts_dir, ignore_errors=True) + except Exception: + pass + return resp + finally: + try: + os.remove(tmp_path) + except Exception: + pass + except HTTPException as e: + return _err(str(e.detail), 400) + except Exception as e: + return _err(str(e)) + +@app.post("/api/import/convert") +async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)): + try: + raw_text: Optional[str] = None + if json_file is not None: + raw = await json_file.read() + raw_text = raw.decode("utf-8", errors="ignore") + elif json_text: + raw_text = json_text + else: + use_path = (path or "import.json").strip() + p = Path(use_path).expanduser() + if not p.exists(): + return _err(f"未找到文件: {use_path}") + raw_text = p.read_text("utf-8", errors="ignore") + import json as _json + data = _json.loads(raw_text or "{}") + files = data.get("files", []) + if not isinstance(files, list): + return _err("JSON结构不合法:缺少 files 数组") + imp = _build_import_tree(files, int(versionId or 1001)) + if download: + from fastapi.responses import StreamingResponse + b = _json.dumps(imp, ensure_ascii=False, indent=2).encode("utf-8") + return StreamingResponse(io.BytesIO(b), media_type="application/json; charset=utf-8", headers={"Content-Disposition": "attachment; filename=import.json"}) + return _ok({"import": imp}) + except Exception as e: + return _err(str(e)) + +@app.post("/api/upload-archive") +async def api_upload_archive(file: UploadFile = File(...), prefix: Optional[str] = Form(None)): + try: + client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG) + if client is None or bucket is None or not public_base: + return _err("MinIO 未配置") + use_prefix = (prefix or env_prefix or "").strip() + suffix = (file.filename or "").lower() + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) + data = await file.read() + tmp.write(data) + tmp.flush(); tmp.close() + root = Path(tempfile.mkdtemp(prefix="extract_")) + try: + if suffix.endswith(".zip"): + import zipfile + with zipfile.ZipFile(tmp.name, "r") as zf: + _zip_extract_safely(zf, root) + elif ".tar" in suffix or suffix.endswith(".tgz") or suffix.endswith(".tar.gz") or suffix.endswith(".tar.bz2") or suffix.endswith(".tar.xz"): + import tarfile + with tarfile.open(tmp.name, "r:*") as tf: + _tar_extract_safely(tf, root) + else: + return _err("不支持的压缩格式") + try: + _bulk_upload_assets(root, client, bucket, public_base, use_prefix) + except Exception: + pass + files = [] + # Process Markdown files as-is + for md in root.rglob("*.md"): + try: + text = md.read_text("utf-8", errors="ignore") + new_text, mappings = _rewrite_md_assets_to_minio(text, md.parent, client, bucket, public_base, use_prefix, search_root=root) + rel_md = md.relative_to(root) + rel_uplift = _uplift_rel_path(rel_md, md.parent, root, mappings) + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + bio = io.BytesIO(new_text.encode("utf-8")) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(new_text.encode("utf-8")), content_type="text/markdown; charset=utf-8") # type: ignore + try: + url = f"{public_base}/{bucket}/{quote(obj, safe='/')}" + except Exception: + url = f"{public_base}/{bucket}/{obj}" + url_display = unquote(url) + url_display = unquote(url) + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj, exp) if client is not None else None + raw = new_text.encode("utf-8") + files.append({ + "source": rel_uplift.as_posix(), + "minio_url": url, + "minio_presigned_url": ps, + "minio_url_display": url_display, + "mappings": mappings, + "object_name": obj, + "size": len(raw), + }) + except Exception: + files.append({"source": (md.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0}) + + # Convert HTML files to Markdown and process similarly + for html in [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in {".html", ".htm"}]: + try: + # Skip if a sibling Markdown already exists for the same base name + rel_html = html.relative_to(root) + md_target_rel = rel_html.with_suffix(".md") + md_sibling = (root / md_target_rel).exists() + if md_sibling: + continue + html_src = html.read_text("utf-8", errors="ignore") + html_rew, mappings = _rewrite_md_assets_to_minio(html_src, html.parent, client, bucket, public_base, use_prefix, search_root=root) + tmpd = Path(tempfile.mkdtemp(prefix="rew_html_")) + tmpf = tmpd / html.name + tmpf.write_text(html_rew, "utf-8") + enc, md_text, _art = _converter_v2.convert(str(tmpf), export="markdown") + md_text2, mappings2 = _rewrite_md_assets_to_minio(md_text, html.parent, client, bucket, public_base, use_prefix, search_root=root) + mappings = (mappings or []) + (mappings2 or []) + new_text = md_text2 + rel_uplift = _uplift_rel_path(md_target_rel, html.parent, root, mappings) + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + raw = new_text.encode(enc or "utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore + try: + url = f"{public_base}/{bucket}/{quote(obj, safe='/')}" + except Exception: + url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj, exp) if client is not None else None + files.append({ + "source": rel_uplift.as_posix(), + "minio_url": url, + "minio_presigned_url": ps, + "minio_url_display": url_display, + "mappings": mappings, + "object_name": obj, + "size": len(raw), + }) + except Exception: + files.append({"source": (html.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0}) + finally: + try: + shutil.rmtree(tmpd, ignore_errors=True) + except Exception: + pass + imp = _build_import_tree(files, int(1001)) + return _ok({"count": len(files), "files": files, "import": imp}) + finally: + try: + os.unlink(tmp.name) + except Exception: + pass + try: + shutil.rmtree(root) + except Exception: + pass + except Exception as e: + return _err(str(e)) + +STAGED_ARCHIVES: Dict[str, Dict[str, object]] = {} + +def _build_import_tree(processed: List[Dict[str, object]], version_id: int) -> Dict[str, object]: + def ensure_folder(children: list, name: str) -> Dict[str, object]: + for n in children: + if isinstance(n, dict) and n.get("name") == name and n.get("type") == "FOLDER": + return n + node = {"name": name, "type": "FOLDER", "children": [], "sortOrder": 100} + children.append(node) + return node + tree: List[Dict[str, object]] = [] + for idx, f in enumerate(processed): + src = str(f.get("source") or "") + obj = str(f.get("object_name") or "") + size = int(f.get("size") or 0) + parts = [p for p in src.split("/") if p] + if not parts: + continue + cur = tree + for d in parts[:-1]: + folder = ensure_folder(cur, d) + cur = folder.setdefault("children", []) # type: ignore + fname = parts[-1] + base = fname.rsplit(".", 1)[0] + file_node = {"name": base, "type": "FILE", "sortOrder": 100 + idx, "files": [{"languageId": 1, "objectName": obj, "fileName": fname, "fileSize": size}]} + cur.append(file_node) # type: ignore + return {"versionId": version_id, "tree": tree} + +@app.post("/api/archive/stage") +async def api_archive_stage(file: UploadFile = File(...), prefix: Optional[str] = Form(None)): + try: + suffix = (file.filename or "").lower() + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) + data = await file.read() + tmp.write(data) + tmp.flush(); tmp.close() + sid = uuid.uuid4().hex + STAGED_ARCHIVES[sid] = {"path": tmp.name, "prefix": (prefix or "")} + return _ok({"id": sid, "name": file.filename, "size": len(data)}) + except Exception as e: + return _err(str(e)) + +@app.post("/api/archive/process") +async def api_archive_process(id: str = Form(...), prefix: Optional[str] = Form(None), versionId: Optional[int] = Form(1001)): + try: + st = STAGED_ARCHIVES.get(id) + if not st: + return _err("未找到已上传的压缩包") + tmp_path = Path(str(st.get("path"))) + use_prefix_param = (prefix or str(st.get("prefix") or "")).strip() + client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG) + if client is None or bucket is None or not public_base: + return _err("MinIO 未配置") + use_prefix = (use_prefix_param or env_prefix or "").strip() + root = Path(tempfile.mkdtemp(prefix="extract_")) + try: + sfx = tmp_path.name.lower() + if sfx.endswith(".zip"): + import zipfile + with zipfile.ZipFile(str(tmp_path), "r") as zf: + _zip_extract_safely(zf, root) + elif ".tar" in sfx or sfx.endswith(".tgz") or sfx.endswith(".tar.gz") or sfx.endswith(".tar.bz2") or sfx.endswith(".tar.xz"): + import tarfile + with tarfile.open(str(tmp_path), "r:*") as tf: + _tar_extract_safely(tf, root) + else: + return _err("不支持的压缩格式") + try: + _bulk_upload_assets(root, client, bucket, public_base, use_prefix) + except Exception: + pass + processed: List[Dict[str, object]] = [] + # Process existing Markdown files + for md in root.rglob("*.md"): + try: + text = md.read_text("utf-8", errors="ignore") + new_text, mappings = _rewrite_md_assets_to_minio(text, md.parent, client, bucket, public_base, use_prefix, search_root=root) + rel_md = md.relative_to(root) + rel_uplift = _uplift_rel_path(rel_md, md.parent, root, mappings) + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + raw = new_text.encode("utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore + try: + from urllib.parse import quote as _quote + url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj, exp) if client is not None else None + processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw)}) + except Exception: + processed.append({"source": (md.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0}) + + # Convert HTML files to Markdown and process + for html in [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in {".html", ".htm"}]: + try: + rel_html = html.relative_to(root) + md_target_rel = rel_html.with_suffix(".md") + md_sibling = (root / md_target_rel).exists() + if md_sibling: + continue + html_src = html.read_text("utf-8", errors="ignore") + html_rew, mappings = _rewrite_md_assets_to_minio(html_src, html.parent, client, bucket, public_base, use_prefix, search_root=root) + tmpd = Path(tempfile.mkdtemp(prefix="rew_html_")) + tmpf = tmpd / html.name + tmpf.write_text(html_rew, "utf-8") + enc, md_text, _art = _converter_v2.convert(str(tmpf), export="markdown") + md_text2, mappings2 = _rewrite_md_assets_to_minio(md_text, html.parent, client, bucket, public_base, use_prefix, search_root=root) + mappings = (mappings or []) + (mappings2 or []) + new_text = md_text2 + rel_uplift = _uplift_rel_path(md_target_rel, html.parent, root, mappings) + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + raw = new_text.encode(enc or "utf-8") + bio = io.BytesIO(raw) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw), content_type="text/markdown; charset=utf-8") # type: ignore + try: + from urllib.parse import quote as _quote + url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj, exp) if client is not None else None + processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw)}) + except Exception: + processed.append({"source": (html.relative_to(root).as_posix()), "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0}) + finally: + try: + shutil.rmtree(tmpd, ignore_errors=True) + except Exception: + pass + imp = _build_import_tree(processed, int(versionId or 1001)) + return _ok({"count": len(processed), "files": processed, "import": imp}) + finally: + try: + os.unlink(str(tmp_path)) + except Exception: + pass + try: + shutil.rmtree(root) + except Exception: + pass + try: + STAGED_ARCHIVES.pop(id, None) + except Exception: + pass + except Exception as e: + return _err(str(e)) + +@app.post("/api/upload-list") +async def api_upload_list(list_file: UploadFile = File(...), prefix: Optional[str] = Form(None), versionId: Optional[int] = Form(1001)): + try: + client, bucket, public_base, env_prefix = minio_current(RUNTIME_CONFIG) + if client is None or bucket is None or not public_base: + return _err("MinIO 未配置") + use_prefix = (prefix or env_prefix or "").strip() + raw = await list_file.read() + text = raw.decode("utf-8", errors="ignore") + lines = [l.strip() for l in text.splitlines()] + paths: List[str] = [l for l in lines if l and not l.startswith("#")] + locals: List[Path] = [] + for p in paths: + if p.startswith("http://") or p.startswith("https://"): + pass + else: + lp = Path(p).expanduser() + if lp.exists() and lp.is_file(): + locals.append(lp.resolve()) + base_root = None + try: + if locals: + base_root = Path(os.path.commonpath([str(x) for x in locals])) + except Exception: + base_root = None + processed: List[Dict[str, object]] = [] + for p in locals: + try: + content = p.read_text("utf-8", errors="ignore") + new_text, mappings = _rewrite_md_assets_to_minio(content, p.parent, client, bucket, public_base, use_prefix, search_root=base_root) + rel0 = p.relative_to(base_root) if base_root else Path(p.name) + rel_uplift = _uplift_rel_path(rel0, p.parent, base_root, mappings) + obj = f"{use_prefix.strip('/')}/rewritten/{rel_uplift.as_posix()}".lstrip("/") + raw_md = new_text.encode("utf-8") + bio = io.BytesIO(raw_md) + client.put_object(bucket_name=bucket, object_name=obj, data=bio, length=len(raw_md), content_type="text/markdown; charset=utf-8") # type: ignore + try: + from urllib.parse import quote as _quote + url = f"{public_base}/{bucket}/{_quote(obj, safe='/')}" + except Exception: + url = f"{public_base}/{bucket}/{obj}" + exp = int(timedelta(hours=12).total_seconds()) + ps = presigned_read(client, bucket, obj, exp) if client is not None else None + processed.append({"source": rel_uplift.as_posix(), "minio_url": url, "minio_presigned_url": ps, "mappings": mappings, "object_name": obj, "size": len(raw_md)}) + except Exception: + processed.append({"source": p.name, "minio_url": None, "minio_presigned_url": None, "mappings": [], "object_name": None, "size": 0}) + imp = _build_import_tree(processed, int(versionId or 1001)) + return _ok({"count": len(processed), "files": processed, "import": imp}) + except Exception as e: + return _err(str(e)) +@app.get("/config/minio/policy") +async def get_minio_policy(bucket: Optional[str] = None): + client, cfg_bucket, _, _ = minio_current(RUNTIME_CONFIG) + if client is None: + raise HTTPException(status_code=400, detail="MinIO 未配置") + bkt = (bucket or cfg_bucket or "").strip() + if not bkt: + raise HTTPException(status_code=400, detail="bucket 不能为空") + try: + pol = client.get_bucket_policy(bucket_name=bkt) # type: ignore + try: + import json as _json + data = _json.loads(pol) + except Exception: + data = {"raw": pol} + return {"ok": True, "bucket": bkt, "policy": data} + except Exception as e: + try: + try: + region = client._get_region(bkt) # type: ignore + except Exception: + region = "us-east-1" + resp = client._url_open(method="GET", region=region, bucket_name=bkt, query_params={"policy": ""}) # type: ignore + raw = None + try: + raw = getattr(resp, "data", None) + if raw is not None and hasattr(raw, "decode"): + raw = raw.decode("utf-8") + except Exception: + raw = None + if raw is None: + try: + raw = resp.read().decode("utf-8") # type: ignore + except Exception: + raw = "" + try: + import json as _json + data = _json.loads(raw) + except Exception: + data = {"raw": raw} + return {"ok": True, "bucket": bkt, "policy": data} + except Exception as e2: + return {"ok": False, "bucket": bkt, "error": str(e2)} + +@app.post("/config/minio/apply_public_read") +async def apply_public_read(bucket: Optional[str] = Form(None), enable: Optional[str] = Form("true")): + client, cfg_bucket, _, _ = minio_current(RUNTIME_CONFIG) + if client is None: + raise HTTPException(status_code=400, detail="MinIO 未配置") + bkt = (bucket or cfg_bucket or "").strip() + if not bkt: + raise HTTPException(status_code=400, detail="bucket 不能为空") + try: + import json as _json + if str(enable or "true").lower() in {"1","true","yes","on"}: + policy = { + "Version": "2012-10-17", + "Statement": [ + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetBucketLocation", "s3:ListBucket"], "Resource": [f"arn:aws:s3:::{bkt}"]}, + {"Effect": "Allow", "Principal": "*", "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{bkt}/*"]}, + ], + } + try: + client.set_bucket_policy(bucket_name=bkt, policy=_json.dumps(policy)) # type: ignore + return {"ok": True, "bucket": bkt, "applied": True} + except Exception: + try: + try: + region = client._get_region(bkt) # type: ignore + except Exception: + region = "us-east-1" + raw = _json.dumps(policy).encode("utf-8") + client._url_open(method="PUT", region=region, bucket_name=bkt, query_params={"policy": ""}, body=raw) # type: ignore + return {"ok": True, "bucket": bkt, "applied": True} + except Exception as e2: + return {"ok": False, "bucket": bkt, "error": str(e2)} + try: + client.delete_bucket_policy(bkt) # type: ignore + except Exception: + pass + return {"ok": True, "bucket": bkt, "applied": False} + except Exception as e: + return {"ok": False, "bucket": bkt, "error": str(e)} diff --git a/docling/app/services/__init__.py b/docling/app/services/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/docling/app/services/__init__.py @@ -0,0 +1 @@ + diff --git a/docling/app/services/docling_adapter.py b/docling/app/services/docling_adapter.py new file mode 100644 index 0000000..5725366 --- /dev/null +++ b/docling/app/services/docling_adapter.py @@ -0,0 +1,709 @@ +from pathlib import Path +from typing import Optional, Tuple, Dict, List, Any +from urllib.parse import urlparse, unquote +import os +import re +import io +from bs4 import BeautifulSoup +from bs4.element import PageElement +import marko +import sys +try: + _DOC_BASE = Path(__file__).resolve().parents[2] / "docling" + p = str(_DOC_BASE) + if p not in sys.path: + sys.path.insert(0, p) +except Exception: + pass +try: + from docling.document_converter import DocumentConverter +except Exception: + class DocumentConverter: # type: ignore + def __init__(self, *args, **kwargs): + pass + def convert(self, source): + raise RuntimeError("docling not available") +from docx import Document +from docx.shared import Mm, Pt +from docx.enum.section import WD_SECTION +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT +from docx.oxml import OxmlElement +from docx.oxml.ns import qn +from urllib.request import urlopen +import json + +try: + from weasyprint import HTML, CSS # type: ignore +except Exception: + HTML = None + CSS = None + +_mdit: Any = None +_tasklists_plugin: Any = None +_deflist_plugin: Any = None +_footnote_plugin: Any = None +_attrs_plugin: Any = None +_HAS_MD_IT: bool = False +try: + import markdown_it as _mdit # type: ignore + from mdit_py_plugins.tasklists import tasklists_plugin as _tasklists_plugin # type: ignore + from mdit_py_plugins.deflist import deflist_plugin as _deflist_plugin # type: ignore + from mdit_py_plugins.footnote import footnote_plugin as _footnote_plugin # type: ignore + from mdit_py_plugins.attrs import attrs_plugin as _attrs_plugin # type: ignore + _HAS_MD_IT = True +except Exception: + pass + +converter = DocumentConverter() +LINKMAP_PATH = Path(__file__).resolve().parent.parent / "configs" / "linkmap" / "linkmap.json" +_LINKMAP: Dict[str, str] = {} + +def load_linkmap() -> Dict[str, str]: + global _LINKMAP + try: + if LINKMAP_PATH.exists(): + _LINKMAP = json.loads(LINKMAP_PATH.read_text("utf-8")) or {} + except Exception: + _LINKMAP = {} + return _LINKMAP + +def save_linkmap(mapping: Dict[str, str]) -> None: + LINKMAP_PATH.parent.mkdir(parents=True, exist_ok=True) + LINKMAP_PATH.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), "utf-8") + load_linkmap() + +def resolve_link(href: Optional[str], data_doc: Optional[str]) -> Optional[str]: + if href: + return href + if not _LINKMAP: + load_linkmap() + if data_doc and data_doc in _LINKMAP: + return _LINKMAP[data_doc] + return None + +def export_payload(doc, fmt: str) -> Tuple[str, str]: + f = fmt.lower() + if f == "markdown": + return doc.export_to_markdown(), "text/markdown" + if f == "html": + return doc.export_to_html(), "text/html" + if f == "json": + return doc.export_to_json(), "application/json" + if f == "doctags": + return doc.export_to_doctags(), "application/json" + raise ValueError("unsupported export") + +def infer_basename(source_url: Optional[str], upload_name: Optional[str]) -> str: + if source_url: + path = urlparse(source_url).path + name = os.path.basename(path) or "document" + name = unquote(name) + return os.path.splitext(name)[0] or "document" + if upload_name: + name = os.path.splitext(os.path.basename(upload_name))[0] or "document" + return name + return "document" + +def sanitize_filename(name: Optional[str]) -> str: + if not name: + return "document" + name = name.strip()[:128] + name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", name) or "document" + return name + +def convert_source(source: str, export: str) -> Tuple[str, str]: + result = converter.convert(source) + return export_payload(result.document, export) + + + +def md_to_docx_bytes(md: str, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes: + try: + import logging as _log + _log.info(f"md_to_docx_bytes start toc={toc} header={bool(header_text)} footer={bool(footer_text)} logo={bool(logo_url)} cover={bool(cover_src)}") + except Exception: + pass + def _add_field(paragraph, instr: str): + r1 = paragraph.add_run() + b = OxmlElement('w:fldChar') + b.set(qn('w:fldCharType'), 'begin') + r1._r.append(b) + r2 = paragraph.add_run() + t = OxmlElement('w:instrText') + t.set(qn('xml:space'), 'preserve') + t.text = instr + r2._r.append(t) + r3 = paragraph.add_run() + e = OxmlElement('w:fldChar') + e.set(qn('w:fldCharType'), 'end') + r3._r.append(e) + def _available_width(section) -> int: + return section.page_width - section.left_margin - section.right_margin + def _fetch_bytes(u: str) -> Optional[bytes]: + try: + if u.lower().startswith('http://') or u.lower().startswith('https://'): + with urlopen(u, timeout=10) as r: + return r.read() + p = Path(u) + if p.exists() and p.is_file(): + return p.read_bytes() + except Exception: + return None + return None + html = normalize_html(md, options={ + "toc": "1" if toc else "", + "header_text": header_text, + "footer_text": footer_text, + "logo_url": logo_url, + "copyright_text": copyright_text, + "filename_text": filename_text, + "cover_src": cover_src, + "product_name": product_name, + "document_name": document_name, + "product_version": product_version, + "document_version": document_version, + }) + try: + import logging as _log + _log.info(f"md_to_docx_bytes normalize_html length={len(html)}") + except Exception: + pass + soup = BeautifulSoup(html, "html.parser") + doc = Document() + sec0 = doc.sections[0] + sec0.page_width = Mm(210) + sec0.page_height = Mm(297) + sec0.left_margin = Mm(15) + sec0.right_margin = Mm(15) + sec0.top_margin = Mm(20) + sec0.bottom_margin = Mm(20) + has_cover = bool(cover_src or (soup.find('section', class_='cover') is not None)) + if has_cover: + sec0.left_margin = Mm(0) + sec0.right_margin = Mm(0) + sec0.top_margin = Mm(0) + sec0.bottom_margin = Mm(0) + if cover_src: + b = _fetch_bytes(cover_src) + if b: + bio = io.BytesIO(b) + doc.add_picture(bio, width=_available_width(sec0)) + if product_name: + p = doc.add_paragraph() + r = p.add_run(product_name) + r.font.size = Pt(18) + r.bold = True + t = document_name or None + if not t: + h1 = soup.body.find('h1') if soup.body else soup.find('h1') + t = h1.get_text(strip=True) if h1 else '文档' + p2 = doc.add_paragraph() + r2 = p2.add_run(t or '文档') + r2.font.size = Pt(24) + r2.bold = True + if filename_text: + p3 = doc.add_paragraph() + r3 = p3.add_run(filename_text) + r3.font.size = Pt(13) + meta_parts = [] + if product_version: + meta_parts.append("产品版本:" + product_version) + if document_version: + meta_parts.append("文档版本:" + document_version) + if meta_parts: + pm = doc.add_paragraph(" ".join(meta_parts)) + pm.font = None + doc.add_section(WD_SECTION.NEW_PAGE) + sec = doc.sections[-1] + sec.page_width = Mm(210) + sec.page_height = Mm(297) + sec.left_margin = Mm(15) + sec.right_margin = Mm(15) + sec.top_margin = Mm(20) + sec.bottom_margin = Mm(20) + else: + sec = sec0 + if header_text or logo_url or filename_text: + hp = sec.header.add_paragraph() + left = header_text or '' + right = '' + if '||' in left: + parts = left.split('||', 1) + left, right = parts[0], parts[1] + elif '|' in left: + parts = left.split('|', 1) + left, right = parts[0], parts[1] + if left.strip(): + hp.add_run(left.strip()) + if right.strip(): + rp = sec.header.add_paragraph() + rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + rp.add_run(right.strip()) + elif filename_text: + rp = sec.header.add_paragraph() + rp.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + rp.add_run(filename_text) + if footer_text or copyright_text: + fp = sec.footer.add_paragraph() + if footer_text: + fp.add_run(footer_text) + if copyright_text: + cp = sec.footer.add_paragraph() + cp.add_run(copyright_text) + pn = sec.footer.add_paragraph() + pn.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + _add_field(pn, 'PAGE') + if toc: + doc.add_paragraph('目录') + _add_field(doc.add_paragraph(), 'TOC \\o "1-3" \\h \\z \\u') + doc.add_page_break() + def add_inline(p, node): + if isinstance(node, str): + p.add_run(node) + return + if node.name in ['strong', 'b']: + r = p.add_run(node.get_text()) + r.bold = True + return + if node.name in ['em', 'i']: + r = p.add_run(node.get_text()) + r.italic = True + return + if node.name == 'code': + r = p.add_run(node.get_text()) + r.font.name = 'Courier New' + return + if node.name == 'a': + text = node.get_text() + href = node.get('href') + extra = node.get('data-doc') + resolved = resolve_link(href, extra) + if resolved: + p.add_run(text + ' [' + resolved + ']') + else: + p.add_run(text) + return + if node.name == 'img': + src = node.get('src') or '' + b = _fetch_bytes(src) + if b: + bio = io.BytesIO(b) + try: + doc.add_picture(bio, width=_available_width(sec)) + except Exception: + pass + return + for c in getattr(node, 'children', []): + add_inline(p, c) + def process_block(el): + name = getattr(el, 'name', None) + if name is None: + return + cls = el.get('class') or [] + if name == 'div' and 'doc-meta' in cls: + return + if name == 'section' and 'cover' in cls: + return + if name == 'nav' and 'toc' in cls: + return + if name == 'div': + for child in el.children: + process_block(child) + return + if name == 'h1': + doc.add_heading(el.get_text(), level=1) + return + if name == 'h2' or (name == 'strong' and 'subtitle' in cls): + doc.add_heading(el.get_text(), level=2) + return + if name == 'h3': + doc.add_heading(el.get_text(), level=3) + return + if name == 'p': + p = doc.add_paragraph() + for c in el.children: + add_inline(p, c) + return + if name in ['ul', 'ol']: + for li in el.find_all('li', recursive=False): + p = doc.add_paragraph(style='List Bullet') + for c in li.children: + add_inline(p, c) + return + if name == 'pre': + code = el.get_text() or '' + p = doc.add_paragraph() + run = p.add_run(code) + run.font.name = 'Courier New' + return + if name == 'blockquote': + p = doc.add_paragraph(el.get_text()) + p.paragraph_format.left_indent = Mm(10) + return + if name == 'table': + rows = [] + thead = el.find('thead') + tbody = el.find('tbody') + if thead: + hdrs = [th.get_text(strip=True) for th in thead.find_all('th')] + else: + hdrs = [cell.get_text(strip=True) for cell in el.find_all('tr')[0].find_all(['th','td'])] if el.find_all('tr') else [] + trs = tbody.find_all('tr') if tbody else el.find_all('tr')[1:] + for tr in trs: + tds = [td.get_text(strip=True) for td in tr.find_all('td')] + rows.append(tds) + tbl = doc.add_table(rows=1 + len(rows), cols=len(hdrs) or 1) + hdr = tbl.rows[0].cells + for k, h in enumerate(hdrs or ['']): + hdr[k].text = h + for r_idx, row in enumerate(rows): + cells = tbl.rows[1 + r_idx].cells + for c_idx in range(len(hdrs) or 1): + cells[c_idx].text = (row[c_idx] if c_idx < len(row) else '') + return + if name == 'img': + src = el.get('src') or '' + b = _fetch_bytes(src) + if b: + bio = io.BytesIO(b) + try: + doc.add_picture(bio, width=_available_width(sec)) + except Exception: + pass + return + body = soup.body or soup + for el in body.children: + process_block(el) + bio = io.BytesIO() + try: + import logging as _log + _log.info("md_to_docx_bytes saving doc") + except Exception: + pass + doc.save(bio) + try: + import logging as _log + _log.info(f"md_to_docx_bytes done size={bio.tell()}") + except Exception: + pass + return bio.getvalue() + +def md_to_pdf_bytes(md: str) -> bytes: + return md_to_pdf_bytes_with_renderer(md, renderer="weasyprint") + +def _md_with_tables_to_html(md_text: str) -> str: + lines = md_text.splitlines() + out = [] + i = 0 + while i < len(lines): + line = lines[i] + def is_sep(s: str) -> bool: + s = s.strip() + if "|" not in s: + return False + s = s.strip("|") + return all(set(seg.strip()) <= set("-: ") and len(seg.strip()) >= 1 for seg in s.split("|")) + if "|" in line and i + 1 < len(lines) and is_sep(lines[i + 1]): + headers = [c.strip() for c in line.strip().strip("|").split("|")] + j = i + 2 + rows = [] + while j < len(lines) and "|" in lines[j]: + rows.append([c.strip() for c in lines[j].strip().strip("|").split("|")]) + j += 1 + tbl = ["", ""] + for h in headers: + tbl.append(f"") + tbl.append("") + for row in rows: + tbl.append("") + for idx in range(len(headers)): + cell = row[idx] if idx < len(row) else "" + tbl.append(f"") + tbl.append("") + tbl.append("
{h}
{cell}
") + out.append("".join(tbl)) + i = j + continue + out.append(line) + i += 1 + return marko.convert("\n".join(out)) + +def _render_markdown_html(md_text: str) -> str: + if _HAS_MD_IT and _mdit is not None: + try: + md = _mdit.MarkdownIt("commonmark").enable(["table", "strikethrough"]) + if _tasklists_plugin: + md.use(_tasklists_plugin) + if _deflist_plugin: + md.use(_deflist_plugin) + if _footnote_plugin: + md.use(_footnote_plugin) + if _attrs_plugin: + md.use(_attrs_plugin) + return md.render(md_text) + except Exception: + pass + return _md_with_tables_to_html(md_text) + +def normalize_html(md_or_html: str, options: Optional[Dict[str, Optional[str]]] = None) -> str: + html = _render_markdown_html(md_or_html) + soup = BeautifulSoup(html, "html.parser") + for s in soup.find_all("strong", class_="subtitle"): + s.name = "h2" + s.attrs = {"data-origin": "subtitle"} + for a in soup.find_all("a"): + href_val = a.get("href") + extra_val = a.get("data-doc") + href = href_val if isinstance(href_val, str) else None + extra = extra_val if isinstance(extra_val, str) else None + resolved = resolve_link(href, extra) + if resolved: + a["href"] = resolved + elif not href and extra: + a.replace_with(a.get_text() + " [" + extra + "]") + opts = options or {} + header_text = opts.get("header_text") or None + footer_text = opts.get("footer_text") or None + logo_url = opts.get("logo_url") or None + copyright_text = opts.get("copyright_text") or None + cover_src = opts.get("cover_src") or None + product_name_opt = opts.get("product_name") or None + document_name_opt = opts.get("document_name") or None + product_version_opt = opts.get("product_version") or None + document_version_opt = opts.get("document_version") or None + toc_flag = bool(opts.get("toc")) + meta = soup.new_tag("div", attrs={"class": "doc-meta"}) + if header_text: + ht = soup.new_tag("div", attrs={"class": "doc-header-text"}) + text = header_text + left = text + right = "" + if "||" in text: + parts = text.split("||", 1) + left, right = parts[0], parts[1] + elif "|" in text: + parts = text.split("|", 1) + left, right = parts[0], parts[1] + if logo_url: + img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url}) + ht.append(img) + hl = soup.new_tag("span", attrs={"class": "doc-header-left"}) + hl.string = left + ht.append(hl) + if right.strip(): + hr = soup.new_tag("span", attrs={"class": "doc-header-right"}) + hr.string = right + ht.append(hr) + meta.append(ht) + else: + first_h1 = None + if soup.body: + first_h1 = soup.body.find("h1") + else: + first_h1 = soup.find("h1") + left = (first_h1.get_text(strip=True) if first_h1 else "文档") + right = opts.get("filename_text") or "" + ht = soup.new_tag("div", attrs={"class": "doc-header-text"}) + if logo_url: + img = soup.new_tag("img", attrs={"class": "logo-inline", "src": logo_url}) + ht.append(img) + hl = soup.new_tag("span", attrs={"class": "doc-header-left"}) + hl.string = left + ht.append(hl) + if right: + hr = soup.new_tag("span", attrs={"class": "doc-header-right"}) + hr.string = right + ht.append(hr) + meta.append(ht) + if footer_text: + ft = soup.new_tag("div", attrs={"class": "doc-footer-text"}) + ft.string = footer_text + meta.append(ft) + page_header_val = (header_text or (document_name_opt or None)) + if not page_header_val: + first_h1_for_header = None + if soup.body: + first_h1_for_header = soup.body.find("h1") + else: + first_h1_for_header = soup.find("h1") + page_header_val = (first_h1_for_header.get_text(strip=True) if first_h1_for_header else "文档") + page_footer_val = (footer_text or "FunMD") + ph = soup.new_tag("div", attrs={"class": "doc-page-header"}) + if logo_url: + logo_inline = soup.new_tag("img", attrs={"src": logo_url, "class": "doc-page-header-logo"}) + ph.append(logo_inline) + ht_inline = soup.new_tag("span", attrs={"class": "doc-page-header-text"}) + ht_inline.string = page_header_val + ph.append(ht_inline) + meta.append(ph) + pf = soup.new_tag("div", attrs={"class": "doc-page-footer"}) + pf.string = page_footer_val + meta.append(pf) + if copyright_text: + cp = soup.new_tag("div", attrs={"class": "doc-copyright"}) + cp.string = copyright_text + meta.append(cp) + # brand logo is rendered inline within header; no separate top-left element + if soup.body: + soup.body.insert(0, meta) + else: + soup.insert(0, meta) + if not soup.head: + head = soup.new_tag("head") + soup.insert(0, head) + else: + head = soup.head + style_run = soup.new_tag("style") + style_run.string = "@page{margin:20mm}@page{\n @top-center{content: element(page-header)}\n @bottom-center{content: element(page-footer)}\n}\n.doc-page-header{position: running(page-header); font-size:10pt; color:#666; display:block; text-align:center; width:100%}\n.doc-page-header::after{content:''; display:block; width:80%; border-bottom:1px solid #d9d9d9; margin:4px auto 0}\n.doc-page-header-logo{height:20px; vertical-align:middle; margin-right:4px}\n.doc-page-header-text{vertical-align:middle}\n.doc-page-footer{position: running(page-footer); font-size:10pt; color:#666}\n.doc-page-footer::before{content:''; display:block; width:80%; border-top:1px solid #d9d9d9; margin:0 auto 4px}" + head.append(style_run) + # Fallback inline styles for cover to ensure visibility even if external CSS isn't loaded + if (cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt): + if not soup.head: + head = soup.new_tag("head") + soup.insert(0, head) + else: + head = soup.head + style = soup.new_tag("style") + style.string = "@page:first{margin:0} html,body{margin:0;padding:0}.cover{position:relative;width:210mm;height:297mm;overflow:hidden;page-break-after:always}.cover .cover-bg{position:absolute;left:0;top:0;right:0;bottom:0;width:100%;height:100%;object-fit:cover;display:block}.cover .cover-brand{position:absolute;top:20mm;left:20mm;font-size:18pt;font-weight:700;color:#1d4ed8}.cover .cover-footer{position:absolute;left:0;right:0;bottom:0;background:#1d4ed8;color:#fff;padding:12mm 20mm}.cover .cover-title{font-size:24pt;font-weight:700;margin:0}.cover .cover-subtitle{font-size:13pt;margin-top:4pt}.cover .cover-meta{margin-top:8pt;font-size:11pt;display:flex;gap:20mm}" + head.append(style) + if cover_src or product_name_opt or document_name_opt or product_version_opt or document_version_opt: + cov = soup.new_tag("section", attrs={"class": "cover"}) + if cover_src: + bg = soup.new_tag("img", attrs={"class": "cover-bg", "src": cover_src}) + cov.append(bg) + if product_name_opt: + brand_el = soup.new_tag("div", attrs={"class": "cover-brand"}) + brand_el.string = product_name_opt + cov.append(brand_el) + footer = soup.new_tag("div", attrs={"class": "cover-footer"}) + title_text = document_name_opt or None + if not title_text: + first_h1 = soup.body.find("h1") if soup.body else soup.find("h1") + if first_h1: + title_text = first_h1.get_text(strip=True) + title_el = soup.new_tag("div", attrs={"class": "cover-title"}) + title_el.string = title_text or "文档" + footer.append(title_el) + subtitle_val = opts.get("filename_text") or "" + if subtitle_val: + subtitle_el = soup.new_tag("div", attrs={"class": "cover-subtitle"}) + subtitle_el.string = subtitle_val + footer.append(subtitle_el) + meta_el = soup.new_tag("div", attrs={"class": "cover-meta"}) + if product_version_opt: + pv = soup.new_tag("span") + pv.string = f"产品版本:{product_version_opt}" + meta_el.append(pv) + if document_version_opt: + dv = soup.new_tag("span") + dv.string = f"文档版本:{document_version_opt}" + meta_el.append(dv) + footer.append(meta_el) + cov.append(footer) + if soup.body: + soup.body.insert(1, cov) + else: + soup.insert(1, cov) + if toc_flag: + headings = [ + el for el in (soup.find_all(["h1", "h2", "h3"]) or []) + if el.get("data-origin") != "subtitle" + ] + if headings: + ul = soup.new_tag("ul") + idx = 1 + for el in headings: + text = el.get_text(strip=True) + if not text: + continue + hid = el.get("id") + if not hid: + hid = f"sec-{idx}" + el["id"] = hid + idx += 1 + li = soup.new_tag("li", attrs={"class": f"toc-{el.name}"}) + a = soup.new_tag("a", attrs={"href": f"#{hid}", "class": "toc-text"}) + a.string = text + dots = soup.new_tag("span", attrs={"class": "toc-dots"}) + page = soup.new_tag("span", attrs={"class": "toc-page", "data-target": f"#{hid}"}) + li.append(a) + li.append(dots) + li.append(page) + ul.append(li) + nav = soup.new_tag("nav", attrs={"class": "toc"}) + h = soup.new_tag("h1") + h.string = "目录" + nav.append(h) + nav.append(ul) + if soup.body: + soup.body.insert(2, nav) + else: + soup.insert(2, nav) + if soup.body: + for h in soup.body.find_all(["h1", "h2", "h3"]): + sib: Optional[PageElement] = h.find_next_sibling() + blocks: List[Any] = [] + first_table: Optional[Any] = None + while sib is not None: + # Skip pure whitespace nodes + if getattr(sib, "name", None) is None: + try: + if str(sib).strip() == "": + sib = sib.next_sibling + continue + except Exception: + break + # Stop if next heading encountered + name = getattr(sib, "name", None) + if name in ["h1", "h2", "h3"]: + break + # Collect explanatory blocks until first table + if name == "table": + first_table = sib + break + if name in ["p", "blockquote", "ul", "ol"]: + blocks.append(sib) + sib = sib.next_sibling + continue + # Unknown block: stop grouping to avoid wrapping unrelated content + break + if first_table is not None: + wrap = soup.new_tag("div", attrs={"class": "table-block"}) + h.insert_before(wrap) + wrap.append(h.extract()) + for el in blocks: + wrap.append(el.extract()) + wrap.append(first_table.extract()) + return str(soup) + +def _stylesheets_for(css_name: Optional[str], css_text: Optional[str]): + sheets: List[Any] = [] + if CSS is None: + return sheets + if css_text: + sheets.append(CSS(string=css_text)) + if css_name: + css_path = Path(__file__).resolve().parent.parent / "configs" / "styles" / f"{css_name}.css" + if css_path.exists(): + sheets.append(CSS(filename=str(css_path))) + return sheets + +def md_to_pdf_bytes_with_renderer(md: str, renderer: str = "weasyprint", css_name: Optional[str] = None, css_text: Optional[str] = None, toc: bool = False, header_text: Optional[str] = None, footer_text: Optional[str] = None, logo_url: Optional[str] = None, copyright_text: Optional[str] = None, filename_text: Optional[str] = None, cover_src: Optional[str] = None, product_name: Optional[str] = None, document_name: Optional[str] = None, product_version: Optional[str] = None, document_version: Optional[str] = None) -> bytes: + html = normalize_html(md, options={ + "toc": "1" if toc else "", + "header_text": header_text, + "footer_text": footer_text, + "logo_url": logo_url, + "copyright_text": copyright_text, + "filename_text": filename_text, + "cover_src": cover_src, + "product_name": product_name, + "document_name": document_name, + "product_version": product_version, + "document_version": document_version, + }) + if HTML is not None: + stylesheets = _stylesheets_for(css_name, css_text) + pdf_bytes = HTML(string=html).write_pdf(stylesheets=stylesheets or None) + return pdf_bytes + raise RuntimeError("WeasyPrint is not available") diff --git a/docling/app/services/minio_utils.py b/docling/app/services/minio_utils.py new file mode 100644 index 0000000..837a1fd --- /dev/null +++ b/docling/app/services/minio_utils.py @@ -0,0 +1,190 @@ +from typing import Optional, Tuple, Dict +import os +import logging +from urllib.request import urlopen + +try: + from minio import Minio # type: ignore + import urllib3 # type: ignore +except Exception: + Minio = None + urllib3 = None # type: ignore + +def minio_head_bucket(client: object, bucket: str) -> bool: + try: + if hasattr(client, "bucket_exists"): + try: + return bool(client.bucket_exists(bucket)) # type: ignore + except Exception: + pass + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + client._url_open(method="HEAD", region=region, bucket_name=bucket) # type: ignore + return True + except Exception: + try: + names = [getattr(b, "name", None) for b in client.list_buckets()] # type: ignore + return bucket in set(n for n in names if n) + except Exception: + return False + +def minio_create_bucket(client: object, bucket: str) -> bool: + try: + if hasattr(client, "bucket_exists"): + try: + if client.bucket_exists(bucket): # type: ignore + return True + except Exception: + pass + if hasattr(client, "make_bucket"): + try: + client.make_bucket(bucket) # type: ignore + return True + except Exception: + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + try: + client.make_bucket(bucket, location=region) # type: ignore + return True + except Exception: + pass + try: + try: + region = client._get_region(bucket) # type: ignore + except Exception: + region = "us-east-1" + client._url_open(method="PUT", region=region, bucket_name=bucket) # type: ignore + return True + except Exception as ce: + if "BucketAlreadyOwnedByYou" in str(ce) or "BucketAlreadyExists" in str(ce): + return True + raise + except Exception as e: + raise e + +def minio_client(endpoint: str, access: str, secret: str, secure: bool): + if urllib3 is not None: + try: + http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=3.0, read=20.0)) + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure, http_client=http) # type: ignore + except Exception: + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore + return Minio(endpoint=endpoint, access_key=access, secret_key=secret, secure=secure) # type: ignore + +def minio_time_hint(endpoint: str, secure: bool) -> Optional[str]: + try: + scheme = "https" if secure else "http" + r = urlopen(f"{scheme}://{endpoint}", timeout=3) + srv_date = r.headers.get("Date") + if not srv_date: + return None + from email.utils import parsedate_to_datetime + from datetime import datetime, timezone + dt = parsedate_to_datetime(srv_date) + now = datetime.now(timezone.utc) + diff = abs((now - dt).total_seconds()) + return f"服务器时间与本机相差约 {int(diff)} 秒" + except Exception: + return None + +def join_prefix(prefix: str, rel: str) -> str: + pre = (prefix or "").strip("/") + r = rel.lstrip("/") + if pre and r.startswith(pre + "/"): + return r + return f"{pre}/{r}" if pre else r + +def presigned_read(client: object, bucket: str, obj: str, expires_seconds: int) -> Optional[str]: + try: + from datetime import timedelta + exp = expires_seconds + try: + exp = int(exp) + except Exception: + pass + td = timedelta(seconds=exp) + try: + return client.get_presigned_url("GET", bucket, obj, expires=td) # type: ignore + except Exception: + return client.presigned_get_object(bucket, obj, expires=td) # type: ignore + except Exception: + return None + +def minio_current(runtime_cfg: Dict[str, Dict[str, Optional[str]]]) -> Tuple[Optional[object], Optional[str], Optional[str], str]: + rc = runtime_cfg.get("minio", {}) + endpoint_raw = rc.get("endpoint") or os.environ.get("MINIO_ENDPOINT") + access_raw = rc.get("access") or os.environ.get("MINIO_ACCESS_KEY") + secret_raw = rc.get("secret") or os.environ.get("MINIO_SECRET_KEY") + bucket_raw = rc.get("bucket") or os.environ.get("MINIO_BUCKET") + secure_flag = rc.get("secure") or os.environ.get("MINIO_SECURE", "false") + secure = str(secure_flag or "false").lower() in {"1","true","yes","on"} + public_raw = rc.get("public") or os.environ.get("MINIO_PUBLIC_ENDPOINT") + endpoint = (str(endpoint_raw).strip() if endpoint_raw else None) + try: + if isinstance(endpoint, str) and ":9001" in endpoint: + h = endpoint.split("/")[0] + if ":" in h: + parts = h.split(":") + endpoint = f"{parts[0]}:9000" + else: + endpoint = h + except Exception: + endpoint = endpoint + access = (str(access_raw).strip() if access_raw else None) + secret = (str(secret_raw).strip() if secret_raw else None) + bucket = (str(bucket_raw).strip() if bucket_raw else None) + public_base = (str(public_raw).strip() if public_raw else None) + try: + if isinstance(public_base, str) and (":9001" in public_base or "/browser" in public_base or "/minio" in public_base): + host = public_base.strip().split("/")[0] + scheme = "https" if secure else "http" + if ":" in host: + host = host.split("/")[0] + base_host = host.split(":")[0] + public_base = f"{scheme}://{base_host}:9000" + else: + public_base = f"{scheme}://{host}:9000" + except Exception: + public_base = public_base + if not public_base and endpoint: + public_base = f"https://{endpoint}" if secure else f"http://{endpoint}" + missing = [] + if Minio is None: + missing.append("client") + if not endpoint: + missing.append("endpoint") + if not access: + missing.append("access") + if not secret: + missing.append("secret") + if not bucket: + missing.append("bucket") + if not public_base: + missing.append("public") + if missing: + try: + logging.error(f"minio config invalid: missing={missing}") + except Exception: + pass + return None, None, None, "" + client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=secure) + try: + try: + client.list_buckets() # type: ignore + except Exception as e: + if secure and ("SSL" in str(e) or "HTTPSConnectionPool" in str(e) or "SSLError" in str(e)): + client = minio_client(endpoint=endpoint, access=access, secret=secret, secure=False) + except Exception: + pass + try: + exists = minio_head_bucket(client, bucket) + if not exists: + minio_create_bucket(client, bucket) + except Exception: + pass + prefix = rc.get("prefix") or os.environ.get("MINIO_PREFIX", "") + return client, bucket, public_base, prefix diff --git a/docling/app/services/unified_converter.py b/docling/app/services/unified_converter.py new file mode 100644 index 0000000..06eed49 --- /dev/null +++ b/docling/app/services/unified_converter.py @@ -0,0 +1,492 @@ +from pathlib import Path +from typing import Optional, Tuple +import re + +import tempfile +import sys +from urllib.parse import urlsplit +from urllib.request import urlopen +from urllib.error import HTTPError, URLError +import io +_DOC_AVAILABLE = True +try: + _DOC_BASE = Path(__file__).resolve().parents[2] / "docling" + p = str(_DOC_BASE) + if p not in sys.path: + sys.path.insert(0, p) +except Exception: + pass +try: + from docling.document_converter import DocumentConverter + from docling.datamodel.base_models import InputFormat + from docling.document_converter import PdfFormatOption + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling_core.types.doc import ImageRefMode +except Exception: + _DOC_AVAILABLE = False + class DocumentConverter: # type: ignore + def __init__(self, *args, **kwargs): + pass + def convert(self, source): + raise RuntimeError("docling unavailable") + class InputFormat: # type: ignore + PDF = "pdf" + class PdfFormatOption: # type: ignore + def __init__(self, *args, **kwargs): + pass + class StandardPdfPipeline: # type: ignore + pass + class PdfPipelineOptions: # type: ignore + def __init__(self): + pass + class ImageRefMode: # type: ignore + EMBEDDED = None + +""" +@api Unified Converter Service +@description Provides core document conversion logic unifying Docling and word2markdown engines +""" + +_W2M_AVAILABLE = False +try: + from app.services.word2markdown import convert_any as _w2m_convert_any # type: ignore + _W2M_AVAILABLE = True +except Exception: + _W2M_AVAILABLE = False + +try: + from bs4 import BeautifulSoup # type: ignore +except Exception: + BeautifulSoup = None # type: ignore +try: + from app.services.docling_adapter import normalize_html as _normalize_html # type: ignore + from app.services.docling_adapter import resolve_link as _resolve_link # type: ignore + from app.services.docling_adapter import _render_markdown_html as _render_md_html # type: ignore +except Exception: + _normalize_html = None # type: ignore + _resolve_link = None # type: ignore + _render_md_html = None # type: ignore + +def _is_http(s: str) -> bool: + t = (s or "").lower() + return t.startswith("http://") or t.startswith("https://") + +def _read_bytes(source: str) -> Tuple[bytes, str]: + ct = "" + try: + if _is_http(source): + from urllib.request import urlopen + with urlopen(source, timeout=10) as r: + ct = r.headers.get("Content-Type") or "" + return r.read() or b"", ct + p = Path(source) + if p.exists() and p.is_file(): + return p.read_bytes(), ct + except Exception: + return b"", ct + return b"", ct + +def _decode_to_utf8(raw: bytes, ct: str = "") -> str: + if not raw: + return "" + if raw.startswith(b"\xef\xbb\xbf"): + try: + return raw[3:].decode("utf-8") + except Exception: + pass + if raw.startswith(b"\xff\xfe"): + try: + return raw[2:].decode("utf-16le") + except Exception: + pass + if raw.startswith(b"\xfe\xff"): + try: + return raw[2:].decode("utf-16be") + except Exception: + pass + try: + m = re.search(r"charset=([\w-]+)", ct or "", re.IGNORECASE) + if m: + enc = m.group(1).strip().lower() + try: + return raw.decode(enc) + except Exception: + pass + except Exception: + pass + candidates = [ + "utf-8", "gb18030", "gbk", "big5", "shift_jis", "iso-8859-1", "windows-1252", + ] + for enc in candidates: + try: + return raw.decode(enc) + except Exception: + continue + return raw.decode("utf-8", errors="replace") + +def _normalize_newlines(s: str) -> str: + return (s or "").replace("\r\n", "\n").replace("\r", "\n") + +def _html_to_markdown(html: str) -> str: + if not html: + return "" + if BeautifulSoup is None: + return html + soup = BeautifulSoup(html, "html.parser") + out: list[str] = [] + def txt(node) -> str: + return (getattr(node, "get_text", lambda **kwargs: str(node))(strip=True) if node else "") + def inline(node) -> str: + if isinstance(node, str): + return node + name = getattr(node, "name", None) + if name in {None}: # type: ignore + return str(node) + if name in {"strong", "b"}: + return "**" + txt(node) + "**" + if name in {"em", "i"}: + return "*" + txt(node) + "*" + if name == "code": + return "`" + txt(node) + "`" + if name == "a": + href_val = node.get("href") + extra_val = node.get("data-doc") + href = href_val if isinstance(href_val, str) else None + extra = extra_val if isinstance(extra_val, str) else None + resolved = _resolve_link(href, extra) if _resolve_link else (href or extra) + url = resolved or "" + text = txt(node) + if url: + return f"[{text}]({url})" + return text + if name == "img": + alt = node.get("alt") or "image" + src = node.get("src") or "" + return f"![{alt}]({src})" + res = [] + for c in getattr(node, "children", []): + res.append(inline(c)) + return "".join(res) + def block(node): + name = getattr(node, "name", None) + if name is None: + s = str(node).strip() + if s: + out.append(s) + return + if name in {"h1", "h2", "h3", "h4", "h5", "h6"}: + lvl = int(name[1]) + out.append("#" * lvl + " " + txt(node)) + out.append("") + return + if name == "p": + segs = [inline(c) for c in node.children] + out.append("".join(segs)) + out.append("") + return + if name == "br": + out.append("") + return + if name in {"ul", "ol"}: + is_ol = name == "ol" + idx = 1 + for li in node.find_all("li", recursive=False): + text = "".join(inline(c) for c in li.children) + if is_ol: + out.append(f"{idx}. {text}") + idx += 1 + else: + out.append(f"- {text}") + out.append("") + return + if name == "pre": + code_node = node.find("code") + code_text = code_node.get_text() if code_node else node.get_text() + lang = "" + cls = (code_node.get("class") if code_node else node.get("class")) or [] + for c in cls: + s = str(c) + if s.startswith("language-"): + lang = s.split("-", 1)[-1] + break + out.append(f"```{lang}\n{code_text}\n```\n") + return + if name == "blockquote": + lines = [l for l in txt(node).splitlines() if l.strip()] + for l in lines: + out.append("> " + l) + out.append("") + return + if name == "table": + rows = node.find_all("tr") + if not rows: + return + headers = [h.get_text(strip=True) for h in (rows[0].find_all(["th","td"]) or [])] + if headers: + out.append("|" + "|".join(headers) + "|") + sep = "|" + "|".join(["---" for _ in headers]) + "|" + out.append(sep) + for tr in rows[1:]: + cells = [td.get_text(strip=True) for td in tr.find_all("td")] + if cells: + out.append("|" + "|".join(cells) + "|") + out.append("") + return + if name == "div": + for c in node.children: + block(c) + return + segs = [inline(c) for c in node.children] + if segs: + out.append("".join(segs)) + out.append("") + root = soup.body or soup + for ch in getattr(root, "children", []): + block(ch) + return _normalize_newlines("\n".join(out)).strip() + + +def _lower_html_table_tags(html: str) -> str: + """ + @function _lower_html_table_tags + @description Normalizes HTML table tags to lowercase + @param html Input HTML string + @return Normalized HTML string + """ + if not html: + return html + tags = ["TABLE", "THEAD", "TBODY", "TFOOT", "TR", "TH", "TD"] + out = html + for t in tags: + out = re.sub(r"\s*\n+\s*", ">\n", out) + return out + + +def _replace_admonitions(md: str) -> str: + """ + @function _replace_admonitions + @description Replaces ::: style admonitions with !!! style + @param md Input markdown string + @return Processed markdown string + """ + if not md: + return md + lines = md.split("\n") + out = [] + in_block = False + for raw in lines: + t = raw.strip() + if t.startswith(":::"): + if not in_block: + name = t[3:].strip() + if not name: + out.append("!!!") + else: + out.append("!!! " + name) + in_block = True + else: + out.append("!!!") + in_block = False + continue + out.append(raw) + return "\n".join(out) + + +def _enhance_codeblocks(md: str) -> str: + if not md: + return md + lines = md.split("\n") + res = [] + in_fence = False + fence_lang = "" + i = 0 + while i < len(lines): + line = lines[i] + t = line.strip() + if t.startswith("```"): + in_fence = not in_fence + try: + fence_lang = (t[3:] or "").strip() if in_fence else "" + except Exception: + fence_lang = "" + res.append(line) + i += 1 + continue + if in_fence: + res.append(line) + i += 1 + continue + if t.startswith("{") or t.startswith("["): + buf = [line] + j = i + 1 + closed = False + depth = t.count("{") - t.count("}") + while j < len(lines): + buf.append(lines[j]) + s = lines[j].strip() + depth += s.count("{") - s.count("}") + if depth <= 0 and s.endswith("}"): + closed = True + break + j += 1 + if closed and len(buf) >= 3: + lang = "json" + res.append("```" + lang) + res.extend(buf) + res.append("```") + i = j + 1 + continue + code_sig = ( + ("public static" in t) or ("private static" in t) or ("class " in t) or ("return " in t) or ("package " in t) or ("import " in t) + ) + if code_sig: + buf = [line] + j = i + 1 + while j < len(lines): + s = lines[j].strip() + if not s: + break + if s.startswith("# ") or s.startswith("## ") or s.startswith("### "): + break + buf.append(lines[j]) + j += 1 + if len(buf) >= 3: + res.append("```") + res.extend(buf) + res.append("```") + i = j + 1 + continue + res.append(line) + i += 1 + return "\n".join(res) + + +class FormatConverter: + """ + @class FormatConverter + @description Unified converter class wrapping Docling and word2markdown + """ + def __init__(self) -> None: + self._docling = DocumentConverter() + + def convert(self, source: str, export: str = "markdown", engine: Optional[str] = None, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str, Optional[str]]: + """ + @function convert + @description Convert a document source to specified format + @param source Path or URL to source document + @param export Output format (markdown, html, json, doctags) + @param engine Optional engine override (word2markdown/docling) + @param mdx_safe_mode_enabled Toggle safe mode for MDX + @return Tuple of (encoding, content) + """ + + + # Prefer custom word2markdown engine for DOC/DOCX when available + auto_engine = None + try: + from pathlib import Path as _P + suf = _P(source).suffix.lower() + if not engine and suf in {".doc", ".docx"} and _W2M_AVAILABLE: + auto_engine = "word2markdown" + except Exception: + auto_engine = None + use_engine = (engine or auto_engine or "").lower() + try: + from urllib.parse import urlsplit + path = source + if _is_http(source): + path = urlsplit(source).path or "" + ext = Path(path).suffix.lower() + except Exception: + ext = Path(source).suffix.lower() + if ext in {".txt"}: + raw, ct = _read_bytes(source) + text = _normalize_newlines(_decode_to_utf8(raw, ct)) + if export.lower() == "html": + if _render_md_html is not None: + html = _render_md_html(text) + else: + try: + import marko + html = marko.convert(text) + except Exception: + html = f"
{text}
" + return "utf-8", _lower_html_table_tags(html), None + md = _enhance_codeblocks(text) + return "utf-8", md, None + if ext in {".md"}: + raw, ct = _read_bytes(source) + text = _normalize_newlines(_decode_to_utf8(raw, ct)) + if export.lower() == "html": + if _render_md_html is not None: + html = _render_md_html(text) + else: + try: + import marko + html = marko.convert(text) + except Exception: + html = text + return "utf-8", _lower_html_table_tags(html), None + return "utf-8", text, None + if ext in {".html", ".htm"}: + try: + conv = DocumentConverter(allowed_formats=[InputFormat.HTML]) + result = conv.convert(source) + if export.lower() == "html": + html = result.document.export_to_html() + html = _lower_html_table_tags(html) + return "utf-8", html, None + md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) + md = _replace_admonitions(md) + md = _enhance_codeblocks(md) + return "utf-8", md, None + except Exception: + raw, ct = _read_bytes(source) + html_in = _normalize_newlines(_decode_to_utf8(raw, ct)) + if export.lower() == "html": + html = _normalize_html(html_in) if _normalize_html is not None else html_in + return "utf-8", _lower_html_table_tags(html), None + md = _html_to_markdown(html_in) + md = _replace_admonitions(md) + md = _enhance_codeblocks(md) + return "utf-8", md, None + if use_engine in {"pandoc", "custom", "word2markdown"} and _W2M_AVAILABLE: + enc, md = _w2m_convert_any(Path(source), mdx_safe_mode_enabled=mdx_safe_mode_enabled) + md = _replace_admonitions(md) + md = _enhance_codeblocks(md) + return enc or "utf-8", md, None + # Configure PDF pipeline to generate picture images into a per-call artifacts directory + artifacts_dir = tempfile.mkdtemp(prefix="docling_artifacts_") + pdf_opts = PdfPipelineOptions() + pdf_opts.generate_picture_images = True + pdf_opts.generate_page_images = True + pdf_opts.images_scale = 2.0 + pdf_opts.do_code_enrichment = True + pdf_opts.do_formula_enrichment = True + self._docling = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=StandardPdfPipeline, + pipeline_options=pdf_opts, + ) + } + ) + result = self._docling.convert(source) + if export.lower() == "markdown": + md = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) + md = _replace_admonitions(md) + md = _enhance_codeblocks(md) + return "utf-8", md, artifacts_dir + if export.lower() == "html": + html = result.document.export_to_html() + html = _lower_html_table_tags(html) + return "utf-8", html, artifacts_dir + if export.lower() == "json": + js = result.document.export_to_json() + return "utf-8", js, artifacts_dir + if export.lower() == "doctags": + dt = result.document.export_to_doctags() + return "utf-8", dt, artifacts_dir + raise RuntimeError("unsupported export") diff --git a/docling/app/services/word2markdown.py b/docling/app/services/word2markdown.py new file mode 100644 index 0000000..82ae19a --- /dev/null +++ b/docling/app/services/word2markdown.py @@ -0,0 +1,429 @@ +from pathlib import Path +from typing import Tuple, List + +from docx import Document +from docx.table import Table +from docx.text.paragraph import Paragraph +import re +import base64 +import hashlib +import tempfile +import subprocess +from lxml import etree + + +def _iter_blocks(doc: Document): + parent = doc + parent_elm = parent.element.body + for child in parent_elm.iterchildren(): + tag = child.tag.split('}')[-1] + if tag == 'p': + yield Paragraph(child, parent) + elif tag == 'tbl': + yield Table(child, parent) + + +def _cell_text(cell) -> str: + parts = [] + for p in cell.paragraphs: + t = p.text or "" + parts.append(t) + return "\n".join([s for s in parts if s is not None]) + + +def _guess_lang(text: str) -> str: + t = (text or "").strip() + head = t[:512] + if re.search(r"\b(package|import\s+java\.|public\s+class|public\s+static|private\s+static|@Override)\b", head): + return "java" + if re.search(r"\b(def\s+\w+\(|import\s+\w+|print\(|from\s+\w+\s+import)\b", head): + return "python" + if re.search(r"\b(function\s+\w+\(|console\.log|let\s+\w+|const\s+\w+|=>)\b", head): + return "javascript" + if re.search(r"^#include|\bint\s+main\s*\(\)", head): + return "c" + if re.search(r"\busing\s+namespace\b|\bstd::\b|\btemplate\b", head): + return "cpp" + if re.search(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE\s+TABLE|DROP\s+TABLE|ALTER\s+TABLE)\b", head, re.IGNORECASE): + return "sql" + if head.startswith("{") or head.startswith("["): + return "json" + if re.search(r"", head): + return "xml" + return "" + + +def _table_to_md(tbl: Table) -> str: + rows = tbl.rows + cols = tbl.columns + if len(rows) == 1 and len(cols) == 1: + txt = _cell_text(rows[0].cells[0]).strip() + lang = _guess_lang(txt) + return f"```{lang}\n{txt}\n```\n" + + def _cell_inline_md(doc: Document, paragraph: Paragraph) -> str: + el = paragraph._element + parts: List[str] = [] + try: + for ch in el.iterchildren(): + tag = ch.tag.split('}')[-1] + if tag == 'r': + for rc in ch.iterchildren(): + rtag = rc.tag.split('}')[-1] + if rtag == 't': + s = rc.text or '' + if s: + parts.append(s) + elif rtag == 'br': + parts.append('\n') + elif rtag == 'drawing': + try: + for node in rc.iter(): + local = node.tag.split('}')[-1] + rid = None + if local == 'blip': + rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link") + elif local == 'imagedata': + rid = node.get(f"{{{NS['r']}}}id") + if not rid: + continue + try: + part = None + rp = getattr(doc.part, 'related_parts', None) + if isinstance(rp, dict) and rid in rp: + part = rp.get(rid) + if part is None: + rels = getattr(doc.part, 'rels', None) + if rels is not None and hasattr(rels, 'get'): + rel = rels.get(rid) + part = getattr(rel, 'target_part', None) + if part is None: + rel = getattr(doc.part, '_rels', {}).get(rid) + part = getattr(rel, 'target_part', None) + ct = getattr(part, 'content_type', '') if part is not None else '' + data = part.blob if part is not None and hasattr(part, 'blob') else None + if data: + b64 = base64.b64encode(data).decode('ascii') + parts.append(f"![Image](data:{ct};base64,{b64})") + except Exception: + pass + except Exception: + pass + except Exception: + pass + return ''.join(parts) + + out = [] + # python-docx table parent is the Document + doc = getattr(tbl, '_parent', None) or getattr(tbl, 'part', None) + for r_i, r in enumerate(rows): + vals = [] + for c in r.cells: + segs: List[str] = [] + for p in c.paragraphs: + s = _cell_inline_md(doc, p) + if s: + segs.append(s) + cell_text = '
'.join([x for x in segs if x is not None]) + vals.append((cell_text or '').replace('|', '\\|').strip()) + line = "| " + " | ".join(vals) + " |" + out.append(line) + if r_i == 0: + sep = "| " + " | ".join(["---" for _ in vals]) + " |" + out.append(sep) + return "\n".join(out) + "\n" + + +def _paragraph_to_md(p: Paragraph) -> str: + return (p.text or "").strip() + "\n\n" + + +def convert_any(path: Path, mdx_safe_mode_enabled: bool = True) -> Tuple[str, str]: + ext = path.suffix.lower() + use_path = path + if ext == ".doc": + use_path = _convert_doc_to_docx_cross_platform(path) + if use_path.suffix.lower() not in {".docx"}: + raise RuntimeError("unsupported input for word2markdown") + doc = Document(str(use_path)) + out: List[str] = [] + in_code = False + code_lines: List[str] = [] + lang_hint: str = '' + for blk in _iter_blocks(doc): + if isinstance(blk, Table): + out.append(_table_to_md(blk)) + elif isinstance(blk, Paragraph): + tboxes = _paragraph_textboxes(blk) + for tb in tboxes: + if tb.strip(): + out.append(_md_code_block(tb.strip())) + sdts = _paragraph_sdts(blk) + for s in sdts: + if s.strip(): + out.append(_md_code_block(s.strip())) + btx = _paragraph_bordered_text(blk) + for s in btx: + if s.strip(): + out.append(_md_code_block(s.strip())) + ftx = _paragraph_framed(blk) + for s in ftx: + if s.strip(): + out.append(_md_code_block(s.strip())) + raw = (blk.text or "") + sraw = raw.strip() + if _looks_like_code_paragraph(sraw) or (in_code and sraw == ""): + if not in_code: + in_code = True + lang_hint = _guess_lang(sraw) + code_lines = [] + code_lines.append(raw) + continue + if in_code and code_lines: + text = "\n".join(code_lines) + use_lang = lang_hint or _guess_lang(text) + out.append(f"```{use_lang}\n{text}\n```\n") + in_code = False + code_lines = [] + lang_hint = '' + def _paragraph_with_images(doc: Document, p: Paragraph) -> str: + el = p._element + parts: List[str] = [] + try: + for ch in el.iterchildren(): + tag = ch.tag.split('}')[-1] + if tag == 'r': + for rc in ch.iterchildren(): + rtag = rc.tag.split('}')[-1] + if rtag == 't': + s = rc.text or '' + if s: + parts.append(s) + elif rtag == 'br': + parts.append('\n') + elif rtag == 'drawing': + for node in rc.iter(): + local = node.tag.split('}')[-1] + rid = None + if local == 'blip': + rid = node.get(f"{{{NS['r']}}}embed") or node.get(f"{{{NS['r']}}}link") + elif local == 'imagedata': + rid = node.get(f"{{{NS['r']}}}id") + if not rid: + continue + try: + part = None + rp = getattr(doc.part, 'related_parts', None) + if isinstance(rp, dict) and rid in rp: + part = rp.get(rid) + if part is None: + rels = getattr(doc.part, 'rels', None) + if rels is not None and hasattr(rels, 'get'): + rel = rels.get(rid) + part = getattr(rel, 'target_part', None) + if part is None: + rel = getattr(doc.part, '_rels', {}).get(rid) + part = getattr(rel, 'target_part', None) + ct = getattr(part, 'content_type', '') if part is not None else '' + data = part.blob if part is not None and hasattr(part, 'blob') else None + if data: + b64 = base64.b64encode(data).decode('ascii') + parts.append(f"![Image](data:{ct};base64,{b64})") + except Exception: + pass + except Exception: + pass + s = ''.join(parts).strip() + return (s + '\n\n') if s else '' + txt = _paragraph_with_images(doc, blk) + if txt.strip(): + out.append(txt) + if in_code and code_lines: + text = "\n".join(code_lines) + use_lang = lang_hint or _guess_lang(text) + out.append(f"```{use_lang}\n{text}\n```\n") + try: + boxes = _doclevel_textboxes(doc) + existing_texts = set() + try: + for seg in out: + if isinstance(seg, str): + ss = seg.strip() + if ss.startswith("```"): + m = re.search(r"^```[\w-]*\n([\s\S]*?)\n```\s*$", ss) + if m: + existing_texts.add(m.group(1).strip()) + continue + existing_texts.add(ss) + except Exception: + pass + for tb in boxes: + s = (tb or '').strip() + if not s: + continue + if s in existing_texts: + continue + out.append(_md_code_block(s)) + existing_texts.add(s) + except Exception: + pass + md = "".join(out) + return "utf-8", md + +NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", + "v": "urn:schemas-microsoft-com:vml", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", +} + + +def _paragraph_textboxes(p: Paragraph) -> List[str]: + try: + el = p._element + texts: List[str] = [] + for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS): + paras = tbox.xpath('.//w:p', namespaces=NS) + buf: List[str] = [] + for w_p in paras: + ts = w_p.xpath('.//w:t', namespaces=NS) + s = ''.join([t.text or '' for t in ts]).strip() + if s: + buf.append(s) + if buf: + texts.append('\n'.join(buf)) + for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS): + paras = tbox.xpath('.//w:p', namespaces=NS) + buf: List[str] = [] + for w_p in paras: + ts = w_p.xpath('.//w:t', namespaces=NS) + s = ''.join([t.text or '' for t in ts]).strip() + if s: + buf.append(s) + if buf: + texts.append('\n'.join(buf)) + return texts + except Exception: + return [] + + +def _paragraph_sdts(p: Paragraph) -> List[str]: + try: + el = p._element + texts: List[str] = [] + for sdt in el.xpath('.//w:sdt/w:sdtContent', namespaces=NS): + paras = sdt.xpath('.//w:p', namespaces=NS) + buf: List[str] = [] + for w_p in paras: + ts = w_p.xpath('.//w:t', namespaces=NS) + s = ''.join([t.text or '' for t in ts]).strip() + if s: + buf.append(s) + if buf: + texts.append('\n'.join(buf)) + return texts + except Exception: + return [] + + +def _paragraph_bordered_text(p: Paragraph) -> List[str]: + try: + el = p._element + has_border = bool(el.xpath('./w:pPr/w:pBdr', namespaces=NS)) + t = (p.text or '').strip() + if has_border and t: + return [t] + except Exception: + pass + return [] + + +def _paragraph_framed(p: Paragraph) -> List[str]: + try: + el = p._element + has_frame = bool(el.xpath('./w:pPr/w:framePr', namespaces=NS)) + t = (p.text or '').strip() + if has_frame and t: + return [t] + except Exception: + pass + return [] + + +def _md_code_block(text: str) -> str: + lang = _guess_lang(text) + return f"```{lang}\n{text}\n```\n" + + +def _looks_like_code_paragraph(t: str) -> bool: + s = (t or '').strip() + if not s: + return False + if s.startswith('{') or s.startswith('[') or s.endswith('}'): + return True + if s.startswith(' ') or s.startswith('\t'): + return True + if ';' in s or '{' in s or '}' in s: + return True + keywords = ['public static', 'private static', 'class ', 'return ', 'import ', 'package ', 'byte[]', 'String ', 'Cipher', 'KeyFactory'] + return any(k in s for k in keywords) + + +def _doclevel_textboxes(doc: Document) -> List[str]: + texts: List[str] = [] + try: + el = doc.element.body + for tbox in el.xpath('.//wps:txbx/w:txbxContent', namespaces=NS): + paras = tbox.xpath('.//w:p', namespaces=NS) + buf: List[str] = [] + for w_p in paras: + ts = w_p.xpath('.//w:t', namespaces=NS) + s = ''.join([(t.text or '') for t in ts]).strip() + if s: + buf.append(s) + if buf: + texts.append('\n'.join(buf)) + for tbox in el.xpath('.//v:textbox/w:txbxContent', namespaces=NS): + paras = tbox.xpath('.//w:p', namespaces=NS) + buf: List[str] = [] + for w_p in paras: + ts = w_p.xpath('.//w:t', namespaces=NS) + s = ''.join([(t.text or '') for t in ts]).strip() + if s: + buf.append(s) + if buf: + texts.append('\n'.join(buf)) + except Exception: + pass + return texts + + +def _convert_doc_to_docx_cross_platform(path: Path) -> Path: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp: + tmp.close() + subprocess.run(["textutil", "-convert", "docx", str(path), "-output", tmp.name], check=True) + return Path(tmp.name) + except Exception: + pass + try: + outdir = Path(tempfile.mkdtemp(prefix="doc2docx_")) + subprocess.run(["soffice", "--headless", "--convert-to", "docx", "--outdir", str(outdir), str(path)], check=True) + candidate = outdir / (path.stem + ".docx") + if candidate.exists(): + return candidate + except Exception: + pass + try: + out = Path(tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name) + subprocess.run(["unoconv", "-f", "docx", "-o", str(out), str(path)], check=True) + if out.exists(): + return out + except Exception: + pass + raise RuntimeError("doc to docx conversion failed; please install 'soffice' or 'unoconv' or convert manually") diff --git a/docling/app/tests/run_batch_upload_debug.py b/docling/app/tests/run_batch_upload_debug.py new file mode 100644 index 0000000..b00735f --- /dev/null +++ b/docling/app/tests/run_batch_upload_debug.py @@ -0,0 +1,80 @@ +import io +import os +import zipfile +from pathlib import Path +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "test", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def main(): + setup() + app = server.app + c = TestClient(app) + tmp = Path("/tmp/run_batch_upload_debug") + tmp.mkdir(parents=True, exist_ok=True) + + zpath = tmp / "pkg.zip" + md_dir = tmp / "docs" + img_dir = md_dir / "images" + img_dir.mkdir(parents=True, exist_ok=True) + (img_dir / "p.png").write_bytes(b"PNG") + (md_dir / "a.md").write_text("![](images/p.png)", "utf-8") + + with zipfile.ZipFile(str(zpath), "w") as zf: + zf.write(str(md_dir / "a.md"), arcname="a.md") + zf.write(str(img_dir / "p.png"), arcname="images/p.png") + + with open(zpath, "rb") as fp: + files = {"file": ("pkg.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + print("stage status:", r1.status_code, r1.json()) + sid = r1.json()["data"]["id"] + + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"}) + print("process status:", r2.status_code, r2.json()) + + list_text = str(md_dir / "a.md") + lf = io.BytesIO(list_text.encode("utf-8")) + r3 = c.post("/api/upload-list", files={"list_file": ("list.txt", lf.getvalue())}, data={"prefix": "assets", "versionId": "1002"}) + print("upload-list status:", r3.status_code, r3.json()) + + +if __name__ == "__main__": + main() diff --git a/docling/app/tests/run_convert_folder_debug.py b/docling/app/tests/run_convert_folder_debug.py new file mode 100644 index 0000000..619d8b3 --- /dev/null +++ b/docling/app/tests/run_convert_folder_debug.py @@ -0,0 +1,75 @@ +import io +import os +from pathlib import Path +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "test", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def main(): + setup() + app = server.app + c = TestClient(app) + tmp = Path("/tmp/run_convert_folder_debug") + if tmp.exists(): + for p in tmp.rglob("*"): + try: + p.unlink() + except Exception: + pass + try: + tmp.rmdir() + except Exception: + pass + tmp.mkdir(parents=True, exist_ok=True) + + root = tmp / "数+产品手册-MD源文件" + sub = root / "DMDRS_DRS_Language_User_Manual" + img = sub / "images" + img.mkdir(parents=True, exist_ok=True) + (img / "p.png").write_bytes(b"PNG") + (sub / "a.md").write_text("# Title\n\n![](images/p.png)", "utf-8") + + r = c.post("/md/convert-folder", data={"folder_path": str(root), "prefix": "assets"}) + print("convert-folder:", r.status_code) + print(r.json()) + + +if __name__ == "__main__": + main() diff --git a/docling/app/tests/run_edge_cases_debug.py b/docling/app/tests/run_edge_cases_debug.py new file mode 100644 index 0000000..a111f80 --- /dev/null +++ b/docling/app/tests/run_edge_cases_debug.py @@ -0,0 +1,97 @@ +import io +import zipfile +from pathlib import Path +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "test", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def run(): + setup() + app = server.app + c = TestClient(app) + + r = c.post("/api/archive/process", data={"id": "missing"}) + print("invalid-id:", r.status_code, r.json()) + + tmp = Path("/tmp/run_edge_cases_debug") + tmp.mkdir(parents=True, exist_ok=True) + rar_path = tmp / "pkg.rar" + rar_path.write_bytes(b"RAR") + with open(rar_path, "rb") as fp: + files = {"file": ("pkg.rar", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + sid = r1.json()["data"]["id"] + r2 = c.post("/api/archive/process", data={"id": sid}) + print("rar-process:", r2.status_code, r2.json()) + r3 = c.post("/api/archive/process", data={"id": sid}) + print("rar-reprocess:", r3.status_code, r3.json()) + + root = tmp / "listcase2" + root.mkdir(parents=True, exist_ok=True) + (root / "img.png").write_bytes(b"PNG") + (root / "a.md").write_text("![](img.png)", "utf-8") + (root / "b.txt").write_text("![](img.png)", "utf-8") + lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")] + data_bytes = "\n".join(lines).encode("utf-8") + files = {"list_file": ("list.txt", data_bytes)} + r4 = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"}) + print("upload-list:", r4.status_code, r4.json()) + + zpath = tmp / "dup.zip" + base = tmp / "src" + sub = base / "sub" + sub.mkdir(parents=True, exist_ok=True) + (base / "a.md").write_text("![](img.png)", "utf-8") + (base / "img.png").write_bytes(b"PNG") + (sub / "a.md").write_text("![](../img.png)", "utf-8") + with zipfile.ZipFile(str(zpath), "w") as zf: + zf.write(str(base / "a.md"), arcname="a.md") + zf.write(str(base / "img.png"), arcname="img.png") + zf.write(str(sub / "a.md"), arcname="sub/a.md") + with open(zpath, "rb") as fp: + files = {"file": ("dup.zip", fp.read())} + r5 = c.post("/api/archive/stage", files=files) + sid2 = r5.json()["data"]["id"] + r6 = c.post("/api/archive/process", data={"id": sid2, "prefix": "assets", "versionId": "1006"}) + print("archive-dup:", r6.status_code, r6.json()) + + +if __name__ == "__main__": + run() diff --git a/docling/app/tests/run_minio_object_debug.py b/docling/app/tests/run_minio_object_debug.py new file mode 100644 index 0000000..5e289c4 --- /dev/null +++ b/docling/app/tests/run_minio_object_debug.py @@ -0,0 +1,77 @@ +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class _Resp: + def __init__(self, data: bytes): + self._data = data + def read(self) -> bytes: + return self._data + def close(self): + pass + + +class FakeMinio: + def __init__(self): + self.store = { + ("doctest", "assets/rewritten/x.md"): (b"# Title\n\nhello", "text/markdown; charset=utf-8") + } + def stat_object(self, bucket: str, object_name: str): + class S: + def __init__(self, ct: str): + self.content_type = ct + k = (bucket, object_name) + if k in self.store: + return S(self.store[k][1]) + return S("application/octet-stream") + def get_object(self, bucket: str, object_name: str): + k = (bucket, object_name) + if k in self.store: + return _Resp(self.store[k][0]) + return _Resp(b"") + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "doctest", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def run(): + setup() + app = server.app + c = TestClient(app) + r = c.get("/minio/object", params={"bucket": "doctest", "object": "assets/rewritten/x.md"}) + print("status:", r.status_code) + print("ct:", r.headers.get("Content-Type")) + print(r.text) + + import urllib.parse as _u + enc = _u.quote("assets/rewritten/数字+产品手册-MD源文件/x.md") + cur_client, _, _, _ = server._minio_current() # type: ignore + cur_client.store[("doctest", "assets/rewritten/数字+产品手册-MD源文件/x.md")] = ("hello 中文+plus".encode("utf-8"), "text/markdown; charset=utf-8") + r2 = c.get("/minio/object", params={"bucket": "doctest", "object": enc}) + print("status2:", r2.status_code) + print("ct2:", r2.headers.get("Content-Type")) + print(r2.text) + + +if __name__ == "__main__": + run() diff --git a/docling/app/tests/run_minio_presign_debug.py b/docling/app/tests/run_minio_presign_debug.py new file mode 100644 index 0000000..7b9118b --- /dev/null +++ b/docling/app/tests/run_minio_presign_debug.py @@ -0,0 +1,50 @@ +import io +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class FakeMinio: + def __init__(self): + pass + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}" + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}" + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "doctest", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "doctest", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def run(): + setup() + app = server.app + c = TestClient(app) + url = "http://127.0.0.1:9000/doctest/assets/rewritten/%E6%B5%8B%E8%AF%95/a.md" + r = c.post("/minio/presign", data={"url": url, "expires": 7200}) + print("status:", r.status_code) + print(r.json()) + + +if __name__ == "__main__": + run() + diff --git a/docling/app/tests/run_slash_path_debug.py b/docling/app/tests/run_slash_path_debug.py new file mode 100644 index 0000000..f23f7b1 --- /dev/null +++ b/docling/app/tests/run_slash_path_debug.py @@ -0,0 +1,74 @@ +import io +import zipfile +from pathlib import Path +from fastapi.testclient import TestClient +import sys +from pathlib import Path as _Path +base = _Path(__file__).resolve().parents[2] +sys.path.insert(0, str(base)) +sys.path.insert(0, str(base / "docling")) +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup(): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "test", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def main(): + setup() + app = server.app + c = TestClient(app) + tmp = Path("/tmp/run_slash_path_debug") + tmp.mkdir(parents=True, exist_ok=True) + + zpath = tmp / "pkg.zip" + md_dir = tmp / "docs" + img_dir = md_dir / "images" + img_dir.mkdir(parents=True, exist_ok=True) + (img_dir / "p.png").write_bytes(b"PNG") + (md_dir / "a.md").write_text("![](/images/p.png)", "utf-8") + + with zipfile.ZipFile(str(zpath), "w") as zf: + zf.write(str(md_dir / "a.md"), arcname="a.md") + zf.write(str(img_dir / "p.png"), arcname="images/p.png") + + with open(zpath, "rb") as fp: + files = {"file": ("pkg.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + sid = r1.json()["data"]["id"] + + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1007"}) + print("process:", r2.status_code) + print(r2.json()) + + +if __name__ == "__main__": + main() diff --git a/docling/app/tests/test_api_convert.py b/docling/app/tests/test_api_convert.py new file mode 100644 index 0000000..5b17305 --- /dev/null +++ b/docling/app/tests/test_api_convert.py @@ -0,0 +1,29 @@ +import unittest +from fastapi.testclient import TestClient +from pathlib import Path +import io + +from app.server import app + + +class ApiConvertTest(unittest.TestCase): + def setUp(self): + self.client = TestClient(app) + + def test_api_convert_markdown_file(self): + tmpdir = Path("./scratch_unittest") + tmpdir.mkdir(exist_ok=True) + p = tmpdir / "sample.md" + p.write_text("# Title\n\n::: note\nBody\n:::\n", "utf-8") + with open(p, "rb") as f: + files = {"file": (p.name, io.BytesIO(f.read()), "text/markdown")} + r = self.client.post("/api/convert", files=files, data={"export": "markdown"}) + self.assertEqual(r.status_code, 200) + j = r.json() + self.assertEqual(j.get("code"), 0) + self.assertIsInstance(j.get("data", {}).get("content"), str) + self.assertIn("!!! note", j["data"]["content"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/docling/app/tests/test_batch_upload_edge_cases.py b/docling/app/tests/test_batch_upload_edge_cases.py new file mode 100644 index 0000000..8b97a68 --- /dev/null +++ b/docling/app/tests/test_batch_upload_edge_cases.py @@ -0,0 +1,113 @@ +import io +import zipfile +from pathlib import Path +from fastapi.testclient import TestClient + +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup_module(module=None): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + fake = FakeMinio() + def _cur(): + return fake, "test", "http://127.0.0.1:9000", "assets" + server._minio_current = _cur # type: ignore + + +def test_process_invalid_id(): + app = server.app + c = TestClient(app) + r = c.post("/api/archive/process", data={"id": "missing"}) + assert r.status_code == 200 + j = r.json() + assert j["code"] != 0 + + +def test_stage_unsupported_format_and_cleanup(tmp_path: Path): + app = server.app + c = TestClient(app) + rar_path = tmp_path / "pkg.rar" + rar_path.write_bytes(b"RAR") + with open(rar_path, "rb") as fp: + files = {"file": ("pkg.rar", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + assert r1.status_code == 200 + sid = r1.json()["data"]["id"] + r2 = c.post("/api/archive/process", data={"id": sid}) + assert r2.status_code == 200 + j2 = r2.json() + assert j2["code"] != 0 + r3 = c.post("/api/archive/process", data={"id": sid}) + assert r3.status_code == 200 + j3 = r3.json() + assert j3["code"] != 0 + + +def test_upload_list_empty_lines_comments_and_urls(tmp_path: Path): + app = server.app + c = TestClient(app) + root = tmp_path / "listcase2" + root.mkdir(parents=True, exist_ok=True) + (root / "img.png").write_bytes(b"PNG") + (root / "a.md").write_text("![](img.png)", "utf-8") + (root / "b.txt").write_text("![](img.png)", "utf-8") + lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")] + data_bytes = "\n".join(lines).encode("utf-8") + files = {"list_file": ("list.txt", data_bytes)} + r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"}) + assert r.status_code == 200 + j = r.json() + assert j["code"] == 0 + assert j["data"]["count"] >= 2 + + +def test_archive_duplicate_filenames_tree(tmp_path: Path): + app = server.app + c = TestClient(app) + zpath = tmp_path / "dup.zip" + base = tmp_path / "src" + sub = base / "sub" + sub.mkdir(parents=True, exist_ok=True) + (base / "a.md").write_text("![](img.png)", "utf-8") + (base / "img.png").write_bytes(b"PNG") + (sub / "a.md").write_text("![](../img.png)", "utf-8") + with zipfile.ZipFile(str(zpath), "w") as zf: + zf.write(str(base / "a.md"), arcname="a.md") + zf.write(str(base / "img.png"), arcname="img.png") + zf.write(str(sub / "a.md"), arcname="sub/a.md") + with open(zpath, "rb") as fp: + files = {"file": ("dup.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + assert r1.status_code == 200 + sid = r1.json()["data"]["id"] + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1006"}) + assert r2.status_code == 200 + j = r2.json() + assert j["code"] == 0 + tree = j["data"]["import"]["tree"] + names = [n["name"] for n in tree] + assert "sub" in names or any((isinstance(n, dict) and n.get("type") == "FOLDER" and n.get("name") == "sub") for n in tree) diff --git a/docling/app/tests/test_batch_upload_endpoints.py b/docling/app/tests/test_batch_upload_endpoints.py new file mode 100644 index 0000000..44b56e4 --- /dev/null +++ b/docling/app/tests/test_batch_upload_endpoints.py @@ -0,0 +1,185 @@ +import io +import os +import zipfile +from pathlib import Path +from fastapi.testclient import TestClient + +import app.server as server + + +class FakeMinio: + def __init__(self): + self.objs = {} + + def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str): + self.objs[(bucket_name, object_name)] = data.read(length) + + def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + def presigned_get_object(self, bucket: str, obj: str, expires: int): + return f"http://minio.test/presigned/{bucket}/{obj}" + + +def setup_module(module=None): + server.RUNTIME_CONFIG["minio"].update({ + "endpoint": "127.0.0.1:9000", + "public": "http://127.0.0.1:9000", + "access": "ak", + "secret": "sk", + "bucket": "test", + "secure": "false", + "prefix": "assets", + "store_final": "true", + "public_read": "true", + }) + + fake = FakeMinio() + + def _cur_cfg(_cfg): + return fake, "test", "http://127.0.0.1:9000", "assets" + server.minio_current = _cur_cfg # type: ignore + try: + server._minio_current = lambda: _cur_cfg(None) # type: ignore + except Exception: + pass + + +def test_archive_stage_and_process(tmp_path: Path): + app = server.app + c = TestClient(app) + + zpath = tmp_path / "pkg.zip" + md_dir = tmp_path / "docs" + img_dir = md_dir / "images" + img_dir.mkdir(parents=True, exist_ok=True) + (img_dir / "p.png").write_bytes(b"PNG") + (md_dir / "a.md").write_text("![](images/p.png)", "utf-8") + + with zipfile.ZipFile(str(zpath), "w") as zf: + zf.write(str(md_dir / "a.md"), arcname="a.md") + zf.write(str(img_dir / "p.png"), arcname="images/p.png") + + with open(zpath, "rb") as fp: + files = {"file": ("pkg.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + assert r1.status_code == 200 + j1 = r1.json() + assert j1["code"] == 0 and j1["data"]["id"] + sid = j1["data"]["id"] + + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"}) + assert r2.status_code == 200 + j2 = r2.json() + assert j2["code"] == 0 + assert j2["data"]["count"] >= 1 + assert "import" in j2["data"] + + +def test_upload_list(tmp_path: Path): + app = server.app + c = TestClient(app) + + root = tmp_path / "listcase" + root.mkdir(parents=True, exist_ok=True) + (root / "img.png").write_bytes(b"PNG") + (root / "b.md").write_text("![](img.png)", "utf-8") + + list_text = str(root / "b.md") + lf = io.BytesIO(list_text.encode("utf-8")) + + files = {"list_file": ("list.txt", lf.getvalue())} + r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1002"}) + assert r.status_code == 200 + j = r.json() + assert j["code"] == 0 + assert j["data"]["count"] >= 1 + assert "import" in j["data"] + + +def test_archive_process_html_conversion(tmp_path: Path): + app = server.app + c = TestClient(app) + + zpath = tmp_path / "web.zip" + root = tmp_path / "web" + static = root / "static" + static.mkdir(parents=True, exist_ok=True) + (static / "pic.png").write_bytes(b"PNG") + + (root / "index.html").write_text("

T

", "utf-8") + pages = root / "pages" + pages.mkdir(parents=True, exist_ok=True) + (pages / "a.html").write_text("", "utf-8") + + with zipfile.ZipFile(str(zpath), "w") as zf: + for p in root.rglob("*"): + if p.is_file(): + zf.write(str(p), arcname=p.relative_to(root).as_posix()) + + with open(zpath, "rb") as fp: + files = {"file": ("web.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + assert r1.status_code == 200 + sid = r1.json()["data"]["id"] + + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1003"}) + assert r2.status_code == 200 + j = r2.json() + assert j["code"] == 0 + + files_list = j["data"]["files"] + names = {Path(str(f.get("source") or "")).name for f in files_list} + assert "index.md" in names + assert "a.md" in names + for f in files_list: + n = Path(str(f.get("source") or "")).name + if n in {"index.md", "a.md"}: + assert f.get("minio_url") + assert str(f.get("object_name") or "").startswith("assets/rewritten/") + + imp = j["data"]["import"] + nodes = [] + def walk(children): + for n in children: + if n.get("type") == "FILE": + nodes.append(n.get("name")) + elif n.get("type") == "FOLDER": + walk(n.get("children", [])) + walk(imp["tree"]) + assert "index" in nodes + assert "a" in nodes + + +def test_archive_process_html_abs_uppercase(tmp_path: Path): + app = server.app + c = TestClient(app) + + zpath = tmp_path / "web2.zip" + root = tmp_path / "web2" + (root / "static").mkdir(parents=True, exist_ok=True) + (root / "static" / "p.png").write_bytes(b"PNG") + + (root / "INDEX.HTML").write_text("", "utf-8") + (root / "pages").mkdir(parents=True, exist_ok=True) + (root / "pages" / "A.HTM").write_text("", "utf-8") + + with zipfile.ZipFile(str(zpath), "w") as zf: + for p in root.rglob("*"): + if p.is_file(): + zf.write(str(p), arcname=p.relative_to(root).as_posix()) + + with open(zpath, "rb") as fp: + files = {"file": ("web2.zip", fp.read())} + r1 = c.post("/api/archive/stage", files=files) + assert r1.status_code == 200 + sid = r1.json()["data"]["id"] + + r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1004"}) + assert r2.status_code == 200 + j = r2.json() + assert j["code"] == 0 + files_list = j["data"]["files"] + names = {Path(str(f.get("source") or "")).name for f in files_list} + assert "INDEX.md" in names + assert "A.md" in names diff --git a/docling/app/tests/test_md_to_docx.py b/docling/app/tests/test_md_to_docx.py new file mode 100644 index 0000000..da65196 --- /dev/null +++ b/docling/app/tests/test_md_to_docx.py @@ -0,0 +1,53 @@ +import io +import os +import base64 +from pathlib import Path +from zipfile import ZipFile + +from app.services.docling_adapter import md_to_docx_bytes + + +def _make_png(tmpdir: Path) -> Path: + # Minimal 1x1 PNG + data = base64.b64decode( + b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII=" + ) + p = tmpdir / "tiny.png" + p.write_bytes(data) + return p + + +def test_md_to_docx_renders_blocks_and_media(tmp_path: Path): + png = _make_png(tmp_path) + html = ( + f"

标题

" + f"

内容

" + f"
print(\"hello\")\n
" + f"" + f"" + f"
AB
12
" + ) + + docx = md_to_docx_bytes( + html, + toc=True, + header_text="Left|Right", + footer_text="Footer", + filename_text="FileName", + product_name="Product", + document_name="DocName", + product_version="1.0", + document_version="2.0", + ) + + assert isinstance(docx, (bytes, bytearray)) and len(docx) > 0 + zf = ZipFile(io.BytesIO(docx)) + names = set(zf.namelist()) + assert any(n.startswith("word/") for n in names) + # Document XML should contain core texts + doc_xml = zf.read("word/document.xml").decode("utf-8") + for tok in ["标题", "内容", "print(\"hello\")", "A", "B", "1", "2"]: + assert tok in doc_xml + # Media should be present for the image + assert any(n.startswith("word/media/") for n in names) + diff --git a/docling/app/tests/test_word2markdown_inline_images.py b/docling/app/tests/test_word2markdown_inline_images.py new file mode 100644 index 0000000..ba95459 --- /dev/null +++ b/docling/app/tests/test_word2markdown_inline_images.py @@ -0,0 +1,51 @@ +import unittest +from pathlib import Path +import base64 +import tempfile +import sys + +# ensure 'app' package is importable +try: + root = Path(__file__).resolve().parents[2] + p = str(root) + if p not in sys.path: + sys.path.insert(0, p) +except Exception: + pass + +from docx import Document + +from app.services.word2markdown import convert_any + + +def _tiny_png_bytes() -> bytes: + return base64.b64decode( + b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII=" + ) + + +class InlineImagesTest(unittest.TestCase): + def test_paragraph_image_order(self): + tmp = Path(tempfile.mkdtemp(prefix="w2m_inline_test_")) + img = tmp / "tiny.png" + img.write_bytes(_tiny_png_bytes()) + + docx = tmp / "sample.docx" + doc = Document() + doc.add_paragraph("前文A") + doc.add_picture(str(img)) # 图片单独段落 + doc.add_paragraph("后文B") + doc.save(str(docx)) + + enc, md = convert_any(docx) + self.assertEqual(enc, "utf-8") + a_pos = md.find("前文A") + img_pos = md.find("![Image](data:") + b_pos = md.find("后文B") + # 顺序应为 A -> 图片 -> B + self.assertTrue(a_pos != -1 and img_pos != -1 and b_pos != -1) + self.assertTrue(a_pos < img_pos < b_pos) + + +if __name__ == "__main__": + unittest.main() diff --git a/docling/docling b/docling/docling new file mode 160000 index 0000000..ad97e52 --- /dev/null +++ b/docling/docling @@ -0,0 +1 @@ +Subproject commit ad97e5285126388847ba9a219ac73f006c759f09 diff --git a/docling/requirements.txt b/docling/requirements.txt new file mode 100644 index 0000000..63a7484 --- /dev/null +++ b/docling/requirements.txt @@ -0,0 +1,28 @@ +fastapi +uvicorn +python-multipart +minio +beautifulsoup4 +marko +markdown-it-py +mdit-py-plugins +pydantic-settings +filetype +python-docx +openpyxl +mammoth +weasyprint +reportlab +pypdfium2 +python-pptx +pluggy +requests +docling-core +docling-parse +docling-ibm-models +transformers +sentencepiece +safetensors +scipy +opencv-python +pymupdf diff --git a/docling/tests/debug_api.py b/docling/tests/debug_api.py new file mode 100644 index 0000000..73bfaa8 --- /dev/null +++ b/docling/tests/debug_api.py @@ -0,0 +1,17 @@ +import sys +from pathlib import Path +from fastapi.testclient import TestClient + +root = Path(__file__).resolve().parents[2] / "docling" +sys.path.insert(0, str(root)) +import app.server as server + +from docling.tests.test_api_prd import setup_module, PNG +setup_module() + +app = server.app +c = TestClient(app) +files = {"file": ("管理端使用说明 (1).pdf", b"%PDF-1.4\n")} +data = {"export": "markdown", "save": "true", "filename": "管理端使用说明 (1)"} +r = c.post("/api/convert", files=files, data=data) +print(r.json()) diff --git a/docling/tests/test_api_prd.py b/docling/tests/test_api_prd.py new file mode 100644 index 0000000..0631f5d --- /dev/null +++ b/docling/tests/test_api_prd.py @@ -0,0 +1,131 @@ +import os +import sys +import tempfile +from pathlib import Path +from fastapi.testclient import TestClient +import types + +root = Path(__file__).resolve().parents[2] / "docling" +sys.path.insert(0, str(root)) +dc = types.ModuleType('docling.document_converter') +class _DC: + def __init__(self, *a, **k): + pass + def convert(self, src): + class R: + class D: + def export_to_markdown(self, image_mode=None): + return "" + def export_to_html(self): + return "" + def export_to_json(self): + return "{}" + def export_to_doctags(self): + return "{}" + document = D() + return R() +class _PF: + def __init__(self, *a, **k): + pass +dc.DocumentConverter = _DC +dc.PdfFormatOption = _PF +sys.modules['docling.document_converter'] = dc +bm = types.ModuleType('docling.datamodel.base_models') +class _IF: + PDF = 'pdf' +bm.InputFormat = _IF +sys.modules['docling.datamodel.base_models'] = bm +pl = types.ModuleType('docling.pipeline.standard_pdf_pipeline') +class _SP: + def __init__(self, *a, **k): + pass +pl.StandardPdfPipeline = _SP +sys.modules['docling.pipeline.standard_pdf_pipeline'] = pl +po = types.ModuleType('docling.datamodel.pipeline_options') +class _PPO: + def __init__(self, *a, **k): + pass +po.PdfPipelineOptions = _PPO +sys.modules['docling.datamodel.pipeline_options'] = po +ct = types.ModuleType('docling_core.types.doc') +class _IRM: + PLACEHOLDER = 'placeholder' +ct.ImageRefMode = _IRM +sys.modules['docling_core.types.doc'] = ct +da = types.ModuleType('app.services.docling_adapter') +def _convert_source(src, export): + return ("", "text/markdown") +def _md2docx(md, **k): + return b"" +def _md2pdf(md, *a, **k): + return b"" +def _infer(source_url, upload_name): + return "document" +def _san(name): + return name or "document" +def _load(): + return {} +def _save(m): + return None +da.convert_source = _convert_source +da.md_to_docx_bytes = _md2docx +da.md_to_pdf_bytes_with_renderer = _md2pdf +da.infer_basename = _infer +da.sanitize_filename = _san +da.load_linkmap = _load +da.save_linkmap = _save +sys.modules['app.services.docling_adapter'] = da +import app.server as server + +class DummyMinio: + def __init__(self): + self.objs = [] + def put_object(self, bucket_name, object_name, data, length, content_type): + self.objs.append((bucket_name, object_name, length, content_type)) + def get_presigned_url(self, method, bucket, obj, expires=None): + return f"http://127.0.0.1:9000/{bucket}/{obj}" + def presigned_get_object(self, bucket, obj, expires=None): + return f"http://127.0.0.1:9000/{bucket}/{obj}" + +PNG = (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\nIDATx\x9cc\xf8\x0f\x00\x01\x01\x01\x00\x18\xdd\xdc\xa4\x00\x00\x00\x00IEND\xaeB`\x82") + +def setup_module(module=None): + server._minio_current = lambda: (DummyMinio(), "doctest", "http://127.0.0.1:9000", "assets") + def fake_convert(src, export="markdown", engine=None): + d = Path(tempfile.mkdtemp(prefix="artifacts_")) + (d / "img.png").write_bytes(PNG) + return ("utf-8", "A\n\nB", str(d)) + server._converter_v2.convert = fake_convert + server._extract_pdf_images = lambda pdf_path: [("png", PNG), ("png", PNG)] + +import unittest + +class TestApiConvert(unittest.TestCase): + @classmethod + def setUpClass(cls): + setup_module() + def test_api_convert_save_true_returns_md_url(self): + app = server.app + mc = server._minio_current() + assert mc[1] == 'doctest' + c = TestClient(app) + files = {"file": ("管理端使用说明 (1).pdf", b"%PDF-1.4\n")} + data = {"export": "markdown", "save": "true", "filename": "管理端使用说明 (1)"} + r = c.post("/api/convert", files=files, data=data) + j = r.json() + self.assertEqual(j["code"], 0, str(j)) + self.assertTrue(j["data"]["name"].lower().endswith(".md")) + self.assertTrue(j["data"]["minio_url"].lower().endswith(".md")) + + def test_api_convert_save_false_returns_content_and_md_name(self): + app = server.app + mc = server._minio_current() + assert mc[1] == 'doctest' + c = TestClient(app) + files = {"file": ("文档.pdf", b"%PDF-1.4\n")} + data = {"export": "markdown", "save": "false", "filename": "文档"} + r = c.post("/api/convert", files=files, data=data) + j = r.json() + self.assertEqual(j["code"], 0, str(j)) + self.assertTrue(j["data"]["name"].lower().endswith(".md")) + self.assertIn("![image](", j["data"]["content"]) diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000..a547bf3 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/frontend/.vscode/extensions.json b/frontend/.vscode/extensions.json new file mode 100644 index 0000000..a7cea0b --- /dev/null +++ b/frontend/.vscode/extensions.json @@ -0,0 +1,3 @@ +{ + "recommendations": ["Vue.volar"] +} diff --git a/frontend/README.md b/frontend/README.md new file mode 100644 index 0000000..33895ab --- /dev/null +++ b/frontend/README.md @@ -0,0 +1,5 @@ +# Vue 3 + TypeScript + Vite + +This template should help get you started developing with Vue 3 and TypeScript in Vite. The template uses Vue 3 ` + + diff --git a/frontend/package-lock.json b/frontend/package-lock.json new file mode 100644 index 0000000..07565f4 --- /dev/null +++ b/frontend/package-lock.json @@ -0,0 +1,1454 @@ +{ + "name": "frontend", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "frontend", + "version": "0.0.0", + "dependencies": { + "marked": "^17.0.1", + "vue": "^3.5.24" + }, + "devDependencies": { + "@types/node": "^24.10.1", + "@vitejs/plugin-vue": "^6.0.1", + "@vue/tsconfig": "^0.8.1", + "typescript": "~5.9.3", + "vite": "^7.2.4", + "vue-tsc": "^3.1.4" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.5.tgz", + "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.5" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.5.tgz", + "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "license": "MIT" + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.50", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.50.tgz", + "integrity": "sha512-5e76wQiQVeL1ICOZVUg4LSOVYg9jyhGCin+icYozhsUzM+fHE7kddi1bdiE0jwVqTfkjba3jUFbEkoC9WkdvyA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.53.3.tgz", + "integrity": "sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.53.3.tgz", + "integrity": "sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.53.3.tgz", + "integrity": "sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.53.3.tgz", + "integrity": "sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.53.3.tgz", + "integrity": "sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.53.3.tgz", + "integrity": "sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.53.3.tgz", + "integrity": "sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.53.3.tgz", + "integrity": "sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.53.3.tgz", + "integrity": "sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.53.3.tgz", + "integrity": "sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.53.3.tgz", + "integrity": "sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.53.3.tgz", + "integrity": "sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.53.3.tgz", + "integrity": "sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.53.3.tgz", + "integrity": "sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.53.3.tgz", + "integrity": "sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.53.3.tgz", + "integrity": "sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.53.3.tgz", + "integrity": "sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.53.3.tgz", + "integrity": "sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.53.3.tgz", + "integrity": "sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.53.3.tgz", + "integrity": "sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.53.3.tgz", + "integrity": "sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.53.3.tgz", + "integrity": "sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "24.10.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.1.tgz", + "integrity": "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@vitejs/plugin-vue": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-vue/-/plugin-vue-6.0.2.tgz", + "integrity": "sha512-iHmwV3QcVGGvSC1BG5bZ4z6iwa1SOpAPWmnjOErd4Ske+lZua5K9TtAVdx0gMBClJ28DViCbSmZitjWZsWO3LA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rolldown/pluginutils": "1.0.0-beta.50" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "peerDependencies": { + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0", + "vue": "^3.2.25" + } + }, + "node_modules/@volar/language-core": { + "version": "2.4.23", + "resolved": "https://registry.npmjs.org/@volar/language-core/-/language-core-2.4.23.tgz", + "integrity": "sha512-hEEd5ET/oSmBC6pi1j6NaNYRWoAiDhINbT8rmwtINugR39loROSlufGdYMF9TaKGfz+ViGs1Idi3mAhnuPcoGQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@volar/source-map": "2.4.23" + } + }, + "node_modules/@volar/source-map": { + "version": "2.4.23", + "resolved": "https://registry.npmjs.org/@volar/source-map/-/source-map-2.4.23.tgz", + "integrity": "sha512-Z1Uc8IB57Lm6k7q6KIDu/p+JWtf3xsXJqAX/5r18hYOTpJyBn0KXUR8oTJ4WFYOcDzWC9n3IflGgHowx6U6z9Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/@volar/typescript": { + "version": "2.4.23", + "resolved": "https://registry.npmjs.org/@volar/typescript/-/typescript-2.4.23.tgz", + "integrity": "sha512-lAB5zJghWxVPqfcStmAP1ZqQacMpe90UrP5RJ3arDyrhy4aCUQqmxPPLB2PWDKugvylmO41ljK7vZ+t6INMTag==", + "dev": true, + "license": "MIT", + "dependencies": { + "@volar/language-core": "2.4.23", + "path-browserify": "^1.0.1", + "vscode-uri": "^3.0.8" + } + }, + "node_modules/@vue/compiler-core": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.25.tgz", + "integrity": "sha512-vay5/oQJdsNHmliWoZfHPoVZZRmnSWhug0BYT34njkYTPqClh3DNWLkZNJBVSjsNMrg0CCrBfoKkjZQPM/QVUw==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/shared": "3.5.25", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.25.tgz", + "integrity": "sha512-4We0OAcMZsKgYoGlMjzYvaoErltdFI2/25wqanuTu+S4gismOTRTBPi4IASOjxWdzIwrYSjnqONfKvuqkXzE2Q==", + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/compiler-sfc/-/compiler-sfc-3.5.25.tgz", + "integrity": "sha512-PUgKp2rn8fFsI++lF2sO7gwO2d9Yj57Utr5yEsDf3GNaQcowCLKL7sf+LvVFvtJDXUp/03+dC6f2+LCv5aK1ag==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@vue/compiler-core": "3.5.25", + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.21", + "postcss": "^8.5.6", + "source-map-js": "^1.2.1" + } + }, + "node_modules/@vue/compiler-ssr": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.25.tgz", + "integrity": "sha512-ritPSKLBcParnsKYi+GNtbdbrIE1mtuFEJ4U1sWeuOMlIziK5GtOL85t5RhsNy4uWIXPgk+OUdpnXiTdzn8o3A==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/language-core": { + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/@vue/language-core/-/language-core-3.1.5.tgz", + "integrity": "sha512-FMcqyzWN+sYBeqRMWPGT2QY0mUasZMVIuHvmb5NT3eeqPrbHBYtCP8JWEUCDCgM+Zr62uuWY/qoeBrPrzfa78w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@volar/language-core": "2.4.23", + "@vue/compiler-dom": "^3.5.0", + "@vue/shared": "^3.5.0", + "alien-signals": "^3.0.0", + "muggle-string": "^0.4.1", + "path-browserify": "^1.0.1", + "picomatch": "^4.0.2" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/@vue/reactivity": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/reactivity/-/reactivity-3.5.25.tgz", + "integrity": "sha512-5xfAypCQepv4Jog1U4zn8cZIcbKKFka3AgWHEFQeK65OW+Ys4XybP6z2kKgws4YB43KGpqp5D/K3go2UPPunLA==", + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-core": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/runtime-core/-/runtime-core-3.5.25.tgz", + "integrity": "sha512-Z751v203YWwYzy460bzsYQISDfPjHTl+6Zzwo/a3CsAf+0ccEjQ8c+0CdX1WsumRTHeywvyUFtW6KvNukT/smA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/shared": "3.5.25" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/runtime-dom/-/runtime-dom-3.5.25.tgz", + "integrity": "sha512-a4WrkYFbb19i9pjkz38zJBg8wa/rboNERq3+hRRb0dHiJh13c+6kAbgqCPfMaJ2gg4weWD3APZswASOfmKwamA==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.25", + "@vue/runtime-core": "3.5.25", + "@vue/shared": "3.5.25", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/server-renderer/-/server-renderer-3.5.25.tgz", + "integrity": "sha512-UJaXR54vMG61i8XNIzTSf2Q7MOqZHpp8+x3XLGtE3+fL+nQd+k7O5+X3D/uWrnQXOdMw5VPih+Uremcw+u1woQ==", + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "vue": "3.5.25" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/@vue/shared/-/shared-3.5.25.tgz", + "integrity": "sha512-AbOPdQQnAnzs58H2FrrDxYj/TJfmeS2jdfEEhgiKINy+bnOANmVizIEgq1r+C5zsbs6l1CCQxtcj71rwNQ4jWg==", + "license": "MIT" + }, + "node_modules/@vue/tsconfig": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@vue/tsconfig/-/tsconfig-0.8.1.tgz", + "integrity": "sha512-aK7feIWPXFSUhsCP9PFqPyFOcz4ENkb8hZ2pneL6m2UjCkccvaOhC/5KCKluuBufvp2KzkbdA2W2pk20vLzu3g==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "typescript": "5.x", + "vue": "^3.4.0" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + }, + "vue": { + "optional": true + } + } + }, + "node_modules/alien-signals": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/alien-signals/-/alien-signals-3.1.1.tgz", + "integrity": "sha512-ogkIWbVrLwKtHY6oOAXaYkAxP+cTH7V5FZ5+Tm4NZFd8VDZ6uNMDrfzqctTZ42eTMCSR3ne3otpcxmqSnFfPYA==", + "dev": true, + "license": "MIT" + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "license": "MIT" + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/magic-string": { + "version": "0.30.21", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.5" + } + }, + "node_modules/marked": { + "version": "17.0.1", + "resolved": "https://registry.npmjs.org/marked/-/marked-17.0.1.tgz", + "integrity": "sha512-boeBdiS0ghpWcSwoNm/jJBwdpFaMnZWRzjA6SkUMYb40SVaN1x7mmfGKp0jvexGcx+7y2La5zRZsYFZI6Qpypg==", + "license": "MIT", + "bin": { + "marked": "bin/marked.js" + }, + "engines": { + "node": ">= 20" + } + }, + "node_modules/muggle-string": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/muggle-string/-/muggle-string-0.4.1.tgz", + "integrity": "sha512-VNTrAak/KhO2i8dqqnqnAHOa3cYBwXEZe9h+D5h/1ZqFSTEFHdM65lR7RoIqq3tBBYavsOXV84NoHXZ0AkPyqQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/path-browserify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", + "integrity": "sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/rollup": { + "version": "4.53.3", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.53.3.tgz", + "integrity": "sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.53.3", + "@rollup/rollup-android-arm64": "4.53.3", + "@rollup/rollup-darwin-arm64": "4.53.3", + "@rollup/rollup-darwin-x64": "4.53.3", + "@rollup/rollup-freebsd-arm64": "4.53.3", + "@rollup/rollup-freebsd-x64": "4.53.3", + "@rollup/rollup-linux-arm-gnueabihf": "4.53.3", + "@rollup/rollup-linux-arm-musleabihf": "4.53.3", + "@rollup/rollup-linux-arm64-gnu": "4.53.3", + "@rollup/rollup-linux-arm64-musl": "4.53.3", + "@rollup/rollup-linux-loong64-gnu": "4.53.3", + "@rollup/rollup-linux-ppc64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-gnu": "4.53.3", + "@rollup/rollup-linux-riscv64-musl": "4.53.3", + "@rollup/rollup-linux-s390x-gnu": "4.53.3", + "@rollup/rollup-linux-x64-gnu": "4.53.3", + "@rollup/rollup-linux-x64-musl": "4.53.3", + "@rollup/rollup-openharmony-arm64": "4.53.3", + "@rollup/rollup-win32-arm64-msvc": "4.53.3", + "@rollup/rollup-win32-ia32-msvc": "4.53.3", + "@rollup/rollup-win32-x64-gnu": "4.53.3", + "@rollup/rollup-win32-x64-msvc": "4.53.3", + "fsevents": "~2.3.2" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.15", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", + "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "devOptional": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/vite": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/vite/-/vite-7.2.4.tgz", + "integrity": "sha512-NL8jTlbo0Tn4dUEXEsUg8KeyG/Lkmc4Fnzb8JXN/Ykm9G4HNImjtABMJgkQoVjOBN/j2WAwDTRytdqJbZsah7w==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.5.0", + "picomatch": "^4.0.3", + "postcss": "^8.5.6", + "rollup": "^4.43.0", + "tinyglobby": "^0.2.15" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^20.19.0 || >=22.12.0", + "jiti": ">=1.21.0", + "less": "^4.0.0", + "lightningcss": "^1.21.0", + "sass": "^1.70.0", + "sass-embedded": "^1.70.0", + "stylus": ">=0.54.8", + "sugarss": "^5.0.0", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/vscode-uri": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/vscode-uri/-/vscode-uri-3.1.0.tgz", + "integrity": "sha512-/BpdSx+yCQGnCvecbyXdxHDkuk55/G3xwnC0GqY4gmQ3j+A+g8kzzgB4Nk/SINjqn6+waqw3EgbVF2QKExkRxQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/vue": { + "version": "3.5.25", + "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.25.tgz", + "integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.25", + "@vue/compiler-sfc": "3.5.25", + "@vue/runtime-dom": "3.5.25", + "@vue/server-renderer": "3.5.25", + "@vue/shared": "3.5.25" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/vue-tsc": { + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/vue-tsc/-/vue-tsc-3.1.5.tgz", + "integrity": "sha512-L/G9IUjOWhBU0yun89rv8fKqmKC+T0HfhrFjlIml71WpfBv9eb4E9Bev8FMbyueBIU9vxQqbd+oOsVcDa5amGw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@volar/typescript": "2.4.23", + "@vue/language-core": "3.1.5" + }, + "bin": { + "vue-tsc": "bin/vue-tsc.js" + }, + "peerDependencies": { + "typescript": ">=5.0.0" + } + } + } +} diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000..9db4876 --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,23 @@ +{ + "name": "frontend", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vue-tsc -b && vite build", + "preview": "vite preview" + }, + "dependencies": { + "marked": "^17.0.1", + "vue": "^3.5.24" + }, + "devDependencies": { + "@types/node": "^24.10.1", + "@vitejs/plugin-vue": "^6.0.1", + "@vue/tsconfig": "^0.8.1", + "typescript": "~5.9.3", + "vite": "^7.2.4", + "vue-tsc": "^3.1.4" + } +} diff --git a/frontend/public/vite.svg b/frontend/public/vite.svg new file mode 100644 index 0000000..e7b8dfb --- /dev/null +++ b/frontend/public/vite.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frontend/src/App.vue b/frontend/src/App.vue new file mode 100644 index 0000000..2d076c3 --- /dev/null +++ b/frontend/src/App.vue @@ -0,0 +1,101 @@ + + + + + + + diff --git a/frontend/src/assets/vue.svg b/frontend/src/assets/vue.svg new file mode 100644 index 0000000..770e9d3 --- /dev/null +++ b/frontend/src/assets/vue.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frontend/src/components/BatchProcess.vue b/frontend/src/components/BatchProcess.vue new file mode 100644 index 0000000..5a1586e --- /dev/null +++ b/frontend/src/components/BatchProcess.vue @@ -0,0 +1,448 @@ + + + + + diff --git a/frontend/src/components/ConfigModal.vue b/frontend/src/components/ConfigModal.vue new file mode 100644 index 0000000..81bfb56 --- /dev/null +++ b/frontend/src/components/ConfigModal.vue @@ -0,0 +1,494 @@ + + + + + diff --git a/frontend/src/components/DocToMd.vue b/frontend/src/components/DocToMd.vue new file mode 100644 index 0000000..12b4743 --- /dev/null +++ b/frontend/src/components/DocToMd.vue @@ -0,0 +1,337 @@ + + + + + diff --git a/frontend/src/components/HelloWorld.vue b/frontend/src/components/HelloWorld.vue new file mode 100644 index 0000000..b58e52b --- /dev/null +++ b/frontend/src/components/HelloWorld.vue @@ -0,0 +1,41 @@ + + + + + diff --git a/frontend/src/components/MdToDoc.vue b/frontend/src/components/MdToDoc.vue new file mode 100644 index 0000000..7b814b9 --- /dev/null +++ b/frontend/src/components/MdToDoc.vue @@ -0,0 +1,384 @@ + + + + + diff --git a/frontend/src/main.ts b/frontend/src/main.ts new file mode 100644 index 0000000..2425c0f --- /dev/null +++ b/frontend/src/main.ts @@ -0,0 +1,5 @@ +import { createApp } from 'vue' +import './style.css' +import App from './App.vue' + +createApp(App).mount('#app') diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts new file mode 100644 index 0000000..65ad46f --- /dev/null +++ b/frontend/src/services/api.ts @@ -0,0 +1,305 @@ +export interface ConvertResponse { + code: number + msg: string + data: { + encoding?: string + content?: string + name?: string + minio_url?: string + minio_presigned_url?: string + export?: string + media_type?: string + } +} + +export interface ArchiveResponse { + code: number + msg: string + data: { + count: number + files: Array<{ source: string, minio_url?: string, minio_presigned_url?: string, object_name?: string, size?: number }> + import?: { versionId: number, tree: any[] } + } +} + +export interface MinioConfig { + endpoint: string + public?: string + access: string + secret: string + bucket: string + secure?: boolean + prefix?: string + store_final?: boolean + public_read?: boolean +} + +const API_BASE = '/api' +const CONFIG_BASE = '/config' +const API_BASE_KEY = 'app.api.base' +const CMS_BASE_KEY = 'cms.api.base' +const CMS_TOKEN_KEY = 'cms.api.token' + +function normalizeApiBase(v: string): string { + let s = String(v || '').trim() + if (!s) return '' + if (s.startsWith('//')) s = s.slice(2) + if (s.startsWith('/')) s = s.slice(1) + if (!/^https?:\/\//i.test(s)) s = `http://${s}` + return s.replace(/\/+$/, '') +} + +export function setApiBase(v: string) { + try { localStorage.setItem(API_BASE_KEY, normalizeApiBase(v)) } catch {} +} + +function baseUrl(): string { + try { + const ls = normalizeApiBase(localStorage.getItem(API_BASE_KEY) || '') + const env = normalizeApiBase((import.meta as any)?.env?.VITE_API_BASE_URL || '') + if (ls) { + console.debug('[API] using localStorage base:', ls) + return ls + } + if (env) { + console.debug('[API] using env base:', env) + return env + } + // No auto-fallback: use same-origin relative paths when not configured + return '' + } catch { + return '' + } +} + +function joinUrl(base: string, path: string): string { + const b = (base || '').replace(/\/+$/, '') + const p = path.startsWith('/') ? path : `/${path}` + return `${b}${p}` +} + +function apiFetch(path: string, init?: RequestInit) { + const b = baseUrl() + const url = b ? joinUrl(b, path) : path + console.debug('[API] fetch:', url) + return fetch(url, init) +} + +function normalizeEndpoint(ep: string): string { + let s = String(ep || '').trim() + if (!s) return '' + try { + const hasScheme = /^https?:\/\//i.test(s) + if (hasScheme) { + const u = new URL(s) + s = u.host + } + const first = s.split('/')[0] || '' + s = first + } catch { + const first = s.split('/')[0] || '' + s = first + } + return s +} + +export async function convertDoc(formData: FormData): Promise { + const res = await apiFetch(`${API_BASE}/convert`, { + method: 'POST', + body: formData + }) + return res.json() +} + +export async function uploadArchive(formData: FormData): Promise { + const res = await apiFetch(`${API_BASE}/upload-archive`, { + method: 'POST', + body: formData + }) + return res.json() +} + +export async function setMinioConfig(config: MinioConfig): Promise<{ ok: boolean }> { + const formData = new FormData() + Object.entries(config).forEach(([key, value]) => { + if (value !== undefined) { + const v = key === 'endpoint' ? normalizeEndpoint(String(value)) : String(value) + formData.append(key, v) + } + }) + + const res = await apiFetch(`${CONFIG_BASE}/minio`, { + method: 'POST', + body: formData + }) + return res.json() +} + +export async function testMinioConfig(config: MinioConfig): Promise<{ ok: boolean, connected: boolean, bucket_exists: boolean, error?: string, created?: boolean, hint?: string }> { + const formData = new FormData() + Object.entries(config).forEach(([key, value]) => { + if (value !== undefined) { + const v = key === 'endpoint' ? normalizeEndpoint(String(value)) : String(value) + formData.append(key, v) + } + }) + formData.append('create_if_missing', 'true') + + const res = await apiFetch(`${CONFIG_BASE}/minio/test`, { + method: 'POST', + body: formData + }) + return res.json() +} + +export async function createBucket(config: MinioConfig): Promise<{ ok: boolean, bucket_exists?: boolean, error?: string, hint?: string }> { + const formData = new FormData() + formData.append('endpoint', normalizeEndpoint(String(config.endpoint))) + formData.append('access', String(config.access)) + formData.append('secret', String(config.secret)) + formData.append('bucket', String(config.bucket)) + if (config.secure !== undefined) formData.append('secure', String(config.secure)) + if (config.public_read !== undefined) formData.append('public_read', String(config.public_read)) + const res = await apiFetch(`/config/minio/create-bucket`, { method: 'POST', body: formData }) + return res.json() +} + +export async function convertMd(formData: FormData): Promise { + return apiFetch(`/md/convert`, { + method: 'POST', + body: formData + }) +} + +export async function convertFolder(folderPath: string, prefix?: string): Promise<{ ok: boolean, count: number, files: any[] }> { + const form = new FormData() + form.append('folder_path', folderPath) + if (prefix) form.append('prefix', prefix) + const res = await apiFetch(`/md/convert-folder`, { method: 'POST', body: form }) + return res.json() +} + +export async function listProfiles(): Promise<{ ok: boolean, profiles: string[] }> { + try { + const res = await apiFetch(`/config/profiles`) + try { + return await res.json() + } catch { + return { ok: false, profiles: [] } + } + } catch { + return { ok: false, profiles: [] } + } +} + +export async function stageArchive(file: File, prefix?: string): Promise<{ code: number, msg: string, data: { id: string, name: string, size: number } }> { + const fd = new FormData() + fd.append('file', file) + if (prefix) fd.append('prefix', prefix) + const res = await apiFetch(`/api/archive/stage`, { method: 'POST', body: fd }) + return res.json() +} + +export async function processArchive(id: string, prefix?: string, versionId?: number): Promise { + const fd = new FormData() + fd.append('id', id) + if (prefix) fd.append('prefix', prefix) + if (versionId !== undefined) fd.append('versionId', String(versionId)) + const res = await apiFetch(`/api/archive/process`, { method: 'POST', body: fd }) + return res.json() +} + +export async function uploadList(file: File, prefix?: string, versionId?: number): Promise { + const fd = new FormData() + fd.append('list_file', file) + if (prefix) fd.append('prefix', prefix) + if (versionId !== undefined) fd.append('versionId', String(versionId)) + const res = await apiFetch(`/api/upload-list`, { method: 'POST', body: fd }) + return res.json() +} + +function cmsBaseUrl(): string { + try { + const val = localStorage.getItem(CMS_BASE_KEY) || '' + return normalizeApiBase(val) + } catch { return '' } +} + +export function setCmsConfig(base?: string, token?: string) { + try { + if (base !== undefined) localStorage.setItem(CMS_BASE_KEY, normalizeApiBase(base)) + if (token !== undefined) localStorage.setItem(CMS_TOKEN_KEY, String(token)) + } catch {} +} + +export async function sendImportToCms(payload: any): Promise<{ ok: boolean, status?: number, error?: string }> { + const base = cmsBaseUrl() + if (!base) return { ok: false, error: '未配置 CMS 接口地址' } + const url = joinUrl(base, '/cms/api/v1/document/directory/import') + const token = (localStorage.getItem(CMS_TOKEN_KEY) || '').trim() + const headers: Record = { 'Content-Type': 'application/json' } + if (token) headers['Authorization'] = `Bearer ${token}` + const res = await fetch(url, { method: 'POST', headers, body: JSON.stringify(payload) }) + if (!res.ok) return { ok: false, status: res.status, error: `HTTP ${res.status}` } + return { ok: true } +} + +export async function saveProfile(name: string): Promise<{ ok: boolean, name?: string }> { + const form = new FormData() + form.append('name', name) + const res = await apiFetch(`/config/save_profile`, { method: 'POST', body: form }) + return res.json() +} + +export async function loadProfile(name: string): Promise<{ ok: boolean, config?: any }> { + const res = await apiFetch(`/config/load_profile?name=${encodeURIComponent(name)}`) + return res.json() +} + +export async function getConfigSnapshot(): Promise<{ minio: MinioConfig, db: Record }> { + const res = await apiFetch(`/config`) + return res.json() +} + +export async function checkServerTime(config?: Partial): Promise<{ ok: boolean, diff_sec?: number, server_time?: string, local_time?: string, hint?: string, error?: string }>{ + try { + const ep = config?.endpoint ? normalizeEndpoint(String(config?.endpoint)) : '' + const pub = String(config?.public || '').trim() + const sec = config?.secure !== undefined ? String(!!config?.secure) : '' + const qs: string[] = [] + if (ep) qs.push(`endpoint=${encodeURIComponent(ep)}`) + if (pub) qs.push(`public=${encodeURIComponent(pub)}`) + if (sec) qs.push(`secure=${encodeURIComponent(sec)}`) + const q = qs.length ? `?${qs.join('&')}` : '' + let res = await apiFetch(`/system/time/check${q}`) + if (res.ok) { + try { return await res.json() } catch {} + } + res = await apiFetch(`/api/system/time/check${q}`) + if (res.ok) { + try { return await res.json() } catch {} + } + return { ok: false, error: `HTTP ${res.status}` } + } catch (e: any) { + return { ok: false, error: 'NETWORK' } + } +} + +export async function syncServerTime(method?: string, ntpServer?: string): Promise<{ ok: boolean, result?: any, check?: any }>{ + const fd = new FormData() + if (method) fd.append('method', method) + if (ntpServer) fd.append('ntp_server', ntpServer) + try { + let res = await apiFetch(`/system/time/sync`, { method: 'POST', body: fd }) + if (res.ok) { + try { return await res.json() } catch {} + } + res = await apiFetch(`/api/system/time/sync`, { method: 'POST', body: fd }) + if (res.ok) { + try { return await res.json() } catch {} + } + return { ok: false } + } catch { + return { ok: false } + } +} diff --git a/frontend/src/style.css b/frontend/src/style.css new file mode 100644 index 0000000..a636728 --- /dev/null +++ b/frontend/src/style.css @@ -0,0 +1,72 @@ +:root { + font-family: -apple-system, system-ui, Segoe UI, Roboto, Helvetica, Arial, sans-serif; + line-height: 1.5; + font-weight: 400; + color-scheme: light; + color: #111827; + background-color: #ffffff; + font-synthesis: none; + text-rendering: optimizeLegibility; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +a { + font-weight: 500; + color: #646cff; + text-decoration: inherit; +} +a:hover { + color: #535bf2; +} + +body { + margin: 0; + min-width: 320px; + min-height: 100vh; +} + +h1 { + font-size: 3.2em; + line-height: 1.1; +} + +button { + border-radius: 8px; + border: 1px solid transparent; + padding: 0.6em 1.2em; + font-size: 1em; + font-weight: 500; + font-family: inherit; + background-color: #2563eb; + color: #fff; + cursor: pointer; + transition: border-color 0.25s; +} +button:hover { + border-color: #646cff; +} +button:focus, +button:focus-visible { + outline: 4px auto -webkit-focus-ring-color; +} + +.card { + padding: 2em; +} + +#app { + max-width: 1280px; + margin: 0 auto; + padding: 2rem; + text-align: center; +} + +input, select, textarea { + background: #ffffff; + color: #111827; +} + +input::placeholder, textarea::placeholder { + color: #9ca3af; +} diff --git a/frontend/tests/check_frontend_prd.mjs b/frontend/tests/check_frontend_prd.mjs new file mode 100644 index 0000000..8276930 --- /dev/null +++ b/frontend/tests/check_frontend_prd.mjs @@ -0,0 +1,14 @@ +import fs from 'node:fs' +import path from 'node:path' + +const p = path.resolve(process.cwd(), 'frontend/src/components/DocToMd.vue') +const s = fs.readFileSync(p, 'utf-8') +if (!s.includes('const saveToServer = ref(true)')) { + console.error('saveToServer 默认未设置为 true') + process.exit(1) +} +if (!s.includes("mt.startsWith('text/markdown')")) { + console.error('renderedContent 未按 media_type 判断 Markdown') + process.exit(1) +} +console.log('前端源码检查通过') diff --git a/frontend/tsconfig.app.json b/frontend/tsconfig.app.json new file mode 100644 index 0000000..8d16e42 --- /dev/null +++ b/frontend/tsconfig.app.json @@ -0,0 +1,16 @@ +{ + "extends": "@vue/tsconfig/tsconfig.dom.json", + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", + "types": ["vite/client"], + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedSideEffectImports": true + }, + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.vue"] +} diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..1ffef60 --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,7 @@ +{ + "files": [], + "references": [ + { "path": "./tsconfig.app.json" }, + { "path": "./tsconfig.node.json" } + ] +} diff --git a/frontend/tsconfig.node.json b/frontend/tsconfig.node.json new file mode 100644 index 0000000..8a67f62 --- /dev/null +++ b/frontend/tsconfig.node.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", + "target": "ES2023", + "lib": ["ES2023"], + "module": "ESNext", + "types": ["node"], + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "erasableSyntaxOnly": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedSideEffectImports": true + }, + "include": ["vite.config.ts"] +} diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts new file mode 100644 index 0000000..8349f89 --- /dev/null +++ b/frontend/vite.config.ts @@ -0,0 +1,47 @@ +import { defineConfig } from 'vite' +import vue from '@vitejs/plugin-vue' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [vue()], + server: { + proxy: { + '/api': { + target: 'http://localhost:8000', + changeOrigin: true, + configure: (proxy) => { + const p = proxy as any + p.timeout = 120000 + p.proxyTimeout = 120000 + }, + }, + '/config': { + target: 'http://localhost:8000', + changeOrigin: true, + configure: (proxy) => { + const p = proxy as any + p.timeout = 120000 + p.proxyTimeout = 120000 + }, + }, + '/md': { + target: 'http://localhost:8000', + changeOrigin: true, + configure: (proxy) => { + const p = proxy as any + p.timeout = 120000 + p.proxyTimeout = 120000 + }, + }, + '/refresh.js': { + target: 'http://localhost:8000', + changeOrigin: true, + configure: (proxy) => { + const p = proxy as any + p.timeout = 120000 + p.proxyTimeout = 120000 + }, + } + } + } +}) diff --git a/import.json b/import.json new file mode 100644 index 0000000..b4e61b7 --- /dev/null +++ b/import.json @@ -0,0 +1,129 @@ +{ + "versionId": 1001, + "tree": [ + { + "name": "数+产品手册-MD源文件", + "type": "FOLDER", + "children": [ + { + "name": "DMDRS诊断工具使用手册", + "type": "FILE", + "sortOrder": 100, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS诊断工具使用手册.md", + "fileName": "DMDRS诊断工具使用手册.md", + "fileSize": 16402 + } + ] + }, + { + "name": "DMDRS控制台命令手册", + "type": "FILE", + "sortOrder": 101, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS控制台命令手册.md", + "fileName": "DMDRS控制台命令手册.md", + "fileSize": 314014 + } + ] + }, + { + "name": "DMDRS搭建手册-Oracle", + "type": "FILE", + "sortOrder": 102, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS搭建手册-Oracle.md", + "fileName": "DMDRS搭建手册-Oracle.md", + "fileSize": 159147 + } + ] + }, + { + "name": "DMDRS DRS API使用手册", + "type": "FILE", + "sortOrder": 103, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS DRS API使用手册.md", + "fileName": "DMDRS DRS API使用手册.md", + "fileSize": 51475 + } + ] + }, + { + "name": "DMDRS参考手册", + "type": "FILE", + "sortOrder": 104, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS参考手册.md", + "fileName": "DMDRS参考手册.md", + "fileSize": 265225 + } + ] + }, + { + "name": "定时调度工具使用手册", + "type": "FILE", + "sortOrder": 105, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/定时调度工具使用手册.md", + "fileName": "定时调度工具使用手册.md", + "fileSize": 104637 + } + ] + }, + { + "name": "DMDRS搭建手册-DM8", + "type": "FILE", + "sortOrder": 106, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS搭建手册-DM8.md", + "fileName": "DMDRS搭建手册-DM8.md", + "fileSize": 217027 + } + ] + }, + { + "name": "DMDRS产品介绍", + "type": "FILE", + "sortOrder": 107, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS产品介绍.md", + "fileName": "DMDRS产品介绍.md", + "fileSize": 94882 + } + ] + }, + { + "name": "DMDRS DRS语言使用手册", + "type": "FILE", + "sortOrder": 108, + "files": [ + { + "languageId": 1, + "objectName": "assets/rewritten/数+产品手册-MD源文件/DMDRS DRS语言使用手册.md", + "fileName": "DMDRS DRS语言使用手册.md", + "fileSize": 177757 + } + ] + } + ], + "sortOrder": 100 + } + ] +} \ No newline at end of file diff --git a/k8s/deployment.yaml b/k8s/deployment.yaml new file mode 100644 index 0000000..827cbfe --- /dev/null +++ b/k8s/deployment.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: funmd-convert + namespace: default + labels: + app: funmd-convert +spec: + replicas: 1 + selector: + matchLabels: + app: funmd-convert + template: + metadata: + labels: + app: funmd-convert + spec: + containers: + - name: funmd-convert + image: funmd-convert:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8000 + env: + - name: MINIO_ENDPOINT + value: "minio-service:9000" + - name: MINIO_ACCESS_KEY + value: "minioadmin" + - name: MINIO_SECRET_KEY + value: "minioadmin" + - name: MINIO_BUCKET + value: "funmd" + resources: + limits: + cpu: "1000m" + memory: "1Gi" + requests: + cpu: "200m" + memory: "256Mi" + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 5 diff --git a/package_offline.sh b/package_offline.sh new file mode 100644 index 0000000..776f3eb --- /dev/null +++ b/package_offline.sh @@ -0,0 +1,60 @@ +#!/bin/bash +set -euo pipefail + +# Image naming +# 注意:Docker 镜像名必须为小写且不含下划线,这里使用 funmd-convert +IMAGE_NAME="funmd-convert" +IMAGE_TAG="latest" +OUTPUT_DIR="$(pwd)" +# 导出的 tar 文件命名为项目名 FunMD_Convert 以便识别 +OUTPUT_TAR="${OUTPUT_DIR}/FunMD_Convert.tar" + +echo "[1/4] Building Docker image ${IMAGE_NAME}:${IMAGE_TAG}" +docker build -t ${IMAGE_NAME}:${IMAGE_TAG} . + +echo "[2/4] Saving image to ${OUTPUT_TAR}" +docker save ${IMAGE_NAME}:${IMAGE_TAG} -o "${OUTPUT_TAR}" + +echo "[3/4] Image size and sha256" +ls -lh "${OUTPUT_TAR}" +SHA256=$(shasum -a 256 "${OUTPUT_TAR}" | awk '{print $1}') +echo "sha256=${SHA256}" + +cat <<'EON' + +[4/4] Transfer and run on offline server: + +1) 复制镜像包到服务器,例如 /opt/FunMD_Convert/FunMD_Convert.tar + scp FunMD_Convert.tar user@server:/opt/FunMD_Convert/ + +2) 加载镜像: + docker load -i /opt/FunMD_Convert/FunMD_Convert.tar + +3) 验证镜像: + docker images | grep funmd-convert + +4) 启动容器(后端端口 8000,同时托管前端 /ui): + docker run -d \ + -p 8000:8000 \ + --name FunMD_Convert \ + --restart unless-stopped \ + funmd-convert:${IMAGE_TAG} + +5) 访问: + 后端健康检查: http://<服务器IP>:8000/health + 前端页面: http://<服务器IP>:8000/ui/ + +6) (可选)配置 MinIO: + curl -X POST \ + -F endpoint=10.9.35.31:9000 \ + -F public=http://10.9.35.31:9000 \ + -F access=你的AK \ + -F secret=你的SK \ + -F bucket=file-cms \ + -F secure=false \ + -F public_read=true \ + http://<服务器IP>:8000/config/minio + +EON + +echo "[Done] Offline package ready: ${OUTPUT_TAR}" diff --git a/prd.md b/prd.md new file mode 100644 index 0000000..9d7a1d5 --- /dev/null +++ b/prd.md @@ -0,0 +1,63 @@ +目前这里有两个项目文件夹,docling和word2markdown,它们全部都是用来处理文档格式的转换。 + +这里的转换主要包括两个转换方向: + +一、其他格式转换成markdown格式 +1、通用格式转换器:docling/docling里面有对应的转换函数,主要是docx和pdf等格式转换成markdown格式; +2、定制格式转换器:word2markdown里面有对应的的转换函数,主要是如下几种特殊情况的处理 +- 所有非UTF-8编码的文档,都需要先转换成UTF-8编码,再进行后续的处理; +- doc/docx格式中有单行单列的表格需处理为markdown格式中的代码高亮格式; +- HTML格式中table标签名全部需要小写; +- 删除HTML格式中标签后面多余的换行; +- 所有markdown中的::: 的提示块 改为 !!! 的提示块; +- 所有markdown文件的渲染可以设置半角和全角,默认是全角; +- 所有转换文件中有图片(或其他静态资源)的存入MinIO中并返回对应的图片URL(或其他静态资源的URL); +- 所有转换文件中的URL相对路径都需转换成MinIO中的URL; +3、文件上传有若干情况 +- 单个文件上传; +- 多个文件上传,通过资源路径或URL上传; +- 非加密压缩文件的上传; +4、批量上传功能: +- 可以通过上传一个包含多个文件路径或URL的文本文件,来批量上传多个文件; +- 可以上传压缩包(zip、tar.gz等),分步骤完成,先上传压缩包,前端显示上传文件成功,点击开始转换按钮,再解压缩,将文件中的markdown文件中的相对路径图片等静态资源地址全部转化为minio的地址,同时按文件结构将转化好的md文件URL(也是存在minio中)按json示例文件的格式返回,前端给出相应的处理信息,方便调试; +- 示例的json文件路径为:/Users/fanyang/Desktop/FunMD_Convert/批量导入目录树.json; +- 处理完之后将压缩文件和服务端已解压本地文件都删除(但是就要确保已转换好的md文件和对应的资源都存放在minio中且都返回了URL到正确的json中); +- 确保图像等相对路径资源上传到minio中并正确返回URL; +- 当markdown和对应的image资源在同一个文件目录下的时候,转化后路径提升一级,就是正确的markdown文件放在上一级目录下,原文件夹就不需要了,且根据这个规则返回import.json + +二、接口规范 +以上所有的能力全部通过python的FastAPI实现,接口规范如下: +1、所有的接口都需要通过POST方法调用; +2、所有的接口都需要返回JSON格式的响应体; +3、所有的接口都需要在响应体中包含一个code字段,用于表示接口调用是否成功; +4、所有的接口都需要在响应体中包含一个msg字段,用于表示接口调用的结果信息; +5、所有的接口都需要在响应体中包含一个data字段,用于表示接口调用的结果数据; + +三、接口实现 +1、所有的接口都需要在FastAPI中实现; +2、所有的接口都需要在实现中包含一个try...except...语句,用于捕获异常并返回对应的错误信息; +3、所有的接口都需要在实现中包含一个return语句,用于返回对应的响应体; + +四、接口重构 +1、将docling和word2markdown中的转换函数,封装成一个类,类中包含一个convert方法,用于实现格式转换; +2、将所有的接口,封装成一个类,类中包含一个convert方法,用于实现接口调用; +3、将所有的类,封装成一个模块,模块中包含一个app对象,用于启动FastAPI应用; + +五、接口测试 +1、使用python的unittest模块,编写测试用例,测试所有的接口是否符合规范; +2、使用python的requests模块,模拟调用所有的接口,测试接口是否正常工作; + +六、提供前端页面(参考docling/app/static/index.html来实现和优化) +1. 通过vue3来重构这个页面; +2. 可以配置以上的能力,实现针对不同用户的不同配置文件,来提供不同的能力; + +七、部署 +1、将所有的代码,打包成一个docker镜像; +2、将这个docker镜像,可以本地ssh部署到服务器中(服务器无法拉取镜像); +3、配置一个ingress,用于暴露这个应用; + +八、其他 +1、所有的代码,都需要符合PEP8规范; +2、所有的接口,都需要在注释中包含一个@api标签,用于标识这是一个接口; +3、所有的类,都需要在注释中包含一个@class标签,用于标识这是一个类; +4、所有的函数,都需要在注释中包含一个@function标签,用于标识这是一个函数; diff --git a/修改总结.md b/修改总结.md new file mode 100644 index 0000000..54db53f --- /dev/null +++ b/修改总结.md @@ -0,0 +1,460 @@ +toolName: search_codebase + +status: success + +query: Locate existing Python code for docling and word2markdown converters, FastAPI app, and any existing front-end reference at docling/app/static/index.html. Identify project structure and existing functions for converting docx/pdf to markdown and custom conversions. + + + + +toolName: view_folder + +status: success + +/Users/fanyang/Desktop/FunMD_Convert + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/services/docling_adapter.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/services/docling_adapter.py + + + + +toolName: search_by_regex + +status: success + +query: def .*\( + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/word2markdown/convert_to_md.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/word2markdown/convert_to_md.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/static/index.html + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/static/index.html + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/static/index.html + + + + +toolName: search_by_regex + +status: success + +query: convert-folder + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: search_by_regex + +status: success + +query: Dockerfile + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/tests/test_md_to_docx.py + + + + +toolName: view_folder + +status: success + +/Users/fanyang/Desktop/FunMD_Convert/word2markdown + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/word2markdown/app.py + + + + +toolName: todo_write + +status: success + +Todos updated: 5 items + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: write_to_file + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/services/unified_converter.py + +content: undefined + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: edit_file_update + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + +changes: + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: edit_file_update + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + +changes: + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: edit_file_update + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/server.py + +changes: + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: write_to_file + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/tests/test_api_convert.py + +content: undefined + + + + +toolName: run_command + +status: success + +command: python -m pytest -q undefined + + + + +toolName: ApplyPatch + +status: running + + + + + +toolName: edit_file_update + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/tests/test_api_convert.py + +changes: + + + + +toolName: run_command + +status: success + +command: python -m unittest -q undefined + + + + +toolName: run_command + +status: success + +command: python -m unittest discover -s tests -p "test_*.py" -v undefined + + + + +toolName: todo_write + +status: success + +Todos updated: 5 items + + + + +toolName: view_files + +status: success + + +filePath: /Users/fanyang/Desktop/FunMD_Convert/docling/app/services/unified_converter.py + +**重构概要** +- 统一了两套转换能力(Docling 通用转换 + Word2Markdown 定制转换),新增统一入口类并在后端暴露标准化接口。 +- 按文档规范统一了接口输出结构(POST、JSON、包含 code/msg/data),并为错误处理加上 try/except 包装。 +- 增强 Markdown/HTML 的后处理:将 `:::` 提示块转换为 `!!!`,HTML 表格标签强制小写,去除标签后无意义换行。 +- 扩展批量上传:支持 ZIP/TAR 压缩包上传,自动解压、资源上传至 MinIO、Markdown 资产重写并回传可用 URL。 +- 预置了单元测试样例,覆盖统一转换和接口返回结构,后续安装依赖后可运行。 + +**核心改动** +- 统一转换类 + - 新增 `FormatConverter`,可一键选择使用 Docling 或 Word2Markdown 引擎,统一返回目标格式。 + - 文件位置:`docling/app/services/unified_converter.py:57`,核心方法 `convert` 在 `docling/app/services/unified_converter.py:61`。 + - 行为要点: + - 引擎选择:传 `engine=word2markdown|pandoc|custom` 时走 `word2markdown.convert_any`;默认走 `Docling.DocumentConverter`。 + - Markdown后处理:将 `:::` 提示块转换为 `!!!`(`docling/app/services/unified_converter.py:33`);HTML表格标签小写化(`docling/app/services/unified_converter.py:22`)。 +- 标准化接口 + - 新增统一转换接口 `POST /api/convert`,提供 file 或 source_url 二选一;可选 `export=markdown|html|json|doctags` 和 `engine`;支持 `save=true` 上传到 MinIO。 + - 位置:`docling/app/server.py:1413`,统一返回结构在 `docling/app/server.py:1380` 和 `docling/app/server.py:1396`。 + - 返回字段: + - 不保存:`data = {encoding, content, name, media_type}` + - 保存到 MinIO:`data = {encoding, name, media_type, minio_url, minio_presigned_url}` + - 所有响应遵循 `{"code": 0|500, "msg": "...", "data": ...}`,实现于 `docling/app/server.py:1363` 和 `docling/app/server.py:1368`。 +- 压缩包上传 + - 新增 `POST /api/upload-archive`:上传 ZIP/TAR 包,后端解压→批量上传资源至 MinIO→重写 `*.md` 内部相对资源链接为 MinIO URL→返回列表。 + - 位置:`docling/app/server.py:1413`(函数起始见 `docling/app/server.py:1414`)。 + - 返回数据:`data={count, files:[{source, minio_url, minio_presigned_url, mappings}]}`。 + - 用到了现有资产上传与链接重写能力:`_bulk_upload_assets` 在 `docling/app/server.py:1079`,`_rewrite_md_assets_to_minio` 在 `docling/app/server.py:885`。 +- MinIO 配置与管理 + - 运行时配置体 `RUNTIME_CONFIG` 保持不变,动态载入/保存配置、桶管理接口均已可用。相关路径: + - 载入/保存配置:`docling/app/server.py:349`、`docling/app/server.py:369`、`docling/app/server.py:386` + - 桶列表/创建:`docling/app/server.py:287`、`docling/app/server.py:304` +- Markdown → DOCX/PDF + - 保留原有 `POST /md/convert` 接口能力(页眉页脚、logo、封面、CSS 模板、目录 TOC、MinIO 存储等),详见 `docling/app/server.py:473`。 + - 实际转换使用 `md_to_docx_bytes` 与 `md_to_pdf_bytes_with_renderer`(`docling/app/services/docling_adapter.py:105`、`docling/app/services/docling_adapter.py:676`),并已支持页眉页脚与封面元信息渲染(`normalize_html` 流水线在 `docling/app/services/docling_adapter.py:433` 到 `docling/app/services/docling_adapter.py:662`)。 + +**满足 PRD 的关键点** +- 统一 POST + JSON + code/msg/data:新增 `api_convert` 与 `api_upload-archive`,所有返回体包含 `code/msg/data`(`docling/app/server.py:1363`、`docling/app/server.py:1368`、`docling/app/server.py:1413`)。 +- 异常捕获:统一接口有 `try...except...` 并转换为标准结构(`docling/app/server.py:1408`、`docling/app/server.py:1476`),全局异常中间件已存在(`docling/app/server.py:1231`、`docling/app/server.py:1245`、`docling/app/server.py:1255`)。 +- 转换特性: + - 非 UTF-8 文档编码识别与归一化由 `word2markdown.convert_to_md` 内置实现(例如 `word2markdown/convert_to_md.py:309`、`word2markdown/convert_to_md.py:326`)。 + - DOC/DOCX 单行单列表格识别为代码块由 `word2markdown/convert_to_md.py:196`、`word2markdown/convert_to_md.py:200` 完成。 + - HTML `` 标签统一小写由统一转换类的 HTML 后处理完成(`docling/app/services/unified_converter.py:22`)。 + - 删除 HTML 标签后多余换行由小写化时一并规整(`docling/app/services/unified_converter.py:29`)。 + - Markdown 中 `:::` 提示块 → `!!!` 由统一转换类完成(`docling/app/services/unified_converter.py:33` 到 `docling/app/services/unified_converter.py:55`)。 + - 图片/静态资源入库 MinIO 与 URL 重写:单文件在 `md_convert` 内部,批量在 `md_convert_folder` 与 `md_upload_folder`(`docling/app/server.py:1099`、`docling/app/server.py:1151`),新 `api_upload-archive` 同样实现(`docling/app/server.py:1413`)。 +- 文件上传场景: + - 单文件上传:`POST /api/convert` 提供 `file`(`docling/app/server.py:1369`)。 + - URL 上传:`POST /api/convert` 提供 `source_url`(`docling/app/server.py:1369`)。 + - 压缩包:`POST /api/upload-archive`(`docling/app/server.py:1413`)。 +- 前端页面 + - 现有测试页仍可用(`docling/app/static/index.html`)。Vue3 重构计划后续推进。 +- 部署 + - 现有 FastAPI 服务与静态页集成,后续可将 `docling/app` 打包为镜像;MinIO 配置已支持运行时动态调整。 + +**接口用法示例** +- 单文件转 Markdown(不保存) + - `curl -s -X POST http://localhost:8000/api/convert -F file=@/path/sample.docx -F export=markdown -F save=false` +- URL 转换为 Markdown(保存到 MinIO) + - `curl -s -X POST http://localhost:8000/api/convert -F source_url="https://example.com/sample.pdf" -F export=markdown -F save=true` +- 压缩包批量上传并重写链接 + - `curl -s -X POST http://localhost:8000/api/upload-archive -F file=@/path/docs.zip -F prefix=assets` + +**教育性说明** +- 引擎路由与契约统一 + - 统一转换类屏蔽底层差异,默认走 Docling,遇到需要 Pandoc/定制规则时切 Word2Markdown,实现“契约优先”的稳定输出。 +- Markdown/HTML 后处理 + - 将 `:::` → `!!!` 保证提示块在统一渲染器(包括 PDF/DOCX 流程)下保持一致;HTML 表格小写化避免某些渲染器的大小写敏感差异。 +- 资产重写与 MinIO + - 多层回退:先批量上传图片资源,再按路径和搜索根解析相对链接;任何失败的映射保留在 `mappings` 中,便于后续修复。 +- 防御式异常处理 + - 接口返回始终有 `code/msg/data`,即使遇到异常也不会破坏前端调用流程;需要详细排错时可开启 `debug` 查询参数或请求头(`docling/app/server.py:1216`)。 + +**已添加的测试** +- 位置:`docling/app/tests/test_api_convert.py:1` +- 用例:上传一个包含 `:::` 的 Markdown 文件,验证返回 `code=0`、`data.content` 存在并包含 `!!! note`。 +- 运行方法(需要安装依赖): + - `python -m unittest discover -s tests -p "test_*.py" -v` + +**后续建议** +- 前端 Vue3 重构:将 `docling/app/static/index.html` 的交互改造为 Vue3,抽象配置表单与批量任务的状态管理。 +- 镜像化与部署:整理 `requirements`,增加 Dockerfile 与构建脚本,结合 Ingress 暴露。 +- 配置管理:完成配置文件的多环境切换与权限控制,支持 Profile 导入导出(已有后端支持,`docling/app/server.py:349`、`docling/app/server.py:369`、`docling/app/server.py:386`)。 +- 测试矩阵:在 `tests` 中补充更多文件类型与异常场景(损坏 DOCX、RTF、伪 DOCX、MDX 复杂 JSX 等),并加上 MinIO 交互的集成测试。 + +如果你现在就要联调,可以直接用上述 curl 命令触发新接口;如需我继续把前端测试页改造以调用新接口或推进 Vue3 页面,我可以立刻开始。 \ No newline at end of file diff --git a/批量导入目录树.json b/批量导入目录树.json new file mode 100644 index 0000000..21d9440 --- /dev/null +++ b/批量导入目录树.json @@ -0,0 +1,202 @@ +{ + "openapi": "3.0.1", + "info": { + "title": "默认模块", + "description": "认证授权微服务API文档,支持用户管理、角色管理、登录认证等功能", + "version": "1.0.0", + "contact": { + "name": "开发团队", + "email": "dev@example.com" + }, + "license": { + "name": "Apache 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0.html" + } + }, + "tags": [ + { + "name": "文档目录管理" + } + ], + "paths": { + "/cms/api/v1/document/directory/import": { + "post": { + "summary": "批量导入目录树", + "deprecated": false, + "description": "根据提供的目录树JSON批量生成目录及文件,默认覆盖该版本下的草稿内容", + "operationId": "importDirectoryTree", + "tags": [ + "文档目录管理" + ], + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DirectoryImportRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "OK", + "content": { + "*/*": { + "schema": { + "$ref": "#/components/schemas/ResultVoid" + } + } + }, + "headers": {} + } + }, + "security": [ + { + "Bearer Authentication": [] + } + ] + } + } + }, + "components": { + "schemas": { + "ResultVoid": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "format": "int32" + }, + "message": { + "type": "string" + }, + "data": { + "type": "object", + "properties": {} + } + } + }, + "DirectoryImportFile": { + "required": [ + "languageId", + "objectName" + ], + "type": "object", + "properties": { + "languageId": { + "type": "integer", + "description": "语言ID", + "format": "int64", + "example": 1 + }, + "objectName": { + "type": "string", + "description": "MinIO对象名", + "example": "version_1001/dir_10/xxx.md" + }, + "fileName": { + "type": "string", + "description": "文件名(用于展示)", + "example": "install.md" + }, + "fileSize": { + "type": "integer", + "description": "文件大小(字节)", + "format": "int64", + "example": 1024 + } + }, + "description": "目录文件信息" + }, + "DirectoryImportNode": { + "required": [ + "name", + "type" + ], + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "目录名称", + "example": "安装指南" + }, + "type": { + "type": "string", + "description": "节点类型:FOLDER/FILE", + "enum": [ + "FOLDER", + "FILE" + ], + "example": "FOLDER" + }, + "sortOrder": { + "type": "integer", + "description": "排序顺序,值越小越靠前", + "format": "int32", + "example": 100 + }, + "description": { + "type": "string", + "description": "目录描述,仅在FOLDER节点生效", + "example": "该章节包含快速开始说明" + }, + "children": { + "type": "array", + "description": "子目录列表(仅在 FOLDER 类型下使用)", + "items": { + "$ref": "#/components/schemas/DirectoryImportNode" + } + }, + "files": { + "type": "array", + "description": "文件列表(仅在 FILE 类型下使用)", + "items": { + "$ref": "#/components/schemas/DirectoryImportFile" + } + } + }, + "description": "目录导入节点" + }, + "DirectoryImportRequest": { + "required": [ + "tree", + "versionId" + ], + "type": "object", + "properties": { + "versionId": { + "type": "integer", + "description": "文档版本ID", + "format": "int64", + "example": 1001 + }, + "tree": { + "type": "array", + "description": "目录树", + "items": { + "$ref": "#/components/schemas/DirectoryImportNode" + } + } + }, + "description": "目录批量导入请求" + } + }, + "responses": {}, + "securitySchemes": { + "Bearer Authentication": { + "type": "http", + "description": "输入Token,格式:Bearer {token}", + "scheme": "bearer", + "bearerFormat": "JWT" + } + } + }, + "servers": [], + "security": [ + { + "Bearer Authentication": [] + } + ] +} \ No newline at end of file