Import project files

This commit is contained in:
2026-01-07 17:18:26 +08:00
parent 7d9fff2c34
commit 0b07e63b76
66 changed files with 11497 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
import io
import os
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "test", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def main():
setup()
app = server.app
c = TestClient(app)
tmp = Path("/tmp/run_batch_upload_debug")
tmp.mkdir(parents=True, exist_ok=True)
zpath = tmp / "pkg.zip"
md_dir = tmp / "docs"
img_dir = md_dir / "images"
img_dir.mkdir(parents=True, exist_ok=True)
(img_dir / "p.png").write_bytes(b"PNG")
(md_dir / "a.md").write_text("![](images/p.png)", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
zf.write(str(md_dir / "a.md"), arcname="a.md")
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
with open(zpath, "rb") as fp:
files = {"file": ("pkg.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
print("stage status:", r1.status_code, r1.json())
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"})
print("process status:", r2.status_code, r2.json())
list_text = str(md_dir / "a.md")
lf = io.BytesIO(list_text.encode("utf-8"))
r3 = c.post("/api/upload-list", files={"list_file": ("list.txt", lf.getvalue())}, data={"prefix": "assets", "versionId": "1002"})
print("upload-list status:", r3.status_code, r3.json())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
import io
import os
from pathlib import Path
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "test", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def main():
setup()
app = server.app
c = TestClient(app)
tmp = Path("/tmp/run_convert_folder_debug")
if tmp.exists():
for p in tmp.rglob("*"):
try:
p.unlink()
except Exception:
pass
try:
tmp.rmdir()
except Exception:
pass
tmp.mkdir(parents=True, exist_ok=True)
root = tmp / "数+产品手册-MD源文件"
sub = root / "DMDRS_DRS_Language_User_Manual"
img = sub / "images"
img.mkdir(parents=True, exist_ok=True)
(img / "p.png").write_bytes(b"PNG")
(sub / "a.md").write_text("# Title\n\n![](images/p.png)", "utf-8")
r = c.post("/md/convert-folder", data={"folder_path": str(root), "prefix": "assets"})
print("convert-folder:", r.status_code)
print(r.json())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,97 @@
import io
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "test", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def run():
setup()
app = server.app
c = TestClient(app)
r = c.post("/api/archive/process", data={"id": "missing"})
print("invalid-id:", r.status_code, r.json())
tmp = Path("/tmp/run_edge_cases_debug")
tmp.mkdir(parents=True, exist_ok=True)
rar_path = tmp / "pkg.rar"
rar_path.write_bytes(b"RAR")
with open(rar_path, "rb") as fp:
files = {"file": ("pkg.rar", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid})
print("rar-process:", r2.status_code, r2.json())
r3 = c.post("/api/archive/process", data={"id": sid})
print("rar-reprocess:", r3.status_code, r3.json())
root = tmp / "listcase2"
root.mkdir(parents=True, exist_ok=True)
(root / "img.png").write_bytes(b"PNG")
(root / "a.md").write_text("![](img.png)", "utf-8")
(root / "b.txt").write_text("![](img.png)", "utf-8")
lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")]
data_bytes = "\n".join(lines).encode("utf-8")
files = {"list_file": ("list.txt", data_bytes)}
r4 = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"})
print("upload-list:", r4.status_code, r4.json())
zpath = tmp / "dup.zip"
base = tmp / "src"
sub = base / "sub"
sub.mkdir(parents=True, exist_ok=True)
(base / "a.md").write_text("![](img.png)", "utf-8")
(base / "img.png").write_bytes(b"PNG")
(sub / "a.md").write_text("![](../img.png)", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
zf.write(str(base / "a.md"), arcname="a.md")
zf.write(str(base / "img.png"), arcname="img.png")
zf.write(str(sub / "a.md"), arcname="sub/a.md")
with open(zpath, "rb") as fp:
files = {"file": ("dup.zip", fp.read())}
r5 = c.post("/api/archive/stage", files=files)
sid2 = r5.json()["data"]["id"]
r6 = c.post("/api/archive/process", data={"id": sid2, "prefix": "assets", "versionId": "1006"})
print("archive-dup:", r6.status_code, r6.json())
if __name__ == "__main__":
run()

View File

@@ -0,0 +1,77 @@
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class _Resp:
def __init__(self, data: bytes):
self._data = data
def read(self) -> bytes:
return self._data
def close(self):
pass
class FakeMinio:
def __init__(self):
self.store = {
("doctest", "assets/rewritten/x.md"): (b"# Title\n\nhello", "text/markdown; charset=utf-8")
}
def stat_object(self, bucket: str, object_name: str):
class S:
def __init__(self, ct: str):
self.content_type = ct
k = (bucket, object_name)
if k in self.store:
return S(self.store[k][1])
return S("application/octet-stream")
def get_object(self, bucket: str, object_name: str):
k = (bucket, object_name)
if k in self.store:
return _Resp(self.store[k][0])
return _Resp(b"")
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "doctest",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "doctest", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def run():
setup()
app = server.app
c = TestClient(app)
r = c.get("/minio/object", params={"bucket": "doctest", "object": "assets/rewritten/x.md"})
print("status:", r.status_code)
print("ct:", r.headers.get("Content-Type"))
print(r.text)
import urllib.parse as _u
enc = _u.quote("assets/rewritten/数字+产品手册-MD源文件/x.md")
cur_client, _, _, _ = server._minio_current() # type: ignore
cur_client.store[("doctest", "assets/rewritten/数字+产品手册-MD源文件/x.md")] = ("hello 中文+plus".encode("utf-8"), "text/markdown; charset=utf-8")
r2 = c.get("/minio/object", params={"bucket": "doctest", "object": enc})
print("status2:", r2.status_code)
print("ct2:", r2.headers.get("Content-Type"))
print(r2.text)
if __name__ == "__main__":
run()

View File

@@ -0,0 +1,50 @@
import io
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class FakeMinio:
def __init__(self):
pass
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}"
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "doctest",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "doctest", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def run():
setup()
app = server.app
c = TestClient(app)
url = "http://127.0.0.1:9000/doctest/assets/rewritten/%E6%B5%8B%E8%AF%95/a.md"
r = c.post("/minio/presign", data={"url": url, "expires": 7200})
print("status:", r.status_code)
print(r.json())
if __name__ == "__main__":
run()

View File

@@ -0,0 +1,74 @@
import io
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
import sys
from pathlib import Path as _Path
base = _Path(__file__).resolve().parents[2]
sys.path.insert(0, str(base))
sys.path.insert(0, str(base / "docling"))
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup():
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "test", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def main():
setup()
app = server.app
c = TestClient(app)
tmp = Path("/tmp/run_slash_path_debug")
tmp.mkdir(parents=True, exist_ok=True)
zpath = tmp / "pkg.zip"
md_dir = tmp / "docs"
img_dir = md_dir / "images"
img_dir.mkdir(parents=True, exist_ok=True)
(img_dir / "p.png").write_bytes(b"PNG")
(md_dir / "a.md").write_text("![](/images/p.png)", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
zf.write(str(md_dir / "a.md"), arcname="a.md")
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
with open(zpath, "rb") as fp:
files = {"file": ("pkg.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1007"})
print("process:", r2.status_code)
print(r2.json())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,29 @@
import unittest
from fastapi.testclient import TestClient
from pathlib import Path
import io
from app.server import app
class ApiConvertTest(unittest.TestCase):
def setUp(self):
self.client = TestClient(app)
def test_api_convert_markdown_file(self):
tmpdir = Path("./scratch_unittest")
tmpdir.mkdir(exist_ok=True)
p = tmpdir / "sample.md"
p.write_text("# Title\n\n::: note\nBody\n:::\n", "utf-8")
with open(p, "rb") as f:
files = {"file": (p.name, io.BytesIO(f.read()), "text/markdown")}
r = self.client.post("/api/convert", files=files, data={"export": "markdown"})
self.assertEqual(r.status_code, 200)
j = r.json()
self.assertEqual(j.get("code"), 0)
self.assertIsInstance(j.get("data", {}).get("content"), str)
self.assertIn("!!! note", j["data"]["content"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,113 @@
import io
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup_module(module=None):
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur():
return fake, "test", "http://127.0.0.1:9000", "assets"
server._minio_current = _cur # type: ignore
def test_process_invalid_id():
app = server.app
c = TestClient(app)
r = c.post("/api/archive/process", data={"id": "missing"})
assert r.status_code == 200
j = r.json()
assert j["code"] != 0
def test_stage_unsupported_format_and_cleanup(tmp_path: Path):
app = server.app
c = TestClient(app)
rar_path = tmp_path / "pkg.rar"
rar_path.write_bytes(b"RAR")
with open(rar_path, "rb") as fp:
files = {"file": ("pkg.rar", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
assert r1.status_code == 200
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid})
assert r2.status_code == 200
j2 = r2.json()
assert j2["code"] != 0
r3 = c.post("/api/archive/process", data={"id": sid})
assert r3.status_code == 200
j3 = r3.json()
assert j3["code"] != 0
def test_upload_list_empty_lines_comments_and_urls(tmp_path: Path):
app = server.app
c = TestClient(app)
root = tmp_path / "listcase2"
root.mkdir(parents=True, exist_ok=True)
(root / "img.png").write_bytes(b"PNG")
(root / "a.md").write_text("![](img.png)", "utf-8")
(root / "b.txt").write_text("![](img.png)", "utf-8")
lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")]
data_bytes = "\n".join(lines).encode("utf-8")
files = {"list_file": ("list.txt", data_bytes)}
r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"})
assert r.status_code == 200
j = r.json()
assert j["code"] == 0
assert j["data"]["count"] >= 2
def test_archive_duplicate_filenames_tree(tmp_path: Path):
app = server.app
c = TestClient(app)
zpath = tmp_path / "dup.zip"
base = tmp_path / "src"
sub = base / "sub"
sub.mkdir(parents=True, exist_ok=True)
(base / "a.md").write_text("![](img.png)", "utf-8")
(base / "img.png").write_bytes(b"PNG")
(sub / "a.md").write_text("![](../img.png)", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
zf.write(str(base / "a.md"), arcname="a.md")
zf.write(str(base / "img.png"), arcname="img.png")
zf.write(str(sub / "a.md"), arcname="sub/a.md")
with open(zpath, "rb") as fp:
files = {"file": ("dup.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
assert r1.status_code == 200
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1006"})
assert r2.status_code == 200
j = r2.json()
assert j["code"] == 0
tree = j["data"]["import"]["tree"]
names = [n["name"] for n in tree]
assert "sub" in names or any((isinstance(n, dict) and n.get("type") == "FOLDER" and n.get("name") == "sub") for n in tree)

View File

@@ -0,0 +1,185 @@
import io
import os
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
import app.server as server
class FakeMinio:
def __init__(self):
self.objs = {}
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
self.objs[(bucket_name, object_name)] = data.read(length)
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def presigned_get_object(self, bucket: str, obj: str, expires: int):
return f"http://minio.test/presigned/{bucket}/{obj}"
def setup_module(module=None):
server.RUNTIME_CONFIG["minio"].update({
"endpoint": "127.0.0.1:9000",
"public": "http://127.0.0.1:9000",
"access": "ak",
"secret": "sk",
"bucket": "test",
"secure": "false",
"prefix": "assets",
"store_final": "true",
"public_read": "true",
})
fake = FakeMinio()
def _cur_cfg(_cfg):
return fake, "test", "http://127.0.0.1:9000", "assets"
server.minio_current = _cur_cfg # type: ignore
try:
server._minio_current = lambda: _cur_cfg(None) # type: ignore
except Exception:
pass
def test_archive_stage_and_process(tmp_path: Path):
app = server.app
c = TestClient(app)
zpath = tmp_path / "pkg.zip"
md_dir = tmp_path / "docs"
img_dir = md_dir / "images"
img_dir.mkdir(parents=True, exist_ok=True)
(img_dir / "p.png").write_bytes(b"PNG")
(md_dir / "a.md").write_text("![](images/p.png)", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
zf.write(str(md_dir / "a.md"), arcname="a.md")
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
with open(zpath, "rb") as fp:
files = {"file": ("pkg.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
assert r1.status_code == 200
j1 = r1.json()
assert j1["code"] == 0 and j1["data"]["id"]
sid = j1["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"})
assert r2.status_code == 200
j2 = r2.json()
assert j2["code"] == 0
assert j2["data"]["count"] >= 1
assert "import" in j2["data"]
def test_upload_list(tmp_path: Path):
app = server.app
c = TestClient(app)
root = tmp_path / "listcase"
root.mkdir(parents=True, exist_ok=True)
(root / "img.png").write_bytes(b"PNG")
(root / "b.md").write_text("![](img.png)", "utf-8")
list_text = str(root / "b.md")
lf = io.BytesIO(list_text.encode("utf-8"))
files = {"list_file": ("list.txt", lf.getvalue())}
r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1002"})
assert r.status_code == 200
j = r.json()
assert j["code"] == 0
assert j["data"]["count"] >= 1
assert "import" in j["data"]
def test_archive_process_html_conversion(tmp_path: Path):
app = server.app
c = TestClient(app)
zpath = tmp_path / "web.zip"
root = tmp_path / "web"
static = root / "static"
static.mkdir(parents=True, exist_ok=True)
(static / "pic.png").write_bytes(b"PNG")
(root / "index.html").write_text("<html><body><h1>T</h1><img src='static/pic.png'/></body></html>", "utf-8")
pages = root / "pages"
pages.mkdir(parents=True, exist_ok=True)
(pages / "a.html").write_text("<img src='../static/pic.png'>", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
for p in root.rglob("*"):
if p.is_file():
zf.write(str(p), arcname=p.relative_to(root).as_posix())
with open(zpath, "rb") as fp:
files = {"file": ("web.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
assert r1.status_code == 200
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1003"})
assert r2.status_code == 200
j = r2.json()
assert j["code"] == 0
files_list = j["data"]["files"]
names = {Path(str(f.get("source") or "")).name for f in files_list}
assert "index.md" in names
assert "a.md" in names
for f in files_list:
n = Path(str(f.get("source") or "")).name
if n in {"index.md", "a.md"}:
assert f.get("minio_url")
assert str(f.get("object_name") or "").startswith("assets/rewritten/")
imp = j["data"]["import"]
nodes = []
def walk(children):
for n in children:
if n.get("type") == "FILE":
nodes.append(n.get("name"))
elif n.get("type") == "FOLDER":
walk(n.get("children", []))
walk(imp["tree"])
assert "index" in nodes
assert "a" in nodes
def test_archive_process_html_abs_uppercase(tmp_path: Path):
app = server.app
c = TestClient(app)
zpath = tmp_path / "web2.zip"
root = tmp_path / "web2"
(root / "static").mkdir(parents=True, exist_ok=True)
(root / "static" / "p.png").write_bytes(b"PNG")
(root / "INDEX.HTML").write_text("<img src='/static/p.png'>", "utf-8")
(root / "pages").mkdir(parents=True, exist_ok=True)
(root / "pages" / "A.HTM").write_text("<img src='/static/p.png'>", "utf-8")
with zipfile.ZipFile(str(zpath), "w") as zf:
for p in root.rglob("*"):
if p.is_file():
zf.write(str(p), arcname=p.relative_to(root).as_posix())
with open(zpath, "rb") as fp:
files = {"file": ("web2.zip", fp.read())}
r1 = c.post("/api/archive/stage", files=files)
assert r1.status_code == 200
sid = r1.json()["data"]["id"]
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1004"})
assert r2.status_code == 200
j = r2.json()
assert j["code"] == 0
files_list = j["data"]["files"]
names = {Path(str(f.get("source") or "")).name for f in files_list}
assert "INDEX.md" in names
assert "A.md" in names

View File

@@ -0,0 +1,53 @@
import io
import os
import base64
from pathlib import Path
from zipfile import ZipFile
from app.services.docling_adapter import md_to_docx_bytes
def _make_png(tmpdir: Path) -> Path:
# Minimal 1x1 PNG
data = base64.b64decode(
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
)
p = tmpdir / "tiny.png"
p.write_bytes(data)
return p
def test_md_to_docx_renders_blocks_and_media(tmp_path: Path):
png = _make_png(tmp_path)
html = (
f"<h1>标题</h1>"
f"<p>内容</p>"
f"<pre><code>print(\"hello\")\n</code></pre>"
f"<img src='{png.as_posix()}'>"
f"<table><thead><tr><th>A</th><th>B</th></tr></thead>"
f"<tbody><tr><td>1</td><td>2</td></tr></tbody></table>"
)
docx = md_to_docx_bytes(
html,
toc=True,
header_text="Left|Right",
footer_text="Footer",
filename_text="FileName",
product_name="Product",
document_name="DocName",
product_version="1.0",
document_version="2.0",
)
assert isinstance(docx, (bytes, bytearray)) and len(docx) > 0
zf = ZipFile(io.BytesIO(docx))
names = set(zf.namelist())
assert any(n.startswith("word/") for n in names)
# Document XML should contain core texts
doc_xml = zf.read("word/document.xml").decode("utf-8")
for tok in ["标题", "内容", "print(\"hello\")", "A", "B", "1", "2"]:
assert tok in doc_xml
# Media should be present for the image
assert any(n.startswith("word/media/") for n in names)

View File

@@ -0,0 +1,51 @@
import unittest
from pathlib import Path
import base64
import tempfile
import sys
# ensure 'app' package is importable
try:
root = Path(__file__).resolve().parents[2]
p = str(root)
if p not in sys.path:
sys.path.insert(0, p)
except Exception:
pass
from docx import Document
from app.services.word2markdown import convert_any
def _tiny_png_bytes() -> bytes:
return base64.b64decode(
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
)
class InlineImagesTest(unittest.TestCase):
def test_paragraph_image_order(self):
tmp = Path(tempfile.mkdtemp(prefix="w2m_inline_test_"))
img = tmp / "tiny.png"
img.write_bytes(_tiny_png_bytes())
docx = tmp / "sample.docx"
doc = Document()
doc.add_paragraph("前文A")
doc.add_picture(str(img)) # 图片单独段落
doc.add_paragraph("后文B")
doc.save(str(docx))
enc, md = convert_any(docx)
self.assertEqual(enc, "utf-8")
a_pos = md.find("前文A")
img_pos = md.find("![Image](data:")
b_pos = md.find("后文B")
# 顺序应为 A -> 图片 -> B
self.assertTrue(a_pos != -1 and img_pos != -1 and b_pos != -1)
self.assertTrue(a_pos < img_pos < b_pos)
if __name__ == "__main__":
unittest.main()