Import project files
This commit is contained in:
80
docling/app/tests/run_batch_upload_debug.py
Normal file
80
docling/app/tests/run_batch_upload_debug.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import io
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def main():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
tmp = Path("/tmp/run_batch_upload_debug")
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
zpath = tmp / "pkg.zip"
|
||||
md_dir = tmp / "docs"
|
||||
img_dir = md_dir / "images"
|
||||
img_dir.mkdir(parents=True, exist_ok=True)
|
||||
(img_dir / "p.png").write_bytes(b"PNG")
|
||||
(md_dir / "a.md").write_text("", "utf-8")
|
||||
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
zf.write(str(md_dir / "a.md"), arcname="a.md")
|
||||
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
|
||||
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("pkg.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
print("stage status:", r1.status_code, r1.json())
|
||||
sid = r1.json()["data"]["id"]
|
||||
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"})
|
||||
print("process status:", r2.status_code, r2.json())
|
||||
|
||||
list_text = str(md_dir / "a.md")
|
||||
lf = io.BytesIO(list_text.encode("utf-8"))
|
||||
r3 = c.post("/api/upload-list", files={"list_file": ("list.txt", lf.getvalue())}, data={"prefix": "assets", "versionId": "1002"})
|
||||
print("upload-list status:", r3.status_code, r3.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
75
docling/app/tests/run_convert_folder_debug.py
Normal file
75
docling/app/tests/run_convert_folder_debug.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def main():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
tmp = Path("/tmp/run_convert_folder_debug")
|
||||
if tmp.exists():
|
||||
for p in tmp.rglob("*"):
|
||||
try:
|
||||
p.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
tmp.rmdir()
|
||||
except Exception:
|
||||
pass
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
root = tmp / "数+产品手册-MD源文件"
|
||||
sub = root / "DMDRS_DRS_Language_User_Manual"
|
||||
img = sub / "images"
|
||||
img.mkdir(parents=True, exist_ok=True)
|
||||
(img / "p.png").write_bytes(b"PNG")
|
||||
(sub / "a.md").write_text("# Title\n\n", "utf-8")
|
||||
|
||||
r = c.post("/md/convert-folder", data={"folder_path": str(root), "prefix": "assets"})
|
||||
print("convert-folder:", r.status_code)
|
||||
print(r.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
97
docling/app/tests/run_edge_cases_debug.py
Normal file
97
docling/app/tests/run_edge_cases_debug.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import io
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def run():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
|
||||
r = c.post("/api/archive/process", data={"id": "missing"})
|
||||
print("invalid-id:", r.status_code, r.json())
|
||||
|
||||
tmp = Path("/tmp/run_edge_cases_debug")
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
rar_path = tmp / "pkg.rar"
|
||||
rar_path.write_bytes(b"RAR")
|
||||
with open(rar_path, "rb") as fp:
|
||||
files = {"file": ("pkg.rar", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
sid = r1.json()["data"]["id"]
|
||||
r2 = c.post("/api/archive/process", data={"id": sid})
|
||||
print("rar-process:", r2.status_code, r2.json())
|
||||
r3 = c.post("/api/archive/process", data={"id": sid})
|
||||
print("rar-reprocess:", r3.status_code, r3.json())
|
||||
|
||||
root = tmp / "listcase2"
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
(root / "img.png").write_bytes(b"PNG")
|
||||
(root / "a.md").write_text("", "utf-8")
|
||||
(root / "b.txt").write_text("", "utf-8")
|
||||
lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")]
|
||||
data_bytes = "\n".join(lines).encode("utf-8")
|
||||
files = {"list_file": ("list.txt", data_bytes)}
|
||||
r4 = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"})
|
||||
print("upload-list:", r4.status_code, r4.json())
|
||||
|
||||
zpath = tmp / "dup.zip"
|
||||
base = tmp / "src"
|
||||
sub = base / "sub"
|
||||
sub.mkdir(parents=True, exist_ok=True)
|
||||
(base / "a.md").write_text("", "utf-8")
|
||||
(base / "img.png").write_bytes(b"PNG")
|
||||
(sub / "a.md").write_text("", "utf-8")
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
zf.write(str(base / "a.md"), arcname="a.md")
|
||||
zf.write(str(base / "img.png"), arcname="img.png")
|
||||
zf.write(str(sub / "a.md"), arcname="sub/a.md")
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("dup.zip", fp.read())}
|
||||
r5 = c.post("/api/archive/stage", files=files)
|
||||
sid2 = r5.json()["data"]["id"]
|
||||
r6 = c.post("/api/archive/process", data={"id": sid2, "prefix": "assets", "versionId": "1006"})
|
||||
print("archive-dup:", r6.status_code, r6.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
77
docling/app/tests/run_minio_object_debug.py
Normal file
77
docling/app/tests/run_minio_object_debug.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class _Resp:
|
||||
def __init__(self, data: bytes):
|
||||
self._data = data
|
||||
def read(self) -> bytes:
|
||||
return self._data
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.store = {
|
||||
("doctest", "assets/rewritten/x.md"): (b"# Title\n\nhello", "text/markdown; charset=utf-8")
|
||||
}
|
||||
def stat_object(self, bucket: str, object_name: str):
|
||||
class S:
|
||||
def __init__(self, ct: str):
|
||||
self.content_type = ct
|
||||
k = (bucket, object_name)
|
||||
if k in self.store:
|
||||
return S(self.store[k][1])
|
||||
return S("application/octet-stream")
|
||||
def get_object(self, bucket: str, object_name: str):
|
||||
k = (bucket, object_name)
|
||||
if k in self.store:
|
||||
return _Resp(self.store[k][0])
|
||||
return _Resp(b"")
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "doctest",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "doctest", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def run():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
r = c.get("/minio/object", params={"bucket": "doctest", "object": "assets/rewritten/x.md"})
|
||||
print("status:", r.status_code)
|
||||
print("ct:", r.headers.get("Content-Type"))
|
||||
print(r.text)
|
||||
|
||||
import urllib.parse as _u
|
||||
enc = _u.quote("assets/rewritten/数字+产品手册-MD源文件/x.md")
|
||||
cur_client, _, _, _ = server._minio_current() # type: ignore
|
||||
cur_client.store[("doctest", "assets/rewritten/数字+产品手册-MD源文件/x.md")] = ("hello 中文+plus".encode("utf-8"), "text/markdown; charset=utf-8")
|
||||
r2 = c.get("/minio/object", params={"bucket": "doctest", "object": enc})
|
||||
print("status2:", r2.status_code)
|
||||
print("ct2:", r2.headers.get("Content-Type"))
|
||||
print(r2.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
50
docling/app/tests/run_minio_presign_debug.py
Normal file
50
docling/app/tests/run_minio_presign_debug.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import io
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
pass
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}"
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}?e={expires}"
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "doctest",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "doctest", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def run():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
url = "http://127.0.0.1:9000/doctest/assets/rewritten/%E6%B5%8B%E8%AF%95/a.md"
|
||||
r = c.post("/minio/presign", data={"url": url, "expires": 7200})
|
||||
print("status:", r.status_code)
|
||||
print(r.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
|
||||
74
docling/app/tests/run_slash_path_debug.py
Normal file
74
docling/app/tests/run_slash_path_debug.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import io
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
import sys
|
||||
from pathlib import Path as _Path
|
||||
base = _Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(base))
|
||||
sys.path.insert(0, str(base / "docling"))
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup():
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def main():
|
||||
setup()
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
tmp = Path("/tmp/run_slash_path_debug")
|
||||
tmp.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
zpath = tmp / "pkg.zip"
|
||||
md_dir = tmp / "docs"
|
||||
img_dir = md_dir / "images"
|
||||
img_dir.mkdir(parents=True, exist_ok=True)
|
||||
(img_dir / "p.png").write_bytes(b"PNG")
|
||||
(md_dir / "a.md").write_text("", "utf-8")
|
||||
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
zf.write(str(md_dir / "a.md"), arcname="a.md")
|
||||
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
|
||||
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("pkg.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
sid = r1.json()["data"]["id"]
|
||||
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1007"})
|
||||
print("process:", r2.status_code)
|
||||
print(r2.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
29
docling/app/tests/test_api_convert.py
Normal file
29
docling/app/tests/test_api_convert.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import unittest
|
||||
from fastapi.testclient import TestClient
|
||||
from pathlib import Path
|
||||
import io
|
||||
|
||||
from app.server import app
|
||||
|
||||
|
||||
class ApiConvertTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.client = TestClient(app)
|
||||
|
||||
def test_api_convert_markdown_file(self):
|
||||
tmpdir = Path("./scratch_unittest")
|
||||
tmpdir.mkdir(exist_ok=True)
|
||||
p = tmpdir / "sample.md"
|
||||
p.write_text("# Title\n\n::: note\nBody\n:::\n", "utf-8")
|
||||
with open(p, "rb") as f:
|
||||
files = {"file": (p.name, io.BytesIO(f.read()), "text/markdown")}
|
||||
r = self.client.post("/api/convert", files=files, data={"export": "markdown"})
|
||||
self.assertEqual(r.status_code, 200)
|
||||
j = r.json()
|
||||
self.assertEqual(j.get("code"), 0)
|
||||
self.assertIsInstance(j.get("data", {}).get("content"), str)
|
||||
self.assertIn("!!! note", j["data"]["content"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
113
docling/app/tests/test_batch_upload_edge_cases.py
Normal file
113
docling/app/tests/test_batch_upload_edge_cases.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import io
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup_module(module=None):
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
fake = FakeMinio()
|
||||
def _cur():
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server._minio_current = _cur # type: ignore
|
||||
|
||||
|
||||
def test_process_invalid_id():
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
r = c.post("/api/archive/process", data={"id": "missing"})
|
||||
assert r.status_code == 200
|
||||
j = r.json()
|
||||
assert j["code"] != 0
|
||||
|
||||
|
||||
def test_stage_unsupported_format_and_cleanup(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
rar_path = tmp_path / "pkg.rar"
|
||||
rar_path.write_bytes(b"RAR")
|
||||
with open(rar_path, "rb") as fp:
|
||||
files = {"file": ("pkg.rar", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
assert r1.status_code == 200
|
||||
sid = r1.json()["data"]["id"]
|
||||
r2 = c.post("/api/archive/process", data={"id": sid})
|
||||
assert r2.status_code == 200
|
||||
j2 = r2.json()
|
||||
assert j2["code"] != 0
|
||||
r3 = c.post("/api/archive/process", data={"id": sid})
|
||||
assert r3.status_code == 200
|
||||
j3 = r3.json()
|
||||
assert j3["code"] != 0
|
||||
|
||||
|
||||
def test_upload_list_empty_lines_comments_and_urls(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
root = tmp_path / "listcase2"
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
(root / "img.png").write_bytes(b"PNG")
|
||||
(root / "a.md").write_text("", "utf-8")
|
||||
(root / "b.txt").write_text("", "utf-8")
|
||||
lines = ["", "# comment", "http://example.com/x.md", str(root / "a.md"), str(root / "b.txt")]
|
||||
data_bytes = "\n".join(lines).encode("utf-8")
|
||||
files = {"list_file": ("list.txt", data_bytes)}
|
||||
r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1005"})
|
||||
assert r.status_code == 200
|
||||
j = r.json()
|
||||
assert j["code"] == 0
|
||||
assert j["data"]["count"] >= 2
|
||||
|
||||
|
||||
def test_archive_duplicate_filenames_tree(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
zpath = tmp_path / "dup.zip"
|
||||
base = tmp_path / "src"
|
||||
sub = base / "sub"
|
||||
sub.mkdir(parents=True, exist_ok=True)
|
||||
(base / "a.md").write_text("", "utf-8")
|
||||
(base / "img.png").write_bytes(b"PNG")
|
||||
(sub / "a.md").write_text("", "utf-8")
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
zf.write(str(base / "a.md"), arcname="a.md")
|
||||
zf.write(str(base / "img.png"), arcname="img.png")
|
||||
zf.write(str(sub / "a.md"), arcname="sub/a.md")
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("dup.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
assert r1.status_code == 200
|
||||
sid = r1.json()["data"]["id"]
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1006"})
|
||||
assert r2.status_code == 200
|
||||
j = r2.json()
|
||||
assert j["code"] == 0
|
||||
tree = j["data"]["import"]["tree"]
|
||||
names = [n["name"] for n in tree]
|
||||
assert "sub" in names or any((isinstance(n, dict) and n.get("type") == "FOLDER" and n.get("name") == "sub") for n in tree)
|
||||
185
docling/app/tests/test_batch_upload_endpoints.py
Normal file
185
docling/app/tests/test_batch_upload_endpoints.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import io
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
import app.server as server
|
||||
|
||||
|
||||
class FakeMinio:
|
||||
def __init__(self):
|
||||
self.objs = {}
|
||||
|
||||
def put_object(self, bucket_name: str, object_name: str, data: io.BytesIO, length: int, content_type: str):
|
||||
self.objs[(bucket_name, object_name)] = data.read(length)
|
||||
|
||||
def get_presigned_url(self, method: str, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
def presigned_get_object(self, bucket: str, obj: str, expires: int):
|
||||
return f"http://minio.test/presigned/{bucket}/{obj}"
|
||||
|
||||
|
||||
def setup_module(module=None):
|
||||
server.RUNTIME_CONFIG["minio"].update({
|
||||
"endpoint": "127.0.0.1:9000",
|
||||
"public": "http://127.0.0.1:9000",
|
||||
"access": "ak",
|
||||
"secret": "sk",
|
||||
"bucket": "test",
|
||||
"secure": "false",
|
||||
"prefix": "assets",
|
||||
"store_final": "true",
|
||||
"public_read": "true",
|
||||
})
|
||||
|
||||
fake = FakeMinio()
|
||||
|
||||
def _cur_cfg(_cfg):
|
||||
return fake, "test", "http://127.0.0.1:9000", "assets"
|
||||
server.minio_current = _cur_cfg # type: ignore
|
||||
try:
|
||||
server._minio_current = lambda: _cur_cfg(None) # type: ignore
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def test_archive_stage_and_process(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
|
||||
zpath = tmp_path / "pkg.zip"
|
||||
md_dir = tmp_path / "docs"
|
||||
img_dir = md_dir / "images"
|
||||
img_dir.mkdir(parents=True, exist_ok=True)
|
||||
(img_dir / "p.png").write_bytes(b"PNG")
|
||||
(md_dir / "a.md").write_text("", "utf-8")
|
||||
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
zf.write(str(md_dir / "a.md"), arcname="a.md")
|
||||
zf.write(str(img_dir / "p.png"), arcname="images/p.png")
|
||||
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("pkg.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
assert r1.status_code == 200
|
||||
j1 = r1.json()
|
||||
assert j1["code"] == 0 and j1["data"]["id"]
|
||||
sid = j1["data"]["id"]
|
||||
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1001"})
|
||||
assert r2.status_code == 200
|
||||
j2 = r2.json()
|
||||
assert j2["code"] == 0
|
||||
assert j2["data"]["count"] >= 1
|
||||
assert "import" in j2["data"]
|
||||
|
||||
|
||||
def test_upload_list(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
|
||||
root = tmp_path / "listcase"
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
(root / "img.png").write_bytes(b"PNG")
|
||||
(root / "b.md").write_text("", "utf-8")
|
||||
|
||||
list_text = str(root / "b.md")
|
||||
lf = io.BytesIO(list_text.encode("utf-8"))
|
||||
|
||||
files = {"list_file": ("list.txt", lf.getvalue())}
|
||||
r = c.post("/api/upload-list", files=files, data={"prefix": "assets", "versionId": "1002"})
|
||||
assert r.status_code == 200
|
||||
j = r.json()
|
||||
assert j["code"] == 0
|
||||
assert j["data"]["count"] >= 1
|
||||
assert "import" in j["data"]
|
||||
|
||||
|
||||
def test_archive_process_html_conversion(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
|
||||
zpath = tmp_path / "web.zip"
|
||||
root = tmp_path / "web"
|
||||
static = root / "static"
|
||||
static.mkdir(parents=True, exist_ok=True)
|
||||
(static / "pic.png").write_bytes(b"PNG")
|
||||
|
||||
(root / "index.html").write_text("<html><body><h1>T</h1><img src='static/pic.png'/></body></html>", "utf-8")
|
||||
pages = root / "pages"
|
||||
pages.mkdir(parents=True, exist_ok=True)
|
||||
(pages / "a.html").write_text("<img src='../static/pic.png'>", "utf-8")
|
||||
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file():
|
||||
zf.write(str(p), arcname=p.relative_to(root).as_posix())
|
||||
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("web.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
assert r1.status_code == 200
|
||||
sid = r1.json()["data"]["id"]
|
||||
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1003"})
|
||||
assert r2.status_code == 200
|
||||
j = r2.json()
|
||||
assert j["code"] == 0
|
||||
|
||||
files_list = j["data"]["files"]
|
||||
names = {Path(str(f.get("source") or "")).name for f in files_list}
|
||||
assert "index.md" in names
|
||||
assert "a.md" in names
|
||||
for f in files_list:
|
||||
n = Path(str(f.get("source") or "")).name
|
||||
if n in {"index.md", "a.md"}:
|
||||
assert f.get("minio_url")
|
||||
assert str(f.get("object_name") or "").startswith("assets/rewritten/")
|
||||
|
||||
imp = j["data"]["import"]
|
||||
nodes = []
|
||||
def walk(children):
|
||||
for n in children:
|
||||
if n.get("type") == "FILE":
|
||||
nodes.append(n.get("name"))
|
||||
elif n.get("type") == "FOLDER":
|
||||
walk(n.get("children", []))
|
||||
walk(imp["tree"])
|
||||
assert "index" in nodes
|
||||
assert "a" in nodes
|
||||
|
||||
|
||||
def test_archive_process_html_abs_uppercase(tmp_path: Path):
|
||||
app = server.app
|
||||
c = TestClient(app)
|
||||
|
||||
zpath = tmp_path / "web2.zip"
|
||||
root = tmp_path / "web2"
|
||||
(root / "static").mkdir(parents=True, exist_ok=True)
|
||||
(root / "static" / "p.png").write_bytes(b"PNG")
|
||||
|
||||
(root / "INDEX.HTML").write_text("<img src='/static/p.png'>", "utf-8")
|
||||
(root / "pages").mkdir(parents=True, exist_ok=True)
|
||||
(root / "pages" / "A.HTM").write_text("<img src='/static/p.png'>", "utf-8")
|
||||
|
||||
with zipfile.ZipFile(str(zpath), "w") as zf:
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file():
|
||||
zf.write(str(p), arcname=p.relative_to(root).as_posix())
|
||||
|
||||
with open(zpath, "rb") as fp:
|
||||
files = {"file": ("web2.zip", fp.read())}
|
||||
r1 = c.post("/api/archive/stage", files=files)
|
||||
assert r1.status_code == 200
|
||||
sid = r1.json()["data"]["id"]
|
||||
|
||||
r2 = c.post("/api/archive/process", data={"id": sid, "prefix": "assets", "versionId": "1004"})
|
||||
assert r2.status_code == 200
|
||||
j = r2.json()
|
||||
assert j["code"] == 0
|
||||
files_list = j["data"]["files"]
|
||||
names = {Path(str(f.get("source") or "")).name for f in files_list}
|
||||
assert "INDEX.md" in names
|
||||
assert "A.md" in names
|
||||
53
docling/app/tests/test_md_to_docx.py
Normal file
53
docling/app/tests/test_md_to_docx.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import io
|
||||
import os
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
|
||||
from app.services.docling_adapter import md_to_docx_bytes
|
||||
|
||||
|
||||
def _make_png(tmpdir: Path) -> Path:
|
||||
# Minimal 1x1 PNG
|
||||
data = base64.b64decode(
|
||||
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
|
||||
)
|
||||
p = tmpdir / "tiny.png"
|
||||
p.write_bytes(data)
|
||||
return p
|
||||
|
||||
|
||||
def test_md_to_docx_renders_blocks_and_media(tmp_path: Path):
|
||||
png = _make_png(tmp_path)
|
||||
html = (
|
||||
f"<h1>标题</h1>"
|
||||
f"<p>内容</p>"
|
||||
f"<pre><code>print(\"hello\")\n</code></pre>"
|
||||
f"<img src='{png.as_posix()}'>"
|
||||
f"<table><thead><tr><th>A</th><th>B</th></tr></thead>"
|
||||
f"<tbody><tr><td>1</td><td>2</td></tr></tbody></table>"
|
||||
)
|
||||
|
||||
docx = md_to_docx_bytes(
|
||||
html,
|
||||
toc=True,
|
||||
header_text="Left|Right",
|
||||
footer_text="Footer",
|
||||
filename_text="FileName",
|
||||
product_name="Product",
|
||||
document_name="DocName",
|
||||
product_version="1.0",
|
||||
document_version="2.0",
|
||||
)
|
||||
|
||||
assert isinstance(docx, (bytes, bytearray)) and len(docx) > 0
|
||||
zf = ZipFile(io.BytesIO(docx))
|
||||
names = set(zf.namelist())
|
||||
assert any(n.startswith("word/") for n in names)
|
||||
# Document XML should contain core texts
|
||||
doc_xml = zf.read("word/document.xml").decode("utf-8")
|
||||
for tok in ["标题", "内容", "print(\"hello\")", "A", "B", "1", "2"]:
|
||||
assert tok in doc_xml
|
||||
# Media should be present for the image
|
||||
assert any(n.startswith("word/media/") for n in names)
|
||||
|
||||
51
docling/app/tests/test_word2markdown_inline_images.py
Normal file
51
docling/app/tests/test_word2markdown_inline_images.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
import base64
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
# ensure 'app' package is importable
|
||||
try:
|
||||
root = Path(__file__).resolve().parents[2]
|
||||
p = str(root)
|
||||
if p not in sys.path:
|
||||
sys.path.insert(0, p)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
from docx import Document
|
||||
|
||||
from app.services.word2markdown import convert_any
|
||||
|
||||
|
||||
def _tiny_png_bytes() -> bytes:
|
||||
return base64.b64decode(
|
||||
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII="
|
||||
)
|
||||
|
||||
|
||||
class InlineImagesTest(unittest.TestCase):
|
||||
def test_paragraph_image_order(self):
|
||||
tmp = Path(tempfile.mkdtemp(prefix="w2m_inline_test_"))
|
||||
img = tmp / "tiny.png"
|
||||
img.write_bytes(_tiny_png_bytes())
|
||||
|
||||
docx = tmp / "sample.docx"
|
||||
doc = Document()
|
||||
doc.add_paragraph("前文A")
|
||||
doc.add_picture(str(img)) # 图片单独段落
|
||||
doc.add_paragraph("后文B")
|
||||
doc.save(str(docx))
|
||||
|
||||
enc, md = convert_any(docx)
|
||||
self.assertEqual(enc, "utf-8")
|
||||
a_pos = md.find("前文A")
|
||||
img_pos = md.find("
|
||||
b_pos = md.find("后文B")
|
||||
# 顺序应为 A -> 图片 -> B
|
||||
self.assertTrue(a_pos != -1 and img_pos != -1 and b_pos != -1)
|
||||
self.assertTrue(a_pos < img_pos < b_pos)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user