132 lines
4.6 KiB
Python
132 lines
4.6 KiB
Python
import os
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from fastapi.testclient import TestClient
|
|
import types
|
|
|
|
root = Path(__file__).resolve().parents[2] / "docling"
|
|
sys.path.insert(0, str(root))
|
|
dc = types.ModuleType('docling.document_converter')
|
|
class _DC:
|
|
def __init__(self, *a, **k):
|
|
pass
|
|
def convert(self, src):
|
|
class R:
|
|
class D:
|
|
def export_to_markdown(self, image_mode=None):
|
|
return ""
|
|
def export_to_html(self):
|
|
return ""
|
|
def export_to_json(self):
|
|
return "{}"
|
|
def export_to_doctags(self):
|
|
return "{}"
|
|
document = D()
|
|
return R()
|
|
class _PF:
|
|
def __init__(self, *a, **k):
|
|
pass
|
|
dc.DocumentConverter = _DC
|
|
dc.PdfFormatOption = _PF
|
|
sys.modules['docling.document_converter'] = dc
|
|
bm = types.ModuleType('docling.datamodel.base_models')
|
|
class _IF:
|
|
PDF = 'pdf'
|
|
bm.InputFormat = _IF
|
|
sys.modules['docling.datamodel.base_models'] = bm
|
|
pl = types.ModuleType('docling.pipeline.standard_pdf_pipeline')
|
|
class _SP:
|
|
def __init__(self, *a, **k):
|
|
pass
|
|
pl.StandardPdfPipeline = _SP
|
|
sys.modules['docling.pipeline.standard_pdf_pipeline'] = pl
|
|
po = types.ModuleType('docling.datamodel.pipeline_options')
|
|
class _PPO:
|
|
def __init__(self, *a, **k):
|
|
pass
|
|
po.PdfPipelineOptions = _PPO
|
|
sys.modules['docling.datamodel.pipeline_options'] = po
|
|
ct = types.ModuleType('docling_core.types.doc')
|
|
class _IRM:
|
|
PLACEHOLDER = 'placeholder'
|
|
ct.ImageRefMode = _IRM
|
|
sys.modules['docling_core.types.doc'] = ct
|
|
da = types.ModuleType('app.services.docling_adapter')
|
|
def _convert_source(src, export):
|
|
return ("", "text/markdown")
|
|
def _md2docx(md, **k):
|
|
return b""
|
|
def _md2pdf(md, *a, **k):
|
|
return b""
|
|
def _infer(source_url, upload_name):
|
|
return "document"
|
|
def _san(name):
|
|
return name or "document"
|
|
def _load():
|
|
return {}
|
|
def _save(m):
|
|
return None
|
|
da.convert_source = _convert_source
|
|
da.md_to_docx_bytes = _md2docx
|
|
da.md_to_pdf_bytes_with_renderer = _md2pdf
|
|
da.infer_basename = _infer
|
|
da.sanitize_filename = _san
|
|
da.load_linkmap = _load
|
|
da.save_linkmap = _save
|
|
sys.modules['app.services.docling_adapter'] = da
|
|
import app.server as server
|
|
|
|
class DummyMinio:
|
|
def __init__(self):
|
|
self.objs = []
|
|
def put_object(self, bucket_name, object_name, data, length, content_type):
|
|
self.objs.append((bucket_name, object_name, length, content_type))
|
|
def get_presigned_url(self, method, bucket, obj, expires=None):
|
|
return f"http://127.0.0.1:9000/{bucket}/{obj}"
|
|
def presigned_get_object(self, bucket, obj, expires=None):
|
|
return f"http://127.0.0.1:9000/{bucket}/{obj}"
|
|
|
|
PNG = (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\nIDATx\x9cc\xf8\x0f\x00\x01\x01\x01\x00\x18\xdd\xdc\xa4\x00\x00\x00\x00IEND\xaeB`\x82")
|
|
|
|
def setup_module(module=None):
|
|
server._minio_current = lambda: (DummyMinio(), "doctest", "http://127.0.0.1:9000", "assets")
|
|
def fake_convert(src, export="markdown", engine=None):
|
|
d = Path(tempfile.mkdtemp(prefix="artifacts_"))
|
|
(d / "img.png").write_bytes(PNG)
|
|
return ("utf-8", "A\n<!-- image -->\nB", str(d))
|
|
server._converter_v2.convert = fake_convert
|
|
server._extract_pdf_images = lambda pdf_path: [("png", PNG), ("png", PNG)]
|
|
|
|
import unittest
|
|
|
|
class TestApiConvert(unittest.TestCase):
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
setup_module()
|
|
def test_api_convert_save_true_returns_md_url(self):
|
|
app = server.app
|
|
mc = server._minio_current()
|
|
assert mc[1] == 'doctest'
|
|
c = TestClient(app)
|
|
files = {"file": ("管理端使用说明 (1).pdf", b"%PDF-1.4\n")}
|
|
data = {"export": "markdown", "save": "true", "filename": "管理端使用说明 (1)"}
|
|
r = c.post("/api/convert", files=files, data=data)
|
|
j = r.json()
|
|
self.assertEqual(j["code"], 0, str(j))
|
|
self.assertTrue(j["data"]["name"].lower().endswith(".md"))
|
|
self.assertTrue(j["data"]["minio_url"].lower().endswith(".md"))
|
|
|
|
def test_api_convert_save_false_returns_content_and_md_name(self):
|
|
app = server.app
|
|
mc = server._minio_current()
|
|
assert mc[1] == 'doctest'
|
|
c = TestClient(app)
|
|
files = {"file": ("文档.pdf", b"%PDF-1.4\n")}
|
|
data = {"export": "markdown", "save": "false", "filename": "文档"}
|
|
r = c.post("/api/convert", files=files, data=data)
|
|
j = r.json()
|
|
self.assertEqual(j["code"], 0, str(j))
|
|
self.assertTrue(j["data"]["name"].lower().endswith(".md"))
|
|
self.assertIn("
|