import pytest from unittest.mock import AsyncMock, MagicMock from app.core.exceptions import LLMParseError, StorageError, UnsupportedFileTypeError from app.models.text_models import TextExtractRequest SAMPLE_TRIPLES_JSON = '''[ { "subject": "变压器", "predicate": "额定电压", "object": "110kV", "source_snippet": "该变压器额定电压为110kV", "source_offset": {"start": 0, "end": 12} } ]''' @pytest.fixture def req_txt(): return TextExtractRequest(file_path="text/test.txt", file_name="test.txt") @pytest.fixture def req_pdf(): return TextExtractRequest(file_path="text/test.pdf", file_name="report.pdf") @pytest.fixture def req_docx(): return TextExtractRequest(file_path="text/test.docx", file_name="doc.docx") @pytest.fixture def llm(mock_llm): mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON) return mock_llm @pytest.mark.asyncio async def test_txt_extraction_returns_triples(llm, mock_storage): mock_storage.download_bytes = AsyncMock(return_value=b"test content") from app.services.text_service import extract_triples req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") result = await extract_triples(req, llm, mock_storage) assert len(result.items) == 1 assert result.items[0].subject == "变压器" assert result.items[0].predicate == "额定电压" assert result.items[0].object == "110kV" assert result.items[0].source_offset.start == 0 assert result.items[0].source_offset.end == 12 @pytest.mark.asyncio async def test_pdf_extraction(llm, mock_storage, tmp_path): import pdfplumber, io # We mock download_bytes to return a minimal PDF-like response # and mock pdfplumber.open to return pages with text mock_storage.download_bytes = AsyncMock(return_value=b"%PDF fake") with pytest.MonkeyPatch().context() as mp: mock_page = MagicMock() mock_page.extract_text.return_value = "PDF content here" mock_pdf = MagicMock() mock_pdf.__enter__ = lambda s: s mock_pdf.__exit__ = MagicMock(return_value=False) mock_pdf.pages = [mock_page] mp.setattr("pdfplumber.open", lambda f: mock_pdf) from app.services import text_service import importlib importlib.reload(text_service) req = TextExtractRequest(file_path="text/test.pdf", file_name="doc.pdf") result = await text_service.extract_triples(req, llm, mock_storage) assert len(result.items) == 1 @pytest.mark.asyncio async def test_docx_extraction(llm, mock_storage): mock_storage.download_bytes = AsyncMock(return_value=b"PK fake docx bytes") with pytest.MonkeyPatch().context() as mp: mock_para = MagicMock() mock_para.text = "Word paragraph content" mock_doc = MagicMock() mock_doc.paragraphs = [mock_para] mp.setattr("docx.Document", lambda f: mock_doc) from app.services import text_service import importlib importlib.reload(text_service) req = TextExtractRequest(file_path="text/test.docx", file_name="doc.docx") result = await text_service.extract_triples(req, llm, mock_storage) assert len(result.items) == 1 @pytest.mark.asyncio async def test_unsupported_format_raises_error(llm, mock_storage): mock_storage.download_bytes = AsyncMock(return_value=b"data") from app.services.text_service import extract_triples req = TextExtractRequest(file_path="text/test.xlsx", file_name="data.xlsx") with pytest.raises(UnsupportedFileTypeError): await extract_triples(req, llm, mock_storage) @pytest.mark.asyncio async def test_storage_error_propagates(llm, mock_storage): mock_storage.download_bytes = AsyncMock(side_effect=StorageError("not found")) from app.services.text_service import extract_triples req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") with pytest.raises(StorageError): await extract_triples(req, llm, mock_storage) @pytest.mark.asyncio async def test_llm_parse_error_propagates(mock_llm, mock_storage): mock_storage.download_bytes = AsyncMock(return_value=b"content") mock_llm.chat = AsyncMock(return_value="not json {{") from app.services.text_service import extract_triples req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") with pytest.raises(LLMParseError): await extract_triples(req, mock_llm, mock_storage)