Files
label_ai_service/tests/test_text_service.py

123 lines
4.4 KiB
Python
Raw Normal View History

import pytest
from unittest.mock import AsyncMock, MagicMock
from app.core.exceptions import LLMParseError, StorageError, UnsupportedFileTypeError
from app.models.text_models import TextExtractRequest
SAMPLE_TRIPLES_JSON = '''[
{
"subject": "变压器",
"predicate": "额定电压",
"object": "110kV",
"source_snippet": "该变压器额定电压为110kV",
"source_offset": {"start": 0, "end": 12}
}
]'''
@pytest.fixture
def req_txt():
return TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
@pytest.fixture
def req_pdf():
return TextExtractRequest(file_path="text/test.pdf", file_name="report.pdf")
@pytest.fixture
def req_docx():
return TextExtractRequest(file_path="text/test.docx", file_name="doc.docx")
@pytest.fixture
def llm(mock_llm):
mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON)
return mock_llm
@pytest.mark.asyncio
async def test_txt_extraction_returns_triples(llm, mock_storage):
mock_storage.download_bytes = AsyncMock(return_value=b"test content")
from app.services.text_service import extract_triples
req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
result = await extract_triples(req, llm, mock_storage)
assert len(result.items) == 1
assert result.items[0].subject == "变压器"
assert result.items[0].predicate == "额定电压"
assert result.items[0].object == "110kV"
assert result.items[0].source_offset.start == 0
assert result.items[0].source_offset.end == 12
@pytest.mark.asyncio
async def test_pdf_extraction(llm, mock_storage, tmp_path):
import pdfplumber, io
# We mock download_bytes to return a minimal PDF-like response
# and mock pdfplumber.open to return pages with text
mock_storage.download_bytes = AsyncMock(return_value=b"%PDF fake")
with pytest.MonkeyPatch().context() as mp:
mock_page = MagicMock()
mock_page.extract_text.return_value = "PDF content here"
mock_pdf = MagicMock()
mock_pdf.__enter__ = lambda s: s
mock_pdf.__exit__ = MagicMock(return_value=False)
mock_pdf.pages = [mock_page]
mp.setattr("pdfplumber.open", lambda f: mock_pdf)
from app.services import text_service
import importlib
importlib.reload(text_service)
req = TextExtractRequest(file_path="text/test.pdf", file_name="doc.pdf")
result = await text_service.extract_triples(req, llm, mock_storage)
assert len(result.items) == 1
@pytest.mark.asyncio
async def test_docx_extraction(llm, mock_storage):
mock_storage.download_bytes = AsyncMock(return_value=b"PK fake docx bytes")
with pytest.MonkeyPatch().context() as mp:
mock_para = MagicMock()
mock_para.text = "Word paragraph content"
mock_doc = MagicMock()
mock_doc.paragraphs = [mock_para]
mp.setattr("docx.Document", lambda f: mock_doc)
from app.services import text_service
import importlib
importlib.reload(text_service)
req = TextExtractRequest(file_path="text/test.docx", file_name="doc.docx")
result = await text_service.extract_triples(req, llm, mock_storage)
assert len(result.items) == 1
@pytest.mark.asyncio
async def test_unsupported_format_raises_error(llm, mock_storage):
mock_storage.download_bytes = AsyncMock(return_value=b"data")
from app.services.text_service import extract_triples
req = TextExtractRequest(file_path="text/test.xlsx", file_name="data.xlsx")
with pytest.raises(UnsupportedFileTypeError):
await extract_triples(req, llm, mock_storage)
@pytest.mark.asyncio
async def test_storage_error_propagates(llm, mock_storage):
mock_storage.download_bytes = AsyncMock(side_effect=StorageError("not found"))
from app.services.text_service import extract_triples
req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
with pytest.raises(StorageError):
await extract_triples(req, llm, mock_storage)
@pytest.mark.asyncio
async def test_llm_parse_error_propagates(mock_llm, mock_storage):
mock_storage.download_bytes = AsyncMock(return_value=b"content")
mock_llm.chat = AsyncMock(return_value="not json {{")
from app.services.text_service import extract_triples
req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
with pytest.raises(LLMParseError):
await extract_triples(req, mock_llm, mock_storage)