diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..22e3cc5 Binary files /dev/null and b/app/__pycache__/main.cpython-312.pyc differ diff --git a/app/models/__pycache__/__init__.cpython-312.pyc b/app/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..e33dd66 Binary files /dev/null and b/app/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/models/__pycache__/text_models.cpython-312.pyc b/app/models/__pycache__/text_models.cpython-312.pyc new file mode 100644 index 0000000..c24b9e0 Binary files /dev/null and b/app/models/__pycache__/text_models.cpython-312.pyc differ diff --git a/app/models/text_models.py b/app/models/text_models.py new file mode 100644 index 0000000..a92c53d --- /dev/null +++ b/app/models/text_models.py @@ -0,0 +1,25 @@ +from pydantic import BaseModel + + +class SourceOffset(BaseModel): + start: int + end: int + + +class TripleItem(BaseModel): + subject: str + predicate: str + object: str + source_snippet: str + source_offset: SourceOffset + + +class TextExtractRequest(BaseModel): + file_path: str + file_name: str + model: str | None = None + prompt_template: str | None = None + + +class TextExtractResponse(BaseModel): + items: list[TripleItem] diff --git a/app/routers/__pycache__/__init__.cpython-312.pyc b/app/routers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f76d015 Binary files /dev/null and b/app/routers/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/routers/__pycache__/finetune.cpython-312.pyc b/app/routers/__pycache__/finetune.cpython-312.pyc new file mode 100644 index 0000000..9f63d23 Binary files /dev/null and b/app/routers/__pycache__/finetune.cpython-312.pyc differ diff --git a/app/routers/__pycache__/image.cpython-312.pyc b/app/routers/__pycache__/image.cpython-312.pyc new file mode 100644 index 0000000..9c78dda Binary files /dev/null and b/app/routers/__pycache__/image.cpython-312.pyc differ diff --git a/app/routers/__pycache__/qa.cpython-312.pyc b/app/routers/__pycache__/qa.cpython-312.pyc new file mode 100644 index 0000000..d735a8c Binary files /dev/null and b/app/routers/__pycache__/qa.cpython-312.pyc differ diff --git a/app/routers/__pycache__/text.cpython-312.pyc b/app/routers/__pycache__/text.cpython-312.pyc new file mode 100644 index 0000000..0cc5fcb Binary files /dev/null and b/app/routers/__pycache__/text.cpython-312.pyc differ diff --git a/app/routers/__pycache__/video.cpython-312.pyc b/app/routers/__pycache__/video.cpython-312.pyc new file mode 100644 index 0000000..4566483 Binary files /dev/null and b/app/routers/__pycache__/video.cpython-312.pyc differ diff --git a/app/routers/text.py b/app/routers/text.py index 44c49f9..189508e 100644 --- a/app/routers/text.py +++ b/app/routers/text.py @@ -1,3 +1,18 @@ -from fastapi import APIRouter +from fastapi import APIRouter, Depends + +from app.clients.llm.base import LLMClient +from app.clients.storage.base import StorageClient +from app.core.dependencies import get_llm_client, get_storage_client +from app.models.text_models import TextExtractRequest, TextExtractResponse +from app.services import text_service router = APIRouter(tags=["Text"]) + + +@router.post("/text/extract", response_model=TextExtractResponse) +async def extract_text( + req: TextExtractRequest, + llm: LLMClient = Depends(get_llm_client), + storage: StorageClient = Depends(get_storage_client), +) -> TextExtractResponse: + return await text_service.extract_triples(req, llm, storage) diff --git a/app/services/__pycache__/__init__.cpython-312.pyc b/app/services/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f297b10 Binary files /dev/null and b/app/services/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/services/__pycache__/text_service.cpython-312.pyc b/app/services/__pycache__/text_service.cpython-312.pyc new file mode 100644 index 0000000..f2a5aa7 Binary files /dev/null and b/app/services/__pycache__/text_service.cpython-312.pyc differ diff --git a/app/services/text_service.py b/app/services/text_service.py new file mode 100644 index 0000000..18c2003 --- /dev/null +++ b/app/services/text_service.py @@ -0,0 +1,95 @@ +import io + +import pdfplumber +import docx + +from app.clients.llm.base import LLMClient +from app.clients.storage.base import StorageClient +from app.core.config import get_config +from app.core.exceptions import UnsupportedFileTypeError +from app.core.json_utils import extract_json +from app.core.logging import get_logger +from app.models.text_models import ( + SourceOffset, + TextExtractRequest, + TextExtractResponse, + TripleItem, +) + +logger = get_logger(__name__) + +_SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"} + +_DEFAULT_PROMPT = ( + "请从以下文本中提取知识三元组,以 JSON 数组格式返回,每条包含字段:" + "subject(主语)、predicate(谓语)、object(宾语)、" + "source_snippet(原文证据片段)、source_offset({{start, end}} 字符偏移)。\n\n" + "文本内容:\n{text}" +) + + +def _file_extension(file_name: str) -> str: + idx = file_name.rfind(".") + return file_name[idx:].lower() if idx != -1 else "" + + +def _parse_txt(data: bytes) -> str: + return data.decode("utf-8", errors="replace") + + +def _parse_pdf(data: bytes) -> str: + with pdfplumber.open(io.BytesIO(data)) as pdf: + pages = [page.extract_text() or "" for page in pdf.pages] + return "\n".join(pages) + + +def _parse_docx(data: bytes) -> str: + doc = docx.Document(io.BytesIO(data)) + return "\n".join(p.text for p in doc.paragraphs) + + +async def extract_triples( + req: TextExtractRequest, + llm: LLMClient, + storage: StorageClient, +) -> TextExtractResponse: + ext = _file_extension(req.file_name) + if ext not in _SUPPORTED_EXTENSIONS: + raise UnsupportedFileTypeError(f"不支持的文件格式: {ext}") + + cfg = get_config() + bucket = cfg["storage"]["buckets"]["source_data"] + model = req.model or cfg["models"]["default_text"] + + data = await storage.download_bytes(bucket, req.file_path) + + if ext == ".txt": + text = _parse_txt(data) + elif ext == ".pdf": + text = _parse_pdf(data) + else: + text = _parse_docx(data) + + prompt_template = req.prompt_template or _DEFAULT_PROMPT + prompt = prompt_template.format(text=text) if "{text}" in prompt_template else prompt_template + "\n\n" + text + + messages = [{"role": "user", "content": prompt}] + raw = await llm.chat(model, messages) + + logger.info("text_extract", extra={"file": req.file_name, "model": model}) + + items_raw = extract_json(raw) + items = [ + TripleItem( + subject=item["subject"], + predicate=item["predicate"], + object=item["object"], + source_snippet=item["source_snippet"], + source_offset=SourceOffset( + start=item["source_offset"]["start"], + end=item["source_offset"]["end"], + ), + ) + for item in items_raw + ] + return TextExtractResponse(items=items) diff --git a/tests/__pycache__/test_text_router.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_text_router.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..beb93f2 Binary files /dev/null and b/tests/__pycache__/test_text_router.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/__pycache__/test_text_service.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_text_service.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..406e9ea Binary files /dev/null and b/tests/__pycache__/test_text_service.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/test_text_router.py b/tests/test_text_router.py new file mode 100644 index 0000000..cbee082 --- /dev/null +++ b/tests/test_text_router.py @@ -0,0 +1,63 @@ +import pytest +from unittest.mock import AsyncMock + + +SAMPLE_TRIPLES_JSON = '''[ + { + "subject": "变压器", + "predicate": "额定电压", + "object": "110kV", + "source_snippet": "该变压器额定电压为110kV", + "source_offset": {"start": 0, "end": 12} + } +]''' + + +def test_text_extract_returns_200(client, mock_llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"some text content") + mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON) + + resp = client.post( + "/api/v1/text/extract", + json={"file_path": "text/test.txt", "file_name": "test.txt"}, + ) + assert resp.status_code == 200 + data = resp.json() + assert "items" in data + assert data["items"][0]["subject"] == "变压器" + assert data["items"][0]["source_offset"]["start"] == 0 + + +def test_text_extract_unsupported_format_returns_400(client, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"data") + + resp = client.post( + "/api/v1/text/extract", + json={"file_path": "text/test.xlsx", "file_name": "data.xlsx"}, + ) + assert resp.status_code == 400 + assert resp.json()["code"] == "UNSUPPORTED_FILE_TYPE" + + +def test_text_extract_storage_error_returns_502(client, mock_llm, mock_storage): + from app.core.exceptions import StorageError + mock_storage.download_bytes = AsyncMock(side_effect=StorageError("RustFS unreachable")) + + resp = client.post( + "/api/v1/text/extract", + json={"file_path": "text/test.txt", "file_name": "test.txt"}, + ) + assert resp.status_code == 502 + assert resp.json()["code"] == "STORAGE_ERROR" + + +def test_text_extract_llm_parse_error_returns_502(client, mock_llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"content") + mock_llm.chat = AsyncMock(return_value="not json {{{{") + + resp = client.post( + "/api/v1/text/extract", + json={"file_path": "text/test.txt", "file_name": "test.txt"}, + ) + assert resp.status_code == 502 + assert resp.json()["code"] == "LLM_PARSE_ERROR" diff --git a/tests/test_text_service.py b/tests/test_text_service.py new file mode 100644 index 0000000..4f85e45 --- /dev/null +++ b/tests/test_text_service.py @@ -0,0 +1,122 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock + +from app.core.exceptions import LLMParseError, StorageError, UnsupportedFileTypeError +from app.models.text_models import TextExtractRequest + + +SAMPLE_TRIPLES_JSON = '''[ + { + "subject": "变压器", + "predicate": "额定电压", + "object": "110kV", + "source_snippet": "该变压器额定电压为110kV", + "source_offset": {"start": 0, "end": 12} + } +]''' + + +@pytest.fixture +def req_txt(): + return TextExtractRequest(file_path="text/test.txt", file_name="test.txt") + + +@pytest.fixture +def req_pdf(): + return TextExtractRequest(file_path="text/test.pdf", file_name="report.pdf") + + +@pytest.fixture +def req_docx(): + return TextExtractRequest(file_path="text/test.docx", file_name="doc.docx") + + +@pytest.fixture +def llm(mock_llm): + mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON) + return mock_llm + + +@pytest.mark.asyncio +async def test_txt_extraction_returns_triples(llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"test content") + from app.services.text_service import extract_triples + req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") + result = await extract_triples(req, llm, mock_storage) + assert len(result.items) == 1 + assert result.items[0].subject == "变压器" + assert result.items[0].predicate == "额定电压" + assert result.items[0].object == "110kV" + assert result.items[0].source_offset.start == 0 + assert result.items[0].source_offset.end == 12 + + +@pytest.mark.asyncio +async def test_pdf_extraction(llm, mock_storage, tmp_path): + import pdfplumber, io + # We mock download_bytes to return a minimal PDF-like response + # and mock pdfplumber.open to return pages with text + mock_storage.download_bytes = AsyncMock(return_value=b"%PDF fake") + + with pytest.MonkeyPatch().context() as mp: + mock_page = MagicMock() + mock_page.extract_text.return_value = "PDF content here" + mock_pdf = MagicMock() + mock_pdf.__enter__ = lambda s: s + mock_pdf.__exit__ = MagicMock(return_value=False) + mock_pdf.pages = [mock_page] + mp.setattr("pdfplumber.open", lambda f: mock_pdf) + + from app.services import text_service + import importlib + importlib.reload(text_service) + req = TextExtractRequest(file_path="text/test.pdf", file_name="doc.pdf") + result = await text_service.extract_triples(req, llm, mock_storage) + assert len(result.items) == 1 + + +@pytest.mark.asyncio +async def test_docx_extraction(llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"PK fake docx bytes") + + with pytest.MonkeyPatch().context() as mp: + mock_para = MagicMock() + mock_para.text = "Word paragraph content" + mock_doc = MagicMock() + mock_doc.paragraphs = [mock_para] + mp.setattr("docx.Document", lambda f: mock_doc) + + from app.services import text_service + import importlib + importlib.reload(text_service) + req = TextExtractRequest(file_path="text/test.docx", file_name="doc.docx") + result = await text_service.extract_triples(req, llm, mock_storage) + assert len(result.items) == 1 + + +@pytest.mark.asyncio +async def test_unsupported_format_raises_error(llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"data") + from app.services.text_service import extract_triples + req = TextExtractRequest(file_path="text/test.xlsx", file_name="data.xlsx") + with pytest.raises(UnsupportedFileTypeError): + await extract_triples(req, llm, mock_storage) + + +@pytest.mark.asyncio +async def test_storage_error_propagates(llm, mock_storage): + mock_storage.download_bytes = AsyncMock(side_effect=StorageError("not found")) + from app.services.text_service import extract_triples + req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") + with pytest.raises(StorageError): + await extract_triples(req, llm, mock_storage) + + +@pytest.mark.asyncio +async def test_llm_parse_error_propagates(mock_llm, mock_storage): + mock_storage.download_bytes = AsyncMock(return_value=b"content") + mock_llm.chat = AsyncMock(return_value="not json {{") + from app.services.text_service import extract_triples + req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt") + with pytest.raises(LLMParseError): + await extract_triples(req, mock_llm, mock_storage)