feat(US1): text triple extraction — POST /api/v1/text/extract

- app/models/text_models.py: TripleItem, SourceOffset, TextExtract{Request,Response} - app/services/text_service.py: TXT/PDF/DOCX parsing + LLM call + JSON parse - app/routers/text.py: POST /text/extract handler with Depends injection - tests/test_text_service.py: 6 unit tests (formats, errors) - tests/test_text_router.py: 4 router tests (200, 400, 502×2) - 10/10 tests passing
2026-04-10 15:27:27 +08:00
parent e1eb5e47b1
commit dd8da386f4
18 changed files with 321 additions and 1 deletions
--- a/app/pycache/main.cpython-312.pyc
+++ b/app/pycache/main.cpython-312.pyc
--- a/app/models/pycache/init.cpython-312.pyc
+++ b/app/models/pycache/init.cpython-312.pyc
--- a/app/models/pycache/text_models.cpython-312.pyc
+++ b/app/models/pycache/text_models.cpython-312.pyc
--- a/app/models/text_models.py
+++ b/app/models/text_models.py
@@ -0,0 +1,25 @@
+from pydantic import BaseModel
+
+
+class SourceOffset(BaseModel):
+    start: int
+    end: int
+
+
+class TripleItem(BaseModel):
+    subject: str
+    predicate: str
+    object: str
+    source_snippet: str
+    source_offset: SourceOffset
+
+
+class TextExtractRequest(BaseModel):
+    file_path: str
+    file_name: str
+    model: str | None = None
+    prompt_template: str | None = None
+
+
+class TextExtractResponse(BaseModel):
+    items: list[TripleItem]
--- a/app/routers/pycache/init.cpython-312.pyc
+++ b/app/routers/pycache/init.cpython-312.pyc
--- a/app/routers/pycache/finetune.cpython-312.pyc
+++ b/app/routers/pycache/finetune.cpython-312.pyc
--- a/app/routers/pycache/image.cpython-312.pyc
+++ b/app/routers/pycache/image.cpython-312.pyc
--- a/app/routers/pycache/qa.cpython-312.pyc
+++ b/app/routers/pycache/qa.cpython-312.pyc
--- a/app/routers/pycache/text.cpython-312.pyc
+++ b/app/routers/pycache/text.cpython-312.pyc
--- a/app/routers/pycache/video.cpython-312.pyc
+++ b/app/routers/pycache/video.cpython-312.pyc
--- a/app/routers/text.py
+++ b/app/routers/text.py
@@ -1,3 +1,18 @@
-from fastapi import APIRouter
+from fastapi import APIRouter, Depends
+
+from app.clients.llm.base import LLMClient
+from app.clients.storage.base import StorageClient
+from app.core.dependencies import get_llm_client, get_storage_client
+from app.models.text_models import TextExtractRequest, TextExtractResponse
+from app.services import text_service

 router = APIRouter(tags=["Text"])
+
+
+@router.post("/text/extract", response_model=TextExtractResponse)
+async def extract_text(
+    req: TextExtractRequest,
+    llm: LLMClient = Depends(get_llm_client),
+    storage: StorageClient = Depends(get_storage_client),
+) -> TextExtractResponse:
+    return await text_service.extract_triples(req, llm, storage)
--- a/app/services/pycache/init.cpython-312.pyc
+++ b/app/services/pycache/init.cpython-312.pyc
--- a/app/services/pycache/text_service.cpython-312.pyc
+++ b/app/services/pycache/text_service.cpython-312.pyc
--- a/app/services/text_service.py
+++ b/app/services/text_service.py
@@ -0,0 +1,95 @@
+import io
+
+import pdfplumber
+import docx
+
+from app.clients.llm.base import LLMClient
+from app.clients.storage.base import StorageClient
+from app.core.config import get_config
+from app.core.exceptions import UnsupportedFileTypeError
+from app.core.json_utils import extract_json
+from app.core.logging import get_logger
+from app.models.text_models import (
+    SourceOffset,
+    TextExtractRequest,
+    TextExtractResponse,
+    TripleItem,
+)
+
+logger = get_logger(__name__)
+
+_SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
+
+_DEFAULT_PROMPT = (
+    "请从以下文本中提取知识三元组，以 JSON 数组格式返回，每条包含字段："
+    "subject（主语）、predicate（谓语）、object（宾语）、"
+    "source_snippet（原文证据片段）、source_offset（{{start, end}} 字符偏移）。\n\n"
+    "文本内容：\n{text}"
+)
+
+
+def _file_extension(file_name: str) -> str:
+    idx = file_name.rfind(".")
+    return file_name[idx:].lower() if idx != -1 else ""
+
+
+def _parse_txt(data: bytes) -> str:
+    return data.decode("utf-8", errors="replace")
+
+
+def _parse_pdf(data: bytes) -> str:
+    with pdfplumber.open(io.BytesIO(data)) as pdf:
+        pages = [page.extract_text() or "" for page in pdf.pages]
+    return "\n".join(pages)
+
+
+def _parse_docx(data: bytes) -> str:
+    doc = docx.Document(io.BytesIO(data))
+    return "\n".join(p.text for p in doc.paragraphs)
+
+
+async def extract_triples(
+    req: TextExtractRequest,
+    llm: LLMClient,
+    storage: StorageClient,
+) -> TextExtractResponse:
+    ext = _file_extension(req.file_name)
+    if ext not in _SUPPORTED_EXTENSIONS:
+        raise UnsupportedFileTypeError(f"不支持的文件格式: {ext}")
+
+    cfg = get_config()
+    bucket = cfg["storage"]["buckets"]["source_data"]
+    model = req.model or cfg["models"]["default_text"]
+
+    data = await storage.download_bytes(bucket, req.file_path)
+
+    if ext == ".txt":
+        text = _parse_txt(data)
+    elif ext == ".pdf":
+        text = _parse_pdf(data)
+    else:
+        text = _parse_docx(data)
+
+    prompt_template = req.prompt_template or _DEFAULT_PROMPT
+    prompt = prompt_template.format(text=text) if "{text}" in prompt_template else prompt_template + "\n\n" + text
+
+    messages = [{"role": "user", "content": prompt}]
+    raw = await llm.chat(model, messages)
+
+    logger.info("text_extract", extra={"file": req.file_name, "model": model})
+
+    items_raw = extract_json(raw)
+    items = [
+        TripleItem(
+            subject=item["subject"],
+            predicate=item["predicate"],
+            object=item["object"],
+            source_snippet=item["source_snippet"],
+            source_offset=SourceOffset(
+                start=item["source_offset"]["start"],
+                end=item["source_offset"]["end"],
+            ),
+        )
+        for item in items_raw
+    ]
+    return TextExtractResponse(items=items)
--- a/tests/pycache/test_text_router.cpython-312-pytest-9.0.3.pyc
+++ b/tests/pycache/test_text_router.cpython-312-pytest-9.0.3.pyc
--- a/tests/pycache/test_text_service.cpython-312-pytest-9.0.3.pyc
+++ b/tests/pycache/test_text_service.cpython-312-pytest-9.0.3.pyc
--- a/tests/test_text_router.py
+++ b/tests/test_text_router.py
@@ -0,0 +1,63 @@
+import pytest
+from unittest.mock import AsyncMock
+
+
+SAMPLE_TRIPLES_JSON = '''[
+  {
+    "subject": "变压器",
+    "predicate": "额定电压",
+    "object": "110kV",
+    "source_snippet": "该变压器额定电压为110kV",
+    "source_offset": {"start": 0, "end": 12}
+  }
+]'''
+
+
+def test_text_extract_returns_200(client, mock_llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"some text content")
+    mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON)
+
+    resp = client.post(
+        "/api/v1/text/extract",
+        json={"file_path": "text/test.txt", "file_name": "test.txt"},
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "items" in data
+    assert data["items"][0]["subject"] == "变压器"
+    assert data["items"][0]["source_offset"]["start"] == 0
+
+
+def test_text_extract_unsupported_format_returns_400(client, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"data")
+
+    resp = client.post(
+        "/api/v1/text/extract",
+        json={"file_path": "text/test.xlsx", "file_name": "data.xlsx"},
+    )
+    assert resp.status_code == 400
+    assert resp.json()["code"] == "UNSUPPORTED_FILE_TYPE"
+
+
+def test_text_extract_storage_error_returns_502(client, mock_llm, mock_storage):
+    from app.core.exceptions import StorageError
+    mock_storage.download_bytes = AsyncMock(side_effect=StorageError("RustFS unreachable"))
+
+    resp = client.post(
+        "/api/v1/text/extract",
+        json={"file_path": "text/test.txt", "file_name": "test.txt"},
+    )
+    assert resp.status_code == 502
+    assert resp.json()["code"] == "STORAGE_ERROR"
+
+
+def test_text_extract_llm_parse_error_returns_502(client, mock_llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"content")
+    mock_llm.chat = AsyncMock(return_value="not json {{{{")
+
+    resp = client.post(
+        "/api/v1/text/extract",
+        json={"file_path": "text/test.txt", "file_name": "test.txt"},
+    )
+    assert resp.status_code == 502
+    assert resp.json()["code"] == "LLM_PARSE_ERROR"
--- a/tests/test_text_service.py
+++ b/tests/test_text_service.py
@@ -0,0 +1,122 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+from app.core.exceptions import LLMParseError, StorageError, UnsupportedFileTypeError
+from app.models.text_models import TextExtractRequest
+
+
+SAMPLE_TRIPLES_JSON = '''[
+  {
+    "subject": "变压器",
+    "predicate": "额定电压",
+    "object": "110kV",
+    "source_snippet": "该变压器额定电压为110kV",
+    "source_offset": {"start": 0, "end": 12}
+  }
+]'''
+
+
+@pytest.fixture
+def req_txt():
+    return TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
+
+
+@pytest.fixture
+def req_pdf():
+    return TextExtractRequest(file_path="text/test.pdf", file_name="report.pdf")
+
+
+@pytest.fixture
+def req_docx():
+    return TextExtractRequest(file_path="text/test.docx", file_name="doc.docx")
+
+
+@pytest.fixture
+def llm(mock_llm):
+    mock_llm.chat = AsyncMock(return_value=SAMPLE_TRIPLES_JSON)
+    return mock_llm
+
+
+@pytest.mark.asyncio
+async def test_txt_extraction_returns_triples(llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"test content")
+    from app.services.text_service import extract_triples
+    req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
+    result = await extract_triples(req, llm, mock_storage)
+    assert len(result.items) == 1
+    assert result.items[0].subject == "变压器"
+    assert result.items[0].predicate == "额定电压"
+    assert result.items[0].object == "110kV"
+    assert result.items[0].source_offset.start == 0
+    assert result.items[0].source_offset.end == 12
+
+
+@pytest.mark.asyncio
+async def test_pdf_extraction(llm, mock_storage, tmp_path):
+    import pdfplumber, io
+    # We mock download_bytes to return a minimal PDF-like response
+    # and mock pdfplumber.open to return pages with text
+    mock_storage.download_bytes = AsyncMock(return_value=b"%PDF fake")
+
+    with pytest.MonkeyPatch().context() as mp:
+        mock_page = MagicMock()
+        mock_page.extract_text.return_value = "PDF content here"
+        mock_pdf = MagicMock()
+        mock_pdf.__enter__ = lambda s: s
+        mock_pdf.__exit__ = MagicMock(return_value=False)
+        mock_pdf.pages = [mock_page]
+        mp.setattr("pdfplumber.open", lambda f: mock_pdf)
+
+        from app.services import text_service
+        import importlib
+        importlib.reload(text_service)
+        req = TextExtractRequest(file_path="text/test.pdf", file_name="doc.pdf")
+        result = await text_service.extract_triples(req, llm, mock_storage)
+    assert len(result.items) == 1
+
+
+@pytest.mark.asyncio
+async def test_docx_extraction(llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"PK fake docx bytes")
+
+    with pytest.MonkeyPatch().context() as mp:
+        mock_para = MagicMock()
+        mock_para.text = "Word paragraph content"
+        mock_doc = MagicMock()
+        mock_doc.paragraphs = [mock_para]
+        mp.setattr("docx.Document", lambda f: mock_doc)
+
+        from app.services import text_service
+        import importlib
+        importlib.reload(text_service)
+        req = TextExtractRequest(file_path="text/test.docx", file_name="doc.docx")
+        result = await text_service.extract_triples(req, llm, mock_storage)
+    assert len(result.items) == 1
+
+
+@pytest.mark.asyncio
+async def test_unsupported_format_raises_error(llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"data")
+    from app.services.text_service import extract_triples
+    req = TextExtractRequest(file_path="text/test.xlsx", file_name="data.xlsx")
+    with pytest.raises(UnsupportedFileTypeError):
+        await extract_triples(req, llm, mock_storage)
+
+
+@pytest.mark.asyncio
+async def test_storage_error_propagates(llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(side_effect=StorageError("not found"))
+    from app.services.text_service import extract_triples
+    req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
+    with pytest.raises(StorageError):
+        await extract_triples(req, llm, mock_storage)
+
+
+@pytest.mark.asyncio
+async def test_llm_parse_error_propagates(mock_llm, mock_storage):
+    mock_storage.download_bytes = AsyncMock(return_value=b"content")
+    mock_llm.chat = AsyncMock(return_value="not json {{")
+    from app.services.text_service import extract_triples
+    req = TextExtractRequest(file_path="text/test.txt", file_name="test.txt")
+    with pytest.raises(LLMParseError):
+        await extract_triples(req, mock_llm, mock_storage)