import base64 import io import cv2 import numpy as np from app.clients.llm.base import LLMClient from app.clients.storage.base import StorageClient from app.core.config import get_config from app.core.json_utils import extract_json from app.core.logging import get_logger from app.models.image_models import ( BBox, ImageExtractRequest, ImageExtractResponse, QuadrupleItem, ) logger = get_logger(__name__) _DEFAULT_PROMPT = ( "请分析这张图片,提取其中的知识四元组,以 JSON 数组格式返回,每条包含字段:" "subject(主体实体)、predicate(关系/属性)、object(客体实体)、" "qualifier(修饰信息,可为 null)、bbox({{x, y, w, h}} 像素坐标)。" ) async def extract_quads( req: ImageExtractRequest, llm: LLMClient, storage: StorageClient, ) -> ImageExtractResponse: cfg = get_config() bucket = cfg["storage"]["buckets"]["source_data"] model = req.model or cfg["models"]["default_vision"] image_bytes = await storage.download_bytes(bucket, req.file_path) # Decode with OpenCV for cropping; encode as base64 for LLM nparr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) img_h, img_w = img.shape[:2] b64 = base64.b64encode(image_bytes).decode() image_data_url = f"data:image/jpeg;base64,{b64}" prompt = req.prompt_template or _DEFAULT_PROMPT messages = [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_data_url}}, {"type": "text", "text": prompt}, ], } ] raw = await llm.chat_vision(model, messages) logger.info("image_extract", extra={"file": req.file_path, "model": model}) items_raw = extract_json(raw) items: list[QuadrupleItem] = [] for idx, item in enumerate(items_raw): b = item["bbox"] # Clamp bbox to image dimensions x = max(0, min(int(b["x"]), img_w - 1)) y = max(0, min(int(b["y"]), img_h - 1)) w = min(int(b["w"]), img_w - x) h = min(int(b["h"]), img_h - y) crop = img[y : y + h, x : x + w] _, crop_buf = cv2.imencode(".jpg", crop) crop_bytes = crop_buf.tobytes() crop_path = f"crops/{req.task_id}/{idx}.jpg" await storage.upload_bytes(bucket, crop_path, crop_bytes, "image/jpeg") items.append( QuadrupleItem( subject=item["subject"], predicate=item["predicate"], object=item["object"], qualifier=item.get("qualifier"), bbox=BBox(x=x, y=y, w=w, h=h), cropped_image_path=crop_path, ) ) return ImageExtractResponse(items=items)