diff --git a/apps/projects/utils.py b/apps/projects/utils.py index ec10b601..a7d2f9da 100644 --- a/apps/projects/utils.py +++ b/apps/projects/utils.py @@ -14,7 +14,8 @@ ) from apps.summarization.export_utils.core import generate_full_export from apps.summarization.pydantic_models import ProjectSummaryResponse -from apps.summarization.services import AIService +from apps.summarization.services import DocumentProcessor +from apps.summarization.services import ProjectSummarizer logger = logging.getLogger(__name__) @@ -44,7 +45,7 @@ def generate_project_summary( ) if documents_dict: try: - service = AIService() + service = DocumentProcessor() document_response = service.request_vision_dict( documents_dict=documents_dict ) @@ -64,7 +65,7 @@ def generate_project_summary( json_text = json.dumps(export_data, indent=2) prompt = Settings.get_value("project_summary_prompt") - service = AIService() + service = ProjectSummarizer() response = service.project_summarize( project=project, text=json_text, diff --git a/apps/summarization/pydantic_models.py b/apps/summarization/pydantic_models.py index bbd06e11..a8aaeabf 100644 --- a/apps/summarization/pydantic_models.py +++ b/apps/summarization/pydantic_models.py @@ -208,6 +208,31 @@ def is_document(self) -> bool: return any(url_lower.endswith(ext) for ext in document_extensions) +# Add these new models for module-by-module summarization +class ModuleSummaryResponse(BaseModel): + """Response model for a single module summary.""" + + summary: str = Field( + description="A 2-3 sentence overview of what happened in this module" + ) + bullets: List[str] = Field( + default_factory=list, + description="Key points about contributions, ideas, or outcomes", + ) + + +class GeneralInfoResponse(BaseModel): + """Response model for project general information.""" + + summary: str = Field( + description="A 3-4 sentence overview of the entire project's participation journey" + ) + goals: List[str] = Field( + default_factory=list, + description="Main goals or themes that emerged from the participation", + ) + + class DocumentSummaryItem(BaseModel): """Response model for a single document summary with handle.""" diff --git a/apps/summarization/requests/__init__.py b/apps/summarization/requests/__init__.py new file mode 100644 index 00000000..f0b4d355 --- /dev/null +++ b/apps/summarization/requests/__init__.py @@ -0,0 +1,13 @@ +"""AI requests package.""" + +from .base import AIRequest +from .document import DocumentRequest +from .document import MultimodalSummaryRequest +from .project import SummaryRequest + +__all__ = [ + "AIRequest", + "SummaryRequest", + "MultimodalSummaryRequest", + "DocumentRequest", +] diff --git a/apps/summarization/requests/base.py b/apps/summarization/requests/base.py new file mode 100644 index 00000000..0c226ea7 --- /dev/null +++ b/apps/summarization/requests/base.py @@ -0,0 +1,15 @@ +"""Base classes for AI requests.""" + +from abc import ABC +from abc import abstractmethod + + +class AIRequest(ABC): + """Base class for all AI requests.""" + + vision_support = False + + @abstractmethod + def prompt(self) -> str: + """Return the prompt text for this request.""" + pass diff --git a/apps/summarization/requests/document.py b/apps/summarization/requests/document.py new file mode 100644 index 00000000..e8f387a5 --- /dev/null +++ b/apps/summarization/requests/document.py @@ -0,0 +1,37 @@ +"""Document and image processing requests.""" + +from .base import AIRequest + + +class MultimodalSummaryRequest(AIRequest): + """Request model for multimodal document summarization.""" + + vision_support = True + PROMPT = "Summarize this image/document. Return JSON with summary field." + + def __init__( + self, image_urls: list[str], text: str | None = None, prompt: str | None = None + ): + super().__init__() + self.image_urls = image_urls + self.prompt_text = prompt or self.PROMPT + self.text = text + + def prompt(self) -> str: + base = self.prompt_text + return f"{base}\n\nText:\n{self.text}" if self.text else base + + +class DocumentRequest(AIRequest): + """Request model for document summarization.""" + + vision_support = True + PROMPT = "Summarize this document. Return JSON with summary field." + + def __init__(self, url: str, prompt: str | None = None): + super().__init__() + self.image_urls = [url] + self.prompt_text = prompt or self.PROMPT + + def prompt(self) -> str: + return self.prompt_text diff --git a/apps/summarization/requests/project.py b/apps/summarization/requests/project.py new file mode 100644 index 00000000..6fda2987 --- /dev/null +++ b/apps/summarization/requests/project.py @@ -0,0 +1,121 @@ +"""Project-specific AI requests.""" + +import json +from typing import Any +from typing import Dict +from typing import List + +from .base import AIRequest + + +class ModuleSummaryRequest(AIRequest): + """Request for summarizing a single module.""" + + PROMPT_TEMPLATE = """ + Summarize this participation module: + + Module: {module_name} + Phase: {phase} + Description: {description} + Data: {content} + + Return ONLY valid JSON with EXACTLY this format: + {{ + "summary": "A 2-3 sentence overview of what happened in this module", + "bullets": [ + "Key point 1 about specific contributions", + "Key point 2 about ideas or proposals", + "Key point 3 about engagement or outcomes" + ] + }} + + The response MUST include BOTH "summary" and "bullets" fields. + "bullets" MUST be an array of strings, never empty. + """ + + def __init__(self, module_data: Dict[str, Any], phase: str): + super().__init__() + self.module_data = module_data + self.phase = phase + + def prompt(self) -> str: + """Generate the prompt for this module.""" + return self.PROMPT_TEMPLATE.format( + module_name=self.module_data.get("module_name", "Unknown"), + phase=self.phase, + description=self.module_data.get("description", "No description"), + content=json.dumps(self.module_data.get("content", {})), + ) + + +class GeneralInfoRequest(AIRequest): + """Request for project-level summary.""" + + PROMPT_TEMPLATE = """ + Summarize this entire project: + + Project: {project_name} + Description: {description} + Module Summaries: {module_summaries} + + Return ONLY valid JSON with EXACTLY this format: + {{ + "summary": "A 3-4 sentence overview of the entire project's participation", + "goals": [ + "First main goal or theme", + "Second main goal or theme", + "Third main goal or theme" + ] + }} + + The response MUST include BOTH "summary" and "goals" fields. + "goals" MUST be an array of strings, at least 2-3 items. + """ + + def __init__( + self, project_data: Dict[str, Any], module_summaries: List[Dict[str, Any]] + ): + super().__init__() + self.project_data = project_data + self.module_summaries = module_summaries + + def prompt(self) -> str: + """Generate the prompt for project summary.""" + project = self.project_data.get("project", {}) + return self.PROMPT_TEMPLATE.format( + project_name=project.get("name", "Unknown"), + description=project.get("information", "No description"), + module_summaries=json.dumps(self.module_summaries), + ) + + +class SummaryRequest(AIRequest): + """Legacy request model for text summarization.""" + + DEFAULT_PROMPT = """ + You are a JSON generator. Return ONLY valid JSON. + + Schema: + { + "title": "Summary of participation", + "general_info": {"summary": "string", "goals": ["string"]}, + "phases": { + "past": {"modules": [{"module_id": "number", "module_name": "string", "status": "past", "final": {"summary": "string", "bullets": ["string"]}}]}, + "current": {"modules": [{"module_id": "number", "module_name": "string", "status": "current", "final": {"summary": "string", "bullets": ["string"]}}]}, + "upcoming": {"modules": [{"module_id": "number", "module_name": "string", "status": "upcoming", "final": {"summary": "string", "bullets": ["string"]}}]} + } + } + + NOTE: module_id in the output should match the given module_id of the input for each module + + Extract real data from the project export. + Respond with ONLY the JSON object. + """ + + def __init__(self, text: str, prompt: str | None = None): + super().__init__() + self.text = text + self.prompt_text = prompt or self.DEFAULT_PROMPT + + def prompt(self) -> str: + return f"{self.prompt_text}\n\n{self.text}" diff --git a/apps/summarization/services.py b/apps/summarization/services.py deleted file mode 100644 index a8a5c8ab..00000000 --- a/apps/summarization/services.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Service for text summarization using AI providers.""" - -import json -import logging -from datetime import timedelta - -from django.conf import settings -from django.utils import timezone -from pydantic import BaseModel -from sentry_sdk import capture_exception - -from .models import ProjectSummary -from .providers import AIProvider -from .providers import AIRequest -from .providers import ProviderConfig -from .pydantic_models import DocumentInputItem -from .pydantic_models import DocumentSummaryItem -from .pydantic_models import DocumentSummaryResponse -from .pydantic_models import ProjectSummaryResponse -from .pydantic_models import SummaryItem -from .utils import extract_text_from_document - -logger = logging.getLogger(__name__) - - -# Rate limits (can be moved to settings) -PROJECT_SUMMARY_RATE_LIMIT_MINUTES = getattr( - settings, "PROJECT_SUMMARY_RATE_LIMIT_MINUTES", 5 -) -SUMMARY_GLOBAL_LIMIT_PER_HOUR = getattr(settings, "SUMMARY_GLOBAL_LIMIT_PER_HOUR", 100) - - -class AIService: - """Service for summarizing text using configured AI provider.""" - - def __init__( - self, provider_handle: str = None, document_provider_handle: str = None - ): - """Initialize AI service with providers.""" - self.provider = self._init_provider(provider_handle, "AI_PROVIDER") - self.document_provider = self._init_provider( - document_provider_handle, "AI_DOCUMENT_PROVIDER" - ) - - def _init_provider(self, handle: str | None, settings_key: str) -> AIProvider: - """Initialize a provider from handle or settings.""" - if not handle: - handle = getattr(settings, settings_key, None) - if not handle: - raise ValueError( - f"No provider configured. Pass {settings_key.lower()} or set {settings_key} in settings." - ) - return AIProvider(ProviderConfig.from_handle(handle)) - - def summarize( - self, - text: str, - prompt: str | None = None, - result_type: type[BaseModel] = SummaryItem, - ) -> BaseModel: - """Summarize text.""" - request = SummaryRequest(text=text, prompt=prompt) - return self.provider.request(request, result_type=result_type) - - def project_summarize( - self, - project, - text: str, - prompt: str | None = None, - result_type: type[BaseModel] = ProjectSummaryResponse, - is_rate_limit: bool = True, - allow_regeneration: bool = True, - ) -> BaseModel: - """Summarize project data with caching. - - - Exact hash match: reuse cached summary and update last_checked_at. - - Rate limits (if enabled): optionally reuse latest summary without touching last_checked_at. - - If allow_regeneration is False and a summary exists, always return the latest summary - without generating a new one, even when the hash changed. - """ - request = SummaryRequest(text=text, prompt=prompt) - latest = self._get_latest_summary(project) - text_hash = ProjectSummary.compute_hash(text) - - cached = self._get_cached_response( - project=project, - text_hash=text_hash, - latest=latest, - is_rate_limit=is_rate_limit, - ) - if cached: - return cached - - # No cache hit: - # - If regeneration is not allowed and we already have a summary, - # return the latest one unchanged (used by the button endpoint). - if not allow_regeneration and latest: - logger.debug( - "Regeneration disabled and no cache hit; returning latest summary " - f"for project {project.id}" - ) - return ProjectSummaryResponse(**latest.response_data) - - # If there is no existing summary, we must generate one at least once. - logger.info(f"Generating summary for project {project.id} ({project.slug})") - - try: - response = self.provider.request(request, result_type=result_type) - self._save_to_cache(project, request.prompt_text, text_hash, response) - return response - except Exception as e: - logger.error(f"Summary generation failed: {e}", exc_info=True) - capture_exception(e) - - raise - - def _get_latest_summary(self, project): - """Get most recent summary for project.""" - return ( - ProjectSummary.objects.filter(project=project) - .order_by("-created_at") - .first() - ) - - def _get_cached_response( - self, - project, - text_hash: str, - latest, - is_rate_limit: bool, - ) -> ProjectSummaryResponse | None: - """Return cached response if valid. - - - Exact hash match: always used, updates last_checked_at. - - Rate limits: optionally reuse latest summary without touching last_checked_at. - """ - if not latest: - return None - - # Exact match of input hash: content is up to date. - if latest.input_text_hash == text_hash: - logger.debug(f"Cache hit (exact match) for project {project.id}") - latest.last_checked_at = timezone.now() - latest.save(update_fields=["last_checked_at"]) - return ProjectSummaryResponse(**latest.response_data) - - if not is_rate_limit: - return None - - # Rate limit checks (reuse latest summary without updating last_checked_at) - age = timezone.now() - latest.created_at - - if age < timedelta(minutes=PROJECT_SUMMARY_RATE_LIMIT_MINUTES): - logger.debug(f"Using rate-limited cache for project {project.id}") - return ProjectSummaryResponse(**latest.response_data) - - if age < timedelta(hours=1): - recent = ProjectSummary.objects.filter( - created_at__gte=timezone.now() - timedelta(hours=1) - ).count() - if recent >= SUMMARY_GLOBAL_LIMIT_PER_HOUR: - logger.debug( - f"Global rate limit reached, using cache for project {project.id}" - ) - return ProjectSummaryResponse(**latest.response_data) - - return None - - def _save_to_cache( - self, - project, - prompt: str, - text_hash: str, - response: BaseModel, - ): - """Save successful response to cache.""" - if isinstance(response, ProjectSummaryResponse): - ProjectSummary.objects.create( - project=project, - prompt=prompt, - input_text_hash=text_hash, - response_data=json.loads(response.model_dump_json()), - last_checked_at=timezone.now(), - ) - logger.info(f"Cached summary for project {project.id}") - - def request_vision_dict( - self, documents_dict: dict[str, str], prompt: str | None = None - ) -> DocumentSummaryResponse: - """Process documents from dictionary format.""" - items = [DocumentInputItem(handle=h, url=u) for h, u in documents_dict.items()] - return self.request_vision(items, prompt) - - def request_vision( - self, documents: list[DocumentInputItem], prompt: str | None = None - ) -> DocumentSummaryResponse: - """Process documents and images, return combined summaries.""" - docs, images = self._split_documents(documents) - - results = [] - if docs[0]: - results.extend(self._process_documents(docs)) - if images[0]: - results.extend(self._process_images(images, prompt)) - - return DocumentSummaryResponse(documents=results) - - def _split_documents(self, documents): - """Split into regular docs and images.""" - docs_urls, docs_handles = [], [] - img_urls, img_handles = [], [] - - for doc in documents: - if ( - not self.document_provider.config.supports_documents - and doc.is_document() - ): - docs_urls.append(doc.url) - docs_handles.append(doc.handle) - else: - img_urls.append(doc.url) - img_handles.append(doc.handle) - - return (docs_urls, docs_handles), (img_urls, img_handles) - - def _process_documents(self, docs_data): - """Extract text from PDFs/DOCX files.""" - urls, handles = docs_data - results = [] - - for url, handle in zip(urls, handles): - try: - text = extract_text_from_document(url) - results.append(DocumentSummaryItem(handle=handle, summary=text)) - except Exception as e: - logger.error( - f"Failed to extract text from {handle}: {e}", exc_info=True - ) - capture_exception(e) - - return results - - def _process_images(self, images_data, prompt): - """Process images with vision API.""" - urls, handles = images_data - - if not prompt: - prompt = ( - f"Summarize each image separately. Handles in order: {handles}. " - f"Return list of summaries with handles." - ) - - request = MultimodalSummaryRequest(image_urls=urls, prompt=prompt) - response = self.document_provider.request(request, DocumentSummaryResponse) - return response.documents - - -class SummaryRequest(AIRequest): - """Request model for text summarization.""" - - DEFAULT_PROMPT = """ -You are a JSON generator. Return ONLY valid JSON. - -Schema: -{ - "title": "Summary of participation", - "general_info": {"summary": "string", "goals": ["string"]}, - "phases": { - "past": {"modules": [{"module_id": "number", "module_name": "string", "status": "past", "final": {"summary": "string", "bullets": ["string"]}, "debug": {...}}]}, - "current": {"modules": [{"module_id": "number", "module_name": "string", "status": "current", "final": {"summary": "string", "bullets": ["string"]}, "debug": {...}}]}, - "upcoming": {"modules": [{"module_id": "number", "module_name": "string", "status": "upcoming", "final": {"summary": "string", "bullets": ["string"]}, "debug": {...}}]} - } -} - -NOTE: module_id in the output should match the given module_id of the input for each module - -Each module MUST include a 'debug' object with: -- module_type: string -- signals_snapshot: list of strings -- draft_before_qa: string -- claims: list of {claim_text, evidence_type(from_votes|from_ratings|from_open_answers|from_comments|from_base_text|uncertain), action(keep|soften|remove), fix_hint} -- quantifier_fixes: list of {original_phrase, replacement, reason} -- anchors: list of strings -- coverage_gaps: list of strings -- coverage_patch: optional string -- patches: list of {patch_type(REPLACE|REMOVE|ADD_SENTENCE), target, replacement} -- after_qa: string -- diff_summary: optional string -- qa_status: PASS|FAIL - -Extract real data from the project export. -Respond with ONLY the JSON object. -""" - - def __init__(self, text: str, prompt: str | None = None): - super().__init__() - self.text = text - self.prompt_text = prompt or self.DEFAULT_PROMPT - - def prompt(self) -> str: - return f"{self.prompt_text}\n\n{self.text}" - - -class MultimodalSummaryRequest(AIRequest): - """Request model for multimodal document summarization.""" - - vision_support = True - PROMPT = "Summarize this image/document. Return JSON with summary field." - - def __init__( - self, image_urls: list[str], text: str | None = None, prompt: str | None = None - ): - super().__init__() - self.image_urls = image_urls - self.prompt_text = prompt or self.PROMPT - self.text = text - - def prompt(self) -> str: - base = self.prompt_text - return f"{base}\n\nText:\n{self.text}" if self.text else base - - -class DocumentRequest(AIRequest): - """Request model for document summarization.""" - - vision_support = True - PROMPT = "Summarize this document. Return JSON with summary field." - - def __init__(self, url: str, prompt: str | None = None): - super().__init__() - self.image_urls = [url] - self.prompt_text = prompt or self.PROMPT - - def prompt(self) -> str: - return self.prompt_text diff --git a/apps/summarization/services/__init__.py b/apps/summarization/services/__init__.py new file mode 100644 index 00000000..66e0815e --- /dev/null +++ b/apps/summarization/services/__init__.py @@ -0,0 +1,11 @@ +"""AI services package.""" + +from .base import AIServiceBase +from .document import DocumentProcessor +from .project import ProjectSummarizer + +__all__ = [ + "AIServiceBase", + "ProjectSummarizer", + "DocumentProcessor", +] diff --git a/apps/summarization/services/base.py b/apps/summarization/services/base.py new file mode 100644 index 00000000..bba19a7b --- /dev/null +++ b/apps/summarization/services/base.py @@ -0,0 +1,40 @@ +"""Base classes for AI services.""" + +import logging + +from django.conf import settings +from pydantic import BaseModel + +from ..providers import AIProvider +from ..providers import ProviderConfig +from ..requests.project import SummaryRequest + +logger = logging.getLogger(__name__) + + +class AIServiceBase: + """Base class for AI services with provider initialization.""" + + def __init__(self, provider_handle: str = None, settings_key: str = None): + """Initialize AI service with provider.""" + self.provider = self._init_provider(provider_handle, settings_key) + + def _init_provider(self, handle: str | None, settings_key: str) -> AIProvider: + """Initialize a provider from handle or settings.""" + if not handle: + handle = getattr(settings, settings_key, None) + if not handle: + raise ValueError( + f"No provider configured. Pass {settings_key.lower()} or set {settings_key} in settings." + ) + return AIProvider(ProviderConfig.from_handle(handle)) + + def summarize( + self, + text: str, + prompt: str | None = None, + result_type: type[BaseModel] = BaseModel, + ) -> BaseModel: + """Basic text summarization.""" + request = SummaryRequest(text=text, prompt=prompt) + return self.provider.request(request, result_type=result_type) diff --git a/apps/summarization/services/cache.py b/apps/summarization/services/cache.py new file mode 100644 index 00000000..2a9cc93d --- /dev/null +++ b/apps/summarization/services/cache.py @@ -0,0 +1,91 @@ +"""Caching logic for project summaries.""" + +import json +import logging +from datetime import timedelta + +from django.utils import timezone +from pydantic import BaseModel + +from ..models import ProjectSummary +from ..pydantic_models import ProjectSummaryResponse + +logger = logging.getLogger(__name__) + + +class SummaryCache: + """Handles caching of project summaries.""" + + def __init__(self, rate_limit_minutes: int, global_limit_per_hour: int): + self.rate_limit_minutes = rate_limit_minutes + self.global_limit_per_hour = global_limit_per_hour + + def get_latest(self, project): + """Get most recent summary for project.""" + return ( + ProjectSummary.objects.filter(project=project) + .order_by("-created_at") + .first() + ) + + def get_cached_response( + self, + project, + text_hash: str, + latest, + is_rate_limit: bool, + ) -> ProjectSummaryResponse | None: + """Return cached response if valid. + + - Exact hash match: always used, updates last_checked_at. + - Rate limits: optionally reuse latest summary without touching last_checked_at. + """ + if not latest: + return None + + # Exact match of input hash: content is up to date. + if latest.input_text_hash == text_hash: + logger.debug(f"Cache hit (exact match) for project {project.id}") + latest.last_checked_at = timezone.now() + latest.save(update_fields=["last_checked_at"]) + return ProjectSummaryResponse(**latest.response_data) + + if not is_rate_limit: + return None + + # Rate limit checks (reuse latest summary without updating last_checked_at) + age = timezone.now() - latest.created_at + + if age < timedelta(minutes=self.rate_limit_minutes): + logger.debug(f"Using rate-limited cache for project {project.id}") + return ProjectSummaryResponse(**latest.response_data) + + if age < timedelta(hours=1): + recent = ProjectSummary.objects.filter( + created_at__gte=timezone.now() - timedelta(hours=1) + ).count() + if recent >= self.global_limit_per_hour: + logger.debug( + f"Global rate limit reached, using cache for project {project.id}" + ) + return ProjectSummaryResponse(**latest.response_data) + + return None + + def save( + self, + project, + prompt: str, + text_hash: str, + response: BaseModel, + ): + """Save successful response to cache.""" + if isinstance(response, ProjectSummaryResponse): + ProjectSummary.objects.create( + project=project, + prompt=prompt, + input_text_hash=text_hash, + response_data=json.loads(response.model_dump_json()), + last_checked_at=timezone.now(), + ) + logger.info(f"Cached summary for project {project.id}") diff --git a/apps/summarization/services/document.py b/apps/summarization/services/document.py new file mode 100644 index 00000000..21f5f91e --- /dev/null +++ b/apps/summarization/services/document.py @@ -0,0 +1,89 @@ +"""Document and image processing service.""" + +import logging + +from sentry_sdk import capture_exception + +from ..pydantic_models import DocumentInputItem +from ..pydantic_models import DocumentSummaryItem +from ..pydantic_models import DocumentSummaryResponse +from ..requests.document import MultimodalSummaryRequest +from ..utils import extract_text_from_document +from .base import AIServiceBase + +logger = logging.getLogger(__name__) + + +class DocumentProcessor(AIServiceBase): + """Service for processing documents and images.""" + + def __init__(self, provider_handle: str = None): + """Initialize document processor with provider.""" + super().__init__(provider_handle, "AI_DOCUMENT_PROVIDER") + + def request_vision_dict( + self, documents_dict: dict[str, str], prompt: str | None = None + ) -> DocumentSummaryResponse: + """Process documents from dictionary format.""" + items = [DocumentInputItem(handle=h, url=u) for h, u in documents_dict.items()] + return self.request_vision(items, prompt) + + def request_vision( + self, documents: list[DocumentInputItem], prompt: str | None = None + ) -> DocumentSummaryResponse: + """Process documents and images, return combined summaries.""" + docs, images = self._split_documents(documents) + + results = [] + if docs[0]: + results.extend(self._process_documents(docs)) + if images[0]: + results.extend(self._process_images(images, prompt)) + + return DocumentSummaryResponse(documents=results) + + def _split_documents(self, documents): + """Split into regular docs and images.""" + docs_urls, docs_handles = [], [] + img_urls, img_handles = [], [] + + for doc in documents: + if not self.provider.config.supports_documents and doc.is_document(): + docs_urls.append(doc.url) + docs_handles.append(doc.handle) + else: + img_urls.append(doc.url) + img_handles.append(doc.handle) + + return (docs_urls, docs_handles), (img_urls, img_handles) + + def _process_documents(self, docs_data): + """Extract text from PDFs/DOCX files.""" + urls, handles = docs_data + results = [] + + for url, handle in zip(urls, handles): + try: + text = extract_text_from_document(url) + results.append(DocumentSummaryItem(handle=handle, summary=text)) + except Exception as e: + logger.error( + f"Failed to extract text from {handle}: {e}", exc_info=True + ) + capture_exception(e) + + return results + + def _process_images(self, images_data, prompt): + """Process images with vision API.""" + urls, handles = images_data + + if not prompt: + prompt = ( + f"Summarize each image separately. Handles in order: {handles}. " + f"Return list of summaries with handles." + ) + + request = MultimodalSummaryRequest(image_urls=urls, prompt=prompt) + response = self.provider.request(request, DocumentSummaryResponse) + return response.documents diff --git a/apps/summarization/services/project.py b/apps/summarization/services/project.py new file mode 100644 index 00000000..700e0548 --- /dev/null +++ b/apps/summarization/services/project.py @@ -0,0 +1,228 @@ +"""Project summarization service.""" + +import json +import logging +from typing import Any +from typing import Dict +from typing import List + +from django.conf import settings +from sentry_sdk import capture_exception + +from ..models import ProjectSummary +from ..pydantic_models import GeneralInfo +from ..pydantic_models import GeneralInfoResponse +from ..pydantic_models import ModuleDebug +from ..pydantic_models import ModuleFinal +from ..pydantic_models import ModuleSummaryResponse +from ..pydantic_models import PhaseModule +from ..pydantic_models import Phases +from ..pydantic_models import PhaseSection +from ..pydantic_models import ProjectSummaryResponse +from ..requests.project import GeneralInfoRequest +from ..requests.project import ModuleSummaryRequest +from ..requests.project import SummaryRequest +from .base import AIServiceBase +from .cache import SummaryCache + +logger = logging.getLogger(__name__) + + +# Rate limits (can be moved to settings) +PROJECT_SUMMARY_RATE_LIMIT_MINUTES = getattr( + settings, "PROJECT_SUMMARY_RATE_LIMIT_MINUTES", 5 +) +SUMMARY_GLOBAL_LIMIT_PER_HOUR = getattr(settings, "SUMMARY_GLOBAL_LIMIT_PER_HOUR", 100) + + +class ProjectSummarizer(AIServiceBase): + """Service for summarizing projects module by module.""" + + def __init__(self, provider_handle: str = None): + """Initialize project summarizer with provider.""" + super().__init__(provider_handle, "AI_PROVIDER") + self.cache = SummaryCache( + rate_limit_minutes=PROJECT_SUMMARY_RATE_LIMIT_MINUTES, + global_limit_per_hour=SUMMARY_GLOBAL_LIMIT_PER_HOUR, + ) + + def _process_module(self, module: Dict[str, Any], phase: str) -> PhaseModule: + """Process a single module and return its summary.""" + try: + request = ModuleSummaryRequest(module, phase) + response = self.provider.request(request, result_type=ModuleSummaryResponse) + + # Create debug info if present in module data + debug = None + if "debug" in module: + debug = ModuleDebug(**module["debug"]) + + return PhaseModule( + module_id=module.get("module_id"), + module_name=module.get("module_name", "Unknown"), + status=phase, # 'past', 'current', 'upcoming' + debug=debug, + final=ModuleFinal(summary=response.summary, bullets=response.bullets), + ) + + except Exception as e: + logger.error(f"Failed to summarize module {module.get('module_id')}: {e}") + capture_exception(e) + + # Return fallback module + return PhaseModule( + module_id=module.get("module_id", 0), + module_name=module.get("module_name", "Unknown"), + status=phase, + debug=None, + final=ModuleFinal( + summary="Summary temporarily unavailable", + bullets=["Could not generate summary for this module"], + ), + ) + + def _process_phase( + self, phase_data: Dict[str, Any], phase_name: str + ) -> List[PhaseModule]: + """Process all modules in a phase.""" + modules = phase_data.get("modules", []) + return [self._process_module(module, phase_name) for module in modules] + + def project_summarize( + self, + project, # Django project model instance + text: str, # JSON string containing project data + prompt: str | None = None, + result_type: type[ProjectSummaryResponse] = ProjectSummaryResponse, + is_rate_limit: bool = True, + allow_regeneration: bool = True, + ) -> ProjectSummaryResponse: + """Summarize project data module by module. + + Args: + project: Django project model instance (for caching) + text: JSON string containing the full project export data + prompt: Optional custom prompt + result_type: Expected response type + is_rate_limit: Whether to apply rate limiting + allow_regeneration: Whether to generate new summary if cache miss + """ + # Parse the JSON string into a dictionary + try: + project_data = json.loads(text) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse project data JSON: {e}") + # Fall back to legacy method if JSON parsing fails + return self._legacy_summarize( + project, text, prompt, result_type, is_rate_limit, allow_regeneration + ) + + # Check cache using the text parameter (string version for hashing) + text_hash = ProjectSummary.compute_hash(text) + latest = self.cache.get_latest(project) + + cached = self.cache.get_cached_response( + project=project, + text_hash=text_hash, + latest=latest, + is_rate_limit=is_rate_limit, + ) + if cached: + return cached + + if not allow_regeneration and latest: + logger.debug( + f"Regeneration disabled, returning cached summary for project {project.id}" + ) + return ProjectSummaryResponse(**latest.response_data) + + logger.info( + f"Generating module-by-module summary for project {project.id} ({project.slug})" + ) + + try: + # Process each phase + phases_data = project_data.get("phases", {}) + + past_modules = self._process_phase(phases_data.get("past", {}), "past") + current_modules = self._process_phase( + phases_data.get("current", {}), "current" + ) + upcoming_modules = self._process_phase( + phases_data.get("upcoming", {}), "upcoming" + ) + + all_modules = past_modules + current_modules + upcoming_modules + + # Get general project summary + general_request = GeneralInfoRequest( + project_data, [m.dict() for m in all_modules] + ) # Convert to dict for the request + general_response = self.provider.request( + general_request, result_type=GeneralInfoResponse + ) + + # Build complete response + response = ProjectSummaryResponse( + title=f"Summary of {project_data.get('project', {}).get('name', 'participation')}", + general_info=GeneralInfo( + summary=general_response.summary, goals=general_response.goals + ), + phases=Phases( + past=PhaseSection(modules=past_modules), + current=PhaseSection(modules=current_modules), + upcoming=PhaseSection(modules=upcoming_modules), + ), + ) + + # Cache the result + self.cache.save(project, prompt or "module-by-module", text_hash, response) + + return response + + except Exception as e: + logger.error(f"Summary generation failed: {e}", exc_info=True) + capture_exception(e) + raise + + def _legacy_summarize( + self, + project, + text: str, + prompt: str | None = None, + result_type: type[ProjectSummaryResponse] = ProjectSummaryResponse, + is_rate_limit: bool = True, + allow_regeneration: bool = True, + ) -> ProjectSummaryResponse: + """Fallback to original summarization method.""" + request = SummaryRequest(text=text, prompt=prompt) + latest = self.cache.get_latest(project) + text_hash = ProjectSummary.compute_hash(text) + + cached = self.cache.get_cached_response( + project=project, + text_hash=text_hash, + latest=latest, + is_rate_limit=is_rate_limit, + ) + if cached: + return cached + + if not allow_regeneration and latest: + logger.debug( + f"Regeneration disabled, returning cached summary for project {project.id}" + ) + return ProjectSummaryResponse(**latest.response_data) + + logger.info( + f"Generating legacy summary for project {project.id} ({project.slug})" + ) + + try: + response = self.provider.request(request, result_type=result_type) + self.cache.save(project, request.prompt_text, text_hash, response) + return response + except Exception as e: + logger.error(f"Summary generation failed: {e}", exc_info=True) + capture_exception(e) + raise diff --git a/apps/summarization/test_summarization.py b/apps/summarization/test_summarization.py index 294d50d4..46d4e704 100644 --- a/apps/summarization/test_summarization.py +++ b/apps/summarization/test_summarization.py @@ -28,7 +28,7 @@ from adhocracy4.projects.models import Project from apps.summarization.pydantic_models import SummaryResponse -from apps.summarization.services import AIService +from apps.summarization.services import ProjectSummarizer # Long example text for testing LONG_TEXT = ( @@ -255,7 +255,7 @@ def test_summarization(provider_handle: str = None): print_separator() try: - service = AIService(provider_handle=provider_handle) + service = ProjectSummarizer(provider_handle=provider_handle) _print_service_info(service) print("ORIGINAL TEXT:") diff --git a/apps/summarization/views.py b/apps/summarization/views.py index 510a45d3..0ddc3d0a 100644 --- a/apps/summarization/views.py +++ b/apps/summarization/views.py @@ -12,8 +12,9 @@ from .export_utils.core import generate_full_export from .pydantic_models import DocumentInputItem from .pydantic_models import ProjectSummaryResponse -from .services import AIService -from .services import SummaryRequest +from .requests import SummaryRequest +from .services import DocumentProcessor +from .services import ProjectSummarizer def _get_projects_queryset(): @@ -54,7 +55,7 @@ def _handle_text_request( ) -> tuple[ProjectSummaryResponse | None, int, str | None]: """Handle text-only summarization.""" try: - service = AIService(provider_handle=provider_handle) + service = ProjectSummarizer(provider_handle=provider_handle) response = service.summarize( text=text, prompt=prompt if prompt else None, @@ -204,7 +205,7 @@ def post(self, request): raise ValueError("Documents must be a dict or list") # Process documents - service = AIService(document_provider_handle=provider_handle) + service = DocumentProcessor(document_provider_handle=provider_handle) response = service.request_vision( documents=documents, prompt=prompt if prompt else None,