avidml · shubhobm · Feb 19, 2026 · Dec 25, 2025 · Dec 26, 2025 · Jan 4, 2026
diff --git a/avidtools/connectors/url.py b/avidtools/connectors/url.py
@@ -8,7 +8,7 @@
 from typing import Optional
 import requests
 from bs4 import BeautifulSoup
-from openai import OpenAI
+from openai import OpenAI, AsyncOpenAI
 
 from ..datamodels.report import Report, ReportMetadata
 from ..datamodels.components import (
@@ -50,6 +50,7 @@ def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
             )
         self.model = model
         self.client = OpenAI(api_key=self.api_key)
+        self.async_client = AsyncOpenAI(api_key=self.api_key)
 
     def scrape_url(self, url: str) -> dict:
         """
@@ -114,19 +115,79 @@ def _create_ai_prompt(self, scraped_data: dict) -> str:
         """
         prompt = f"""You are an AI security expert tasked with analyzing web content about AI/ML vulnerabilities, incidents, or security issues and extracting structured information to create an AVID (AI Vulnerability Database) report.
 
-The AVID report structure includes:
-- **report_id**: A unique identifier (generate one like "AVID-YYYY-R-XXXX")
-- **affects**: Information about affected artifacts including:
-  - developer: List of developers/organizations
-  - deployer: List of deployers/organizations
-  - artifacts: List of artifacts with type (MUST be one of: "Model", "Dataset", "System") and name
-- **problemtype**: Problem description with:
-  - classof: Class (MUST be one of: "AIID Incident", "ATLAS Case Study", "CVE Entry", "LLM Evaluation", "Third-party Report", "Undefined"). Default to "Third-party Report" if unsure.
-  - type: Type (MUST be one of: "Issue", "Advisory", "Measurement", "Detection")
-  - description: language ("eng") and value (description text)
-- **description**: High-level description with lang and value
-- **references**: List of references with label and url (MUST include the source URL)
-- **reported_date**: Date in YYYY-MM-DD format (use today's date if not specified)
+The AVID Report follows this exact schema:
+
+```json
+{{
+  "data_type": "AVID",
+  "data_version": "string (optional)",
+  "metadata": {{
+    "report_id": "string (required, format: AVID-YYYY-R-XXXX)"
+  }},
+  "affects": {{
+    "developer": ["list of developer organizations of the model/system involved"],
+    "deployer": ["list of deployer organizations"],
+    "artifacts": [
+      {{
+        "type": "Model|Dataset|System (required)",
+        "name": "artifact name (required)"
+      }}
+    ]
+  }},
+  "problemtype": {{
+    "classof": "AIID Incident|ATLAS Case Study|CVE Entry|LLM Evaluation|Third-party Report|Undefined (required, default: Third-party Report)",
+    "type": "Issue|Advisory|Measurement|Detection (optional)",
+    "description": {{
+      "lang": "eng",
+      "value": "description text"
+    }}
+  }},
+  "metrics": [
+    {{
+      "name": "metric name",
+      "detection_method": {{
+        "type": "Significance Test|Static Threshold",
+        "name": "method name"
+      }},
+      "results": {{}} or []
+    }}
+  ],
+  "references": [
+    {{
+      "label": "reference label",
+      "url": "reference url (MUST include source URL)"
+    }}
+  ],
+  "description": {{
+    "lang": "eng",
+    "value": "high-level description"
+  }},
+  "impact": {{
+    "avid": {{
+      "risk_domain": ["list of risk domains"],
+      "sep_view": ["list of SEP taxonomy IDs"],
+      "lifecycle_view": ["list of lifecycle stage IDs"],
+      "taxonomy_version": "version string"
+    }},
+    "atlas": [
+      {{
+        "tactic": "tactic name",
+        "technique": "technique name",
+        "subtechnique": "subtechnique name"
+      }}
+    ]
+  }},
+  "credit": [
+    {{
+      "lang": "eng",
+      "value": "credited person or organization"
+    }}
+  ],
+  "reported_date": "YYYY-MM-DD"
+}}
+```
+
+All fields except those marked as required are optional. Omit fields if information is not available.
 
 Here is the web content to analyze:
 
@@ -239,10 +300,50 @@ def _build_report_from_json(self, data: dict) -> Report:
             artifacts = []
             if "artifacts" in affects_data:
                 for artifact_data in affects_data["artifacts"]:
+                    artifact_type = ArtifactTypeEnum(artifact_data["type"])
+                    artifact_name = artifact_data["name"]
+
+                    # Reclassify models as systems based on provider-specific rules
+                    if artifact_type == ArtifactTypeEnum.model:
+                        artifact_name_lower = artifact_name.lower()
+
+                        # OpenAI: All LLMs are systems
+                        if "openai" in artifact_name_lower:
+                            artifact_type = ArtifactTypeEnum.system
+
+                        # Anthropic: All LLMs are systems
+                        elif "anthropic" in artifact_name_lower:
+                            artifact_type = ArtifactTypeEnum.system
+
+                        # Google: Gemini series are systems, Gemma are models
+                        elif "google" in artifact_name_lower or "gemini" in artifact_name_lower:
+                            if "gemini" in artifact_name_lower:
+                                artifact_type = ArtifactTypeEnum.system
+                            # gemma remains as model
+
+                        # Cohere: All except Command R and Aya are systems
+                        elif "cohere" in artifact_name_lower:
+                            if "command r" not in artifact_name_lower and "aya" not in artifact_name_lower:
+                                artifact_type = ArtifactTypeEnum.system
+
+                        # Mistral: Large, Medium, Moderation, Embed are systems
+                        elif "mistral" in artifact_name_lower:
+                            if any(variant in artifact_name_lower for variant in ["large", "medium", "moderation", "embed"]):
+                                artifact_type = ArtifactTypeEnum.system
+
+                        # Alibaba: Qwen Max and Turbo are systems
+                        elif "alibaba" in artifact_name_lower or "qwen" in artifact_name_lower:
+                            if "qwen max" in artifact_name_lower or "qwen turbo" in artifact_name_lower:
+                                artifact_type = ArtifactTypeEnum.system
+
+                        # Meta, Twitter/X, Mozilla remain as systems (closed APIs)
+                        elif any(provider in artifact_name_lower for provider in ["twitter", "grok"]):
+                            artifact_type = ArtifactTypeEnum.system
+
                     artifacts.append(
                         Artifact(
-                            type=ArtifactTypeEnum(artifact_data["type"]),
-                            name=artifact_data["name"],
+                            type=artifact_type,
+                            name=artifact_name,
                         )
                     )
             affects = Affects(

diff --git a/avidtools/datamodels/components.py b/avidtools/datamodels/components.py
@@ -2,7 +2,7 @@
 Component data classes used in AVID report and vulnerability datamodels.
 """
 
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 from pydantic import BaseModel
 
 from .enums import (
@@ -126,20 +126,51 @@ class CWETaxonomy(BaseModel):
     lang: Optional[str] = None
 
 
+class JailbreakTaxonomyItem(BaseModel):
+    """0DIN Jailbreak Taxonomy item with Category, Strategy, and Technique."""
+
+    Category: Optional[str] = None
+    Strategy: Optional[str] = None
+    Technique: Optional[str] = None
+
+    class Config:  # Fields are excluded if None
+        fields = {
+            "Category": {"exclude": True},
+            "Strategy": {"exclude": True},
+            "Technique": {"exclude": True}
+        }
+
+
+class OdinTaxonomy(BaseModel):
+    """0DIN taxonomy mapping for AI security disclosures."""
+
+    SocialImpactScore: Optional[str] = None
+    JailbreakTaxonomy: Optional[List[JailbreakTaxonomyItem]] = None
+
+    class Config:  # Fields are excluded if None
+        fields = {
+            "SocialImpactScore": {"exclude": True},
+            "JailbreakTaxonomy": {"exclude": True}
+        }
+
+
 class Impact(BaseModel):
     """Impact information of a report/vulnerability.
 
     E.g. different taxonomy mappings, harm and severity scores.
     """
 
-    avid: AvidTaxonomy
+    avid: Optional[AvidTaxonomy] = None
     atlas: Optional[List[AtlasTaxonomy]] = None
     cvss: Optional[CVSSScores] = None
     cwe: Optional[List[CWETaxonomy]] = None
+    odin: Optional[OdinTaxonomy] = None
 
     class Config:  # Fields are excluded if None
         fields = {
+            "avid": {"exclude": True},
             "atlas": {"exclude": True},
             "cvss": {"exclude": True},
-            "cwe": {"exclude": True}
+            "cwe": {"exclude": True},
+            "odin": {"exclude": True}
         }
diff --git a/scripts/download_pages.py b/scripts/download_pages.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+"""Download all disclosure pages from 0din.ai page 1 for offline testing."""
+
+import httpx
+from pathlib import Path
+from bs4 import BeautifulSoup
+
+def main():
+    # Create output directory
+    output_dir = Path(__file__).parent / "scraped_html"
+    output_dir.mkdir(exist_ok=True)
+
+    # Get page 1 to extract UUIDs
+    print("Fetching page 1 to get UUIDs...")
+    page_url = "https://0din.ai/disclosures?page=1"
+    response = httpx.get(page_url, timeout=30.0)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Extract UUIDs
+    links = soup.find_all('a', {'data-turbo-frame': '_top'})
+    uuids = set()
+    for link in links:
+        href = link.get('href', '')
+        if '/disclosures/' in href and href.count('/') == 2:
+            uuid = href.split('/')[-1]
+            uuids.add(uuid)
+
+    print(f"Found {len(uuids)} UUIDs")
+
+    # Download each disclosure page
+    for i, uuid in enumerate(sorted(uuids), 1):
+        url = f"https://0din.ai/disclosures/{uuid}"
+        output_file = output_dir / f"{uuid}.html"
+
+        if output_file.exists():
+            print(f"[{i}/{len(uuids)}] Skipping {uuid} (already exists)")
+            continue
+
+        print(f"[{i}/{len(uuids)}] Downloading {uuid}...")
+        try:
+            response = httpx.get(url, timeout=30.0)
+            output_file.write_text(response.text, encoding='utf-8')
+            print(f"  ✓ Saved to {output_file}")
+        except Exception as e:
+            print(f"  ✗ Error: {e}")
+
+    print(f"\nComplete! Downloaded pages to {output_dir}")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/mileva.py b/scripts/mileva.py
@@ -177,6 +177,7 @@ async def scrape_nvd_cve_details(
     details = {
         'cve_id': cve_id,
         'url': f"https://www.cve.org/CVERecord?id={cve_id}",
+        'title': None,
         'description': None,
         'published_date': None,
         'last_modified_date': None,
@@ -193,6 +194,11 @@ async def scrape_nvd_cve_details(
     try:
         containers = cve_data.get('containers', {})
         cna = containers.get('cna', {})
+
+        # Get CNA title
+        title = cna.get('title')
+        if isinstance(title, str):
+            details['title'] = title.strip()
 
         # Get description
         descriptions = cna.get('descriptions', [])
@@ -268,7 +274,7 @@ async def scrape_nvd_cve_details(
 def create_description(cve_id: str, cve_details: dict) -> Optional[LangValue]:
     """Create description LangValue object."""
     if cve_details['description']:
-        return LangValue(lang="eng", value=cve_id + " Detail")
+        return LangValue(lang="eng", value=cve_details['description'])
     return None
 
 
@@ -296,10 +302,7 @@ def create_references(cve_details: dict) -> List[Reference]:
 
 def create_problemtype(cve_id: str, cve_details: dict) -> Problemtype:
     """Create problemtype from CVE details."""
-    problemtype_desc = cve_details['description'] or f"Vulnerability {cve_id}"
-    if cve_details['cwe_ids']:
-        cwe_list = ', '.join(cve_details['cwe_ids'])
-        problemtype_desc = f"{cwe_list}: {problemtype_desc}"
+    problemtype_desc = cve_details.get('title') or f"Vulnerability {cve_id}"
 
     return Problemtype(
         classof=ClassEnum.cve,