Skip to content
Merged

0din #15

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 117 additions & 16 deletions avidtools/connectors/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Optional
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
from openai import OpenAI, AsyncOpenAI

from ..datamodels.report import Report, ReportMetadata
from ..datamodels.components import (
Expand Down Expand Up @@ -50,6 +50,7 @@ def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
)
self.model = model
self.client = OpenAI(api_key=self.api_key)
self.async_client = AsyncOpenAI(api_key=self.api_key)

def scrape_url(self, url: str) -> dict:
"""
Expand Down Expand Up @@ -114,19 +115,79 @@ def _create_ai_prompt(self, scraped_data: dict) -> str:
"""
prompt = f"""You are an AI security expert tasked with analyzing web content about AI/ML vulnerabilities, incidents, or security issues and extracting structured information to create an AVID (AI Vulnerability Database) report.

The AVID report structure includes:
- **report_id**: A unique identifier (generate one like "AVID-YYYY-R-XXXX")
- **affects**: Information about affected artifacts including:
- developer: List of developers/organizations
- deployer: List of deployers/organizations
- artifacts: List of artifacts with type (MUST be one of: "Model", "Dataset", "System") and name
- **problemtype**: Problem description with:
- classof: Class (MUST be one of: "AIID Incident", "ATLAS Case Study", "CVE Entry", "LLM Evaluation", "Third-party Report", "Undefined"). Default to "Third-party Report" if unsure.
- type: Type (MUST be one of: "Issue", "Advisory", "Measurement", "Detection")
- description: language ("eng") and value (description text)
- **description**: High-level description with lang and value
- **references**: List of references with label and url (MUST include the source URL)
- **reported_date**: Date in YYYY-MM-DD format (use today's date if not specified)
The AVID Report follows this exact schema:

```json
{{
"data_type": "AVID",
"data_version": "string (optional)",
"metadata": {{
"report_id": "string (required, format: AVID-YYYY-R-XXXX)"
}},
"affects": {{
"developer": ["list of developer organizations of the model/system involved"],
"deployer": ["list of deployer organizations"],
"artifacts": [
{{
"type": "Model|Dataset|System (required)",
"name": "artifact name (required)"
}}
]
}},
"problemtype": {{
"classof": "AIID Incident|ATLAS Case Study|CVE Entry|LLM Evaluation|Third-party Report|Undefined (required, default: Third-party Report)",
"type": "Issue|Advisory|Measurement|Detection (optional)",
"description": {{
"lang": "eng",
"value": "description text"
}}
}},
"metrics": [
{{
"name": "metric name",
"detection_method": {{
"type": "Significance Test|Static Threshold",
"name": "method name"
}},
"results": {{}} or []
}}
],
"references": [
{{
"label": "reference label",
"url": "reference url (MUST include source URL)"
}}
],
"description": {{
"lang": "eng",
"value": "high-level description"
}},
"impact": {{
"avid": {{
"risk_domain": ["list of risk domains"],
"sep_view": ["list of SEP taxonomy IDs"],
"lifecycle_view": ["list of lifecycle stage IDs"],
"taxonomy_version": "version string"
}},
"atlas": [
{{
"tactic": "tactic name",
"technique": "technique name",
"subtechnique": "subtechnique name"
}}
]
}},
"credit": [
{{
"lang": "eng",
"value": "credited person or organization"
}}
],
"reported_date": "YYYY-MM-DD"
}}
```

All fields except those marked as required are optional. Omit fields if information is not available.

Here is the web content to analyze:

Expand Down Expand Up @@ -239,10 +300,50 @@ def _build_report_from_json(self, data: dict) -> Report:
artifacts = []
if "artifacts" in affects_data:
for artifact_data in affects_data["artifacts"]:
artifact_type = ArtifactTypeEnum(artifact_data["type"])
artifact_name = artifact_data["name"]

# Reclassify models as systems based on provider-specific rules
if artifact_type == ArtifactTypeEnum.model:
artifact_name_lower = artifact_name.lower()

# OpenAI: All LLMs are systems
if "openai" in artifact_name_lower:
artifact_type = ArtifactTypeEnum.system

# Anthropic: All LLMs are systems
elif "anthropic" in artifact_name_lower:
artifact_type = ArtifactTypeEnum.system

# Google: Gemini series are systems, Gemma are models
elif "google" in artifact_name_lower or "gemini" in artifact_name_lower:
if "gemini" in artifact_name_lower:
artifact_type = ArtifactTypeEnum.system
# gemma remains as model

# Cohere: All except Command R and Aya are systems
elif "cohere" in artifact_name_lower:
if "command r" not in artifact_name_lower and "aya" not in artifact_name_lower:
artifact_type = ArtifactTypeEnum.system

# Mistral: Large, Medium, Moderation, Embed are systems
elif "mistral" in artifact_name_lower:
if any(variant in artifact_name_lower for variant in ["large", "medium", "moderation", "embed"]):
artifact_type = ArtifactTypeEnum.system

# Alibaba: Qwen Max and Turbo are systems
elif "alibaba" in artifact_name_lower or "qwen" in artifact_name_lower:
if "qwen max" in artifact_name_lower or "qwen turbo" in artifact_name_lower:
artifact_type = ArtifactTypeEnum.system

# Meta, Twitter/X, Mozilla remain as systems (closed APIs)
elif any(provider in artifact_name_lower for provider in ["twitter", "grok"]):
artifact_type = ArtifactTypeEnum.system

artifacts.append(
Artifact(
type=ArtifactTypeEnum(artifact_data["type"]),
name=artifact_data["name"],
type=artifact_type,
name=artifact_name,
)
)
affects = Affects(
Expand Down
37 changes: 34 additions & 3 deletions avidtools/datamodels/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Component data classes used in AVID report and vulnerability datamodels.
"""

from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
from pydantic import BaseModel

from .enums import (
Expand Down Expand Up @@ -126,20 +126,51 @@ class CWETaxonomy(BaseModel):
lang: Optional[str] = None


class JailbreakTaxonomyItem(BaseModel):
"""0DIN Jailbreak Taxonomy item with Category, Strategy, and Technique."""

Category: Optional[str] = None
Strategy: Optional[str] = None
Technique: Optional[str] = None

class Config: # Fields are excluded if None
fields = {
"Category": {"exclude": True},
"Strategy": {"exclude": True},
"Technique": {"exclude": True}
}


class OdinTaxonomy(BaseModel):
"""0DIN taxonomy mapping for AI security disclosures."""

SocialImpactScore: Optional[str] = None
JailbreakTaxonomy: Optional[List[JailbreakTaxonomyItem]] = None

class Config: # Fields are excluded if None
fields = {
"SocialImpactScore": {"exclude": True},
"JailbreakTaxonomy": {"exclude": True}
}


class Impact(BaseModel):
"""Impact information of a report/vulnerability.

E.g. different taxonomy mappings, harm and severity scores.
"""

avid: AvidTaxonomy
avid: Optional[AvidTaxonomy] = None
atlas: Optional[List[AtlasTaxonomy]] = None
cvss: Optional[CVSSScores] = None
cwe: Optional[List[CWETaxonomy]] = None
odin: Optional[OdinTaxonomy] = None

class Config: # Fields are excluded if None
fields = {
"avid": {"exclude": True},
"atlas": {"exclude": True},
"cvss": {"exclude": True},
"cwe": {"exclude": True}
"cwe": {"exclude": True},
"odin": {"exclude": True}
}
50 changes: 50 additions & 0 deletions scripts/download_pages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""Download all disclosure pages from 0din.ai page 1 for offline testing."""

import httpx
from pathlib import Path
from bs4 import BeautifulSoup

def main():
# Create output directory
output_dir = Path(__file__).parent / "scraped_html"
output_dir.mkdir(exist_ok=True)

# Get page 1 to extract UUIDs
print("Fetching page 1 to get UUIDs...")
page_url = "https://0din.ai/disclosures?page=1"
response = httpx.get(page_url, timeout=30.0)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract UUIDs
links = soup.find_all('a', {'data-turbo-frame': '_top'})
uuids = set()
for link in links:
href = link.get('href', '')
if '/disclosures/' in href and href.count('/') == 2:
uuid = href.split('/')[-1]
uuids.add(uuid)

print(f"Found {len(uuids)} UUIDs")

# Download each disclosure page
for i, uuid in enumerate(sorted(uuids), 1):
url = f"https://0din.ai/disclosures/{uuid}"
output_file = output_dir / f"{uuid}.html"

if output_file.exists():
print(f"[{i}/{len(uuids)}] Skipping {uuid} (already exists)")
continue

print(f"[{i}/{len(uuids)}] Downloading {uuid}...")
try:
response = httpx.get(url, timeout=30.0)
output_file.write_text(response.text, encoding='utf-8')
print(f" ✓ Saved to {output_file}")
except Exception as e:
print(f" ✗ Error: {e}")

print(f"\nComplete! Downloaded pages to {output_dir}")

if __name__ == "__main__":
main()
13 changes: 8 additions & 5 deletions scripts/mileva.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def scrape_nvd_cve_details(
details = {
'cve_id': cve_id,
'url': f"https://www.cve.org/CVERecord?id={cve_id}",
'title': None,
'description': None,
'published_date': None,
'last_modified_date': None,
Expand All @@ -193,6 +194,11 @@ async def scrape_nvd_cve_details(
try:
containers = cve_data.get('containers', {})
cna = containers.get('cna', {})

# Get CNA title
title = cna.get('title')
if isinstance(title, str):
details['title'] = title.strip()

# Get description
descriptions = cna.get('descriptions', [])
Expand Down Expand Up @@ -268,7 +274,7 @@ async def scrape_nvd_cve_details(
def create_description(cve_id: str, cve_details: dict) -> Optional[LangValue]:
"""Create description LangValue object."""
if cve_details['description']:
return LangValue(lang="eng", value=cve_id + " Detail")
return LangValue(lang="eng", value=cve_details['description'])
return None


Expand Down Expand Up @@ -296,10 +302,7 @@ def create_references(cve_details: dict) -> List[Reference]:

def create_problemtype(cve_id: str, cve_details: dict) -> Problemtype:
"""Create problemtype from CVE details."""
problemtype_desc = cve_details['description'] or f"Vulnerability {cve_id}"
if cve_details['cwe_ids']:
cwe_list = ', '.join(cve_details['cwe_ids'])
problemtype_desc = f"{cwe_list}: {problemtype_desc}"
problemtype_desc = cve_details.get('title') or f"Vulnerability {cve_id}"

return Problemtype(
classof=ClassEnum.cve,
Expand Down
Loading
Loading