Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ echo "SELECT * FROM v_master LIMIT 5;" | docker exec -i supabase_db_tryvit psql
```
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────────────┐
│ Open Food Facts │────▶│ Python Pipeline │────▶│ PostgreSQL (Supabase) │
│ API v2 │ │ sql_generator │ │ 230 migrations │
│ API v2 │ │ sql_generator │ │ 231 migrations │
│ (category tags, │ │ validator │ │ 58 pipeline folders │
│ countries=PL,DE│ │ off_client │ │ products + nutrition │
└─────────────────┘ └──────────────────┘ │ + ingredients + scores │
Expand Down Expand Up @@ -319,7 +319,7 @@ tryvit/
│ └── views/ # Reference view definitions
├── supabase/
│ ├── migrations/ # 230 append-only schema migrations
│ ├── migrations/ # 231 append-only schema migrations
│ ├── seed/ # Reference data seeds
│ ├── tests/ # pgTAP integration tests
│ └── functions/ # Edge Functions (API gateway, push notifications, CAPTCHA)
Expand Down
42 changes: 42 additions & 0 deletions RUN_LOCAL.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,48 @@ if ($Enrich) {
# Step 2: Apply the generated migration
$enrichMigrations = Get-ChildItem -Path (Join-Path $PSScriptRoot "supabase" "migrations") -Filter "*populate_ingredients_allergens*" | Sort-Object Name | Select-Object -Last 1
if ($enrichMigrations) {
Write-Host "Step 2a: Validating percent ranges in generated migration..." -ForegroundColor Yellow
$validatePercentScript = @"
import re
import sys
from pathlib import Path

path = Path(sys.argv[1])
line_re = re.compile(
r"\(\s*'(?P<country>[^']+)'\s*,\s*'(?P<ean>[^']+)'\s*,\s*'(?P<name>(?:[^']|'')*)'\s*,\s*(?P<position>\d+)\s*,\s*(?P<pct>NULL|-?\d+(?:\.\d+)?)::numeric\s*,\s*(?P<pct_est>NULL|-?\d+(?:\.\d+)?)::numeric"
)
Comment on lines +256 to +258

bad = []
for lineno, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
m = line_re.search(line)
if not m:
continue
pct = m.group("pct")
if pct != "NULL":
val = float(pct)
if val < 0 or val > 100:
bad.append((lineno, m.group("country"), m.group("ean"), m.group("name"), val))

if bad:
print(f"Found {len(bad)} out-of-range product_ingredient.percent values in {path.name}", file=sys.stderr)
for row in bad[:5]:
print(
f" line {row[0]} country={row[1]} ean={row[2]} ingredient='{row[3]}' percent={row[4]}",
file=sys.stderr,
)
sys.exit(1)
Comment on lines +260 to +278

print("Percent range validation passed.")
"@
& $pythonExe -c $validatePercentScript $enrichMigrations.FullName
if ($LASTEXITCODE -eq 0) {
Write-Host " ✓ Percent range validation passed" -ForegroundColor Green
}
else {
Write-Host " ✗ Percent range validation FAILED" -ForegroundColor Red
exit 1
}

Write-Host "Step 2: Applying enrichment migration: $($enrichMigrations.Name)..." -ForegroundColor Yellow
$output = Get-Content $enrichMigrations.FullName -Raw | docker exec -i $CONTAINER psql -U $DB_USER -d $DB_NAME -v ON_ERROR_STOP=1 2>&1
if ($LASTEXITCODE -eq 0) {
Expand Down
4 changes: 2 additions & 2 deletions copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ tryvit/
│ │ ├── api-gateway/ # Write-path gateway (rate limiting, validation) (#478)
│ │ └── send-push-notification/ # Push notification handler
│ ├── dr-drill/ # Disaster recovery drill artifacts
│ └── migrations/ # 230 append-only schema migrations
│ └── migrations/ # 231 append-only schema migrations
│ ├── 20260207000100_create_schema.sql
│ ├── 20260207000200_baseline.sql
│ ├── 20260207000300_add_chip_metadata.sql
Expand Down Expand Up @@ -698,7 +698,7 @@ a mix of `'baked'`, `'fried'`, and `'none'`.

## 7. Migrations

**Location:** `supabase/migrations/` — managed by Supabase CLI. Currently **230 migrations**.
**Location:** `supabase/migrations/` — managed by Supabase CLI. Currently **231 migrations**.

**Rules:**

Expand Down
6 changes: 3 additions & 3 deletions docs/PRODUCTION_DATA.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

### 1.2 Migration Inventory

**Location:** `supabase/migrations/` — **230 migration files**, append-only.
**Location:** `supabase/migrations/` — **231 migration files**, append-only.

**Naming convention:** `YYYYMMDDHHMMSS_description.sql` (Supabase CLI timestamps). Files are applied in lexicographic sort order.

Expand Down Expand Up @@ -329,7 +329,7 @@ There is no standalone `init_db_structure.py` script. Database initialization fo

```
supabase db reset
→ Applies all 230 migrations in order (supabase/migrations/*.sql)
→ Applies all 231 migrations in order (supabase/migrations/*.sql)
→ Runs seed.sql (empty — no-op)
→ Schema is ready

Expand Down Expand Up @@ -511,7 +511,7 @@ Backup = supabase/migrations/*.sql + db/pipelines/*.sql
```

Since the database can be fully reconstructed from:
1. 230 migration files (schema + functions + views)
1. 231 migration files (schema + functions + views)
2. 25 × 4 pipeline SQL files (all product data)
3. `ci_post_pipeline.sql` (data fixups)

Expand Down
110 changes: 108 additions & 2 deletions enrich_ingredients.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,81 @@ def _clamp_percent_estimate(pct_est: float | None) -> float | None:
return round(max(pct_est, 0), 2)


def _normalize_percent_value(
value: object,
*,
field_name: str,
ean: str,
ingredient_name: str,
anomalies: list[dict],
) -> float | None:
"""Normalize percent-like fields.

- Preserve NULL/empty as NULL
- Preserve valid numeric range [0, 100]
- If outside range or unparsable, set NULL and record anomaly
"""
if value is None:
return None

if isinstance(value, str):
raw = value.strip()
if raw == "":
return None
# Some OFF entries may use comma decimal separators.
raw = raw.replace(",", ".")
else:
raw = str(value)

try:
parsed = float(raw)
except (TypeError, ValueError):
anomalies.append(
{
"ean": ean,
"ingredient_name": ingredient_name,
"field": field_name,
"value": value,
"reason": "unparsable",
}
)
return None

if parsed < 0 or parsed > 100:
anomalies.append(
{
"ean": ean,
"ingredient_name": ingredient_name,
"field": field_name,
"value": value,
"reason": "out_of_range",
}
)
return None

return round(parsed, 2)
Comment on lines +328 to +340


def _find_invalid_percent_rows(rows: list[dict]) -> list[dict]:
"""Return rows that still contain out-of-range percent values."""
bad: list[dict] = []
for r in rows:
pct = r.get("percent")
if pct is not None and (pct < 0 or pct > 100):
bad.append(r)
pct_est = r.get("percent_estimate")
if pct_est is not None and (pct_est < 0 or pct_est > 100):
bad.append(r)
Comment on lines +346 to +352
return bad
Comment on lines +343 to +353


def process_ingredients(
off_product: dict,
country: str,
ean: str,
ingredient_lookup: dict[str, int],
new_ingredients: dict[str, dict],
percent_anomalies: list[dict] | None = None,
) -> list[dict]:
"""Extract ingredient rows for a product.

Expand All @@ -301,6 +370,9 @@ def process_ingredients(
if not ingredients:
return []

if percent_anomalies is None:
percent_anomalies = []

rows: list[dict] = []

def process_item(item: dict, pos: int, is_sub: bool, parent_name: str | None) -> int:
Expand Down Expand Up @@ -330,8 +402,20 @@ def process_item(item: dict, pos: int, is_sub: bool, parent_name: str | None) ->
"ean": ean,
"ingredient_name": display_name,
"position": pos,
"percent": item.get("percent"),
"percent_estimate": _clamp_percent_estimate(item.get("percent_estimate")),
"percent": _normalize_percent_value(
item.get("percent"),
field_name="percent",
ean=ean,
ingredient_name=display_name,
anomalies=percent_anomalies,
),
"percent_estimate": _normalize_percent_value(
item.get("percent_estimate"),
field_name="percent_estimate",
ean=ean,
ingredient_name=display_name,
anomalies=percent_anomalies,
),
"is_sub_ingredient": is_sub,
"parent_ingredient_name": parent_name if is_sub else None,
}
Expand Down Expand Up @@ -765,6 +849,7 @@ def main():
all_ingredient_rows = []
all_allergen_rows = []
new_ingredients: dict[str, dict] = {}
percent_anomalies: list[dict] = []

stats = {
"processed": 0,
Expand Down Expand Up @@ -800,6 +885,7 @@ def main():
product["ean"],
ingredient_lookup,
new_ingredients,
percent_anomalies,
)
if ing_rows:
stats["with_ingredients"] += 1
Expand All @@ -825,6 +911,26 @@ def main():
print(f" Total ingredient rows: {len(all_ingredient_rows)}")
print(f" Total allergen rows: {len(all_allergen_rows)}")

if percent_anomalies:
print(f" Percent anomalies sanitized to NULL: {len(percent_anomalies)}")
sample = percent_anomalies[:5]
for a in sample:
print(
f" - EAN {a['ean']}, ingredient '{a['ingredient_name']}', {a['field']}={a['value']} ({a['reason']})"
)

invalid_rows = _find_invalid_percent_rows(all_ingredient_rows)
if invalid_rows:
print("\nERROR: Invalid percent values remain after normalization.", file=sys.stderr)
for row in invalid_rows[:5]:
print(
" "
f"EAN {row.get('ean')} ingredient '{row.get('ingredient_name')}' "
f"percent={row.get('percent')} percent_estimate={row.get('percent_estimate')}",
file=sys.stderr,
)
sys.exit(1)

sql = generate_migration(all_ingredient_rows, all_allergen_rows, new_ingredients, stats)

MIGRATION_FILE.write_text(sql, encoding="utf-8")
Expand Down
Loading
Loading