diff --git a/README.md b/README.md index f7016af5..bae3ae03 100644 --- a/README.md +++ b/README.md @@ -187,7 +187,7 @@ echo "SELECT * FROM v_master LIMIT 5;" | docker exec -i supabase_db_tryvit psql ``` ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────────────┐ │ Open Food Facts │────▶│ Python Pipeline │────▶│ PostgreSQL (Supabase) │ -│ API v2 │ │ sql_generator │ │ 231 migrations │ +│ API v2 │ │ sql_generator │ │ 232 migrations │ │ (category tags, │ │ validator │ │ 58 pipeline folders │ │ countries=PL,DE│ │ off_client │ │ products + nutrition │ └─────────────────┘ └──────────────────┘ │ + ingredients + scores │ @@ -319,7 +319,7 @@ tryvit/ │ └── views/ # Reference view definitions │ ├── supabase/ -│ ├── migrations/ # 231 append-only schema migrations +│ ├── migrations/ # 232 append-only schema migrations │ ├── seed/ # Reference data seeds │ ├── tests/ # pgTAP integration tests │ └── functions/ # Edge Functions (API gateway, push notifications, CAPTCHA) @@ -531,3 +531,4 @@ This project is licensed under the terms in the [LICENSE](LICENSE) file.
Built with science and care.

+ diff --git a/copilot-instructions.md b/copilot-instructions.md index e68f689a..0a87ee04 100644 --- a/copilot-instructions.md +++ b/copilot-instructions.md @@ -189,7 +189,7 @@ tryvit/ │ │ ├── api-gateway/ # Write-path gateway (rate limiting, validation) (#478) │ │ └── send-push-notification/ # Push notification handler │ ├── dr-drill/ # Disaster recovery drill artifacts -│ └── migrations/ # 231 append-only schema migrations +│ └── migrations/ # 232 append-only schema migrations │ ├── 20260207000100_create_schema.sql │ ├── 20260207000200_baseline.sql │ ├── 20260207000300_add_chip_metadata.sql @@ -698,7 +698,7 @@ a mix of `'baked'`, `'fried'`, and `'none'`. ## 7. Migrations -**Location:** `supabase/migrations/` — managed by Supabase CLI. Currently **231 migrations**. +**Location:** `supabase/migrations/` — managed by Supabase CLI. Currently **232 migrations**. **Rules:** @@ -2705,3 +2705,4 @@ gh issue comment XXX --body "Session complete — " - Sections are numbered sequentially so any agent can say "per §X.Y" unambiguously. - When this file is updated, the commit message should include the sections changed (e.g., `chore(docs): copilot-instructions §4 — add ingredient_translations table`). - When the instructions conflict with a specific issue's acceptance criteria, the issue wins for that specific change — but document the divergence in the PR description. + diff --git a/db/ci_post_enrichment.sql b/db/ci_post_enrichment.sql index 12252256..cd78d3e7 100644 --- a/db/ci_post_enrichment.sql +++ b/db/ci_post_enrichment.sql @@ -197,32 +197,150 @@ AND NOT EXISTS ( ) AND EXISTS (SELECT 1 FROM product_ingredient LIMIT 1); +-- ═══════════════════════════════════════════════════════════════ +-- Step 0e: Classify additive/E-code concern tiers +-- ═══════════════════════════════════════════════════════════════ +-- Uses strict start-of-string E-code extraction to avoid false positives +-- such as "200 G", "Type 500", or nutrition text. + +WITH normalized AS ( + SELECT + ingredient_id, + name_en, + CASE + WHEN lower(name_en) ~ '^\s*e[ -]?[0-9]{3,4}[a-z]?($|[^a-z0-9])' + THEN regexp_replace( + substring(lower(name_en) from '^\s*e[ -]?[0-9]{3,4}[a-z]?'), + '[^a-z0-9]', + '', + 'g' + ) + ELSE '' + END AS e_code, + lower(name_en) AS n + FROM ingredient_ref + WHERE name_en IS NOT NULL +), +classified AS ( + SELECT + ingredient_id, + CASE + WHEN e_code IN ('e250','e252') + OR n LIKE '%sodium nitrite%' + OR n LIKE '%potassium nitrate%' + THEN 3 + + WHEN e_code IN ('e133','e150d','e211','e220','e223','e319','e385','e407','e407a','e621','e950','e951','e955') + OR n LIKE '%sodium benzoate%' + OR n LIKE '%benzoate%' + OR n LIKE '%sucralose%' + OR n LIKE '%aspartame%' + OR n LIKE '%carrageenan%' + OR n LIKE '%sodium metabisulphite%' + OR n LIKE '%contains sulphites%' + OR n IN ('sulfite','sulfiten','sulphite','sulphites') + THEN 2 + + WHEN e_code IN ( + 'e150','e150a','e150b','e150c','e172','e200','e202', + 'e281','e282','e338','e339','e340','e341', + 'e420','e421','e422','e440','e450','e451','e452', + 'e460','e466','e471','e472','e500','e501','e503', + 'e960','e960a','e960c' + ) + OR n LIKE '%potassium sorbate%' + OR n LIKE '%sorbate%' + THEN 1 + + ELSE 0 + END AS target_concern_tier, + CASE + WHEN e_code IN ('e250','e252') + OR n LIKE '%sodium nitrite%' + OR n LIKE '%potassium nitrate%' + THEN 'Strict additive/E-code classifier: nitrite/nitrate preservative concern' + + WHEN e_code IN ('e133','e150d','e211','e220','e223','e319','e385','e407','e407a','e621','e950','e951','e955') + OR n LIKE '%sodium benzoate%' + OR n LIKE '%benzoate%' + OR n LIKE '%sucralose%' + OR n LIKE '%aspartame%' + OR n LIKE '%carrageenan%' + OR n LIKE '%sodium metabisulphite%' + OR n LIKE '%contains sulphites%' + OR n IN ('sulfite','sulfiten','sulphite','sulphites') + THEN 'Strict additive/E-code classifier: preservative, sweetener, sulphite, or carrageenan concern' + + WHEN e_code IN ( + 'e150','e150a','e150b','e150c','e172','e200','e202', + 'e281','e282','e338','e339','e340','e341', + 'e420','e421','e422','e440','e450','e451','e452', + 'e460','e466','e471','e472','e500','e501','e503', + 'e960','e960a','e960c' + ) + OR n LIKE '%potassium sorbate%' + OR n LIKE '%sorbate%' + THEN 'Strict additive/E-code classifier: lower concern additive or sorbate' + + ELSE NULL + END AS target_concern_reason + FROM normalized +) +UPDATE ingredient_ref ir +SET + concern_tier = GREATEST(COALESCE(ir.concern_tier, 0), c.target_concern_tier), + is_additive = true, + concern_reason = CASE + WHEN ir.concern_reason IS NULL OR trim(ir.concern_reason) = '' + THEN c.target_concern_reason + ELSE ir.concern_reason + END +FROM classified c +WHERE ir.ingredient_id = c.ingredient_id + AND c.target_concern_tier > 0 + AND ( + COALESCE(ir.concern_tier, 0) < c.target_concern_tier + OR ir.is_additive IS DISTINCT FROM true + OR ir.concern_reason IS NULL + OR trim(ir.concern_reason) = '' + ); + -- ═══════════════════════════════════════════════════════════════ -- Step 1: Populate ingredient_concern_score from actual ingredient data -- ═══════════════════════════════════════════════════════════════ -- Based on EFSA concern tiers: tier 1 = 15pts, tier 2 = 40pts, tier 3 = 100pts -- Capped at LEAST(100, SUM(...)) per SCORING_METHODOLOGY.md v3.2 +WITH product_scores AS ( + SELECT + pi.product_id, + LEAST(100, SUM( + CASE ir.concern_tier + WHEN 1 THEN 15 + WHEN 2 THEN 40 + WHEN 3 THEN 100 + ELSE 0 + END + ))::int AS score + FROM product_ingredient pi + JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id + WHERE COALESCE(ir.concern_tier, 0) > 0 + GROUP BY pi.product_id +), +computed_scores AS ( + SELECT + p.product_id, + COALESCE(ps.score, 0)::int AS score + FROM products p + LEFT JOIN product_scores ps ON ps.product_id = p.product_id + WHERE p.is_deprecated IS NOT TRUE +) UPDATE products p -SET ingredient_concern_score = COALESCE(concern.score, 0) -FROM ( - SELECT pi.product_id, - LEAST(100, SUM( - CASE ir.concern_tier - WHEN 1 THEN 15 - WHEN 2 THEN 40 - WHEN 3 THEN 100 - ELSE 0 - END - ))::int AS score - FROM product_ingredient pi - JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id - WHERE ir.concern_tier > 0 - GROUP BY pi.product_id -) concern -WHERE concern.product_id = p.product_id +SET ingredient_concern_score = cs.score +FROM computed_scores cs +WHERE p.product_id = cs.product_id AND p.is_deprecated IS NOT TRUE - AND p.ingredient_concern_score IS DISTINCT FROM COALESCE(concern.score, 0); + AND p.ingredient_concern_score IS DISTINCT FROM cs.score; -- ═══════════════════════════════════════════════════════════════ -- Step 2: Flag palm oil controversy from actual ingredient data diff --git a/db/qa/QA__ingredient_quality.sql b/db/qa/QA__ingredient_quality.sql index ba4a5b35..aab243a2 100644 --- a/db/qa/QA__ingredient_quality.sql +++ b/db/qa/QA__ingredient_quality.sql @@ -198,3 +198,43 @@ SELECT '17. ingredient_translations source valid' AS check_name, FROM ingredient_translations WHERE source NOT IN ('curated', 'off_api', 'auto_translated', 'user_submitted'); + +-- ═══════════════════════════════════════════════════════════════════════════ +-- 18. concern_tier should not remain all-zero once product_ingredient exists +-- ═══════════════════════════════════════════════════════════════════════════ +SELECT '18. concern_tier populated when product ingredients exist' AS check_name, + CASE + WHEN NOT EXISTS (SELECT 1 FROM product_ingredient) THEN 0 + WHEN EXISTS ( + SELECT 1 + FROM product_ingredient pi + JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id + WHERE COALESCE(ir.concern_tier, 0) > 0 + ) THEN 0 + ELSE 1 + END AS violations; + +-- ═══════════════════════════════════════════════════════════════════════════ +-- 19. products linked to concern-tier ingredients should have concern score +-- ═══════════════════════════════════════════════════════════════════════════ +SELECT '19. concern-tier linked products have concern score' AS check_name, + CASE + WHEN NOT EXISTS ( + SELECT 1 + FROM product_ingredient pi + JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id + WHERE COALESCE(ir.concern_tier, 0) > 0 + ) THEN 0 + ELSE ( + SELECT COUNT(*) + FROM ( + SELECT DISTINCT p.product_id + FROM products p + JOIN product_ingredient pi ON pi.product_id = p.product_id + JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id + WHERE p.is_deprecated IS NOT TRUE + AND COALESCE(ir.concern_tier, 0) > 0 + AND COALESCE(p.ingredient_concern_score, 0) = 0 + ) x + ) + END AS violations; diff --git a/db/qa/QA__scoring_formula_tests.sql b/db/qa/QA__scoring_formula_tests.sql index a815341a..22f5f37e 100644 --- a/db/qa/QA__scoring_formula_tests.sql +++ b/db/qa/QA__scoring_formula_tests.sql @@ -643,3 +643,4 @@ FROM ( WHERE NOT ((api_score_explanation(p.product_id))->'summary') ? 'conflicts' OR NOT ((api_score_explanation(p.product_id))->'summary') ? 'qualified_headline'; + diff --git a/docs/PRODUCTION_DATA.md b/docs/PRODUCTION_DATA.md index 8c6281a3..6610173f 100644 --- a/docs/PRODUCTION_DATA.md +++ b/docs/PRODUCTION_DATA.md @@ -45,7 +45,7 @@ ### 1.2 Migration Inventory -**Location:** `supabase/migrations/` — **231 migration files**, append-only. +**Location:** `supabase/migrations/` — **232 migration files**, append-only. **Naming convention:** `YYYYMMDDHHMMSS_description.sql` (Supabase CLI timestamps). Files are applied in lexicographic sort order. @@ -329,7 +329,7 @@ There is no standalone `init_db_structure.py` script. Database initialization fo ``` supabase db reset - → Applies all 231 migrations in order (supabase/migrations/*.sql) + → Applies all 232 migrations in order (supabase/migrations/*.sql) → Runs seed.sql (empty — no-op) → Schema is ready @@ -511,7 +511,7 @@ Backup = supabase/migrations/*.sql + db/pipelines/*.sql ``` Since the database can be fully reconstructed from: -1. 231 migration files (schema + functions + views) +1. 232 migration files (schema + functions + views) 2. 25 × 4 pipeline SQL files (all product data) 3. `ci_post_pipeline.sql` (data fixups) @@ -657,3 +657,4 @@ Supabase Auth URL configuration required: 7. **Consider EAN-based canonical keys** to eliminate `product_id` instability across environments 8. **Add rollback documentation** — steps to revert a bad deployment 9. **Create staging environment** — intermediate Supabase project for pre-production validation + diff --git a/supabase/migrations/20260605223000_backfill_ingredient_concern_tiers.sql b/supabase/migrations/20260605223000_backfill_ingredient_concern_tiers.sql new file mode 100644 index 00000000..06a5e0ce --- /dev/null +++ b/supabase/migrations/20260605223000_backfill_ingredient_concern_tiers.sql @@ -0,0 +1,142 @@ +-- Migration: Backfill ingredient concern tiers from strict additive/E-code classification. +-- Rollback: Manual rollback only. Restore affected ingredient_ref.concern_tier/is_additive/concern_reason and products.ingredient_concern_score from backup if needed. +-- Backfill ingredient_ref.concern_tier using strict additive/E-code classification. +-- Safe to re-run. Does not downgrade existing higher concern tiers. + +BEGIN; + +WITH normalized AS ( + SELECT + ingredient_id, + name_en, + CASE + WHEN lower(name_en) ~ '^\s*e[ -]?[0-9]{3,4}[a-z]?($|[^a-z0-9])' + THEN regexp_replace( + substring(lower(name_en) from '^\s*e[ -]?[0-9]{3,4}[a-z]?'), + '[^a-z0-9]', + '', + 'g' + ) + ELSE '' + END AS e_code, + lower(name_en) AS n + FROM ingredient_ref + WHERE name_en IS NOT NULL +), +classified AS ( + SELECT + ingredient_id, + CASE + WHEN e_code IN ('e250','e252') + OR n LIKE '%sodium nitrite%' + OR n LIKE '%potassium nitrate%' + THEN 3 + + WHEN e_code IN ('e133','e150d','e211','e220','e223','e319','e385','e407','e407a','e621','e950','e951','e955') + OR n LIKE '%sodium benzoate%' + OR n LIKE '%benzoate%' + OR n LIKE '%sucralose%' + OR n LIKE '%aspartame%' + OR n LIKE '%carrageenan%' + OR n LIKE '%sodium metabisulphite%' + OR n LIKE '%contains sulphites%' + OR n IN ('sulfite','sulfiten','sulphite','sulphites') + THEN 2 + + WHEN e_code IN ( + 'e150','e150a','e150b','e150c','e172','e200','e202', + 'e281','e282','e338','e339','e340','e341', + 'e420','e421','e422','e440','e450','e451','e452', + 'e460','e466','e471','e472','e500','e501','e503', + 'e960','e960a','e960c' + ) + OR n LIKE '%potassium sorbate%' + OR n LIKE '%sorbate%' + THEN 1 + + ELSE 0 + END AS target_concern_tier, + CASE + WHEN e_code IN ('e250','e252') + OR n LIKE '%sodium nitrite%' + OR n LIKE '%potassium nitrate%' + THEN 'Strict additive/E-code classifier: nitrite/nitrate preservative concern' + + WHEN e_code IN ('e133','e150d','e211','e220','e223','e319','e385','e407','e407a','e621','e950','e951','e955') + OR n LIKE '%sodium benzoate%' + OR n LIKE '%benzoate%' + OR n LIKE '%sucralose%' + OR n LIKE '%aspartame%' + OR n LIKE '%carrageenan%' + OR n LIKE '%sodium metabisulphite%' + OR n LIKE '%contains sulphites%' + OR n IN ('sulfite','sulfiten','sulphite','sulphites') + THEN 'Strict additive/E-code classifier: preservative, sweetener, sulphite, or carrageenan concern' + + WHEN e_code IN ( + 'e150','e150a','e150b','e150c','e172','e200','e202', + 'e281','e282','e338','e339','e340','e341', + 'e420','e421','e422','e440','e450','e451','e452', + 'e460','e466','e471','e472','e500','e501','e503', + 'e960','e960a','e960c' + ) + OR n LIKE '%potassium sorbate%' + OR n LIKE '%sorbate%' + THEN 'Strict additive/E-code classifier: lower concern additive or sorbate' + + ELSE NULL + END AS target_concern_reason + FROM normalized +) +UPDATE ingredient_ref ir +SET + concern_tier = GREATEST(COALESCE(ir.concern_tier, 0), c.target_concern_tier), + is_additive = true, + concern_reason = CASE + WHEN ir.concern_reason IS NULL OR trim(ir.concern_reason) = '' + THEN c.target_concern_reason + ELSE ir.concern_reason + END +FROM classified c +WHERE ir.ingredient_id = c.ingredient_id + AND c.target_concern_tier > 0 + AND ( + COALESCE(ir.concern_tier, 0) < c.target_concern_tier + OR ir.is_additive IS DISTINCT FROM true + OR ir.concern_reason IS NULL + OR trim(ir.concern_reason) = '' + ); + +WITH product_scores AS ( + SELECT + pi.product_id, + LEAST(100, SUM( + CASE ir.concern_tier + WHEN 1 THEN 15 + WHEN 2 THEN 40 + WHEN 3 THEN 100 + ELSE 0 + END + ))::int AS score + FROM product_ingredient pi + JOIN ingredient_ref ir ON ir.ingredient_id = pi.ingredient_id + WHERE COALESCE(ir.concern_tier, 0) > 0 + GROUP BY pi.product_id +), +computed_scores AS ( + SELECT + p.product_id, + COALESCE(ps.score, 0)::int AS score + FROM products p + LEFT JOIN product_scores ps ON ps.product_id = p.product_id + WHERE p.is_deprecated IS NOT TRUE +) +UPDATE products p +SET ingredient_concern_score = cs.score +FROM computed_scores cs +WHERE p.product_id = cs.product_id + AND p.is_deprecated IS NOT TRUE + AND p.ingredient_concern_score IS DISTINCT FROM cs.score; + +COMMIT; +