From c7d8cf6b5655e8f89346f47f7759dcafd7dc0cd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 11:28:29 +0300
Subject: [PATCH 01/15] feat: improve a101 scraper parsing and filtering

---
 scraper/a101/scraper.py | 107 +++++++++++++++++++++++++++-------------
 1 file changed, 72 insertions(+), 35 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 3030c1b7..79aa2fe7 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -7,10 +7,12 @@ def extract_unit_info(product_name: str):
         return None, None
 
     patterns = [
-        (r"(\d+(?:[.,]\d+)?)\s*(kg|KG|Kg)", "KG"),
-        (r"(\d+(?:[.,]\d+)?)\s*(g|G|gr|GR)", "GRAM"),
-        (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)", "LITER"),
-        (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)", "ML"),
+        (r"(\d+(?:[.,]\d+)?)\s*(kg|KG|Kg)\b", "KG"),
+        (r"(\d+(?:[.,]\d+)?)\s*(g|G|gr|GR)\b", "GRAM"),
+        (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)\b", "LITER"),
+        (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)\b", "ML"),
+        (r"(\d+(?:[.,]\d+)?)\s*(adet|Adet|ADET)\b", "PIECE"),
+        (r"(\d+(?:[.,]\d+)?)\s*(li|LI)\b", "PIECE"),
     ]
 
     for pattern, unit in patterns:
@@ -38,29 +40,65 @@ def is_valid_product_name(name: str) -> bool:
         "Aramak istediğin ürünü yaz",
         "Böyle bir sayfa bulamadık",
         "A101 Hep Ucuz",
+        "Kapıda",
+        "Teslimat",
+        "Seçtiğin mağaza",
     ]
 
+    lowered = name.lower().strip()
+
+    if len(lowered) < 3:
+        return False
+
     for fragment in invalid_fragments:
-        if fragment.lower() in name.lower():
+        if fragment.lower() in lowered:
             return False
 
+    if "₺" in lowered:
+        return False
+
     return True
 
 
 def parse_price_from_lines(lines):
+    price_pattern = re.compile(r"₺\s*([\d\.]+(?:,\d{1,2})?)")
+
+    prices = []
     for line in lines:
-        if "₺" in line:
-            raw_price = (
-                line.replace("₺", "")
-                .replace(".", "")
-                .replace(",", ".")
-                .strip()
-            )
+        matches = price_pattern.findall(line)
+        for match in matches:
+            raw_price = match.replace(".", "").replace(",", ".").strip()
             try:
-                return float(raw_price)
+                prices.append(float(raw_price))
             except ValueError:
                 continue
-    return None
+
+    if not prices:
+        return None
+
+    return min(prices)
+
+
+def clean_product_name(lines):
+    candidates = []
+
+    for line in lines:
+        line = line.strip()
+
+        if not is_valid_product_name(line):
+            continue
+
+        if re.search(r"\d+\s*(?:kg|g|gr|l|lt|ml|adet|li)\b", line, flags=re.IGNORECASE):
+            candidates.append(line)
+            continue
+
+        if not re.search(r"^\d+[.,]?\d*$", line) and "₺" not in line:
+            candidates.append(line)
+
+    if not candidates:
+        return None
+
+    return max(candidates, key=len).strip()
 
 
 def get_a101_products(category_slug: str):
@@ -68,11 +106,10 @@ def get_a101_products(category_slug: str):
     products = []
 
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=False)
+        browser = p.chromium.launch(headless=True)
         page = browser.new_page()
-        page.goto(url, timeout=60000)
+        page.goto(url, timeout=60000, wait_until="domcontentloaded")
 
-        # Konum popup
         for text in [
             "Bu defalık izin ver",
             "Siteyi ziyaret ederken izin ver",
@@ -84,7 +121,6 @@ def get_a101_products(category_slug: str):
             except Exception:
                 pass
 
-        # Cookie popup
         for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]:
             try:
                 page.get_by_text(text, exact=True).click(timeout=3000)
@@ -92,14 +128,15 @@ def get_a101_products(category_slug: str):
             except Exception:
                 pass
 
-        page.wait_for_timeout(4000)
+        page.wait_for_timeout(3000)
 
-        # Lazy loading için scroll
-        for _ in range(8):
-            page.mouse.wheel(0, 2500)
-            page.wait_for_timeout(1200)
+        for _ in range(10):
+            page.mouse.wheel(0, 3000)
+            page.wait_for_timeout(1000)
 
-        cards = page.locator("div[class*='product']").all()
+        cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all()
+
+        seen_names = set()
 
         for i, card in enumerate(cards):
             try:
@@ -115,29 +152,29 @@ def get_a101_products(category_slug: str):
                 if price is None:
                     continue
 
-                name = None
-                for line in lines:
-                    if "₺" not in line and len(line) > 2:
-                        if is_valid_product_name(line):
-                            name = line
-                            break
-
+                name = clean_product_name(lines)
                 if not name:
                     continue
 
+                normalized_name = name.lower().strip()
+                if normalized_name in seen_names:
+                    continue
+
+                seen_names.add(normalized_name)
+
                 extracted_unit, extracted_amount = extract_unit_info(name)
 
                 products.append(
                     {
-                        "product_id": f"a101_{i}",
+                        "product_id": f"a101_{category_slug}_{i}",
                         "product_name": name,
-                        "sku": f"a101_{i}",
+                        "sku": f"a101_{category_slug}_{i}",
                         "shown_price_tl": price,
                         "regular_price_tl": price,
                         "discount_rate": None,
                         "product_url": url,
                         "brand_name": None,
-                        "category_name": "Meyve ve Sebze",
+                        "category_name": category_slug.replace("-", " ").title(),
                         "unit": extracted_unit,
                         "unit_amount": extracted_amount,
                     }
@@ -148,4 +185,4 @@ def get_a101_products(category_slug: str):
 
         browser.close()
 
-    return products
\ No newline at end of file
+    return products

From 3e2a2e2b4f1f4839cd7df177e5b442e0d384d348 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 13:12:18 +0300
Subject: [PATCH 02/15] debug: inspect a101 scraper output

---
 scraper/a101/scraper.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 79aa2fe7..2391b010 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -106,7 +106,7 @@ def get_a101_products(category_slug: str):
     products = []
 
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
+        browser = p.chromium.launch(headless=False)
         page = browser.new_page()
         page.goto(url, timeout=60000, wait_until="domcontentloaded")
 
@@ -136,11 +136,18 @@ def get_a101_products(category_slug: str):
 
         cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all()
 
+        print(f"DEBUG - CATEGORY URL: {url}")
+        print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}")
+
         seen_names = set()
 
         for i, card in enumerate(cards):
             try:
                 text_blob = card.inner_text().strip()
+
+                if i < 15:
+                    print(f"DEBUG - RAW CARD {i}: {text_blob[:300]}")
+
                 if not text_blob:
                     continue
 
@@ -149,12 +156,17 @@ def get_a101_products(category_slug: str):
                     continue
 
                 price = parse_price_from_lines(lines)
-                if price is None:
+                name = clean_product_name(lines)
+
+                if i < 15:
+                    print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:8]}")
+
+                # Bu aşamada filtreleri gevşek tutuyoruz
+                if price is None and name is None:
                     continue
 
-                name = clean_product_name(lines)
                 if not name:
-                    continue
+                    name = f"unknown_a101_product_{i}"
 
                 normalized_name = name.lower().strip()
                 if normalized_name in seen_names:
@@ -169,8 +181,8 @@ def get_a101_products(category_slug: str):
                         "product_id": f"a101_{category_slug}_{i}",
                         "product_name": name,
                         "sku": f"a101_{category_slug}_{i}",
-                        "shown_price_tl": price,
-                        "regular_price_tl": price,
+                        "shown_price_tl": price if price is not None else 0,
+                        "regular_price_tl": price if price is not None else 0,
                         "discount_rate": None,
                         "product_url": url,
                         "brand_name": None,
@@ -180,9 +192,12 @@ def get_a101_products(category_slug: str):
                     }
                 )
 
-            except Exception:
+            except Exception as e:
+                print(f"DEBUG - CARD ERROR {i}: {e}")
                 continue
 
+        print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}")
+
         browser.close()
 
     return products

From 0d52582f4ff9d6e830aab31fcec467ffade59d85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 13:18:14 +0300
Subject: [PATCH 03/15] Update run_a101_pipeline.py

---
 pipeline/run_a101_pipeline.py | 38 ++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py
index 97b0ad9d..7692a094 100644
--- a/pipeline/run_a101_pipeline.py
+++ b/pipeline/run_a101_pipeline.py
@@ -35,6 +35,17 @@ def run_pipeline(category_key: str):
         products = get_a101_category_products(category_slug)
         logger.info("A101 scraped %d products", len(products))
 
+        if products:
+            logger.info("A101 first 5 products preview:")
+            for product in products[:5]:
+                logger.info(
+                    "product_name=%r shown_price_tl=%r unit=%r unit_amount=%r",
+                    product.get("product_name"),
+                    product.get("shown_price_tl"),
+                    product.get("unit"),
+                    product.get("unit_amount"),
+                )
+
         # -------------------------
         # 2) DB CONNECT
         # -------------------------
@@ -51,6 +62,20 @@ def run_pipeline(category_key: str):
             )
             conn.commit()
 
+        # Eğer 0 ürün geldiyse başarılı sayma
+        if not products:
+            with conn.cursor() as cur:
+                fail_run(
+                    cur,
+                    run_id,
+                    f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}",
+                )
+                conn.commit()
+
+            raise RuntimeError(
+                f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}"
+            )
+
         raw_count = 0
         stg_count = 0
         fact_count = 0
@@ -62,7 +87,6 @@ def run_pipeline(category_key: str):
         for product in products:
             try:
                 with conn.cursor() as cur:
-                    # RAW
                     event_id = insert_raw_event(
                         cur,
                         run_id=run_id,
@@ -72,7 +96,6 @@ def run_pipeline(category_key: str):
                         currency=currency,
                     )
 
-                    # STG SOURCE
                     insert_stg_source_product(
                         cur,
                         event_id=event_id,
@@ -81,17 +104,14 @@ def run_pipeline(category_key: str):
                         source_name=source_name,
                     )
 
-                    # TRANSFORM
                     transformed = transform_product(product)
 
-                    # DIM
                     product_id = get_or_create_product_id(
                         cur,
                         transformed["standardized_product_name"],
                         transformed.get("category_name"),
                     )
 
-                    # STG NORMALIZED
                     insert_stg_normalized_observation(
                         cur,
                         event_id,
@@ -101,7 +121,6 @@ def run_pipeline(category_key: str):
                         source_name=source_name,
                     )
 
-                    # STG OBS
                     observation_id = insert_stg_observation(
                         cur,
                         event_id,
@@ -112,7 +131,6 @@ def run_pipeline(category_key: str):
                         currency=currency,
                     )
 
-                    # FACT
                     inserted = insert_fact_observation(
                         cur,
                         observation_id,
@@ -167,7 +185,7 @@ def run_pipeline(category_key: str):
         if conn:
             try:
                 conn.rollback()
-            except:
+            except Exception:
                 pass
 
         if conn and run_id:
@@ -175,7 +193,7 @@ def run_pipeline(category_key: str):
                 with conn.cursor() as cur:
                     fail_run(cur, run_id, str(e))
                     conn.commit()
-            except:
+            except Exception:
                 pass
 
         logger.exception("A101 pipeline failed: %s", e)
@@ -183,4 +201,4 @@ def run_pipeline(category_key: str):
 
     finally:
         if conn:
-            conn.close()
\ No newline at end of file
+            conn.close()

From 1ba523a3a71386d0d41570c3558e093f1f0e085a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 13:18:53 +0300
Subject: [PATCH 04/15] debug: add detailed a101 scraper diagnostics

---
 scraper/a101/scraper.py | 67 ++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 2391b010..9b446bee 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -106,10 +106,15 @@ def get_a101_products(category_slug: str):
     products = []
 
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=False)
-        page = browser.new_page()
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page(viewport={"width": 1440, "height": 2200})
+
+        print(f"DEBUG - CATEGORY URL: {url}")
+
         page.goto(url, timeout=60000, wait_until="domcontentloaded")
+        page.wait_for_timeout(5000)
 
+        # Location popup
         for text in [
             "Bu defalık izin ver",
             "Siteyi ziyaret ederken izin ver",
@@ -117,36 +122,62 @@ def get_a101_products(category_slug: str):
         ]:
             try:
                 page.get_by_text(text, exact=True).click(timeout=3000)
+                print(f"DEBUG - clicked location popup button: {text}")
                 break
             except Exception:
                 pass
 
+        # Cookie popup
         for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]:
             try:
                 page.get_by_text(text, exact=True).click(timeout=3000)
+                print(f"DEBUG - clicked cookie popup button: {text}")
                 break
             except Exception:
                 pass
 
-        page.wait_for_timeout(3000)
+        page.wait_for_timeout(4000)
 
-        for _ in range(10):
+        for i in range(10):
             page.mouse.wheel(0, 3000)
-            page.wait_for_timeout(1000)
+            page.wait_for_timeout(1200)
+            print(f"DEBUG - scroll step {i + 1}")
 
-        cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all()
+        body_text = page.locator("body").inner_text()
+        print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}")
 
-        print(f"DEBUG - CATEGORY URL: {url}")
-        print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}")
+        selectors = [
+            "div[class*='product']",
+            "article",
+            "a[href*='/kapida/']",
+            "div[class*='grid'] > div",
+        ]
+
+        best_cards = []
+        best_selector = None
+
+        for selector in selectors:
+            try:
+                current_cards = page.locator(selector).all()
+                print(f"DEBUG - SELECTOR {selector} -> {len(current_cards)} cards")
+
+                if len(current_cards) > len(best_cards):
+                    best_cards = current_cards
+                    best_selector = selector
+            except Exception as e:
+                print(f"DEBUG - SELECTOR ERROR {selector}: {e}")
+
+        print(f"DEBUG - BEST SELECTOR: {best_selector}")
+        print(f"DEBUG - TOTAL CARDS FOUND: {len(best_cards)}")
 
         seen_names = set()
 
-        for i, card in enumerate(cards):
+        for i, card in enumerate(best_cards):
             try:
                 text_blob = card.inner_text().strip()
 
-                if i < 15:
-                    print(f"DEBUG - RAW CARD {i}: {text_blob[:300]}")
+                if i < 20:
+                    print(f"DEBUG - RAW CARD {i}: {text_blob[:400]}")
 
                 if not text_blob:
                     continue
@@ -158,15 +189,17 @@ def get_a101_products(category_slug: str):
                 price = parse_price_from_lines(lines)
                 name = clean_product_name(lines)
 
-                if i < 15:
-                    print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:8]}")
+                if i < 20:
+                    print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}")
 
-                # Bu aşamada filtreleri gevşek tutuyoruz
                 if price is None and name is None:
                     continue
 
                 if not name:
-                    name = f"unknown_a101_product_{i}"
+                    continue
+
+                if price is None:
+                    continue
 
                 normalized_name = name.lower().strip()
                 if normalized_name in seen_names:
@@ -181,8 +214,8 @@ def get_a101_products(category_slug: str):
                         "product_id": f"a101_{category_slug}_{i}",
                         "product_name": name,
                         "sku": f"a101_{category_slug}_{i}",
-                        "shown_price_tl": price if price is not None else 0,
-                        "regular_price_tl": price if price is not None else 0,
+                        "shown_price_tl": price,
+                        "regular_price_tl": price,
                         "discount_rate": None,
                         "product_url": url,
                         "brand_name": None,

From b6697f7af70498f58ae3cb77f8cbdbf05a04f883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 13:28:44 +0300
Subject: [PATCH 05/15] Update scraper.py

---
 scraper/a101/scraper.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 9b446bee..3950da29 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -106,15 +106,38 @@ def get_a101_products(category_slug: str):
     products = []
 
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
-        page = browser.new_page(viewport={"width": 1440, "height": 2200})
+        browser = p.chromium.launch(
+            headless=True,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+            ],
+        )
+
+        context = browser.new_context(
+            viewport={"width": 1440, "height": 2200},
+            user_agent=(
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/123.0.0.0 Safari/537.36"
+            ),
+            locale="tr-TR",
+        )
+
+        page = context.new_page()
+
+        page.add_init_script("""
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+        """)
 
         print(f"DEBUG - CATEGORY URL: {url}")
 
         page.goto(url, timeout=60000, wait_until="domcontentloaded")
         page.wait_for_timeout(5000)
 
-        # Location popup
         for text in [
             "Bu defalık izin ver",
             "Siteyi ziyaret ederken izin ver",
@@ -127,7 +150,6 @@ def get_a101_products(category_slug: str):
             except Exception:
                 pass
 
-        # Cookie popup
         for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]:
             try:
                 page.get_by_text(text, exact=True).click(timeout=3000)
@@ -231,6 +253,7 @@ def get_a101_products(category_slug: str):
 
         print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}")
 
+        context.close()
         browser.close()
 
     return products

From f19a8e760ac904c97298bf034452987e57ddee99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 13:55:36 +0300
Subject: [PATCH 06/15] fix: use a101 working fruit category path

---
 config/retailers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/retailers.py b/config/retailers.py
index fe828187..0a06f3f4 100644
--- a/config/retailers.py
+++ b/config/retailers.py
@@ -13,7 +13,7 @@
         "source_name": "a101",
         "currency": "TRY",
           "categories": {
-            "fruit_veg": "meyve-ve-sebze",
+            "fruit_veg": "meyve-sebze",
         },
     }
 }

From cda308edd16db8a8cdad19c4042a03611124e7df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 14:10:40 +0300
Subject: [PATCH 07/15] Update scraper.py

---
 scraper/a101/scraper.py | 35 +++++++++--------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 3950da29..1f039410 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -43,6 +43,11 @@ def is_valid_product_name(name: str) -> bool:
         "Kapıda",
         "Teslimat",
         "Seçtiğin mağaza",
+        "Kategoriler",
+        "Ana Sayfa",
+        "Site Haritası",
+        "İletişim",
+        "Yardım",
     ]
 
     lowered = name.lower().strip()
@@ -168,33 +173,14 @@ def get_a101_products(category_slug: str):
         body_text = page.locator("body").inner_text()
         print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}")
 
-        selectors = [
-            "div[class*='product']",
-            "article",
-            "a[href*='/kapida/']",
-            "div[class*='grid'] > div",
-        ]
+        cards = page.locator("div[class*='product']").all()
 
-        best_cards = []
-        best_selector = None
-
-        for selector in selectors:
-            try:
-                current_cards = page.locator(selector).all()
-                print(f"DEBUG - SELECTOR {selector} -> {len(current_cards)} cards")
-
-                if len(current_cards) > len(best_cards):
-                    best_cards = current_cards
-                    best_selector = selector
-            except Exception as e:
-                print(f"DEBUG - SELECTOR ERROR {selector}: {e}")
-
-        print(f"DEBUG - BEST SELECTOR: {best_selector}")
-        print(f"DEBUG - TOTAL CARDS FOUND: {len(best_cards)}")
+        print("DEBUG - FIXED SELECTOR: div[class*='product']")
+        print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}")
 
         seen_names = set()
 
-        for i, card in enumerate(best_cards):
+        for i, card in enumerate(cards):
             try:
                 text_blob = card.inner_text().strip()
 
@@ -214,9 +200,6 @@ def get_a101_products(category_slug: str):
                 if i < 20:
                     print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}")
 
-                if price is None and name is None:
-                    continue
-
                 if not name:
                     continue
 

From 778ea3a99a3d1cab47d1e448b8df39a54df25788 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 14:22:34 +0300
Subject: [PATCH 08/15] feat: aggregate a101 fruit_veg subcategories

---
 pipeline/run_a101_pipeline.py | 44 ++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py
index 7692a094..d24c947c 100644
--- a/pipeline/run_a101_pipeline.py
+++ b/pipeline/run_a101_pipeline.py
@@ -23,7 +23,16 @@
 
 
 def run_pipeline(category_key: str):
-    category_slug = RETAILER_CONFIG["a101"]["categories"][category_key]
+    base_category_slug = RETAILER_CONFIG["a101"]["categories"][category_key]
+
+    if category_key == "fruit_veg":
+        category_slugs = [
+            f"{base_category_slug}/meyve",
+            f"{base_category_slug}/sebze",
+            f"{base_category_slug}/yesillik",
+        ]
+    else:
+        category_slugs = [base_category_slug]
 
     conn = None
     run_id = None
@@ -32,7 +41,30 @@ def run_pipeline(category_key: str):
         # -------------------------
         # 1) SCRAPE
         # -------------------------
-        products = get_a101_category_products(category_slug)
+        products = []
+        seen_product_names = set()
+
+        for slug in category_slugs:
+            logger.info("Scraping A101 subcategory: %s", slug)
+            subcategory_products = get_a101_category_products(slug)
+
+            logger.info(
+                "A101 subcategory %s returned %d products",
+                slug,
+                len(subcategory_products),
+            )
+
+            for product in subcategory_products:
+                product_name = (product.get("product_name") or "").strip().lower()
+                if not product_name:
+                    continue
+
+                if product_name in seen_product_names:
+                    continue
+
+                seen_product_names.add(product_name)
+                products.append(product)
+
         logger.info("A101 scraped %d products", len(products))
 
         if products:
@@ -56,7 +88,7 @@ def run_pipeline(category_key: str):
                 cur,
                 source_name=source_name,
                 category_key=category_key,
-                category_slug=category_slug,
+                category_slug=base_category_slug,
                 triggered_by="local_test",
                 pipeline_version="v2-a101",
             )
@@ -68,12 +100,12 @@ def run_pipeline(category_key: str):
                 fail_run(
                     cur,
                     run_id,
-                    f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}",
+                    f"A101 scraper returned 0 products for category_key={category_key} category_slug={base_category_slug}",
                 )
                 conn.commit()
 
             raise RuntimeError(
-                f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}"
+                f"A101 scraper returned 0 products for category_key={category_key} category_slug={base_category_slug}"
             )
 
         raw_count = 0
@@ -91,7 +123,7 @@ def run_pipeline(category_key: str):
                         cur,
                         run_id=run_id,
                         product=product,
-                        category_slug=category_slug,
+                        category_slug=base_category_slug,
                         source_name=source_name,
                         currency=currency,
                     )

From ee9b8f97f83bc9b48bcbd4127f77477fb10bf67f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 14:37:18 +0300
Subject: [PATCH 09/15] debug: log a101 subcategory aggregation

---
 pipeline/run_a101_pipeline.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py
index d24c947c..201de234 100644
--- a/pipeline/run_a101_pipeline.py
+++ b/pipeline/run_a101_pipeline.py
@@ -34,6 +34,9 @@ def run_pipeline(category_key: str):
     else:
         category_slugs = [base_category_slug]
 
+    logger.info("DEBUG - base_category_slug=%s", base_category_slug)
+    logger.info("DEBUG - category_slugs=%s", category_slugs)
+
     conn = None
     run_id = None
 
@@ -45,15 +48,22 @@ def run_pipeline(category_key: str):
         seen_product_names = set()
 
         for slug in category_slugs:
-            logger.info("Scraping A101 subcategory: %s", slug)
+            logger.info("DEBUG - Scraping A101 subcategory: %s", slug)
             subcategory_products = get_a101_category_products(slug)
 
             logger.info(
-                "A101 subcategory %s returned %d products",
+                "DEBUG - A101 subcategory %s returned %d products",
                 slug,
                 len(subcategory_products),
             )
 
+            if subcategory_products:
+                logger.info(
+                    "DEBUG - First 3 products from %s: %s",
+                    slug,
+                    [p.get("product_name") for p in subcategory_products[:3]],
+                )
+
             for product in subcategory_products:
                 product_name = (product.get("product_name") or "").strip().lower()
                 if not product_name:

From 0fd7060df6f58070f1d3cb60acfd5c77dcd7e85d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 15:24:01 +0300
Subject: [PATCH 10/15] fix: clean a101 product card parsing

---
 scraper/a101/scraper.py | 140 ++++++++++++----------------------------
 1 file changed, 40 insertions(+), 100 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 1f039410..27e77147 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -27,85 +27,6 @@ def extract_unit_info(product_name: str):
     return None, None
 
 
-def is_valid_product_name(name: str) -> bool:
-    if not name:
-        return False
-
-    invalid_fragments = [
-        "Popüler Ürünler",
-        "Çerez Kullanımı",
-        "Kampanyalar",
-        "Giriş Yap",
-        "Sepetim",
-        "Aramak istediğin ürünü yaz",
-        "Böyle bir sayfa bulamadık",
-        "A101 Hep Ucuz",
-        "Kapıda",
-        "Teslimat",
-        "Seçtiğin mağaza",
-        "Kategoriler",
-        "Ana Sayfa",
-        "Site Haritası",
-        "İletişim",
-        "Yardım",
-    ]
-
-    lowered = name.lower().strip()
-
-    if len(lowered) < 3:
-        return False
-
-    for fragment in invalid_fragments:
-        if fragment.lower() in lowered:
-            return False
-
-    if "₺" in lowered:
-        return False
-
-    return True
-
-
-def parse_price_from_lines(lines):
-    price_pattern = re.compile(r"₺\s*([\d\.]+(?:,\d{1,2})?)")
-
-    prices = []
-    for line in lines:
-        matches = price_pattern.findall(line)
-        for match in matches:
-            raw_price = match.replace(".", "").replace(",", ".").strip()
-            try:
-                prices.append(float(raw_price))
-            except ValueError:
-                continue
-
-    if not prices:
-        return None
-
-    return min(prices)
-
-
-def clean_product_name(lines):
-    candidates = []
-
-    for line in lines:
-        line = line.strip()
-
-        if not is_valid_product_name(line):
-            continue
-
-        if re.search(r"\d+\s*(?:kg|g|gr|l|lt|ml|adet|li)\b", line, flags=re.IGNORECASE):
-            candidates.append(line)
-            continue
-
-        if not re.search(r"^\d+[.,]?\d*$", line) and "₺" not in line:
-            candidates.append(line)
-
-    if not candidates:
-        return None
-
-    return max(candidates, key=len).strip()
-
-
 def get_a101_products(category_slug: str):
     url = f"https://www.a101.com.tr/kapida/{category_slug}/"
     products = []
@@ -173,35 +94,55 @@ def get_a101_products(category_slug: str):
         body_text = page.locator("body").inner_text()
         print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}")
 
-        cards = page.locator("div[class*='product']").all()
+        cards = page.locator("div[data-testid='product-card']").all()
 
-        print("DEBUG - FIXED SELECTOR: div[class*='product']")
+        print("DEBUG - USING SELECTOR: div[data-testid='product-card']")
         print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}")
 
         seen_names = set()
 
         for i, card in enumerate(cards):
             try:
-                text_blob = card.inner_text().strip()
-
-                if i < 20:
-                    print(f"DEBUG - RAW CARD {i}: {text_blob[:400]}")
-
-                if not text_blob:
-                    continue
-
-                lines = [x.strip() for x in text_blob.splitlines() if x.strip()]
-                if not lines:
+                name = card.locator("h3").inner_text().strip()
+
+                # Meyve-sebze dışı ürünleri ele
+                if any(
+                    x in name.lower()
+                    for x in [
+                        "süt",
+                        "peynir",
+                        "çikolata",
+                        "deterjan",
+                        "pirinç",
+                        "yumurta",
+                    ]
+                ):
                     continue
 
-                price = parse_price_from_lines(lines)
-                name = clean_product_name(lines)
-
-                if i < 20:
-                    print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}")
-
-                if not name:
-                    continue
+                price = None
+                candidate_texts = []
+
+                try:
+                    spans = card.locator("span").all_inner_texts()
+                    candidate_texts.extend(spans)
+                except Exception:
+                    pass
+
+                try:
+                    card_text = card.inner_text()
+                    candidate_texts.append(card_text)
+                except Exception:
+                    pass
+
+                for text in candidate_texts:
+                    matches = re.findall(r"₺\s*([\d\.]+(?:,\d{1,2})?)", text)
+                    if matches:
+                        raw_price = matches[0].replace(".", "").replace(",", ".").strip()
+                        try:
+                            price = float(raw_price)
+                            break
+                        except ValueError:
+                            continue
 
                 if price is None:
                     continue
@@ -209,7 +150,6 @@ def get_a101_products(category_slug: str):
                 normalized_name = name.lower().strip()
                 if normalized_name in seen_names:
                     continue
-
                 seen_names.add(normalized_name)
 
                 extracted_unit, extracted_amount = extract_unit_info(name)

From 27ea39d1e4766b1ada3b06c6f3c1b98c900e7457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 15:31:36 +0300
Subject: [PATCH 11/15] Update scraper.py

---
 scraper/a101/scraper.py | 202 ++++++++++++++++++++++++----------------
 1 file changed, 122 insertions(+), 80 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 27e77147..194be55c 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -12,7 +12,7 @@ def extract_unit_info(product_name: str):
         (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)\b", "LITER"),
         (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)\b", "ML"),
         (r"(\d+(?:[.,]\d+)?)\s*(adet|Adet|ADET)\b", "PIECE"),
-        (r"(\d+(?:[.,]\d+)?)\s*(li|LI)\b", "PIECE"),
+        (r"(\d+(?:[.,]\d+)?)\s*(li|LI|'lu|'lü)\b", "PIECE"),
     ]
 
     for pattern, unit in patterns:
@@ -27,6 +27,124 @@ def extract_unit_info(product_name: str):
     return None, None
 
 
+def normalize_category_name(category_slug: str) -> str:
+    if category_slug.endswith("/meyve"):
+        return "Meyve"
+    if category_slug.endswith("/sebze"):
+        return "Sebze"
+    if category_slug.endswith("/yesillik"):
+        return "Yesillik"
+    return category_slug.replace("-", " ").title()
+
+
+def get_section_header(category_slug: str) -> str:
+    if category_slug.endswith("/meyve"):
+        return "Meyve"
+    if category_slug.endswith("/sebze"):
+        return "Sebze"
+    if category_slug.endswith("/yesillik"):
+        return "Yeşillik"
+    return None
+
+
+def parse_products_from_body_text(body_text: str, category_slug: str):
+    lines = [line.strip() for line in body_text.splitlines() if line.strip()]
+
+    section_header = get_section_header(category_slug)
+    if not section_header:
+        return []
+
+    # İlgili bölümün başlangıcını bul
+    start_idx = None
+    for i, line in enumerate(lines):
+        if line == section_header:
+            start_idx = i
+            break
+
+    if start_idx is None:
+        print(f"DEBUG - section header not found: {section_header}")
+        return []
+
+    # Bir sonraki bölüm başlığına kadar git
+    stop_headers = {"Meyve", "Sebze", "Yeşillik", "Sepetim", "Giriş Yap", "Site Haritası"}
+    section_lines = []
+
+    for line in lines[start_idx + 1:]:
+        if line in stop_headers and line != section_header:
+            break
+        section_lines.append(line)
+
+    print(f"DEBUG - section_header={section_header}")
+    print(f"DEBUG - section_lines_sample={section_lines[:40]}")
+
+    products = []
+    seen_names = set()
+
+    i = 0
+    while i < len(section_lines) - 1:
+        name = section_lines[i]
+        next_line = section_lines[i + 1]
+
+        # fiyat satırı mı?
+        if re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", next_line):
+            # saçma satırları ele
+            lowered = name.lower()
+            if any(
+                x in lowered
+                for x in [
+                    "kampanyalar",
+                    "giriş yap",
+                    "sepetim",
+                    "anasayfa",
+                    "site haritası",
+                    "yardım",
+                    "iletişim",
+                ]
+            ):
+                i += 1
+                continue
+
+            raw_price = next_line.replace("₺", "").replace(".", "").replace(",", ".").strip()
+
+            try:
+                price = float(raw_price)
+            except ValueError:
+                i += 1
+                continue
+
+            normalized_name = name.lower().strip()
+            if normalized_name in seen_names:
+                i += 2
+                continue
+
+            seen_names.add(normalized_name)
+
+            extracted_unit, extracted_amount = extract_unit_info(name)
+
+            products.append(
+                {
+                    "product_id": f"a101_{category_slug}_{len(products)}",
+                    "product_name": name,
+                    "sku": f"a101_{category_slug}_{len(products)}",
+                    "shown_price_tl": price,
+                    "regular_price_tl": price,
+                    "discount_rate": None,
+                    "product_url": f"https://www.a101.com.tr/kapida/{category_slug}/",
+                    "brand_name": None,
+                    "category_name": normalize_category_name(category_slug),
+                    "unit": extracted_unit,
+                    "unit_amount": extracted_amount,
+                }
+            )
+
+            i += 2
+            continue
+
+        i += 1
+
+    return products
+
+
 def get_a101_products(category_slug: str):
     url = f"https://www.a101.com.tr/kapida/{category_slug}/"
     products = []
@@ -94,87 +212,11 @@ def get_a101_products(category_slug: str):
         body_text = page.locator("body").inner_text()
         print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}")
 
-        cards = page.locator("div[data-testid='product-card']").all()
-
-        print("DEBUG - USING SELECTOR: div[data-testid='product-card']")
-        print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}")
-
-        seen_names = set()
-
-        for i, card in enumerate(cards):
-            try:
-                name = card.locator("h3").inner_text().strip()
-
-                # Meyve-sebze dışı ürünleri ele
-                if any(
-                    x in name.lower()
-                    for x in [
-                        "süt",
-                        "peynir",
-                        "çikolata",
-                        "deterjan",
-                        "pirinç",
-                        "yumurta",
-                    ]
-                ):
-                    continue
-
-                price = None
-                candidate_texts = []
-
-                try:
-                    spans = card.locator("span").all_inner_texts()
-                    candidate_texts.extend(spans)
-                except Exception:
-                    pass
-
-                try:
-                    card_text = card.inner_text()
-                    candidate_texts.append(card_text)
-                except Exception:
-                    pass
-
-                for text in candidate_texts:
-                    matches = re.findall(r"₺\s*([\d\.]+(?:,\d{1,2})?)", text)
-                    if matches:
-                        raw_price = matches[0].replace(".", "").replace(",", ".").strip()
-                        try:
-                            price = float(raw_price)
-                            break
-                        except ValueError:
-                            continue
-
-                if price is None:
-                    continue
-
-                normalized_name = name.lower().strip()
-                if normalized_name in seen_names:
-                    continue
-                seen_names.add(normalized_name)
-
-                extracted_unit, extracted_amount = extract_unit_info(name)
-
-                products.append(
-                    {
-                        "product_id": f"a101_{category_slug}_{i}",
-                        "product_name": name,
-                        "sku": f"a101_{category_slug}_{i}",
-                        "shown_price_tl": price,
-                        "regular_price_tl": price,
-                        "discount_rate": None,
-                        "product_url": url,
-                        "brand_name": None,
-                        "category_name": category_slug.replace("-", " ").title(),
-                        "unit": extracted_unit,
-                        "unit_amount": extracted_amount,
-                    }
-                )
-
-            except Exception as e:
-                print(f"DEBUG - CARD ERROR {i}: {e}")
-                continue
+        products = parse_products_from_body_text(body_text, category_slug)
 
         print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}")
+        if products:
+            print(f"DEBUG - FIRST 10 PRODUCTS: {[p['product_name'] for p in products[:10]]}")
 
         context.close()
         browser.close()

From 8a295c24737ac262808e5e7aa54f8b7f64ad2f85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 15:32:06 +0300
Subject: [PATCH 12/15] Update retailers.py

---
 config/retailers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/retailers.py b/config/retailers.py
index 0a06f3f4..c8d1d339 100644
--- a/config/retailers.py
+++ b/config/retailers.py
@@ -15,5 +15,5 @@
           "categories": {
             "fruit_veg": "meyve-sebze",
         },
-    }
+    },
 }

From 10031c4554cacb9591a9a264eba5db628bbe975a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 15:43:13 +0300
Subject: [PATCH 13/15] Update scraper.py

---
 scraper/a101/scraper.py | 58 ++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py
index 194be55c..07d8dccd 100644
--- a/scraper/a101/scraper.py
+++ b/scraper/a101/scraper.py
@@ -37,7 +37,7 @@ def normalize_category_name(category_slug: str) -> str:
     return category_slug.replace("-", " ").title()
 
 
-def get_section_header(category_slug: str) -> str:
+def get_section_header(category_slug: str) -> str | None:
     if category_slug.endswith("/meyve"):
         return "Meyve"
     if category_slug.endswith("/sebze"):
@@ -47,34 +47,61 @@ def get_section_header(category_slug: str) -> str:
     return None
 
 
+def is_price_line(text: str) -> bool:
+    return bool(re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", text))
+
+
+def find_best_section_start(lines: list[str], section_header: str) -> int | None:
+    """
+    Same header appears multiple times in the page.
+    Choose the occurrence that is followed by real product rows (price lines).
+    """
+    candidate_indices = [i for i, line in enumerate(lines) if line == section_header]
+
+    if not candidate_indices:
+        return None
+
+    best_idx = None
+    best_score = -1
+
+    for idx in candidate_indices:
+        window = lines[idx : idx + 80]
+        score = sum(1 for line in window if is_price_line(line))
+
+        # Prefer the first strong candidate with nearby prices
+        if score > best_score:
+            best_score = score
+            best_idx = idx
+
+    if best_score <= 0:
+        return None
+
+    return best_idx
+
+
 def parse_products_from_body_text(body_text: str, category_slug: str):
     lines = [line.strip() for line in body_text.splitlines() if line.strip()]
 
     section_header = get_section_header(category_slug)
     if not section_header:
+        print(f"DEBUG - section header missing for category_slug={category_slug}")
         return []
 
-    # İlgili bölümün başlangıcını bul
-    start_idx = None
-    for i, line in enumerate(lines):
-        if line == section_header:
-            start_idx = i
-            break
-
+    start_idx = find_best_section_start(lines, section_header)
     if start_idx is None:
-        print(f"DEBUG - section header not found: {section_header}")
+        print(f"DEBUG - section header not found or no priced rows nearby: {section_header}")
         return []
 
-    # Bir sonraki bölüm başlığına kadar git
     stop_headers = {"Meyve", "Sebze", "Yeşillik", "Sepetim", "Giriş Yap", "Site Haritası"}
-    section_lines = []
 
-    for line in lines[start_idx + 1:]:
+    section_lines = []
+    for line in lines[start_idx + 1 :]:
         if line in stop_headers and line != section_header:
             break
         section_lines.append(line)
 
     print(f"DEBUG - section_header={section_header}")
+    print(f"DEBUG - section_start_idx={start_idx}")
     print(f"DEBUG - section_lines_sample={section_lines[:40]}")
 
     products = []
@@ -85,10 +112,10 @@ def parse_products_from_body_text(body_text: str, category_slug: str):
         name = section_lines[i]
         next_line = section_lines[i + 1]
 
-        # fiyat satırı mı?
-        if re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", next_line):
-            # saçma satırları ele
+        if is_price_line(next_line):
             lowered = name.lower()
+
+            # obvious non-product lines
             if any(
                 x in lowered
                 for x in [
@@ -99,6 +126,7 @@ def parse_products_from_body_text(body_text: str, category_slug: str):
                     "site haritası",
                     "yardım",
                     "iletişim",
+                    "kategoriler",
                 ]
             ):
                 i += 1

From 2a73d1f25f5364203119d7b05d130eb7021f0891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 16:07:52 +0300
Subject: [PATCH 14/15] debug: log why a101 records are skipped from fact

---
 pipeline/loaders_fact.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/pipeline/loaders_fact.py b/pipeline/loaders_fact.py
index 03d16861..59815bfb 100644
--- a/pipeline/loaders_fact.py
+++ b/pipeline/loaders_fact.py
@@ -43,9 +43,19 @@ def insert_fact_observation(
 
     if not can_insert:
         logger.info(
-            "Skipping fact insert — product=%r reason=%s",
+            (
+                "DEBUG - FACT SKIP | product=%r | reason=%s | "
+                "price=%r | normalized_unit=%r | normalized_quantity=%r | "
+                "price_per_unit=%r | standardized_product_name=%r | category_name=%r"
+            ),
             product.get("product_name"),
             reason,
+            transformed.get("price"),
+            transformed.get("normalized_unit"),
+            transformed.get("normalized_quantity"),
+            transformed.get("price_per_unit"),
+            transformed.get("standardized_product_name"),
+            transformed.get("category_name"),
         )
         return False
 
@@ -127,4 +137,11 @@ def insert_fact_observation(
         ),
     )
 
-    return cursor.rowcount == 1
\ No newline at end of file
+    if cursor.rowcount != 1:
+        logger.info(
+            "DEBUG - FACT NOT INSERTED | product=%r | reason=conflict_or_no_insert | event_id=%r",
+            product.get("product_name"),
+            event_id,
+        )
+
+    return cursor.rowcount == 1

From 6d5a2a25d9f158e809b62530a534b35d677aac22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= <sabahatsngzr@gmail.com>
Date: Mon, 13 Apr 2026 17:27:45 +0300
Subject: [PATCH 15/15] fix: infer normalized unit from a101 product names

---
 pipeline/transforms.py | 75 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 61 insertions(+), 14 deletions(-)

diff --git a/pipeline/transforms.py b/pipeline/transforms.py
index e4a93d3c..b2374253 100644
--- a/pipeline/transforms.py
+++ b/pipeline/transforms.py
@@ -3,13 +3,8 @@
 
 
 def normalize_unit(
-    unit: Optional[str], quantity: Any
+    unit: Optional[str], quantity: Any, product_name: Optional[str] = None
 ) -> Tuple[Optional[str], Optional[float]]:
-    if unit is None:
-        return None, None
-
-    unit_upper = str(unit).strip().upper()
-
     qty: Optional[float] = None
     if quantity is not None:
         try:
@@ -17,15 +12,63 @@ def normalize_unit(
         except (TypeError, ValueError):
             qty = None
 
-    if unit_upper == "GRAM":
-        if qty is None:
-            return "kg", None
-        return "kg", round(qty / 1000, 4)
+    # 1) Unit varsa önce onu kullan
+    if unit is not None:
+        unit_upper = str(unit).strip().upper()
+
+        if unit_upper == "GRAM":
+            if qty is None:
+                return "kg", None
+            return "kg", round(qty / 1000, 4)
+
+        if unit_upper == "KG":
+            return "kg", qty if qty is not None else 1.0
+
+        if unit_upper == "PIECE":
+            return "piece", qty if qty is not None else 1.0
+
+        if unit_upper == "LITER":
+            return "liter", qty if qty is not None else 1.0
+
+        if unit_upper == "ML":
+            if qty is None:
+                return "liter", None
+            return "liter", round(qty / 1000, 4)
+
+        return unit.lower(), qty
+
+    # 2) Unit yoksa product_name'den çözmeye çalış
+    name = (product_name or "").lower().strip()
+
+    if not name:
+        return None, None
+
+    tr_map = {"ı": "i", "ğ": "g", "ü": "u", "ş": "s", "ö": "o", "ç": "c"}
+    for old, new in tr_map.items():
+        name = name.replace(old, new)
+
+    if re.search(r"\bkg\b", name):
+        return "kg", 1.0
 
-    if unit_upper == "PIECE":
-        return "piece", qty if qty is not None else 1.0
+    gram_match = re.search(r"(\d+(?:[.,]\d+)?)\s*g\b", name)
+    if gram_match:
+        grams = float(gram_match.group(1).replace(",", "."))
+        return "kg", round(grams / 1000, 4)
 
-    return unit.lower(), qty
+    liter_match = re.search(r"(\d+(?:[.,]\d+)?)\s*l\b", name)
+    if liter_match:
+        liters = float(liter_match.group(1).replace(",", "."))
+        return "liter", liters
+
+    ml_match = re.search(r"(\d+(?:[.,]\d+)?)\s*ml\b", name)
+    if ml_match:
+        ml = float(ml_match.group(1).replace(",", "."))
+        return "liter", round(ml / 1000, 4)
+
+    if re.search(r"\badet\b", name):
+        return "piece", 1.0
+
+    return None, None
 
 
 def standardize_product_name(product_name: Optional[str]) -> Optional[str]:
@@ -106,7 +149,11 @@ def transform_product(product: dict[str, Any]) -> dict[str, Any]:
     unit = product.get("unit")
     unit_amount = product.get("unit_amount")
 
-    normalized_unit, normalized_quantity = normalize_unit(unit, unit_amount)
+    normalized_unit, normalized_quantity = normalize_unit(
+        unit,
+        unit_amount,
+        product.get("product_name"),
+    )
     price_per_unit = calculate_price_per_unit(price, normalized_quantity)
     unit_price_label = build_unit_price_label(normalized_unit)
     standardized_product_name = standardize_product_name(product.get("product_name"))