From c7d8cf6b5655e8f89346f47f7759dcafd7dc0cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 11:28:29 +0300 Subject: [PATCH 01/15] feat: improve a101 scraper parsing and filtering --- scraper/a101/scraper.py | 107 +++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 3030c1b7..79aa2fe7 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -7,10 +7,12 @@ def extract_unit_info(product_name: str): return None, None patterns = [ - (r"(\d+(?:[.,]\d+)?)\s*(kg|KG|Kg)", "KG"), - (r"(\d+(?:[.,]\d+)?)\s*(g|G|gr|GR)", "GRAM"), - (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)", "LITER"), - (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)", "ML"), + (r"(\d+(?:[.,]\d+)?)\s*(kg|KG|Kg)\b", "KG"), + (r"(\d+(?:[.,]\d+)?)\s*(g|G|gr|GR)\b", "GRAM"), + (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)\b", "LITER"), + (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)\b", "ML"), + (r"(\d+(?:[.,]\d+)?)\s*(adet|Adet|ADET)\b", "PIECE"), + (r"(\d+(?:[.,]\d+)?)\s*(li|LI)\b", "PIECE"), ] for pattern, unit in patterns: @@ -38,29 +40,65 @@ def is_valid_product_name(name: str) -> bool: "Aramak istediğin ürünü yaz", "Böyle bir sayfa bulamadık", "A101 Hep Ucuz", + "Kapıda", + "Teslimat", + "Seçtiğin mağaza", ] + lowered = name.lower().strip() + + if len(lowered) < 3: + return False + for fragment in invalid_fragments: - if fragment.lower() in name.lower(): + if fragment.lower() in lowered: return False + if "₺" in lowered: + return False + return True def parse_price_from_lines(lines): + price_pattern = re.compile(r"₺\s*([\d\.]+(?:,\d{1,2})?)") + + prices = [] for line in lines: - if "₺" in line: - raw_price = ( - line.replace("₺", "") - .replace(".", "") - .replace(",", ".") - .strip() - ) + matches = price_pattern.findall(line) + for match in matches: + raw_price = match.replace(".", "").replace(",", ".").strip() try: - return float(raw_price) + prices.append(float(raw_price)) except ValueError: continue - return None + + if not prices: + return None + + return min(prices) + + +def clean_product_name(lines): + candidates = [] + + for line in lines: + line = line.strip() + + if not is_valid_product_name(line): + continue + + if re.search(r"\d+\s*(?:kg|g|gr|l|lt|ml|adet|li)\b", line, flags=re.IGNORECASE): + candidates.append(line) + continue + + if not re.search(r"^\d+[.,]?\d*$", line) and "₺" not in line: + candidates.append(line) + + if not candidates: + return None + + return max(candidates, key=len).strip() def get_a101_products(category_slug: str): @@ -68,11 +106,10 @@ def get_a101_products(category_slug: str): products = [] with sync_playwright() as p: - browser = p.chromium.launch(headless=False) + browser = p.chromium.launch(headless=True) page = browser.new_page() - page.goto(url, timeout=60000) + page.goto(url, timeout=60000, wait_until="domcontentloaded") - # Konum popup for text in [ "Bu defalık izin ver", "Siteyi ziyaret ederken izin ver", @@ -84,7 +121,6 @@ def get_a101_products(category_slug: str): except Exception: pass - # Cookie popup for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]: try: page.get_by_text(text, exact=True).click(timeout=3000) @@ -92,14 +128,15 @@ def get_a101_products(category_slug: str): except Exception: pass - page.wait_for_timeout(4000) + page.wait_for_timeout(3000) - # Lazy loading için scroll - for _ in range(8): - page.mouse.wheel(0, 2500) - page.wait_for_timeout(1200) + for _ in range(10): + page.mouse.wheel(0, 3000) + page.wait_for_timeout(1000) - cards = page.locator("div[class*='product']").all() + cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all() + + seen_names = set() for i, card in enumerate(cards): try: @@ -115,29 +152,29 @@ def get_a101_products(category_slug: str): if price is None: continue - name = None - for line in lines: - if "₺" not in line and len(line) > 2: - if is_valid_product_name(line): - name = line - break - + name = clean_product_name(lines) if not name: continue + normalized_name = name.lower().strip() + if normalized_name in seen_names: + continue + + seen_names.add(normalized_name) + extracted_unit, extracted_amount = extract_unit_info(name) products.append( { - "product_id": f"a101_{i}", + "product_id": f"a101_{category_slug}_{i}", "product_name": name, - "sku": f"a101_{i}", + "sku": f"a101_{category_slug}_{i}", "shown_price_tl": price, "regular_price_tl": price, "discount_rate": None, "product_url": url, "brand_name": None, - "category_name": "Meyve ve Sebze", + "category_name": category_slug.replace("-", " ").title(), "unit": extracted_unit, "unit_amount": extracted_amount, } @@ -148,4 +185,4 @@ def get_a101_products(category_slug: str): browser.close() - return products \ No newline at end of file + return products From 3e2a2e2b4f1f4839cd7df177e5b442e0d384d348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 13:12:18 +0300 Subject: [PATCH 02/15] debug: inspect a101 scraper output --- scraper/a101/scraper.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 79aa2fe7..2391b010 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -106,7 +106,7 @@ def get_a101_products(category_slug: str): products = [] with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() page.goto(url, timeout=60000, wait_until="domcontentloaded") @@ -136,11 +136,18 @@ def get_a101_products(category_slug: str): cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all() + print(f"DEBUG - CATEGORY URL: {url}") + print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}") + seen_names = set() for i, card in enumerate(cards): try: text_blob = card.inner_text().strip() + + if i < 15: + print(f"DEBUG - RAW CARD {i}: {text_blob[:300]}") + if not text_blob: continue @@ -149,12 +156,17 @@ def get_a101_products(category_slug: str): continue price = parse_price_from_lines(lines) - if price is None: + name = clean_product_name(lines) + + if i < 15: + print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:8]}") + + # Bu aşamada filtreleri gevşek tutuyoruz + if price is None and name is None: continue - name = clean_product_name(lines) if not name: - continue + name = f"unknown_a101_product_{i}" normalized_name = name.lower().strip() if normalized_name in seen_names: @@ -169,8 +181,8 @@ def get_a101_products(category_slug: str): "product_id": f"a101_{category_slug}_{i}", "product_name": name, "sku": f"a101_{category_slug}_{i}", - "shown_price_tl": price, - "regular_price_tl": price, + "shown_price_tl": price if price is not None else 0, + "regular_price_tl": price if price is not None else 0, "discount_rate": None, "product_url": url, "brand_name": None, @@ -180,9 +192,12 @@ def get_a101_products(category_slug: str): } ) - except Exception: + except Exception as e: + print(f"DEBUG - CARD ERROR {i}: {e}") continue + print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}") + browser.close() return products From 0d52582f4ff9d6e830aab31fcec467ffade59d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 13:18:14 +0300 Subject: [PATCH 03/15] Update run_a101_pipeline.py --- pipeline/run_a101_pipeline.py | 38 ++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py index 97b0ad9d..7692a094 100644 --- a/pipeline/run_a101_pipeline.py +++ b/pipeline/run_a101_pipeline.py @@ -35,6 +35,17 @@ def run_pipeline(category_key: str): products = get_a101_category_products(category_slug) logger.info("A101 scraped %d products", len(products)) + if products: + logger.info("A101 first 5 products preview:") + for product in products[:5]: + logger.info( + "product_name=%r shown_price_tl=%r unit=%r unit_amount=%r", + product.get("product_name"), + product.get("shown_price_tl"), + product.get("unit"), + product.get("unit_amount"), + ) + # ------------------------- # 2) DB CONNECT # ------------------------- @@ -51,6 +62,20 @@ def run_pipeline(category_key: str): ) conn.commit() + # Eğer 0 ürün geldiyse başarılı sayma + if not products: + with conn.cursor() as cur: + fail_run( + cur, + run_id, + f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}", + ) + conn.commit() + + raise RuntimeError( + f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}" + ) + raw_count = 0 stg_count = 0 fact_count = 0 @@ -62,7 +87,6 @@ def run_pipeline(category_key: str): for product in products: try: with conn.cursor() as cur: - # RAW event_id = insert_raw_event( cur, run_id=run_id, @@ -72,7 +96,6 @@ def run_pipeline(category_key: str): currency=currency, ) - # STG SOURCE insert_stg_source_product( cur, event_id=event_id, @@ -81,17 +104,14 @@ def run_pipeline(category_key: str): source_name=source_name, ) - # TRANSFORM transformed = transform_product(product) - # DIM product_id = get_or_create_product_id( cur, transformed["standardized_product_name"], transformed.get("category_name"), ) - # STG NORMALIZED insert_stg_normalized_observation( cur, event_id, @@ -101,7 +121,6 @@ def run_pipeline(category_key: str): source_name=source_name, ) - # STG OBS observation_id = insert_stg_observation( cur, event_id, @@ -112,7 +131,6 @@ def run_pipeline(category_key: str): currency=currency, ) - # FACT inserted = insert_fact_observation( cur, observation_id, @@ -167,7 +185,7 @@ def run_pipeline(category_key: str): if conn: try: conn.rollback() - except: + except Exception: pass if conn and run_id: @@ -175,7 +193,7 @@ def run_pipeline(category_key: str): with conn.cursor() as cur: fail_run(cur, run_id, str(e)) conn.commit() - except: + except Exception: pass logger.exception("A101 pipeline failed: %s", e) @@ -183,4 +201,4 @@ def run_pipeline(category_key: str): finally: if conn: - conn.close() \ No newline at end of file + conn.close() From 1ba523a3a71386d0d41570c3558e093f1f0e085a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 13:18:53 +0300 Subject: [PATCH 04/15] debug: add detailed a101 scraper diagnostics --- scraper/a101/scraper.py | 67 ++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 2391b010..9b446bee 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -106,10 +106,15 @@ def get_a101_products(category_slug: str): products = [] with sync_playwright() as p: - browser = p.chromium.launch(headless=False) - page = browser.new_page() + browser = p.chromium.launch(headless=True) + page = browser.new_page(viewport={"width": 1440, "height": 2200}) + + print(f"DEBUG - CATEGORY URL: {url}") + page.goto(url, timeout=60000, wait_until="domcontentloaded") + page.wait_for_timeout(5000) + # Location popup for text in [ "Bu defalık izin ver", "Siteyi ziyaret ederken izin ver", @@ -117,36 +122,62 @@ def get_a101_products(category_slug: str): ]: try: page.get_by_text(text, exact=True).click(timeout=3000) + print(f"DEBUG - clicked location popup button: {text}") break except Exception: pass + # Cookie popup for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]: try: page.get_by_text(text, exact=True).click(timeout=3000) + print(f"DEBUG - clicked cookie popup button: {text}") break except Exception: pass - page.wait_for_timeout(3000) + page.wait_for_timeout(4000) - for _ in range(10): + for i in range(10): page.mouse.wheel(0, 3000) - page.wait_for_timeout(1000) + page.wait_for_timeout(1200) + print(f"DEBUG - scroll step {i + 1}") - cards = page.locator("div[class*='product'], article, a[href*='/kapida/']").all() + body_text = page.locator("body").inner_text() + print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}") - print(f"DEBUG - CATEGORY URL: {url}") - print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}") + selectors = [ + "div[class*='product']", + "article", + "a[href*='/kapida/']", + "div[class*='grid'] > div", + ] + + best_cards = [] + best_selector = None + + for selector in selectors: + try: + current_cards = page.locator(selector).all() + print(f"DEBUG - SELECTOR {selector} -> {len(current_cards)} cards") + + if len(current_cards) > len(best_cards): + best_cards = current_cards + best_selector = selector + except Exception as e: + print(f"DEBUG - SELECTOR ERROR {selector}: {e}") + + print(f"DEBUG - BEST SELECTOR: {best_selector}") + print(f"DEBUG - TOTAL CARDS FOUND: {len(best_cards)}") seen_names = set() - for i, card in enumerate(cards): + for i, card in enumerate(best_cards): try: text_blob = card.inner_text().strip() - if i < 15: - print(f"DEBUG - RAW CARD {i}: {text_blob[:300]}") + if i < 20: + print(f"DEBUG - RAW CARD {i}: {text_blob[:400]}") if not text_blob: continue @@ -158,15 +189,17 @@ def get_a101_products(category_slug: str): price = parse_price_from_lines(lines) name = clean_product_name(lines) - if i < 15: - print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:8]}") + if i < 20: + print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}") - # Bu aşamada filtreleri gevşek tutuyoruz if price is None and name is None: continue if not name: - name = f"unknown_a101_product_{i}" + continue + + if price is None: + continue normalized_name = name.lower().strip() if normalized_name in seen_names: @@ -181,8 +214,8 @@ def get_a101_products(category_slug: str): "product_id": f"a101_{category_slug}_{i}", "product_name": name, "sku": f"a101_{category_slug}_{i}", - "shown_price_tl": price if price is not None else 0, - "regular_price_tl": price if price is not None else 0, + "shown_price_tl": price, + "regular_price_tl": price, "discount_rate": None, "product_url": url, "brand_name": None, From b6697f7af70498f58ae3cb77f8cbdbf05a04f883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 13:28:44 +0300 Subject: [PATCH 05/15] Update scraper.py --- scraper/a101/scraper.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 9b446bee..3950da29 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -106,15 +106,38 @@ def get_a101_products(category_slug: str): products = [] with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - page = browser.new_page(viewport={"width": 1440, "height": 2200}) + browser = p.chromium.launch( + headless=True, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + ], + ) + + context = browser.new_context( + viewport={"width": 1440, "height": 2200}, + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36" + ), + locale="tr-TR", + ) + + page = context.new_page() + + page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + """) print(f"DEBUG - CATEGORY URL: {url}") page.goto(url, timeout=60000, wait_until="domcontentloaded") page.wait_for_timeout(5000) - # Location popup for text in [ "Bu defalık izin ver", "Siteyi ziyaret ederken izin ver", @@ -127,7 +150,6 @@ def get_a101_products(category_slug: str): except Exception: pass - # Cookie popup for text in ["KABUL ET", "Kabul Et", "Tümünü Kabul Et"]: try: page.get_by_text(text, exact=True).click(timeout=3000) @@ -231,6 +253,7 @@ def get_a101_products(category_slug: str): print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}") + context.close() browser.close() return products From f19a8e760ac904c97298bf034452987e57ddee99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 13:55:36 +0300 Subject: [PATCH 06/15] fix: use a101 working fruit category path --- config/retailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/retailers.py b/config/retailers.py index fe828187..0a06f3f4 100644 --- a/config/retailers.py +++ b/config/retailers.py @@ -13,7 +13,7 @@ "source_name": "a101", "currency": "TRY", "categories": { - "fruit_veg": "meyve-ve-sebze", + "fruit_veg": "meyve-sebze", }, } } From cda308edd16db8a8cdad19c4042a03611124e7df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 14:10:40 +0300 Subject: [PATCH 07/15] Update scraper.py --- scraper/a101/scraper.py | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 3950da29..1f039410 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -43,6 +43,11 @@ def is_valid_product_name(name: str) -> bool: "Kapıda", "Teslimat", "Seçtiğin mağaza", + "Kategoriler", + "Ana Sayfa", + "Site Haritası", + "İletişim", + "Yardım", ] lowered = name.lower().strip() @@ -168,33 +173,14 @@ def get_a101_products(category_slug: str): body_text = page.locator("body").inner_text() print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}") - selectors = [ - "div[class*='product']", - "article", - "a[href*='/kapida/']", - "div[class*='grid'] > div", - ] + cards = page.locator("div[class*='product']").all() - best_cards = [] - best_selector = None - - for selector in selectors: - try: - current_cards = page.locator(selector).all() - print(f"DEBUG - SELECTOR {selector} -> {len(current_cards)} cards") - - if len(current_cards) > len(best_cards): - best_cards = current_cards - best_selector = selector - except Exception as e: - print(f"DEBUG - SELECTOR ERROR {selector}: {e}") - - print(f"DEBUG - BEST SELECTOR: {best_selector}") - print(f"DEBUG - TOTAL CARDS FOUND: {len(best_cards)}") + print("DEBUG - FIXED SELECTOR: div[class*='product']") + print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}") seen_names = set() - for i, card in enumerate(best_cards): + for i, card in enumerate(cards): try: text_blob = card.inner_text().strip() @@ -214,9 +200,6 @@ def get_a101_products(category_slug: str): if i < 20: print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}") - if price is None and name is None: - continue - if not name: continue From 778ea3a99a3d1cab47d1e448b8df39a54df25788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 14:22:34 +0300 Subject: [PATCH 08/15] feat: aggregate a101 fruit_veg subcategories --- pipeline/run_a101_pipeline.py | 44 ++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py index 7692a094..d24c947c 100644 --- a/pipeline/run_a101_pipeline.py +++ b/pipeline/run_a101_pipeline.py @@ -23,7 +23,16 @@ def run_pipeline(category_key: str): - category_slug = RETAILER_CONFIG["a101"]["categories"][category_key] + base_category_slug = RETAILER_CONFIG["a101"]["categories"][category_key] + + if category_key == "fruit_veg": + category_slugs = [ + f"{base_category_slug}/meyve", + f"{base_category_slug}/sebze", + f"{base_category_slug}/yesillik", + ] + else: + category_slugs = [base_category_slug] conn = None run_id = None @@ -32,7 +41,30 @@ def run_pipeline(category_key: str): # ------------------------- # 1) SCRAPE # ------------------------- - products = get_a101_category_products(category_slug) + products = [] + seen_product_names = set() + + for slug in category_slugs: + logger.info("Scraping A101 subcategory: %s", slug) + subcategory_products = get_a101_category_products(slug) + + logger.info( + "A101 subcategory %s returned %d products", + slug, + len(subcategory_products), + ) + + for product in subcategory_products: + product_name = (product.get("product_name") or "").strip().lower() + if not product_name: + continue + + if product_name in seen_product_names: + continue + + seen_product_names.add(product_name) + products.append(product) + logger.info("A101 scraped %d products", len(products)) if products: @@ -56,7 +88,7 @@ def run_pipeline(category_key: str): cur, source_name=source_name, category_key=category_key, - category_slug=category_slug, + category_slug=base_category_slug, triggered_by="local_test", pipeline_version="v2-a101", ) @@ -68,12 +100,12 @@ def run_pipeline(category_key: str): fail_run( cur, run_id, - f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}", + f"A101 scraper returned 0 products for category_key={category_key} category_slug={base_category_slug}", ) conn.commit() raise RuntimeError( - f"A101 scraper returned 0 products for category_key={category_key} category_slug={category_slug}" + f"A101 scraper returned 0 products for category_key={category_key} category_slug={base_category_slug}" ) raw_count = 0 @@ -91,7 +123,7 @@ def run_pipeline(category_key: str): cur, run_id=run_id, product=product, - category_slug=category_slug, + category_slug=base_category_slug, source_name=source_name, currency=currency, ) From ee9b8f97f83bc9b48bcbd4127f77477fb10bf67f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 14:37:18 +0300 Subject: [PATCH 09/15] debug: log a101 subcategory aggregation --- pipeline/run_a101_pipeline.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pipeline/run_a101_pipeline.py b/pipeline/run_a101_pipeline.py index d24c947c..201de234 100644 --- a/pipeline/run_a101_pipeline.py +++ b/pipeline/run_a101_pipeline.py @@ -34,6 +34,9 @@ def run_pipeline(category_key: str): else: category_slugs = [base_category_slug] + logger.info("DEBUG - base_category_slug=%s", base_category_slug) + logger.info("DEBUG - category_slugs=%s", category_slugs) + conn = None run_id = None @@ -45,15 +48,22 @@ def run_pipeline(category_key: str): seen_product_names = set() for slug in category_slugs: - logger.info("Scraping A101 subcategory: %s", slug) + logger.info("DEBUG - Scraping A101 subcategory: %s", slug) subcategory_products = get_a101_category_products(slug) logger.info( - "A101 subcategory %s returned %d products", + "DEBUG - A101 subcategory %s returned %d products", slug, len(subcategory_products), ) + if subcategory_products: + logger.info( + "DEBUG - First 3 products from %s: %s", + slug, + [p.get("product_name") for p in subcategory_products[:3]], + ) + for product in subcategory_products: product_name = (product.get("product_name") or "").strip().lower() if not product_name: From 0fd7060df6f58070f1d3cb60acfd5c77dcd7e85d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 15:24:01 +0300 Subject: [PATCH 10/15] fix: clean a101 product card parsing --- scraper/a101/scraper.py | 140 ++++++++++++---------------------------- 1 file changed, 40 insertions(+), 100 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 1f039410..27e77147 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -27,85 +27,6 @@ def extract_unit_info(product_name: str): return None, None -def is_valid_product_name(name: str) -> bool: - if not name: - return False - - invalid_fragments = [ - "Popüler Ürünler", - "Çerez Kullanımı", - "Kampanyalar", - "Giriş Yap", - "Sepetim", - "Aramak istediğin ürünü yaz", - "Böyle bir sayfa bulamadık", - "A101 Hep Ucuz", - "Kapıda", - "Teslimat", - "Seçtiğin mağaza", - "Kategoriler", - "Ana Sayfa", - "Site Haritası", - "İletişim", - "Yardım", - ] - - lowered = name.lower().strip() - - if len(lowered) < 3: - return False - - for fragment in invalid_fragments: - if fragment.lower() in lowered: - return False - - if "₺" in lowered: - return False - - return True - - -def parse_price_from_lines(lines): - price_pattern = re.compile(r"₺\s*([\d\.]+(?:,\d{1,2})?)") - - prices = [] - for line in lines: - matches = price_pattern.findall(line) - for match in matches: - raw_price = match.replace(".", "").replace(",", ".").strip() - try: - prices.append(float(raw_price)) - except ValueError: - continue - - if not prices: - return None - - return min(prices) - - -def clean_product_name(lines): - candidates = [] - - for line in lines: - line = line.strip() - - if not is_valid_product_name(line): - continue - - if re.search(r"\d+\s*(?:kg|g|gr|l|lt|ml|adet|li)\b", line, flags=re.IGNORECASE): - candidates.append(line) - continue - - if not re.search(r"^\d+[.,]?\d*$", line) and "₺" not in line: - candidates.append(line) - - if not candidates: - return None - - return max(candidates, key=len).strip() - - def get_a101_products(category_slug: str): url = f"https://www.a101.com.tr/kapida/{category_slug}/" products = [] @@ -173,35 +94,55 @@ def get_a101_products(category_slug: str): body_text = page.locator("body").inner_text() print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}") - cards = page.locator("div[class*='product']").all() + cards = page.locator("div[data-testid='product-card']").all() - print("DEBUG - FIXED SELECTOR: div[class*='product']") + print("DEBUG - USING SELECTOR: div[data-testid='product-card']") print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}") seen_names = set() for i, card in enumerate(cards): try: - text_blob = card.inner_text().strip() - - if i < 20: - print(f"DEBUG - RAW CARD {i}: {text_blob[:400]}") - - if not text_blob: - continue - - lines = [x.strip() for x in text_blob.splitlines() if x.strip()] - if not lines: + name = card.locator("h3").inner_text().strip() + + # Meyve-sebze dışı ürünleri ele + if any( + x in name.lower() + for x in [ + "süt", + "peynir", + "çikolata", + "deterjan", + "pirinç", + "yumurta", + ] + ): continue - price = parse_price_from_lines(lines) - name = clean_product_name(lines) - - if i < 20: - print(f"DEBUG - PARSED CARD {i}: name={name}, price={price}, lines={lines[:10]}") - - if not name: - continue + price = None + candidate_texts = [] + + try: + spans = card.locator("span").all_inner_texts() + candidate_texts.extend(spans) + except Exception: + pass + + try: + card_text = card.inner_text() + candidate_texts.append(card_text) + except Exception: + pass + + for text in candidate_texts: + matches = re.findall(r"₺\s*([\d\.]+(?:,\d{1,2})?)", text) + if matches: + raw_price = matches[0].replace(".", "").replace(",", ".").strip() + try: + price = float(raw_price) + break + except ValueError: + continue if price is None: continue @@ -209,7 +150,6 @@ def get_a101_products(category_slug: str): normalized_name = name.lower().strip() if normalized_name in seen_names: continue - seen_names.add(normalized_name) extracted_unit, extracted_amount = extract_unit_info(name) From 27ea39d1e4766b1ada3b06c6f3c1b98c900e7457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 15:31:36 +0300 Subject: [PATCH 11/15] Update scraper.py --- scraper/a101/scraper.py | 202 ++++++++++++++++++++++++---------------- 1 file changed, 122 insertions(+), 80 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 27e77147..194be55c 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -12,7 +12,7 @@ def extract_unit_info(product_name: str): (r"(\d+(?:[.,]\d+)?)\s*(l|L|lt|LT)\b", "LITER"), (r"(\d+(?:[.,]\d+)?)\s*(ml|ML)\b", "ML"), (r"(\d+(?:[.,]\d+)?)\s*(adet|Adet|ADET)\b", "PIECE"), - (r"(\d+(?:[.,]\d+)?)\s*(li|LI)\b", "PIECE"), + (r"(\d+(?:[.,]\d+)?)\s*(li|LI|'lu|'lü)\b", "PIECE"), ] for pattern, unit in patterns: @@ -27,6 +27,124 @@ def extract_unit_info(product_name: str): return None, None +def normalize_category_name(category_slug: str) -> str: + if category_slug.endswith("/meyve"): + return "Meyve" + if category_slug.endswith("/sebze"): + return "Sebze" + if category_slug.endswith("/yesillik"): + return "Yesillik" + return category_slug.replace("-", " ").title() + + +def get_section_header(category_slug: str) -> str: + if category_slug.endswith("/meyve"): + return "Meyve" + if category_slug.endswith("/sebze"): + return "Sebze" + if category_slug.endswith("/yesillik"): + return "Yeşillik" + return None + + +def parse_products_from_body_text(body_text: str, category_slug: str): + lines = [line.strip() for line in body_text.splitlines() if line.strip()] + + section_header = get_section_header(category_slug) + if not section_header: + return [] + + # İlgili bölümün başlangıcını bul + start_idx = None + for i, line in enumerate(lines): + if line == section_header: + start_idx = i + break + + if start_idx is None: + print(f"DEBUG - section header not found: {section_header}") + return [] + + # Bir sonraki bölüm başlığına kadar git + stop_headers = {"Meyve", "Sebze", "Yeşillik", "Sepetim", "Giriş Yap", "Site Haritası"} + section_lines = [] + + for line in lines[start_idx + 1:]: + if line in stop_headers and line != section_header: + break + section_lines.append(line) + + print(f"DEBUG - section_header={section_header}") + print(f"DEBUG - section_lines_sample={section_lines[:40]}") + + products = [] + seen_names = set() + + i = 0 + while i < len(section_lines) - 1: + name = section_lines[i] + next_line = section_lines[i + 1] + + # fiyat satırı mı? + if re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", next_line): + # saçma satırları ele + lowered = name.lower() + if any( + x in lowered + for x in [ + "kampanyalar", + "giriş yap", + "sepetim", + "anasayfa", + "site haritası", + "yardım", + "iletişim", + ] + ): + i += 1 + continue + + raw_price = next_line.replace("₺", "").replace(".", "").replace(",", ".").strip() + + try: + price = float(raw_price) + except ValueError: + i += 1 + continue + + normalized_name = name.lower().strip() + if normalized_name in seen_names: + i += 2 + continue + + seen_names.add(normalized_name) + + extracted_unit, extracted_amount = extract_unit_info(name) + + products.append( + { + "product_id": f"a101_{category_slug}_{len(products)}", + "product_name": name, + "sku": f"a101_{category_slug}_{len(products)}", + "shown_price_tl": price, + "regular_price_tl": price, + "discount_rate": None, + "product_url": f"https://www.a101.com.tr/kapida/{category_slug}/", + "brand_name": None, + "category_name": normalize_category_name(category_slug), + "unit": extracted_unit, + "unit_amount": extracted_amount, + } + ) + + i += 2 + continue + + i += 1 + + return products + + def get_a101_products(category_slug: str): url = f"https://www.a101.com.tr/kapida/{category_slug}/" products = [] @@ -94,87 +212,11 @@ def get_a101_products(category_slug: str): body_text = page.locator("body").inner_text() print(f"DEBUG - BODY TEXT SAMPLE: {body_text[:2000]}") - cards = page.locator("div[data-testid='product-card']").all() - - print("DEBUG - USING SELECTOR: div[data-testid='product-card']") - print(f"DEBUG - TOTAL CARDS FOUND: {len(cards)}") - - seen_names = set() - - for i, card in enumerate(cards): - try: - name = card.locator("h3").inner_text().strip() - - # Meyve-sebze dışı ürünleri ele - if any( - x in name.lower() - for x in [ - "süt", - "peynir", - "çikolata", - "deterjan", - "pirinç", - "yumurta", - ] - ): - continue - - price = None - candidate_texts = [] - - try: - spans = card.locator("span").all_inner_texts() - candidate_texts.extend(spans) - except Exception: - pass - - try: - card_text = card.inner_text() - candidate_texts.append(card_text) - except Exception: - pass - - for text in candidate_texts: - matches = re.findall(r"₺\s*([\d\.]+(?:,\d{1,2})?)", text) - if matches: - raw_price = matches[0].replace(".", "").replace(",", ".").strip() - try: - price = float(raw_price) - break - except ValueError: - continue - - if price is None: - continue - - normalized_name = name.lower().strip() - if normalized_name in seen_names: - continue - seen_names.add(normalized_name) - - extracted_unit, extracted_amount = extract_unit_info(name) - - products.append( - { - "product_id": f"a101_{category_slug}_{i}", - "product_name": name, - "sku": f"a101_{category_slug}_{i}", - "shown_price_tl": price, - "regular_price_tl": price, - "discount_rate": None, - "product_url": url, - "brand_name": None, - "category_name": category_slug.replace("-", " ").title(), - "unit": extracted_unit, - "unit_amount": extracted_amount, - } - ) - - except Exception as e: - print(f"DEBUG - CARD ERROR {i}: {e}") - continue + products = parse_products_from_body_text(body_text, category_slug) print(f"DEBUG - TOTAL PRODUCTS RETURNED: {len(products)}") + if products: + print(f"DEBUG - FIRST 10 PRODUCTS: {[p['product_name'] for p in products[:10]]}") context.close() browser.close() From 8a295c24737ac262808e5e7aa54f8b7f64ad2f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 15:32:06 +0300 Subject: [PATCH 12/15] Update retailers.py --- config/retailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/retailers.py b/config/retailers.py index 0a06f3f4..c8d1d339 100644 --- a/config/retailers.py +++ b/config/retailers.py @@ -15,5 +15,5 @@ "categories": { "fruit_veg": "meyve-sebze", }, - } + }, } From 10031c4554cacb9591a9a264eba5db628bbe975a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 15:43:13 +0300 Subject: [PATCH 13/15] Update scraper.py --- scraper/a101/scraper.py | 58 ++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/scraper/a101/scraper.py b/scraper/a101/scraper.py index 194be55c..07d8dccd 100644 --- a/scraper/a101/scraper.py +++ b/scraper/a101/scraper.py @@ -37,7 +37,7 @@ def normalize_category_name(category_slug: str) -> str: return category_slug.replace("-", " ").title() -def get_section_header(category_slug: str) -> str: +def get_section_header(category_slug: str) -> str | None: if category_slug.endswith("/meyve"): return "Meyve" if category_slug.endswith("/sebze"): @@ -47,34 +47,61 @@ def get_section_header(category_slug: str) -> str: return None +def is_price_line(text: str) -> bool: + return bool(re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", text)) + + +def find_best_section_start(lines: list[str], section_header: str) -> int | None: + """ + Same header appears multiple times in the page. + Choose the occurrence that is followed by real product rows (price lines). + """ + candidate_indices = [i for i, line in enumerate(lines) if line == section_header] + + if not candidate_indices: + return None + + best_idx = None + best_score = -1 + + for idx in candidate_indices: + window = lines[idx : idx + 80] + score = sum(1 for line in window if is_price_line(line)) + + # Prefer the first strong candidate with nearby prices + if score > best_score: + best_score = score + best_idx = idx + + if best_score <= 0: + return None + + return best_idx + + def parse_products_from_body_text(body_text: str, category_slug: str): lines = [line.strip() for line in body_text.splitlines() if line.strip()] section_header = get_section_header(category_slug) if not section_header: + print(f"DEBUG - section header missing for category_slug={category_slug}") return [] - # İlgili bölümün başlangıcını bul - start_idx = None - for i, line in enumerate(lines): - if line == section_header: - start_idx = i - break - + start_idx = find_best_section_start(lines, section_header) if start_idx is None: - print(f"DEBUG - section header not found: {section_header}") + print(f"DEBUG - section header not found or no priced rows nearby: {section_header}") return [] - # Bir sonraki bölüm başlığına kadar git stop_headers = {"Meyve", "Sebze", "Yeşillik", "Sepetim", "Giriş Yap", "Site Haritası"} - section_lines = [] - for line in lines[start_idx + 1:]: + section_lines = [] + for line in lines[start_idx + 1 :]: if line in stop_headers and line != section_header: break section_lines.append(line) print(f"DEBUG - section_header={section_header}") + print(f"DEBUG - section_start_idx={start_idx}") print(f"DEBUG - section_lines_sample={section_lines[:40]}") products = [] @@ -85,10 +112,10 @@ def parse_products_from_body_text(body_text: str, category_slug: str): name = section_lines[i] next_line = section_lines[i + 1] - # fiyat satırı mı? - if re.match(r"^₺\s*[\d\.]+(?:,\d{1,2})?$", next_line): - # saçma satırları ele + if is_price_line(next_line): lowered = name.lower() + + # obvious non-product lines if any( x in lowered for x in [ @@ -99,6 +126,7 @@ def parse_products_from_body_text(body_text: str, category_slug: str): "site haritası", "yardım", "iletişim", + "kategoriler", ] ): i += 1 From 2a73d1f25f5364203119d7b05d130eb7021f0891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 16:07:52 +0300 Subject: [PATCH 14/15] debug: log why a101 records are skipped from fact --- pipeline/loaders_fact.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pipeline/loaders_fact.py b/pipeline/loaders_fact.py index 03d16861..59815bfb 100644 --- a/pipeline/loaders_fact.py +++ b/pipeline/loaders_fact.py @@ -43,9 +43,19 @@ def insert_fact_observation( if not can_insert: logger.info( - "Skipping fact insert — product=%r reason=%s", + ( + "DEBUG - FACT SKIP | product=%r | reason=%s | " + "price=%r | normalized_unit=%r | normalized_quantity=%r | " + "price_per_unit=%r | standardized_product_name=%r | category_name=%r" + ), product.get("product_name"), reason, + transformed.get("price"), + transformed.get("normalized_unit"), + transformed.get("normalized_quantity"), + transformed.get("price_per_unit"), + transformed.get("standardized_product_name"), + transformed.get("category_name"), ) return False @@ -127,4 +137,11 @@ def insert_fact_observation( ), ) - return cursor.rowcount == 1 \ No newline at end of file + if cursor.rowcount != 1: + logger.info( + "DEBUG - FACT NOT INSERTED | product=%r | reason=conflict_or_no_insert | event_id=%r", + product.get("product_name"), + event_id, + ) + + return cursor.rowcount == 1 From 6d5a2a25d9f158e809b62530a534b35d677aac22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sabahat=20=C5=9Eengezer?= Date: Mon, 13 Apr 2026 17:27:45 +0300 Subject: [PATCH 15/15] fix: infer normalized unit from a101 product names --- pipeline/transforms.py | 75 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/pipeline/transforms.py b/pipeline/transforms.py index e4a93d3c..b2374253 100644 --- a/pipeline/transforms.py +++ b/pipeline/transforms.py @@ -3,13 +3,8 @@ def normalize_unit( - unit: Optional[str], quantity: Any + unit: Optional[str], quantity: Any, product_name: Optional[str] = None ) -> Tuple[Optional[str], Optional[float]]: - if unit is None: - return None, None - - unit_upper = str(unit).strip().upper() - qty: Optional[float] = None if quantity is not None: try: @@ -17,15 +12,63 @@ def normalize_unit( except (TypeError, ValueError): qty = None - if unit_upper == "GRAM": - if qty is None: - return "kg", None - return "kg", round(qty / 1000, 4) + # 1) Unit varsa önce onu kullan + if unit is not None: + unit_upper = str(unit).strip().upper() + + if unit_upper == "GRAM": + if qty is None: + return "kg", None + return "kg", round(qty / 1000, 4) + + if unit_upper == "KG": + return "kg", qty if qty is not None else 1.0 + + if unit_upper == "PIECE": + return "piece", qty if qty is not None else 1.0 + + if unit_upper == "LITER": + return "liter", qty if qty is not None else 1.0 + + if unit_upper == "ML": + if qty is None: + return "liter", None + return "liter", round(qty / 1000, 4) + + return unit.lower(), qty + + # 2) Unit yoksa product_name'den çözmeye çalış + name = (product_name or "").lower().strip() + + if not name: + return None, None + + tr_map = {"ı": "i", "ğ": "g", "ü": "u", "ş": "s", "ö": "o", "ç": "c"} + for old, new in tr_map.items(): + name = name.replace(old, new) + + if re.search(r"\bkg\b", name): + return "kg", 1.0 - if unit_upper == "PIECE": - return "piece", qty if qty is not None else 1.0 + gram_match = re.search(r"(\d+(?:[.,]\d+)?)\s*g\b", name) + if gram_match: + grams = float(gram_match.group(1).replace(",", ".")) + return "kg", round(grams / 1000, 4) - return unit.lower(), qty + liter_match = re.search(r"(\d+(?:[.,]\d+)?)\s*l\b", name) + if liter_match: + liters = float(liter_match.group(1).replace(",", ".")) + return "liter", liters + + ml_match = re.search(r"(\d+(?:[.,]\d+)?)\s*ml\b", name) + if ml_match: + ml = float(ml_match.group(1).replace(",", ".")) + return "liter", round(ml / 1000, 4) + + if re.search(r"\badet\b", name): + return "piece", 1.0 + + return None, None def standardize_product_name(product_name: Optional[str]) -> Optional[str]: @@ -106,7 +149,11 @@ def transform_product(product: dict[str, Any]) -> dict[str, Any]: unit = product.get("unit") unit_amount = product.get("unit_amount") - normalized_unit, normalized_quantity = normalize_unit(unit, unit_amount) + normalized_unit, normalized_quantity = normalize_unit( + unit, + unit_amount, + product.get("product_name"), + ) price_per_unit = calculate_price_per_unit(price, normalized_quantity) unit_price_label = build_unit_price_label(normalized_unit) standardized_product_name = standardize_product_name(product.get("product_name"))