import time
import re
import json
import random
import logging
import asyncio
import os       
import shutil
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
from urllib.parse import urlparse, urlencode
from uuid import UUID, uuid4
from concurrent.futures import ThreadPoolExecutor
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from fastapi import HTTPException, status
import platform
import subprocess
from app.models.product import Product
from app.models.vendor import Vendor
from app.models.violation import Violation
from app.models.scraping_result import ScrapingResult
from urllib.parse import quote_plus


try:
    from playwright.sync_api import sync_playwright
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False

logger = logging.getLogger(__name__)

# Enhanced user agents for better evasion
GOOGLE_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
]

# List of free/public proxies (use with caution, consider paid proxies for production)
FREE_PROXIES = [
    "socks4://103.146.170.233:5678",
    "socks4://103.165.64.86:4153",
    "http://108.165.152.59:80",
    "socks4://103.204.54.50:1080",
]

# HTTP Headers that mimic real browser requests
HTTP_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
    "Cache-Control": "max-age=0",
    "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="122", "Google Chrome";v="122"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": '"Windows"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
}


def human_delay(min_seconds: float, max_seconds: float):
    """Introduce a random delay to mimic human behavior."""
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)


def exponential_backoff_delay(attempt: int, base_delay: float = 2.0, max_delay: float = 60.0):
    """
    Calculate exponential backoff delay.
    Useful for retry logic when hitting rate limits.
    """
    delay = base_delay * (2 ** attempt)
    # Add jitter to prevent thundering herd
    jitter = random.uniform(0, delay * 0.1)
    final_delay = min(delay + jitter, max_delay)
    time.sleep(final_delay)


def get_free_proxy() -> Optional[str]:
    """
    Get a free proxy from environment or predefined list.
    For production, consider using a paid proxy service.
    """
    # Check if proxy is set via environment variable
    env_proxy = os.environ.get('HTTP_PROXY') or os.environ.get('http_proxy')
    if env_proxy:
        return env_proxy
    
    if FREE_PROXIES:
        proxy = random.choice(FREE_PROXIES)
        return proxy
    
    return None


def get_selenium_proxy_url(proxy: Optional[str]) -> Optional[str]:
    """Format proxy URL for Selenium."""
    if not proxy:
        return None
    if proxy.startswith("http://") or proxy.startswith("socks"):
        return proxy
    return f"http://{proxy}"


def search_tavily_api(query: str) -> List[str]:
    """
    Search using Tavily API (best for AI/automation).
    Tavily is designed for AI agents and is much more reliable than browser automation.
    """
    results = []
    
    try:
        logger.info("[Discovery] Using Tavily API for search")
        api_key = os.environ.get('TAVILY_API_KEY')
        if not api_key:
            logger.warning("[Discovery] TAVILY_API_KEY not configured - skipping Tavily search")
            return results
        
        url = "https://api.tavily.com/search"
        
        # Tavily API request
        payload = {
            "api_key": api_key,
            "query": query,
            "include_answer": False,
            "max_results": 5,
            "include_raw_content": False
        }
        
        response = requests.post(url, json=payload, timeout=15)
        response.raise_for_status()
        data = response.json()
        
        # Extract URLs from Tavily results
        for result in data.get("results", [])[:5]:
            if "url" in result:
                results.append(result["url"])
        
        logger.info(f"[Discovery] ✓ Tavily API search returned {len(results)} results")
    except Exception as e:
        logger.warning(f"[Discovery] Tavily API search failed: {str(e)}")
    
    return results


def search_via_api(query: str) -> List[str]:
    """
    Search using Tavily API instead of Selenium (avoids detection completely).
    Falls back to API-based search when Google blocks you.
    """
    return search_tavily_api(query)


def is_soria_natural_product(result: Dict) -> bool:
    """
    Check if a SERP API result contains 'Soria Natural' brand.
    Searches in title, vendor name (source), and other product fields.
    
    Args:
        result: SERP API result dictionary
    
    Returns:
        True if 'Soria Natural' is found, False otherwise
    """
    # Fields to check for Soria Natural brand
    fields_to_check = [
        result.get("title", ""),
        result.get("source", ""),
        result.get("vendor_name", ""),
        result.get("brand", ""),
        result.get("description", ""),
        result.get("snippet", ""),
    ]
    
    # Join all fields and convert to lowercase for case-insensitive search
    combined_text = " ".join(str(field) for field in fields_to_check).lower()
    
    # Check if "soria natural" is in any of the fields
    is_soria_natural = "soria natural" in combined_text or "soriañatural" in combined_text.replace(" ", "")
    
    return is_soria_natural


def search_serp_api(product_name: str, barcode: str, msp: float = None) -> List[Dict]:
    """
    Search using SERP API Google Shopping Light Engine.
    Filters results to only include 'Soria Natural' branded products.
    Uses direct HTTP requests to SERP API (more reliable than library).
    
    Args:
        product_name: Product name for search
        barcode: Product barcode
        msp: Minimum Selling Price (optional, used for violation detection)
    
    Returns:
        List of dictionaries with product and vendor information (filtered for Soria Natural only)
    """
    results = []
    
    try:
        logger.info("[Discovery SERP] Using SERP API Google Shopping Light Engine for search")
        api_key = os.environ.get('SERP_API_KEY')
        
        # Check if API key is configured
        if not api_key:
            logger.error("[Discovery SERP] ❌ SERP_API_KEY not found in environment variables")
            return results
            
        if api_key == 'your_serp_api_key_here':
            logger.error("[Discovery SERP] ❌ SERP_API_KEY is set to placeholder value")
            logger.error("[Discovery SERP] Please replace 'your_serp_api_key_here' with your actual SERP API key")
            return results
        
        # Build search query with product name only (better SERP results)
        # Using only product name for broader search results
        search_query = f"{product_name}"
        #search_query = f"{product_name} {barcode}"
        logger.info(f"[Discovery SERP] Search query: '{search_query}'")
        
        # Direct HTTP request to SERP API (more reliable than GoogleSearch library)
        serp_url = "https://serpapi.com/search.json"
        
        params = {
            "engine": "google_shopping_light",  # Correct engine for shopping light
            "q": search_query,  # Query with only product name for broader search results
            "gl": "es",  # Country code: Spain (es), can be changed (us, uk, fr, etc.)
            "api_key": api_key,
            "location": "Spain "  # Locale for Spanish results
        }
        
        try:
            response = requests.get(serp_url, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.RequestException as req_error:
            logger.error(f"[Discovery SERP] HTTP request failed: {str(req_error)}")
            return results
        except ValueError as json_error:
            logger.error(f"[Discovery SERP] Failed to parse JSON response: {str(json_error)}")
            return results
        
        # Check for empty response
        if not data:
            return results
        
        # Check for API errors
        if "error" in data:
            logger.error(f"[Discovery SERP] API Error: {data.get('error')}")
            return results
        
        # Extract shopping results from SERP API response
        shopping_results = data.get("shopping_results", [])
        
        # If no shopping results, log response structure and try fallback
        if not shopping_results:
            logger.warning(f"[Discovery SERP] ⚠️ No 'shopping_results' key in response")
            
            # Check for alternative result keys
            for alt_key in ["results", "products", "items", "search_results", "organic_results"]:
                if alt_key in data and data[alt_key]:
                    pass  # Found alternative key but skipping verbose logging
            
            # Suggest alternative search strategies
            logger.warning(f"[Discovery SERP] No results found. Trying fallback: searching with product name only...")
            
            # Fallback 1: Try with just product name (no barcode)
            if barcode and " " in search_query:
                name_only_query = product_name.strip()
                
                try:
                    fallback_params = {
                        "engine": "google_shopping_light",
                        "q": name_only_query,
                        "api_key": api_key
                    }
                    fallback_response = requests.get(serp_url, params=fallback_params, timeout=30)
                    fallback_response.raise_for_status()
                    fallback_data = fallback_response.json()
                    fallback_results = fallback_data.get("shopping_results", [])
                    
                    if fallback_results:
                        logger.info(f"[Discovery SERP] ✓ Fallback search successful! Found {len(fallback_results)} results with product name only")
                        shopping_results = fallback_results
                    else:
                        logger.warning(f"[Discovery SERP] ⚠️ Fallback search also returned no shopping_results")
                except Exception as fallback_error:
                    logger.warning(f"[Discovery SERP] Fallback search failed: {str(fallback_error)}")
        
        # Process shopping results
        total_checked = 0
        total_soria_natural = 0
        total_filtered_out = 0
        
        for idx, result in enumerate(shopping_results[:15]):  # Process up to 15 results
            try:
                total_checked += 1
                
                # Parse SERP API Google Shopping Light result format
                # Shopping light results have: source, product_link, extracted_price, title, etc.
                vendor_name = result.get("source", "Unknown")  # Source/vendor name
                product_link = result.get("product_link", "") or result.get("link", "")  # Product URL (try product_link first, then link)
                title = result.get("title", "")  # Product title
                
                # FILTER: Check if this product is from Soria Natural brand
                if not is_soria_natural_product(result):
                    total_filtered_out += 1
                    logger.info(f"[Discovery SERP] ⊘ Result {idx + 1}: {vendor_name} - FILTERED OUT (Not Soria Natural brand)")
                    logger.debug(f"[Discovery SERP]   Title: {title[:80]}")
                    continue
                
                total_soria_natural += 1
                
                # Price: prefer extracted_price (already parsed), fallback to price string
                price = result.get("extracted_price")  # Already a float if available
                if price is None:
                    price_str = result.get("price", "")  # Fallback to string price
                    if price_str:
                        try:
                            # Remove currency symbols and whitespace, convert comma to dot
                            price_clean = str(price_str).replace("$", "").replace("€", "").replace("₹", "").replace(",", ".").strip()
                            price = float(price_clean)
                        except (ValueError, AttributeError, TypeError):
                            price = None
                
                if not product_link:
                    continue
                
                # NOTE: SERP API google_shopping_light returns Google Shopping links in product_link
                # This is expected behavior - we use source (vendor name) + price for violation detection
                # Google Shopping redirects are acceptable as they still link to the product with vendor info
                
                result_dict = {
                    "vendor_name": vendor_name,
                    "vendor_url": product_link,
                    "scraped_price": price,
                    "product_title": title,
                    "source": "serp_api_shopping_light"
                }
                
                # Determine compliance status
                if price and msp:
                    if price < msp:
                        result_dict["status"] = "violation"
                        result_dict["compliance_status"] = "violation"
                    else:
                        result_dict["status"] = "compliant"
                        result_dict["compliance_status"] = "compliant"
                else:
                    result_dict["status"] = "unknown"
                    result_dict["compliance_status"] = "unknown"
                
                results.append(result_dict)
                logger.info(f"[Discovery SERP] ✓ Result {idx + 1}: {vendor_name} - {product_link[:60]} - Price: {f'₹{price:.2f}' if price else 'N/A'} - Status: {result_dict['status'].upper()} ✅ SORIA NATURAL")
                
            except Exception as e:
                logger.warning(f"[Discovery SERP] Error parsing result {idx}: {str(e)}")
                continue
        
        # Log filtering summary
        logger.info(f"\n[Discovery SERP] {'='*80}")
        logger.info(f"[Discovery SERP] SORIA NATURAL BRAND FILTER SUMMARY:")
        logger.info(f"[Discovery SERP]   Total results received from SERP API: {total_checked}")
        logger.info(f"[Discovery SERP]   ✅ Soria Natural products found: {total_soria_natural}")
        logger.info(f"[Discovery SERP]   ⊘ Non-Soria Natural products filtered out: {total_filtered_out}")
        logger.info(f"[Discovery SERP] {'='*80}\n")
        
        logger.info(f"[Discovery SERP] ✓ SERP API search completed. Processed {min(15, len(shopping_results))} out of {len(shopping_results)} available results, returned {len(results)} Soria Natural products for analysis")
        
    except Exception as e:
        logger.error(f"[Discovery SERP] SERP API search failed: {str(e)}", exc_info=True)
    
    return results


def ensure_chrome_dependencies():
    """Ensure required system dependencies are installed for Chrome on Linux."""
    system = platform.system()
    if system != "Linux":
        return True
    
    logger.info("[Chrome Dependencies] Checking Chrome and system utilities on Linux...")
    
    # Check for basic shell commands that Chrome needs
    required_commands = ['readlink', 'dirname', 'cat', 'basename', 'grep', 'sed', 'awk']
    missing_commands = []
    
    for cmd in required_commands:
        try:
            result = subprocess.run(['which', cmd], capture_output=True, timeout=2)
            if result.returncode != 0:
                missing_commands.append(cmd)
        except Exception:
            missing_commands.append(cmd)
    
    if missing_commands:
        logger.error(f"[Chrome Dependencies] ✗ Missing core shell commands: {', '.join(missing_commands)}")
        logger.error("[Chrome Dependencies] DevOps must install: sudo apt-get install -y coreutils grep sed gawk")
        return False
    
    # Check if Chrome is accessible
    chrome_found = False
    try:
        result = subprocess.run(["which", "google-chrome"], capture_output=True, timeout=5)
        if result.returncode == 0:
            chrome_path = result.stdout.decode().strip()
            logger.info(f"[Chrome Dependencies] ✓ Chrome found in PATH: {chrome_path}")
            chrome_found = True
    except Exception as e:
        logger.debug(f"[Chrome Dependencies] Could not check PATH: {str(e)}")
    
    # Try alternate paths
    if not chrome_found:
        paths = ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/chromium"]
        for path in paths:
            if os.path.exists(path):
                logger.info(f"[Chrome Dependencies] ✓ Chrome found at: {path}")
                chrome_found = True
                break
    
    if not chrome_found:
        logger.error("[Chrome Dependencies] ✗ Chrome NOT found - DevOps must install Chrome")
        return False
    
    # Test Chrome --version (this will catch the shell command issues)
    try:
        result = subprocess.run(["google-chrome", "--version"], capture_output=True, text=True, timeout=10)
        if result.returncode == 0 and result.stdout.strip():
            logger.info(f"[Chrome Dependencies] ✓ Chrome executable: {result.stdout.strip()}")
            return True
        else:
            logger.error(f"[Chrome Dependencies] ✗ Chrome --version failed: {result.stderr}")
            logger.error("[Chrome Dependencies] This indicates missing shell commands (readlink, dirname, cat)")
            return False
    except subprocess.TimeoutExpired:
        logger.error("[Chrome Dependencies] ✗ Chrome --version timed out (likely missing shell commands)")
        return False
    except Exception as e:
        logger.error(f"[Chrome Dependencies] ✗ Could not verify Chrome: {str(e)}")
        return False


# Thread pool executor for running synchronous scraping operations
_executor = ThreadPoolExecutor(max_workers=1)  # Single worker to avoid too many browser instances



def cleanup_chrome_session(driver):
    """Clean up Chrome session temporary files."""
    try:
        if hasattr(driver, '_user_data_dir') and driver._user_data_dir:
            user_data_dir = driver._user_data_dir
            if os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.debug(f"[Chrome Cleanup] Removed session directory: {user_data_dir}")
    except Exception as e:
        logger.warning(f"[Chrome Cleanup] Failed to cleanup: {str(e)}")


def get_domain_from_url(url: str) -> str:
    """Extract domain name from URL."""
    parsed = urlparse(url)
    return parsed.netloc.replace("www.", "")


def normalize_url_for_comparison(url: str) -> str:
    """
    Normalize a URL for comparison purposes.
    Removes www, trailing slashes, query parameters to compare similar URLs.
    """
    try:
        parsed = urlparse(url.lower().strip())
        # Remove www. from domain
        domain = parsed.netloc.replace("www.", "").split(":")[ 0]
        # Get path without trailing slash
        path = parsed.path.rstrip("/")
        # Return domain + path (ignore query params and fragments)
        return f"{domain}{path}"
    except:
        return url.lower().strip()


def normalize_price(price_str: str) -> Optional[float]:
    """Normalize price string to float."""
    try:
        clean = re.sub(r"[^\d.,]", "", str(price_str).strip())
        return float(clean.replace(",", "."))
    except (ValueError, AttributeError):
        return None


def calculate_price_difference(msp, scraped_price: float) -> Tuple[float, float]:
    """Calculate price difference and percentage difference.
    
    Args:
        msp: Minimum Selling Price (can be Decimal from database or float)
        scraped_price: Price found via scraping (always float)
    
    Returns:
        Tuple of (price_difference, percentage_difference)
    """
    try:
        # Convert MSP to float in case it's a Decimal from database
        msp_float = float(msp) if msp else 0
        difference = round(msp_float - scraped_price, 2)
        percentage = round((difference / msp_float * 100) if msp_float > 0 else 0, 2)
        return difference, percentage
    except (TypeError, ValueError) as e:
        logger.warning(f"[Price Calculation] Error calculating price difference: {str(e)}")
        return 0.0, 0.0


def determine_compliance_status(msp, scraped_price: Optional[float]) -> str:
    """Determine compliance status based on price comparison.
    
    Args:
        msp: Minimum Selling Price (can be Decimal from database or float)
        scraped_price: Price found via scraping (optional float)
    
    Returns:
        Status string: 'violation', 'complain', 'compliant', or 'unknown'
    """
    try:
        if scraped_price is None:
            return "unknown"
        
        # Convert MSP to float in case it's a Decimal from database
        msp_float = float(msp) if msp else None
        if msp_float is None:
            return "unknown"
        
        if scraped_price < msp_float:
            return "violation"
        elif scraped_price > msp_float:
            return "complain"
        else:
            return "compliant"
    except (TypeError, ValueError) as e:
        logger.warning(f"[Compliance Status] Error determining status: {str(e)}")
        return "unknown"


async def scrape_price_from_website(url: str, product_name: str = "") -> Optional[float]:
    """
    Scrape price from vendor website.
    Tries to extract price from HTML content using multiple patterns.
    Falls back to Selenium if requests library is blocked (403 Forbidden).
    
    Args:
        url: Website URL to scrape
        product_name: Product name (for context, optional)
    
    Returns:
        Price as float if found, None otherwise
    """
    try:
        logger.debug(f"[Website Scraper] Attempting to scrape price from: {url[:80]}")
        
        # Enhanced headers to bypass basic anti-bot detection
        headers = {
            "User-Agent": random.choice(GOOGLE_USER_AGENTS),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Referer": "https://www.google.com/",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Cache-Control": "max-age=0"
        }
        
        # Try with requests library first (faster)
        try:
            response = requests.get(url, headers=headers, timeout=10)
            
            # Check for 403 Forbidden - indicates bot detection
            if response.status_code == 403:
                logger.warning(f"[Website Scraper] 🚫 403 Forbidden from {url[:60]} - requests blocked, falling back to Selenium...")
                return await _scrape_price_with_selenium(url, product_name)
            
            response.raise_for_status()
            html_content = response.text
            
        except requests.exceptions.HTTPError as http_err:
            if http_err.response.status_code == 403:
                logger.warning(f"[Website Scraper] 🚫 403 Forbidden from {url[:60]} - falling back to Selenium")
                return await _scrape_price_with_selenium(url, product_name)
            else:
                logger.warning(f"[Website Scraper] HTTP {http_err.response.status_code} error for {url[:60]}")
                return None
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.RequestException) as e:
            logger.debug(f"[Website Scraper] Request failed ({type(e).__name__}), falling back to Selenium...")
            return await _scrape_price_with_selenium(url, product_name)
        
        # Try multiple regex patterns to find price in HTML
        price_patterns = [
            # Euro formats - most common
            r'€\s*([\d,]+\.?\d*)',                        # €17.99 or € 17.99
            r'€\s*([\d]+,[\d]{2})',                      # €17,99 (European format)
            r'([\d,]+\.?\d*)\s*€',                        # 17.99€ or 17.99 €
            r'([\d]+,[\d]{2})\s*€',                      # 17,99€ (European format)
            
            # JSON/HTML attributes
            r'["\']?price["\']?\s*:\s*["\']?([\d,]+\.?\d*)',  # JSON: price: "17.99"
            r'data-price["\']?\s*=\s*["\']?([0-9,]+\.?\d*)',  # data-price="17.99"
            r'price\s*=\s*["\']?([\d,]+\.?\d*)',         # price="17.99"
            r'precioNumerico\s*[:\s=]+["\']?([\d,]+\.?\d*)',  # Spanish: precioNumerico (Soria Natural)
            
            # HTML span/div with price class
            r'<span[^>]*class=["\'].*price.*["\'][^>]*>.*?€\s*([\d,]+\.?\d*)',
            r'<div[^>]*class=["\'].*price.*["\'][^>]*>.*?€\s*([\d,]+\.?\d*)',
            r'<span[^>]*>.*?€\s*([\d,]+\.?\d*)',
            
            # Fallback: any number with € or price nearby
            r'(?:price|precio|coste|costo)\s*[:\s=]+["\']?([\d,]+\.?\d*)',  # price: 17.99
            r'(\d+[.,]\d{2})(?:\s*€|\s+euros?)',        # 17.99 euros or 17,99 euros
        ]
        
        for pattern in price_patterns:
            match = re.search(pattern, html_content, re.IGNORECASE)
            if match:
                price_str = match.group(1).replace(",", ".")
                try:
                    price = float(price_str)
                    logger.info(f"[Website Scraper] ✓ Found price €{price} from {url[:60]} (via requests)")
                    return price
                except (ValueError, TypeError):
                    continue
        
        logger.warning(f"[Website Scraper] ⚠️ No price found in HTML from {url[:60]}")
        return None
        
    except Exception as e:
        logger.warning(f"[Website Scraper] Unexpected error scraping {url[:60]}: {str(e)}")
        return None


async def _scrape_price_with_selenium(url: str, product_name: str = "") -> Optional[float]:
    """
    Fallback to Selenium for scraping when requests library is blocked.
    Used for sites with bot detection (403 Forbidden) including Cloudflare protection.
    
    Note: Cloudflare protection is very difficult to bypass. This function attempts
    to scrape the page but may fail on heavily protected sites.
    """
    driver = None
    try:
        logger.info(f"[Selenium Fallback] Loading {url[:70]} with Selenium...")
        logger.debug("[Selenium Fallback] Note: Site may have Cloudflare protection which blocks automated access")
        
        driver = initialize_selenium_driver(headless=True)
        driver.get(url)
        
        # Extended wait for Cloudflare challenge and page content
        logger.info("[Selenium Fallback] Waiting for page to load (including potential Cloudflare bypass)...")
        
        for attempt in range(20):  # Wait up to 20 seconds for content
            try:
                # Check if page is ready
                ready = driver.execute_script("return document.readyState === 'complete'")
                # Check if significant content has loaded (not just Cloudflare challenge)
                text_length = len(driver.execute_script("return document.body.innerText"))
                
                if attempt % 5 == 0:
                    logger.debug(f"[Selenium Fallback] Attempt {attempt+1}: readyState={ready}, text_length={text_length}")
                
                if ready and text_length > 500:  # Cloudflare challenge pages are very small (~200 chars)
                    logger.info(f"[Selenium Fallback] Page content loaded ({text_length} chars)")
                    break
                
                human_delay(1, 1.5)
            except:
                human_delay(1, 1.5)
        
        # Get page source and extract price
        page_source = driver.page_source
        html_size = len(page_source)
        logger.debug(f"[Selenium Fallback] Page HTML size: {html_size} bytes")
        
        # Check if we got actual content or just Cloudflare challenge
        if html_size < 5000 and "__cf_chl_tk" in page_source:
            logger.warning(f"[Selenium Fallback] ⚠️ Detected Cloudflare challenge - page blocker not bypassed")
            logger.warning(f"[Selenium Fallback] URL: {url[:70]} is protected by Cloudflare anti-bot")
            return None
        
        # Try JSON-LD structured data first (most reliable)
        price = extract_price_from_html(page_source)
        if price:
            logger.info(f"[Selenium Fallback] ✓ Found price €{price} from {url[:60]} (via Selenium/JSON-LD)")
            return price
        
        # Try to search page text for prices
        page_text = driver.execute_script("return document.body.innerText")
        prices = re.findall(r'\d{1,3}[.,]\d{2}', page_text)
        if prices:
            logger.debug(f"[Selenium Fallback] Found price-like patterns: {set(prices)}")
            # Filter for reasonable prices and prefer those near the max value (usually the product price)
            prices_sorted = sorted(set(float(p.replace(",", ".")) for p in prices), reverse=True)
            for price in prices_sorted:
                if 0.5 < price < 5000.0:  # Sanity check
                    logger.info(f"[Selenium Fallback] ✓ Found price €{price} from {url[:60]}")
                    return price
        
        logger.warning(f"[Selenium Fallback] ⚠️ No price found on {url[:60]}")
        logger.debug(f"[Selenium Fallback] Suggestions: URL may require search, or be protected by Cloudflare")
        return None
        
    except Exception as e:
        logger.warning(f"[Selenium Fallback] Error with Selenium: {str(e)}")
        return None
    finally:
        if driver:
            try:
                cleanup_chrome_session(driver)
                driver.quit()
            except:
                pass

async def check_duplicate_violation(
    db: AsyncSession,
    url: str,
    scraped_price: Optional[float],
    barcode: str,
    product_name: str,
    msp: float,
    vendor_name: Optional[str] = None,
    packs: int = 1,
) -> bool:
    """
    Check if a violation record with MATCHING vendor already exists for this product/price.
    Prevents duplicates created by finding the same vendor from different sources
    (e.g., registered vendor scraping vs SERP API discovery).
    
    Considers it a duplicate if BOTH of these match:
    - product_name
    - barcode_number
    - msp
    - scraped_price
    - vendor_name (NOT url, since same vendor can be found via different URLs)
    
    This prevents duplicate violations for the same vendor selling at the same price.
    
    Returns True if duplicate found, False otherwise.
    """
    from sqlalchemy import and_
    
    try:
        if scraped_price is None or not barcode or not product_name:
            return False
        
        # If vendor_name is provided, check by vendor_name (more flexible for different URL sources)
        if vendor_name:
            stmt = select(Violation).where(
                and_(
                    Violation.barcode_number == barcode,
                    Violation.product_name == product_name,
                    Violation.msp == msp,
                    Violation.scraped_price == scraped_price,
                    Violation.vendor_name == vendor_name,
                    Violation.packs == packs,
                )
            )
            result = await db.execute(stmt)
            existing = result.scalars().first()
            
            if existing:
                logger.info(f"[Duplicate] Vendor match found: {vendor_name} selling {product_name} (barcode: {barcode}) @ {scraped_price} (MSP: {msp})")
                return True
        
        # Fallback to URL-based check if vendor_name not provided
        if url:
            stmt = select(Violation).where(
                and_(
                    Violation.barcode_number == barcode,
                    Violation.url == url,
                    Violation.scraped_price == scraped_price,
                    Violation.product_name == product_name,
                    Violation.msp == msp
                )
            )
            result = await db.execute(stmt)
            existing = result.scalars().first()
            
            if existing:
                logger.info(f"[Duplicate] Exact URL match found: {product_name} (barcode: {barcode}), URL: {url[:60]}, Price: {scraped_price}")
                return True
        
        return False
    except Exception as e:
        logger.warning(f"[Duplicate Check] Error: {str(e)}")
        # On error, allow creation to avoid blocking
        return False


def check_disk_space(path: str = "/tmp", min_gb: int = 1) -> bool:
    """Check if there's enough disk space for Chrome."""
    try:
        import shutil
        stat = shutil.disk_usage(path)
        available_gb = stat.free / (1024 ** 3)
        
        if available_gb < min_gb:
            logger.error(f"[Disk Space] ✗ Insufficient space: {available_gb:.2f}GB available, need {min_gb}GB")
            return False
        
        logger.debug(f"[Disk Space] ✓ Available: {available_gb:.2f}GB (threshold: {min_gb}GB)")
        return True
    except Exception as e:
        logger.warning(f"[Disk Space] Could not check disk space: {str(e)}")
        return True  # Allow to proceed if check fails


def cleanup_orphaned_chrome_processes():
    """Kill any orphaned Chrome/ChromeDriver processes."""
    system = platform.system()
    if system != "Linux":
        return
    
    try:
        # Kill any lingering Chrome processes
        subprocess.run(["pkill", "-9", "-f", "chrome"], capture_output=True, timeout=5)
        logger.info("[Process Cleanup] Killed orphaned Chrome processes")
    except Exception as e:
        logger.debug(f"[Process Cleanup] Could not kill processes: {str(e)}")


def get_page_html_via_playwright(url: str, search_query: Optional[str] = None, timeout: int = 30) -> Optional[str]:
    """
    Use Playwright to bypass Cloudflare and return the rendered HTML for extraction.
    This avoids Cloudflare detection while keeping extraction logic in Selenium/BeautifulSoup.
    
    Args:
        url: Website URL to load
        search_query: Optional search term to execute on the page
        timeout: Timeout in seconds
    
    Returns:
        HTML content of the page or None if failed
    """
    if not PLAYWRIGHT_AVAILABLE:
        logger.warning("[Playwright] Playwright not available, cannot bypass Cloudflare")
        return None
    
    try:
        logger.info("[Playwright] Using Playwright to bypass Cloudflare...")
        
        with sync_playwright() as p:
            # Launch browser with Chromium (better for Cloudflare than Chrome)
            logger.debug("[Playwright] Launching Chromium browser...")
            browser = p.chromium.launch(headless=True)
            
            # Create a new page with proper context
            page = browser.new_page(
                user_agent=random.choice(GOOGLE_USER_AGENTS),
                locale="es-ES",
                timezone_id="Europe/Madrid"
            )
            
            logger.debug(f"[Playwright] Navigating to {url}...")
            page.goto(url, wait_until="load", timeout=timeout * 1000)
            
            # Apply human-like behavior
            logger.debug("[Playwright] Applying human behavior simulation...")
            try:
                # Scroll page
                page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(random.uniform(0.5, 1.0))
                
                # Inject anti-detection JavaScript
                page.evaluate("""
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                });
                """)
            except:
                pass
            
            # Wait for page to be interactive
            page.wait_for_load_state("domcontentloaded")
            human_delay(0.5, 1.0)
            
            # If search query provided, search for it
            if search_query:
                logger.debug(f"[Playwright] Searching for '{search_query}'...")
                try:
                    search_box = page.query_selector(
                        'input[type="search"], input[placeholder*="buscar"], input[placeholder*="search"]'
                    )
                    if search_box:
                        search_box.click()
                        search_box.fill(search_query)
                        page.keyboard.press("Enter")
                        
                        # Wait for results
                        page.wait_for_load_state("load")
                        human_delay(1.0, 2.0)
                        logger.debug(f"[Playwright] Search completed for '{search_query}'")
                    else:
                        logger.warning("[Playwright] Search box not found")
                except Exception as e:
                    logger.warning(f"[Playwright] Search failed: {str(e)}")
            
            # Get the rendered HTML
            logger.debug("[Playwright] Extracting page HTML...")
            html_content = page.content()
            
            # Close browser
            browser.close()
            
            if html_content and len(html_content) > 500:
                logger.info(f"[Playwright] ✓ Successfully bypassed Cloudflare ({len(html_content)} bytes)")
                return html_content
            else:
                logger.warning("[Playwright] ⚠ Page content is too small, may still be blocked")
                return html_content if html_content else None
        
    except Exception as e:
        logger.error(f"[Playwright] Failed to load page: {str(e)}")
        return None

def normalize_vendor_website_for_lookup(vendor_website: str) -> str:
    """
    Normalize vendor website URL for dictionary lookup.
    Converts various formats to a standard form for matching against VENDOR_SEARCH_URL_TEMPLATES keys.
    
    Examples:
        "https://www.dosfarma.com/" → "https://www.dosfarma.com"
        "dosfarma.com" → "https://www.dosfarma.com"
        "www.dosfarma.com" → "https://www.dosfarma.com"
        "http://dosfarma.com/path" → "https://www.dosfarma.com"
    
    Args:
        vendor_website: Raw vendor website URL (may be incomplete or have trailing slashes)
    
    Returns:
        Normalized URL in format: https://www.domain.com (no trailing slash, no path)
    """
    if not vendor_website:
        return ""
    
    url_clean = vendor_website.strip().lower()
    
    # Add protocol if missing
    if not url_clean.startswith(('http://', 'https://')):
        url_clean = 'https://' + url_clean
    
    try:
        parsed = urlparse(url_clean)
        domain = parsed.netloc.lower()
        
        # Ensure www. prefix for consistency (most e-commerce sites use it)
        if domain and not domain.startswith('www.'):
            domain = 'www.' + domain
        
        # Reconstruct: https://www.domain.com (no path, no query, no fragment)
        normalized = f"https://{domain}"
        return normalized
    except Exception as e:
        logger.warning(f"[Normalization] Could not normalize {vendor_website}: {str(e)}")
        return url_clean


def extract_price_near_product(html: str, product_name: str) -> Optional[float]:
    """
    Extract the MAIN product price from a product page HTML.

    Root problem on PrestaShop bundle pages:
      - The main product price is in a SPAN with a content attribute:
          <span id="our_price_display" itemprop="price" content="12.001">12,00 €</span>
      - The "Contenido del paquete" section below uses a META tag:
          <meta itemprop="price" content="5.4">
      - Old code searched only for <meta itemprop="price">, so it found 5.4
        and never saw the span's content="12.001".

    Fix strategy:
      1. Strip all known bundle-component / related-product sections from the
         soup FIRST (section#blockpack, .related-products, etc.) so their
         itemprop elements are gone before any search.
      2. Search ALL elements (any tag) with itemprop="price" — read the
         content attribute first (machine-readable), fall back to text.
         Within the cleaned soup, take the MAXIMUM valid price.
      3. Platform-specific high-confidence selectors as an additional pass.
      4. DOM-walking regex near product name text as final fallback.
    """
    soup = BeautifulSoup(html, "html.parser")

    # ── Step 0: Strip noise sections entirely ─────────────────────────────────
    # These contain embedded single-unit product cards that also emit
    # itemprop="price". Removing them first makes all subsequent searches safe.
    NOISE_SELECTORS = [
        "section#blockpack",          # PrestaShop "Contenido del paquete"
        "#blockpack",
        ".blockpack",
        "#related-products",
        ".related-products",
        "#crossselling",
        ".crossselling",
        "#upsell",
        ".upsell",
        ".cross-sells",
        ".up-sells",
        "#accessories",
        ".accessories",
        "#similar_products",
        ".similar_products",
    ]
    for sel in NOISE_SELECTORS:
        for tag in soup.select(sel):
            tag.decompose()

    # ── Strategy 1: High-confidence platform-specific selectors ───────────────
    # Try the most specific containers first. Read content attribute OR text.
    # Covers both <span itemprop="price" content="12.00"> (PrestaShop)
    # and <meta itemprop="price" content="12.00"> (generic schema.org).
    HIGH_CONFIDENCE_SELECTORS = [
        "#our_price_display",                              # PrestaShop main price span
        "#buy_block [itemprop='price']",                   # PrestaShop buy block
        ".content_prices [itemprop='price']",              # PrestaShop price container
        ".box-info-product [itemprop='price']",            # PrestaShop product info
        ".product-prices .current-price [itemprop='price']",  # PrestaShop 1.7/8.x
        ".product-prices [itemprop='price']",              # PrestaShop 1.7/8.x
        ".summary .price > .amount",                       # WooCommerce
        ".summary .price .woocommerce-Price-amount",       # WooCommerce
        ".product-summary .price .amount",                 # WooCommerce variant
        "[itemprop='offers'] [itemprop='price']",          # generic schema.org Offer
        "#product-price",                                  # Magento
        ".product-info-main [itemprop='price']",           # Magento/generic
        ".product-detail [itemprop='price']",              # generic
    ]
    for sel in HIGH_CONFIDENCE_SELECTORS:
        el = soup.select_one(sel)
        if not el:
            continue
        # content attribute is machine-readable and always the clean numeric value
        raw = el.get("content", "") or el.get_text(strip=True)
        if not raw:
            continue
        try:
            price = float(str(raw).replace(",", ".").strip())
            if 1.0 <= price <= 500.0:
                logger.debug(f"[PriceExtract] High-confidence '{sel}': {price}")
                return price
        except (ValueError, TypeError):
            pass

    # ── Strategy 2: All itemprop="price" elements — any tag — take MAX ────────
    # After noise removal, collect content/text from every element with
    # itemprop="price" (spans, metas, divs) and return the maximum.
    # Bundle price is always >= any individual component, so max is correct.
    all_prices = []
    for el in soup.find_all(attrs={"itemprop": "price"}):
        raw = el.get("content", "") or el.get_text(strip=True)
        if not raw:
            continue
        try:
            price = float(str(raw).replace(",", ".").strip())
            if 1.0 <= price <= 500.0:
                all_prices.append(price)
        except (ValueError, TypeError):
            pass
    if all_prices:
        price = max(all_prices)
        logger.debug(f"[PriceExtract] Max itemprop price={price} ({len(all_prices)} elements after noise removal)")
        return price

    # ── Strategy 3: DOM-walking regex near product name text ──────────────────
    full_name = product_name.lower()
    for element in soup.find_all(text=True):
        if full_name in element.lower():
            parent = element.parent
            for _ in range(5):
                if parent is None:
                    break
                text = parent.get_text()
                matches = re.findall(
                    r'(\d{1,3}(?:[.,]\d{3})*[.,]\d{2})\s*€|€\s*(\d{1,3}(?:[.,]\d{3})*[.,]\d{2})',
                    text
                )
                if matches:
                    price_str = next(p for p in matches[0] if p)
                    try:
                        price = float(price_str.replace('.', '').replace(',', '.'))
                        if 1.0 <= price <= 500.0:
                            return price
                    except ValueError:
                        pass
                parent = parent.parent
    return None


# Vendor-specific search URL templates.
# For vendors whose search is JS-driven (e.g. Alpine.js) and cannot be
# triggered via form submission, we navigate directly to their search
# results page using window.location.href inside the JS scenario.
# Key   = vendor.name as stored in the database (case-insensitive match used below)
# Value = URL template with {query} placeholder
VENDOR_SEARCH_URL_TEMPLATES = {
    "https://www.dosfarma.com": "https://www.dosfarma.com/catalogsearch/result/?q={query}",
    "https://www.farmaes.es": "https://farmaes.es/?mot_q={query}",
    "https://www.linverd.com": "https://linverd.com/en?km_q={query}",
    "https://www.farmaciasdirect.eu": "https://www.farmaciasdirect.eu/?query={query}",
    "https://www.herbolariorosana.com": "https://herbolariorosana.com/resultado-busqueda?controller=search&orderby=position&orderway=desc&id_manufacturer_search=0&search_query={query}&submit_search=",
    "https://www.parafarma24.com": "https://www.parafarma24.com/es/buscar?search={query}",
    "https://www.herbolarioemperatriz.es": "https://herbolarioemperatriz.es/module/iqitsearch/searchiqit?s={query}",
    "https://www.herboahorro.com": "https://herboahorro.com/search?type=product&options%5Bunavailable_products%5D=show&options%5Bprefix%5D=last&q={query}",
    # Add more vendors here as needed, e.g.:
    # "https://www.somevendor.com": "https://www.somevendor.com/search?term={query}",
}

# The 7 pack counts we scrape for every product.
# Pack 1 = base product name (no suffix). Pack N > 1 appends " Pack N".
PACK_COUNTS = [1, 2, 3, 4, 5, 6, 12]

# Words too generic to be useful for smart link-matching.
GENERIC_WORDS = {
    "pack", "packs", "de", "con", "sin", "los", "las", "del", "para",
    "the", "and", "for", "of", "plus", "forte", "complex", "siglo",
    "soria", "natural", "extracto", "sxxi", "xxi", "xxl",
    "comprimidos", "tabletas", "capsulas", "capsules", "mg", "ml",
}


def generate_search_query(product_name: str, pack_count: int) -> str:
    """
    Generate the search query for a given product and pack count.

    Pack 1  -> base name as-is   (e.g. "RESVERASOR PLUS")
    Pack N  -> base + " Pack N"  (e.g. "RESVERASOR PLUS Pack 2")
    """
    base = product_name.strip()
    if pack_count == 1:
        return base
    return f"{base} Pack {pack_count}"


def get_pack_price(product, pack_count: int) -> Optional[float]:
    """
    Return the pre-calculated price threshold for a given pack count.
    Reads price_1_pack ... price_12_pack from the product row.
    Falls back to product.msp if the specific column is missing / None.
    """
    attr_map = {
        1:  "price_1_pack",
        2:  "price_2_pack",
        3:  "price_3_pack",
        4:  "price_4_pack",
        5:  "price_5_pack",
        6:  "price_6_pack",
        12: "price_12_pack",
    }
    attr = attr_map.get(pack_count)
    if attr:
        val = getattr(product, attr, None)
        if val is not None:
            try:
                return float(val)
            except (TypeError, ValueError):
                pass
    try:
        return float(product.msp)
    except (TypeError, ValueError):
        return None


def decompose_search_query(search_query: str) -> Tuple[List[str], Optional[str]]:
    """
    Break a search query such as "RESVERASOR PLUS Pack 2" into:
      - core_words : distinctive, non-generic lowercase words  -> ["resverasor"]
      - pack_number: the numeric pack count as a string        -> "2"  (or None)

    Used for smart link-scoring on vendor search result pages.
    """
    words = search_query.split()
    pack_number: Optional[str] = None
    core_words: List[str] = []

    i = 0
    while i < len(words):
        w = words[i]
        w_lower = w.lower()
        # Detect "Pack N" pair
        if w_lower == "pack" and i + 1 < len(words) and words[i + 1].isdigit():
            pack_number = words[i + 1]
            i += 2
            continue
        # Skip generic words and bare numbers
        if w_lower in GENERIC_WORDS or w.isdigit():
            i += 1
            continue
        if len(w) > 2:
            core_words.append(w_lower)
        i += 1

    if not core_words:
        core_words = [words[0].lower()]

    return core_words, pack_number


def smart_link_score(link_text: str, link_href: str, core_words: List[str], pack_number: Optional[str]) -> int:
    """
    Score a candidate <a> link for how well it matches the search query.

    Scoring:
      +2  per core word found in link text or href
      +3  if pack_number appears in a pack-related context in the visible TEXT
          (adjacent to "pack", "ud", "x", "oferta", "unidad" etc.)
      +1  if pack_number appears bare in the text (weak — could be anything)
       0  pack_number match in href only — IGNORED entirely, because product
          IDs in URLs (e.g. /2327534-normacid...) contain digits that would
          otherwise cause false positives for every pack number.

    Example for "NORMACID Pack 2" (core_words=["normacid"], pack_number="2"):
      "NORMACID OFERTA 2 UD"             → +2 (core) +3 (pack ctx) = 5  ✅ winner
      "NORMACID CITRUS 32 COMPRIMIDOS"   → +2 (core) +0 (32 ≠ bare 2) = 2
      "Pack 3x2 Normacid"                → +2 (core) +0 (3x2 ≠ pack 2) = 2
      "Pack 6x NORMACID"                 → +2 (core) +0 = 2
    """
    import re as _re

    text_lower = link_text.lower()
    href_lower = link_href.lower()
    combined   = text_lower + " " + href_lower

    score = 0

    # Core word matching
    for word in core_words:
        if word in combined:
            score += 2

    # Pack number matching — context-aware, TEXT only
    if pack_number:
        n = _re.escape(pack_number)
        pat_str = (
            r"pack\s*" + n
            + r"|pack\s*x\s*" + n
            + r"|" + n + r"\s*x\s*pack"
            + r"|" + n + r"\s*ud"
            + r"|" + n + r"\s*unidad"
            + r"|x\s*" + n + r"(?!\d)"
            + r"|" + n + r"\s*x\s*(?!\d)"
            + r"|oferta\s*" + n
        )
        if _re.search(pat_str, text_lower, _re.IGNORECASE):
            score += 3   # Strong: pack number in pack context in visible text
        elif _re.search(r"(?<!\d)" + n + r"(?!\d)", text_lower):
            score += 1   # Weak: bare isolated number in visible text
        # href-only digit matches → 0 (unreliable)

    return score


def scrape_with_scrapingbee_js_scenario(url: str, search_query: str, vendor_website: str = None) -> Optional[str]:
    """
    Use ScrapingBee to:
    1. Load vendor homepage (bypasses Cloudflare)
    2. Navigate to the search results page (for VENDOR_SEARCH_URL_TEMPLATES vendors)
       OR interact with the homepage search box (for other vendors)
    3. Click the best-matching product link using smart component scoring
       (core words + pack number), NOT a single first_keyword token
    4. Return the product page HTML for price extraction

    search_query is the FULL search string, e.g. "RESVERASOR PLUS Pack 2"
    """
    api_key = os.environ.get('SCRAPINGBEE_API_KEY')
    if not api_key:
        logger.warning("[ScrapingBee] SCRAPINGBEE_API_KEY not set, skipping.")
        return None

    # Decompose query into core words + optional pack number for smart JS matching
    core_words, pack_number = decompose_search_query(search_query)
    core_words_js = json.dumps(core_words)
    pack_number_js = json.dumps(pack_number or "")

    # Shared JS: score every <a> by core words + pack number, click the best match
    smart_click_js = f"""
        (function() {{
            var coreWords = {core_words_js};
            var packNum   = {pack_number_js};
            var minScore  = coreWords.length * 2;

            function scoreLink(el) {{
                var combined = ((el.innerText || el.textContent || '') + ' ' + (el.href || '')).toLowerCase();
                var score = 0;
                for (var k = 0; k < coreWords.length; k++) {{
                    if (combined.indexOf(coreWords[k]) !== -1) score += 2;
                }}
                if (packNum && combined.indexOf(packNum) !== -1) score += 2;
                return score;
            }}

            var links = document.querySelectorAll('a');
            var bestEl = null, bestScore = 0;
            for (var i = 0; i < links.length; i++) {{
                var href = links[i].href || '';
                if (!href || href.indexOf('#') !== -1 || href.indexOf('javascript') !== -1) continue;
                var s = scoreLink(links[i]);
                if (s > bestScore) {{ bestScore = s; bestEl = links[i]; }}
            }}
            if (bestEl && bestScore >= minScore) {{
                bestEl.click();
            }}
        }})();
    """

    vendor_key = normalize_vendor_website_for_lookup(vendor_website) if vendor_website else ""
    if vendor_key in VENDOR_SEARCH_URL_TEMPLATES:
        direct_search_url = VENDOR_SEARCH_URL_TEMPLATES[vendor_key].format(query=quote_plus(search_query))
        logger.info(f"[ScrapingBee] Direct search URL: {direct_search_url}")
        js_scenario = {
            "instructions": [
                {"evaluate": f"window.location.href = '{direct_search_url}';"},
                {"wait": 8000},
                {"evaluate": "window.scrollBy(0, 300);"},
                {"wait": 1000},
                {"evaluate": smart_click_js},
                {"wait": 6000},
            ]
        }
    else:
        search_selector = (
            "input[type='search'], input[name='q'], input[name='search'], "
            "input[name='s'], input[name='search_query'], input[name='busqueda'], "
            "input[placeholder*='buscar'], input[placeholder*='Buscar'], "
            "input[placeholder*='search'], input[placeholder*='Search'], "
            "input[placeholder*='Busca']"
        )
        js_scenario = {
            "instructions": [
                {"wait": 2000},
                {"click": search_selector},
                {"wait": 500},
                {"fill": [search_selector, search_query]},
                {"wait": 500},
                {"evaluate": "var inp = document.querySelector(\"input[type='search'], input[name='q'], input[name='search'], input[name='s'], input[name='search_query']\"); if(inp && inp.form) { inp.form.submit(); } else if(inp) { inp.dispatchEvent(new Event('input', {bubbles:true})); inp.dispatchEvent(new KeyboardEvent('keydown', {key:'Enter', keyCode:13, bubbles:true})); }"},
                {"wait": 3000},
                {"evaluate": smart_click_js},
                {"wait": 3000},
            ]
        }

    try:
        logger.info(f"[ScrapingBee] Searching for '{search_query}' on {url[:70]}")
        params = {
            'api_key': api_key,
            'url': url,
            'render_js': 'true',
            'premium_proxy': 'true',
            'block_resources': 'false',
            'js_scenario': json.dumps(js_scenario),
        }
        full_url = f"https://app.scrapingbee.com/api/v1/?{urlencode(params)}"
        response = requests.get(full_url, timeout=90)

        if response.status_code == 200 and len(response.text) > 500:
            logger.info(f"[ScrapingBee] ✓ Success ({len(response.text)} chars)")
            return response.text

        if response.status_code == 500:
            logger.warning(f"[ScrapingBee] 500 on first attempt, retrying with stealth_proxy...")
            js_scenario_retry = {
                "instructions": [
                    {"wait": 4000},
                    {"click": search_selector},
                    {"wait": 1000},
                    {"fill": [search_selector, search_query]},
                    {"wait": 1000},
                    {"evaluate": "var inp = document.querySelector(\"input[type='search'], input[name='q'], input[name='search'], input[name='s'], input[name='search_query']\"); if(inp && inp.form) { inp.form.submit(); } else if(inp) { inp.dispatchEvent(new KeyboardEvent('keydown', {key:'Enter', keyCode:13, bubbles:true})); }"},
                    {"wait": 4000},
                    {"evaluate": smart_click_js},
                    {"wait": 4000},
                ]
            }
            params_retry = {
                'api_key': api_key,
                'url': url,
                'render_js': 'true',
                'stealth_proxy': 'true',
                'block_resources': 'false',
                'js_scenario': json.dumps(js_scenario_retry),
            }
            full_url_retry = f"https://app.scrapingbee.com/api/v1/?{urlencode(params_retry)}"
            retry_response = requests.get(full_url_retry, timeout=120)
            if retry_response.status_code == 200 and len(retry_response.text) > 500:
                logger.info(f"[ScrapingBee] ✓ Retry success ({len(retry_response.text)} chars)")
                return retry_response.text
            logger.warning(f"[ScrapingBee] Retry also failed: {retry_response.status_code}: {retry_response.text[:200]}")
            return None

        logger.warning(f"[ScrapingBee] Got status {response.status_code}: {response.text[:200]}")
        return None

    except Exception as e:
        logger.warning(f"[ScrapingBee] js_scenario failed: {str(e)}")
        return None



def human_delay(min_delay: float = 0.5, max_delay: float = 1.0):
    """Simulate human-like delay with random variation."""
    delay = random.uniform(min_delay, max_delay)
    time.sleep(delay)


def human_scroll(driver, min_pause: float = 0.5, max_pause: float = 1.0):
    """Simulate human-like scrolling behavior."""
    try:
        total_height = driver.execute_script("return document.body.scrollHeight")
        viewport_height = driver.execute_script("return window.innerHeight")
        
        for _ in range(random.randint(1, 3)):
            scroll_amount = random.randint(viewport_height // 2, viewport_height)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(min_pause, max_pause))
    except:
        pass


def human_mouse_movement(driver):
    """Simulate human-like mouse movement."""
    try:
        actions = ActionChains(driver)
        for _ in range(random.randint(2, 5)):
            x = random.randint(100, 1000)
            y = random.randint(100, 800)
            actions.move_by_offset(x, y).pause(random.uniform(0.1, 0.5))
        actions.perform()
    except:
        pass


def inject_anti_detection_js(driver):
    """Inject JavaScript to hide webdriver detection."""
    try:
        driver.execute_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
        """)
    except:
        pass


def initialize_selenium_driver(headless: bool = True, use_proxy: bool = False) -> webdriver.Chrome:
    """Initialize and configure Chrome WebDriver with anti-detection measures."""
    
    # Check critical preconditions
    if not ensure_chrome_dependencies():
        raise RuntimeError("Chrome dependencies check failed. DevOps must install Chrome via setup scripts.")
    
    if not check_disk_space("/tmp", min_gb=1):
        raise RuntimeError("Insufficient disk space in /tmp. DevOps must free up space.")
    
    # Clean up any orphaned processes
    cleanup_orphaned_chrome_processes()
    
    options = Options()
    if headless:
        options.add_argument("--headless=new")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--start-maximized")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument("--disable-dev-shm-usage")
    
    # Proxy support for rotating IPs
    if use_proxy:
        proxy_url = get_free_proxy()
        if proxy_url:
            proxy_url = get_selenium_proxy_url(proxy_url)
            options.add_argument(f"--proxy-server={proxy_url}")
            logger.info(f"[Chrome Init] Using proxy: {proxy_url[:30]}")
    
    # Core stability and crash prevention flags
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--incognito")
    options.add_argument("--disable-gpu-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-plugins")
    options.add_argument("--disable-plugins-discovery")
    options.add_argument("--disable-print-preview")
    options.add_argument("--disable-component-extensions-with-background-pages")
    options.add_argument("--no-default-browser-check")
    options.add_argument("--disable-background-networking")
    options.add_argument("--disable-sync")
    options.add_argument("--disable-translate")
    options.add_argument("--hide-scrollbars")
    options.add_argument("--metrics-recording-only")
    options.add_argument("--mute-audio")
    options.add_argument("--no-first-run")
    options.add_argument("--safebrowsing-disable-auto-update")
    options.add_argument("--disable-accelerated-2d-canvas")
    options.add_argument("--no-zygote")
    options.add_argument("--disable-background-timer-throttling")
    options.add_argument("--disable-backgrounding-occluded-windows")
    options.add_argument("--disable-renderer-backgrounding")
    options.add_argument("--disable-features=TranslateUI,IsolateOrigins,site-per-process")
    options.add_argument("--disable-ipc-flooding-protection")
    options.add_argument("--disable-default-apps")
    options.add_argument("--password-store=basic")
    options.add_argument("--use-mock-keychain")
    options.add_argument("--disable-web-security")
    options.add_argument("--allow-running-insecure-content")
    options.add_argument("--disable-webgl")
    options.add_argument("--disable-threaded-animation")
    options.add_argument("--disable-threaded-scrolling")
    options.add_argument("--disable-in-process-stack-traces")
    options.add_argument("--disable-histogram-customizer")
    options.add_argument("--disable-gl-extensions")
    options.add_argument("--disable-composited-antialiasing")
    options.add_argument("--disable-canvas-aa")
    options.add_argument("--disable-3d-apis")
    options.add_argument("--disable-accelerated-video-decode")
    options.add_argument("--disable-background-media-download")
    options.add_argument("--disable-domain-reliability")
    options.add_argument("--disable-client-side-phishing-detection")
    options.add_argument("--disable-component-update")
    options.add_argument("--disable-hang-monitor")
    options.add_argument("--disable-prompt-on-repost")
    options.add_argument("--force-fieldtrials=SiteIsolationExtensions/Control")
    options.add_argument("--disable-back-forward-cache")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-session-crashed-bubble")
    options.add_argument("--disable-infobars")
    
    # Crash prevention and resource management flags
    options.add_argument("--disable-breakpad")
    options.add_argument("--disable-crash-reporter")
    options.add_argument("--disable-default-tracing")
    options.add_argument("--disable-media-session")
    options.add_argument("--no-service-autorun")
    options.add_argument("--disable-audio")
    options.add_argument("--disable-features=VizDisplayCompositor,AudioServiceOutOfProcess")
    options.add_argument("--disable-preconnect")
    options.add_argument("--disable-client-hints")
    
    # Debugging and temporary storage
    # below is for linux only
    # options.add_argument("--remote-debugging-port=9222")
    # below is for linux only
    # options.add_argument("--remote-debugging-address=0.0.0.0")
    options.add_argument("--crash-dumps-dir=/tmp")
    options.add_argument("--data-path=/tmp/chrome-data")
    options.add_argument("--disk-cache-dir=/tmp/chrome-cache")
    options.add_argument("--disable-logging")
    options.add_argument("--disable-logging-redirect")
    options.add_argument("--log-level=3")
    
    # Single process mode to reduce memory overhead
    # below is for linux only
    options.add_argument("--single-process") 
    
    # Set random user agent from the list to avoid detection
    user_agent = random.choice(GOOGLE_USER_AGENTS)
    logger.debug(f"[Chrome Init] Using user agent: {user_agent[:50]}...")
    
    options.add_argument(f"--user-agent={user_agent}")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    # Set environment variables for Chrome
    # below is for linux only
    # os.environ['DISPLAY'] = ':99'  # Virtual display
    # os.environ['DBUS_SESSION_BUS_ADDRESS'] = '/dev/null'

    # Create unique user data directory for this session
    # below is for linux only
    # import uuid
    # unique_session_id = str(uuid.uuid4())[:8]
    # user_data_dir = f"/tmp/chrome-session-{unique_session_id}"
    
    # # Try to create necessary directories
    # try:
    #     os.makedirs(user_data_dir, exist_ok=True)
    #     os.makedirs('/tmp/chrome-cache', exist_ok=True)
    #     os.makedirs('/tmp/chrome-data', exist_ok=True)
    # except Exception as e:
    #     logger.warning(f"Could not create Chrome temp directories: {str(e)}")
    
    # options.add_argument(f"--user-data-dir={user_data_dir}")

    # below is for windows testing only with linux fallback
    # --START--
    # Create unique session ID unconditionally
    import uuid
    import tempfile
    unique_session_id = str(uuid.uuid4())[:8]

    # Platform-aware temp paths
    system = platform.system()
    if system == "Windows":
        tmp_base = os.path.join(tempfile.gettempdir(), "chrome")
    else:
        tmp_base = "/tmp"
        os.environ['DISPLAY'] = ':99'
        os.environ['DBUS_SESSION_BUS_ADDRESS'] = '/dev/null'

    user_data_dir = os.path.join(tmp_base, f"chrome-session-{unique_session_id}")

    # Try to create necessary directories
    try:
        os.makedirs(user_data_dir, exist_ok=True)
        os.makedirs(os.path.join(tmp_base, 'chrome-cache'), exist_ok=True)
        os.makedirs(os.path.join(tmp_base, 'chrome-data'), exist_ok=True)
    except Exception as e:
        logger.warning(f"Could not create Chrome temp directories: {str(e)}")

    options.add_argument(f"--user-data-dir={user_data_dir}")
    options.add_argument(f"--crash-dumps-dir={os.path.join(tmp_base, 'chrome-dumps')}")
    options.add_argument(f"--data-path={os.path.join(tmp_base, 'chrome-data')}")
    options.add_argument(f"--disk-cache-dir={os.path.join(tmp_base, 'chrome-cache')}")

    # Remote debugging only on non-Windows
    if system != "Windows":
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--remote-debugging-address=0.0.0.0")
    # --END--
    
    # Method 1: Check environment variable
    chrome_path = os.environ.get('CHROME_BIN') or os.environ.get('GOOGLE_CHROME_BIN')
    if chrome_path:
        logger.info(f"[Chrome Detection] Found via environment variable: {chrome_path}")
    
    # Method 2: Check if chrome is in PATH
    if not chrome_path:
        for cmd in ['google-chrome', 'chromium-browser', 'chromium', 'chrome']:
            found = shutil.which(cmd)
            if found:
                chrome_path = found
                logger.info(f"[Chrome Detection] Found in PATH using 'which {cmd}': {chrome_path}")
                break
    
    # Method 3: Check common installation paths
    if not chrome_path:
        system = platform.system()
        logger.debug(f"[Chrome Detection] Checking common paths for {system}...")
        
        if system == "Windows":
            possible_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
            ]
        elif system == "Linux":
            possible_paths = [
                "/usr/bin/google-chrome",        # Chrome first (main)
                "/usr/bin/google-chrome-stable", 
                "/usr/bin/chromium-browser",     # Chromium fallback
                "/usr/bin/chromium",
                "/opt/google/chrome/chrome",
                "/snap/bin/chromium"
            ]
        elif system == "Darwin":  # macOS
            possible_paths = [
                "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
                "/Applications/Chromium.app/Contents/MacOS/Chromium"
            ]
        else:
            possible_paths = []
            
        for path in possible_paths:
            if os.path.exists(path):
                chrome_path = path
                logger.debug(f"[Chrome Detection] Found at path: {chrome_path}")
                break
        
        if not chrome_path:
            logger.warning("[Chrome Detection] Chrome not found in common paths")
    
    # Verify Chrome is executable and accessible
    if chrome_path:
        logger.debug(f"[Chrome Verification] Verifying Chrome at: {chrome_path}")
        try:
            # Check if file exists
            if os.path.exists(chrome_path):
                logger.debug(f"[Chrome Verification] ✓ File exists at {chrome_path}")
                # Try to get version
                result = subprocess.run([chrome_path, "--version"], capture_output=True, text=True, timeout=5)
                if result.returncode == 0:
                    logger.debug(f"[Chrome Verification] ✓ Chrome is executable: {result.stdout.strip()}")
                else:
                    logger.warning(f"[Chrome Verification] ⚠ Chrome returned error: {result.stderr}")
            else:
                logger.error(f"[Chrome Verification] ✗ File not found: {chrome_path}")
                chrome_path = None
        except subprocess.TimeoutExpired:
            logger.warning("[Chrome Verification] ⚠ Chrome --version timed out")
        except Exception as e:
            logger.warning(f"[Chrome Verification] ⚠ Could not verify Chrome: {str(e)}")
    
    # Set the binary location if found
    if chrome_path:
        options.binary_location = chrome_path
        logger.debug(f"[Chrome Init] Using Chrome binary: {chrome_path}")
    else:
        logger.warning("[Chrome Init] Chrome binary not found. Selenium will attempt auto-detection.")

    try:
        logger.debug("[Chrome Init] Installing/verifying ChromeDriver with ChromeDriverManager...")
        chromedriver_path = ChromeDriverManager().install()
        logger.debug(f"[Chrome Init] ChromeDriver ready at: {chromedriver_path}")
        
        service = Service(chromedriver_path)
        logger.debug("[Chrome Init] Creating Selenium WebDriver...")
        driver = webdriver.Chrome(service=service, options=options)
        logger.info("[Chrome Init] ✓ WebDriver initialized successfully")
        # Store the user data directory for cleanup
        driver._user_data_dir = user_data_dir
        return driver
        
    except Exception as e:
        logger.error(f"[Chrome Init] ✗ Failed to initialize WebDriver: {str(e)}", exc_info=True)
        # Cleanup temp directory if initialization failed
        try:
            if 'user_data_dir' in locals() and os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.info(f"[Chrome Cleanup] Removed temp directory: {user_data_dir}")
        except Exception:
            pass
        raise



def cleanup_chrome_session(driver):
    """Clean up Chrome session temporary files."""
    try:
        if hasattr(driver, '_user_data_dir') and driver._user_data_dir:
            user_data_dir = driver._user_data_dir
            if os.path.exists(user_data_dir):
                shutil.rmtree(user_data_dir, ignore_errors=True)
                logger.info(f"[Chrome Cleanup] Removed session directory: {user_data_dir}")
    except Exception as e:
        logger.warning(f"[Chrome Cleanup] Failed to cleanup: {str(e)}")



def dismiss_overlays(driver):
    selectors = [
        "button.cookie-accept",
        "button#cookie-accept",
        "button.accept",
        "button.btn-accept",
        "button[aria-label='Close']",
        "button.close",
        ".modal button.close",
        ".popup button.close",
        ".cookie-consent button",
        "div#cookie-consent button",
        "button[title='Close']",
    ]
    for sel in selectors:
        try:
            elems = driver.find_elements(By.CSS_SELECTOR, sel)
            for el in elems:
                if el.is_displayed():
                    el.click()
                    time.sleep(0.3)
        except Exception:
            pass
    try:
        driver.switch_to.active_element.send_keys(Keys.ESCAPE)
    except Exception:
        pass


def find_and_search(driver, search_query: str, vendor_name: str = "") -> bool:
    """Find and interact with search input on vendor websites."""

    # ── Step 0: Try to reveal hidden search inputs ───────────────────────────
    # Many sites (e.g. Parafarmacia El Cid) hide the search input behind a
    # magnifying-glass icon. Click the toggle first so the input appears in DOM.
    search_toggle_selectors = [
        "button[aria-label*='search' i]",
        "button[aria-label*='buscar' i]",
        "a[aria-label*='search' i]",
        "a[aria-label*='buscar' i]",
        ".search-toggle",
        ".search-icon",
        ".icon-search",
        "[class*='search-toggle']",
        "[class*='search-icon']",
        "[class*='searchToggle']",
        "button.search",
        "span.search",
        # SVG search icons wrapped in buttons/links
        "button svg[class*='search' i]",
        "a svg[class*='search' i]",
    ]
    for toggle_sel in search_toggle_selectors:
        try:
            toggles = driver.find_elements(By.CSS_SELECTOR, toggle_sel)
            for toggle in toggles:
                if toggle.is_displayed():
                    toggle.click()
                    human_delay(0.5, 1.0)
                    logger.debug(f"[Search] Clicked search toggle: {toggle_sel}")
                    break
        except Exception:
            continue

    search_selectors = [
        ("xpath", "//input[@type='search']", "search input type"),
        ("xpath", "//input[@placeholder[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'search')]]", "search placeholder XPath"),
        ("xpath", "//input[@name='search' or @name='q' or @name='keyword' or @name='searchInput']", "named search input"),
        ("css", "input[type='search']", "CSS search input"),
        ("css", "[placeholder*='search'], [placeholder*='Search']", "CSS search placeholder"),
        ("css", "input[placeholder*='buscar'], input[placeholder*='Buscar']", "Spanish buscar"),
        ("xpath", "//form//input[1]", "first form input"),
        ("xpath", "//input[@class[contains(., 'search')]]", "search class input"),
        ("css", "input.search, input.search-box, input.searchbox", "common search classes"),
        ("css", "input[name='search_query']", "search_query name"),
        ("css", "#search_query_top", "search_query_top id"),
        ("css", "input.search_query", "search_query class"),
        ("css", "form input[type='text']", "generic form text input"),
    ]

    logger.info("[Search] Attempting to locate search bar...")
    for selector_type, selector, description in search_selectors:
        try:
            if selector_type == "xpath":
                elements = WebDriverWait(driver, 3).until(
                    EC.presence_of_all_elements_located((By.XPATH, selector))
                )
            else:
                elements = WebDriverWait(driver, 3).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                )

            # Find a visible + enabled element
            element = None
            for el in elements:
                if el.is_displayed() and el.is_enabled():
                    element = el
                    break

            if not element:
                continue
            
            logger.info(f"[Search] ✓ Search bar FOUND using: {description}")
             
            # Click and focus
            try:
                element.click()
                human_delay(0.3, 0.6)
            except:
                pass

            # Clear existing text
            try:
                element.clear()
            except:
                pass
            
            # Type search query
            element.send_keys(search_query)
            human_delay(0.8, 1.2)
            
            logger.info(f"[Search] Typed query: '{search_query}'")

            # Try multiple submit methods
            submit_success = False
            
            # Method 1: Press Enter
            try:
                element.send_keys(Keys.RETURN)
                submit_success = True
            except Exception as e:
                pass
            
            # Method 2: Form submission
            if not submit_success:
                try:
                    element.submit()
                    submit_success = True
                except Exception as e:
                    pass
            
            # Method 3: Find and click search button
            if not submit_success:
                button_selectors = [
                    ("//button[contains(., 'Search')]", "Search button text"),
                    ("//button[contains(., 'Buscar')]", "Buscar button text"),
                    ("//button[@type='submit']", "submit button"),
                    ("//input[@type='submit']", "submit input"),
                    ("//button[contains(@class, 'search')]", "search class button"),
                ]
                for btn_selector, btn_desc in button_selectors:
                    try:
                        buttons = driver.find_elements(By.XPATH, btn_selector)
                        if buttons and buttons[0].is_displayed():
                            buttons[0].click()
                            logger.info(f"[Search] Submitted with {btn_desc}")
                            submit_success = True
                            break
                    except Exception:
                        continue
            
            if submit_success:
                human_delay(2, 3)
                logger.info("[Search] ✓ Search SUBMITTED successfully")
                return True
            else:
                logger.warning("[Search] ✗ FAILED to submit search")
                continue

        except Exception as e:
            continue

    logger.debug(f"[Search] Selector failed: {description}")
    return False

def extract_price_from_html(page_source: str) -> Optional[float]:
    """Extract price from HTML using multiple strategies."""
    soup = BeautifulSoup(page_source, "html.parser")
    scraped_price = None

    # Strategy 1: JSON-LD Structured Data
    scripts = soup.find_all("script", type="application/ld+json")

    for script in scripts:
        try:
            if not script.string:
                continue
            data = json.loads(script.string)

            def find_price(obj):
                if isinstance(obj, dict):
                    if obj.get("@type") == "Offer" and "price" in obj:
                        try:
                            return float(obj["price"])
                        except (ValueError, TypeError):
                            pass
                    if "offers" in obj:
                        offers = obj["offers"]
                        if isinstance(offers, dict) and "price" in offers:
                            try:
                                return float(offers["price"])
                            except (ValueError, TypeError):
                                pass
                    for value in obj.values():
                        result = find_price(value)
                        if result:
                            return result
                elif isinstance(obj, list):
                    for item in obj:
                        result = find_price(item)
                        if result:
                            return result
                return None

            scraped_price = find_price(data)
            if scraped_price and 1.0 <= scraped_price <= 5000.0:
                return scraped_price

        except Exception:
            continue

    # Strategy 2: itemprop="price"
    if not scraped_price:
        price_tag = soup.find(attrs={"itemprop": "price"})
        if price_tag:
            content = price_tag.get("content") or price_tag.text
            normalized = normalize_price(content)
            if normalized and 1.0 <= normalized <= 5000.0:
                scraped_price = normalized
                return scraped_price

    # Strategy 3: Intelligent fallback (median of all numeric values)
    if not scraped_price:
        text = soup.get_text(" ")
        matches = re.findall(r"\d+[.,]\d{2}", text)

        prices = []
        for m in matches:
            normalized = normalize_price(m)
            if normalized and 1.0 <= normalized <= 5000.0:
                prices.append(normalized)

        if prices:
            prices = sorted(list(set(prices)))
            scraped_price = prices[len(prices) // 2]
            return scraped_price

    return scraped_price




def search_google(driver, query: str, max_retries: int = 3) -> bool:
    """Search on Google with retry logic and exponential backoff."""
    for attempt in range(max_retries):
        try:
            # Extended Google search box selectors with more variations
            search_selectors = [
                # Standard Google selectors
                (By.NAME, "q"),
                (By.CSS_SELECTOR, "textarea[name='q']"),
                (By.CSS_SELECTOR, "input[name='q']"),
                (By.XPATH, "//textarea[@name='q']"),
                (By.XPATH, "//input[@name='q']"),
                # Alternative selectors
                (By.CSS_SELECTOR, "[aria-label='Search']"),
                (By.CSS_SELECTOR, "input[aria-label='Search']"),
                (By.XPATH, "//*[@aria-label='Search']"),
                (By.CSS_SELECTOR, "[role='searchbox']"),
                (By.XPATH, "//*[@role='combobox' and @aria-owns]"),
                # Fallback to any visible input in the body
                (By.CSS_SELECTOR, "body input[type='text']"),
            ]
            
            search_box = None
            for selector_type, selector_value in search_selectors:
                try:
                    elements = driver.find_elements(selector_type, selector_value)
                    for element in elements:
                        try:
                            if element.is_displayed():
                                search_box = element
                                logger.info(f"[Discovery] Found search box using {selector_type}: {selector_value}")
                                break
                        except:
                            continue
                    if search_box:
                        break
                except:
                    continue
            
            if not search_box:
                logger.warning("[Discovery] Could not find Google search box with any selector")
                logger.debug(f"[Discovery] Attempted {len(search_selectors)} selector combinations")
                return False
            
            # Click on the search box first to focus it
            try:
                search_box.click()
                human_delay(0.3, 0.7)
            except:
                pass
            
            # Clear and type the query
            try:
                search_box.clear()
            except:
                # If clear fails, just proceed
                pass
            
            search_box.send_keys(query)
            logger.info(f"[Discovery] Typed query: {query}")
            human_delay(0.8, 1.5)
            
            # Try multiple submit methods
            submit_success = False
            
            # Method 1: Press Enter
            try:
                search_box.send_keys(Keys.RETURN)
                logger.info("[Discovery] Pressed Enter")
                submit_success = True
            except:
                logger.debug("[Discovery] Enter key failed, trying alternate methods")
            
            # Method 2: Try Ctrl+Enter (alternative for some sites)
            if not submit_success:
                try:
                    search_box.send_keys(Keys.CONTROL, Keys.RETURN)
                    logger.info("[Discovery] Pressed Ctrl+Enter")
                    submit_success = True
                except:
                    pass
            
            # Method 3: Try to find and click search button (multiple selectors)
            if not submit_success:
                button_selectors = [
                    "//button[@aria-label='Google Search']",
                    "//button[@aria-label='google search']",
                    "//button[contains(@aria-label, 'Search')]",
                    "//button[contains(text(), 'Search')]",
                    "//input[@type='submit' and @value='Google Search']",
                    "//input[@type='submit']",
                    "//button[@type='submit']",
                    "//button[contains(@class, 'search')]",
                    "//button[contains(@class, 'btn') and contains(@class, 'search')]",
                    "//button[@jsaction]",
                    "//button[contains(@data-ved, '0ahUKE')]",  # Common Google button attribute
                    "//div[@role='button' and contains(., 'Search')]",
                    "//span[contains(text(), 'Search')]/parent::button",
                    "//span[contains(text(), 'Buscar')]/parent::button",  # Spanish for Search
                ]
                for btn_selector in button_selectors:
                    try:
                        buttons = driver.find_elements(By.XPATH, btn_selector)
                        if buttons and buttons[0].is_displayed():
                            buttons[0].click()
                            logger.info(f"[Discovery] Clicked search button using selector: {btn_selector}")
                            submit_success = True
                            break
                    except Exception:
                        continue
            
            # Method 4: Try form submission
            if not submit_success:
                try:
                    search_box.submit()
                    logger.info("[Discovery] Submitted form")
                    submit_success = True
                except Exception:
                    pass
            
            # Method 5: JavaScript form submission as last resort
            if not submit_success:
                try:
                    # Find the form containing the search box
                    form = search_box.find_element(By.XPATH, "ancestor::form")
                    if form:
                        driver.execute_script("arguments[0].submit();", form)
                        logger.info("[Discovery] Submitted form via JavaScript")
                        submit_success = True
                except Exception:
                    pass
            
            if not submit_success:
                logger.warning("[Discovery] Could not submit search using any method")
                if attempt < max_retries - 1:
                    exponential_backoff_delay(attempt)
                    continue
                return False
            
            human_delay(3, 5)
            return True
        
        except Exception as e:
            logger.error(f"[Discovery] Error during Google search (attempt {attempt + 1}/{max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                exponential_backoff_delay(attempt)
            else:
                return False
    
    return False


def discover_alternative_vendors(barcode: str, product_name: str, msp: float, registered_domains: set) -> List[Dict]:
    """
    Discover alternative vendors via search (Google/Bing) and scrape their prices.
    Uses multiple fallback strategies when Google blocks the request.
    """
    discovered_vendors = []
    search_query = f"{barcode} {product_name[:60]} precio comprar online"
    logger.info(f"[Discovery] Starting search with query: {search_query}")
    logger.info(f"[Discovery] Registered domains to exclude: {registered_domains}")

    # Strategy 1: Try Google first with proxies and proper delays
    logger.info("[Discovery] Strategy 1: Attempting Google Search...")
    driver = None
    try:
        driver = initialize_selenium_driver(use_proxy=False)  # Start without proxy
        
        # Navigate to Google - try specific locale first (Spain), then fallback to generic
        google_urls = [
            "https://www.google.es",  # Spain (for Spanish products)
            "https://www.google.com", # Fallback
        ]
        
        for google_url in google_urls:
            try:
                logger.info(f"[Discovery] Loading {google_url}...")
                driver.get(google_url)
                human_delay(2, 3.5)  # Longer initial delay
                
                # Check if page loaded
                if "Google" in driver.title or len(driver.page_source) > 1000:
                    logger.info(f"[Discovery] ✓ Loaded {google_url}")
                    # Add human-like behavior: scroll a bit
                    try:
                        driver.execute_script("window.scrollTo(0, window.innerHeight / 4);")
                        human_delay(0.5, 1.0)
                        driver.execute_script("window.scrollTo(0, 0);")
                        human_delay(0.5, 1.0)
                    except Exception:
                        pass
                    break
            except Exception as e:
                logger.warning(f"[Discovery] Could not load {google_url}: {str(e)}")
                continue
        
        # Try to dismiss cookie consent (non-critical)
        try:
            dismiss_buttons = [
                "//button[contains(., 'Aceptar')]",
                "//button[contains(., 'Accept')]",
                "//button[contains(., 'all')]",
                "//button[@jsaction]",
            ]
            for selector in dismiss_buttons:
                try:
                    buttons = driver.find_elements(By.XPATH, selector)
                    for btn in buttons:
                        try:
                            if btn.is_displayed() and btn.is_enabled():
                                btn.click()
                                logger.debug("[Discovery] Dismissed cookie consent")
                                human_delay(0.5, 1.0)
                                break
                        except:
                            pass
                except:
                    pass
        except Exception as e:
            logger.debug(f"[Discovery] Cookie dismissal skipped: {str(e)}")
        
        # Search on Google with retries
        logger.info(f"[Discovery] Attempting Google search for: {search_query}")
        google_success = search_google(driver, search_query, max_retries=2)
        
        # Check for CAPTCHA or blocks
        current_url = driver.current_url.lower()
        page_source = driver.page_source.lower()
        
        captcha_indicators = [
            "sorry", "recaptcha", "robot", "automated", "blocked", "unusual traffic",
            "verify", "captcha", "puzzle", "challenge"
        ]
        
        google_blocked = any(indicator in current_url or indicator in page_source for indicator in captcha_indicators)
        
        if google_blocked:
            logger.warning("[Discovery] ⚠ CAPTCHA or block detected on Google")
            google_success = False
        
        # Parse results if Google succeeded
        if google_success and not google_blocked:
            try:
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                
                # Extract search result links
                result_links = []
                all_links = soup.find_all("a", href=True)
                logger.info(f"[Discovery] Found {len(all_links)} total links on page")
                
                for link in all_links:
                    href = link.get("href", "")
                    # Filter out Google's own links
                    if href.startswith("http") and "google" not in href.lower() and not any(x in href for x in ["/ads/", "/aclk", "webcache"]):
                        try:
                            domain = urlparse(href).netloc.replace("www.", "")
                            if domain and domain not in registered_domains:
                                result_links.append(href)
                                logger.debug(f"[Discovery] Found link: {domain}")
                        except Exception as e:
                            logger.debug(f"[Discovery] Error parsing link: {str(e)}")
                
                # Deduplicate and limit
                result_links = list(set(result_links))[:5]
                logger.info(f"[Discovery] Will scrape {len(result_links)} unique vendor links from Google")
                
                # Scrape each discovered link
                for idx, link in enumerate(result_links, 1):
                    try:
                        logger.info(f"[Discovery {idx}/{len(result_links)}] Scraping: {link[:100]}")
                        driver.get(link)
                        
                        try:
                            WebDriverWait(driver, 15).until(
                                lambda d: d.execute_script("return document.readyState") == "complete"
                            )
                        except:
                            pass
                        
                        human_delay(1.5, 2.5)
                        
                        page_source = driver.page_source
                        disc_price = extract_price_from_html(page_source)
                        
                        if disc_price:
                            domain = get_domain_from_url(link)
                            diff, perc_diff = calculate_price_difference(msp, disc_price)
                            status = determine_compliance_status(msp, disc_price)
                            
                            discovered_vendors.append({
                                "vendor_id": None,
                                "vendor_name": domain,
                                "vendor_url": link,
                                "scraped_price": disc_price,
                                "msp": msp,
                                "status": status,
                                "price_difference": diff,
                                "percentage_difference": perc_diff,
                            })
                            logger.info(f"[Discovery] ✓ Price {disc_price} found at {domain}")
                        else:
                            logger.debug(f"[Discovery] No price found at {link}")
                        
                    except Exception as e:
                        logger.warning(f"[Discovery] Error scraping {link}: {str(e)}")
                    
                    human_delay(2, 4)  # Longer delays between vendor scrapes
                
            except Exception as e:
                logger.error(f"[Discovery] Error parsing Google results: {str(e)}")
        
    except Exception as e:
        logger.error(f"[Discovery] Google strategy error: {str(e)}", exc_info=True)
    finally:
        if driver:
            try:
                cleanup_chrome_session(driver)
                driver.quit()
            except:
                pass
    
    # Strategy 2: Fallback to Tavily API if Google fails (no browser needed, pure API)
    if not discovered_vendors or google_blocked:
        logger.info("[Discovery] Strategy 2: Google failed/blocked, attempting Tavily API Search...")
        api_links = search_tavily_api(search_query)
        
        if api_links:
            driver = None
            try:
                driver = initialize_selenium_driver(use_proxy=False)
                for idx, link in enumerate(api_links, 1):
                    try:
                        logger.info(f"[Discovery-Tavily {idx}/{len(api_links)}] Scraping: {link[:100]}")
                        driver.get(link)
                        
                        try:
                            WebDriverWait(driver, 15).until(
                                lambda d: d.execute_script("return document.readyState") == "complete"
                            )
                        except:
                            pass
                        
                        human_delay(1.5, 2.5)
                        
                        page_source = driver.page_source
                        disc_price = extract_price_from_html(page_source)
                        
                        if disc_price:
                            domain = get_domain_from_url(link)
                            diff, perc_diff = calculate_price_difference(msp, disc_price)
                            status = determine_compliance_status(msp, disc_price)
                            
                            discovered_vendors.append({
                                "vendor_id": None,
                                "vendor_name": domain,
                                "vendor_url": link,
                                "scraped_price": disc_price,
                                "msp": msp,
                                "status": status,
                                "price_difference": diff,
                                "percentage_difference": perc_diff,
                            })
                            logger.info(f"[Discovery] ✓ Price {disc_price} found via Tavily at {domain}")
                        else:
                            logger.debug(f"[Discovery] No price found at {link}")
                    
                    except Exception as e:
                        logger.warning(f"[Discovery] Tavily error scraping {link}: {str(e)}")
                    
                    human_delay(2, 4)
            
            except Exception as e:
                logger.warning(f"[Discovery] Tavily scraping error: {str(e)}")
            finally:
                if driver:
                    try:
                        cleanup_chrome_session(driver)
                        driver.quit()
                    except:
                        pass
        else:
            logger.warning("[Discovery] Tavily API returned no results")
    
    logger.info(f"[Discovery] ✓ Completed all strategies. Found {len(discovered_vendors)} alternative vendors")
    return discovered_vendors


def discover_alternative_vendors_serp(barcode: str, product_name: str, msp: float, registered_domains: set) -> List[Dict]:
    """
    Discover alternative vendors using SERP API Google Shopping Light Engine.
    Queries SERP API for shopping results and extracts vendor information with prices.
    Sequence: First vendors scraped, then SERP API used for discovery.
    
    Args:
        barcode: Product barcode
        product_name: Product name
        msp: Minimum Selling Price
        registered_domains: Set of already registered vendor domains to exclude
    
    Returns:
        List of discovered vendor dictionaries with price information
    """
    discovered_vendors = []
    
    # Build search query info (for logging)
    logger.info(f"\n{'='*80}")
    logger.info(f"[Discovery SERP] 🔍 STARTING SERP DISCOVERY")
    logger.info(f"[Discovery SERP] Product: {barcode} - {product_name}")
    logger.info(f"[Discovery SERP] Registered domains to exclude: {registered_domains}")
    logger.info(f"[Discovery SERP] Number of registered domains: {len(registered_domains)}")
    logger.info(f"{'='*80}\n")
    
    try:
        # Use SERP API Google Shopping Light to search for shopping results
        # Pass product_name and barcode directly (not a pre-built query)
        results = search_serp_api(product_name=product_name, barcode=barcode, msp=msp)
        
        logger.info(f"[Discovery SERP] ✓ search_serp_api() returned {len(results)} total results to process")
        if results:
            logger.info(f"[Discovery SERP] Sample vendor names from SERP:")
            for idx, result in enumerate(results[:3], 1):
                vendor = result.get("vendor_name", "Unknown")
                price = result.get("scraped_price", "N/A")
                logger.info(f"  [{idx}] {vendor} - Price: {price}")
        
        if not results:
            logger.warning("[Discovery SERP] SERP API returned no results")
            return discovered_vendors
        
        logger.info(f"[Discovery SERP] Processing {len(results)} SERP API results\n")
        
        # Process each SERP result
        for idx, result in enumerate(results, 1):
            try:
                vendor_url = result.get("vendor_url", "")
                vendor_name = result.get("vendor_name", "Unknown")
                product_title = result.get("product_title", "")
                scraped_price = result.get("scraped_price")
                source_type = result.get("source", "serp_api_shopping_light")
                status = result.get("status", "unknown")
                
                if not vendor_url:
                    logger.debug(f"[Discovery SERP] Skipping result {idx} ({vendor_name}): no URL")
                    continue
                
                # For SERP API results, check vendor_name against registered_domains (which now contains vendor names)
                # Since all SERP product_links are Google Shopping redirects, we skip domain extraction
                if vendor_name in registered_domains:
                    logger.info(f"[Discovery SERP] ⊘ Result {idx}: {vendor_name} - FILTERED (vendor already registered)")
                    continue
                
                # Also check if vendor_name is a substring of any registered domain
                # E.g., if "amazon.com" is registered, skip vendor name "amazon"
                matching_domain = None
                is_registered = False
                for domain in registered_domains:
                    if vendor_name.lower() in domain.lower() or domain.lower() in vendor_name.lower():
                        matching_domain = domain
                        is_registered = True
                        break
                
                if is_registered:
                    logger.info(f"[Discovery SERP] ⊘ Result {idx}: '{vendor_name}' - FILTERED")
                    logger.info(f"           Reason: Substring match with registered domain '{matching_domain}'")
                    logger.info(f"           Logic: '{vendor_name.lower()}' in '{matching_domain.lower()}' OR '{matching_domain.lower()}' in '{vendor_name.lower()}' = TRUE")
                    continue
                else:
                    logger.info(f"[Discovery SERP] ✓ Result {idx}: '{vendor_name}' PASSES")
                    logger.info(f"           Reason: No match against {len(registered_domains)} registered domains: {registered_domains}")
                
                # Build discovery result
                discovery_dict = {
                    "vendor_name": vendor_name,
                    "vendor_url": vendor_url,
                    "product_title": product_title,
                    "scraped_price": scraped_price,
                    "msp": msp,
                    "source_type": source_type,
                    "status": status,
                    "compliance_status": status,
                    "violation_date": datetime.now(),
                }
                
                # Add price difference if price exists
                if scraped_price:
                    price_diff = msp - scraped_price
                    discovery_dict["price_difference"] = price_diff
                    discovery_dict["percentage_difference"] = (price_diff / msp * 100) if msp > 0 else 0
                    
                    logger.info(
                        f"[Discovery SERP] Result {idx}: {vendor_name} "
                        f"- URL: {vendor_url[:60]} "
                        f"- Price: ₹{scraped_price:.2f} "
                        f"- Status: {status.upper()}"
                    )
                else:
                    logger.info(
                        f"[Discovery SERP] Result {idx}: {vendor_name} "
                        f"- URL: {vendor_url[:60]} "
                        f"- Price: N/A "
                        f"- Status: {status.upper()}"
                    )
                
                discovered_vendors.append(discovery_dict)
                
            except Exception as e:
                logger.warning(f"[Discovery SERP] Error processing result {idx}: {str(e)}")
                continue
        
        logger.info(f"[Discovery SERP] ✓ Discovered {len(discovered_vendors)} alternative vendors via SERP API")
        
        # Summary statistics
        filtered_count = len(results) - len(discovered_vendors)
        logger.info(f"\n[Discovery SERP] {'='*80}")
        logger.info(f"[Discovery SERP] FILTERING SUMMARY:")
        logger.info(f"[Discovery SERP]   Total SERP results received: {len(results)}")
        logger.info(f"[Discovery SERP]   ✓ Passed filters (new vendors): {len(discovered_vendors)}")
        logger.info(f"[Discovery SERP]   ⊘ Filtered (registered vendors): {filtered_count}")
        logger.info(f"[Discovery SERP] {'='*80}\n")
        
    except Exception as e:
        logger.error(f"[Discovery SERP] Critical error in SERP discovery: {str(e)}")
    
    return discovered_vendors


def scrape_vendor_website(
    product,
    vendor,
    driver,
    search_query: str,
    pack_msp: float,
    timeout_seconds: int = 15
) -> Tuple[Optional[float], Optional[str]]:
    """
    Scrape the price for ONE pack variant from ONE vendor.

    Args:
        product      : Product ORM object.
        vendor       : Vendor ORM object.
        driver       : Selenium WebDriver instance.
        search_query : Full search string, e.g. "RESVERASOR PLUS Pack 2".
        pack_msp     : The MSP threshold for this specific pack count.
        timeout_seconds: Page load timeout.

    Returns:
        (scraped_price, source_url) or (None, None) if not found.

    Strategy for VENDOR_SEARCH_URL_TEMPLATES path:
      1. Navigate directly to the vendor search results URL.
      2. Check for no-results page — skip immediately if found.
      3. Score all <a> links with smart_link_score (core words + pack number).
      4. Navigate to the best-scoring product page.
      5. Extract price from the product page using layered strategies:
           a. extract_price_near_product (scoped itemprop → max itemprop → DOM walk)
           b. Scoped CSS selectors targeting main product container
           c. Max of all itemprop price elements (handles bundle pages)
           d. Broader fallback CSS selectors
           e. extract_price_from_html (JSON-LD → itemprop → median)
    """
    core_words, pack_number = decompose_search_query(search_query)
    relaxed_min_score = max(2, len(core_words) * 2)

    try:
        logger.info(f"[Vendor] ===== {vendor.name} | query='{search_query}' | pack_msp={pack_msp} =====")

        driver.get(vendor.website_url)
        try:
            WebDriverWait(driver, timeout_seconds).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            logger.warning(f"[Vendor] Page load timeout after {timeout_seconds}s, continuing...")

        # ── Cloudflare Detection ──────────────────────────────────────────
        try:
            page_source = driver.page_source
            page_text = driver.execute_script("return document.body.innerText")
            text_len = len(page_text)
            cloudflare_detected = (
                text_len < 500 or
                "__cf_chl_tk" in page_source or
                "cf-challenge" in page_source or
                "Just a moment" in page_text
            )
        except:
            cloudflare_detected = False

        # ── ScrapingBee fallback ──────────────────────────────────────────
        if cloudflare_detected:
            logger.warning(f"[Vendor] ⚠️ Cloudflare on {vendor.name}. Switching to ScrapingBee...")
            html = scrape_with_scrapingbee_js_scenario(
                vendor.website_url, search_query, vendor_website=vendor.website_url
            )
            if html and len(html) > 500:
                price = extract_price_near_product(html, search_query)
                if not price:
                    price = extract_price_from_html(html)
                if price:
                    logger.info(f"[Vendor] ✓ ScrapingBee price: {price} from {vendor.name}")
                    source_url = vendor.website_url
                    try:
                        soup_sb = BeautifulSoup(html, "html.parser")
                        canonical = soup_sb.find("link", rel="canonical")
                        if canonical and canonical.get("href", "").startswith("http"):
                            source_url = canonical["href"]
                        else:
                            og_url = soup_sb.find("meta", property="og:url")
                            if og_url and og_url.get("content", "").startswith("http"):
                                source_url = og_url["content"]
                        vendor_website_normalized = normalize_vendor_website_for_lookup(vendor.website_url)
                        if vendor_website_normalized in VENDOR_SEARCH_URL_TEMPLATES:
                            parsed_resolved = urlparse(source_url)
                            parsed_homepage = urlparse(vendor.website_url)
                            is_homepage = (
                                parsed_resolved.netloc == parsed_homepage.netloc and
                                parsed_resolved.path.strip("/") == ""
                            )
                            if is_homepage:
                                source_url = VENDOR_SEARCH_URL_TEMPLATES[vendor_website_normalized].format(
                                    query=quote_plus(search_query)
                                )
                    except Exception as url_err:
                        logger.debug(f"[Vendor] Could not resolve ScrapingBee source URL: {url_err}")
                    return price, source_url
            return None, None

        # ── NORMAL SELENIUM FLOW ──────────────────────────────────────────
        vendor_website_normalized = normalize_vendor_website_for_lookup(vendor.website_url)

        if vendor_website_normalized in VENDOR_SEARCH_URL_TEMPLATES:
            # ── VENDOR_SEARCH_URL_TEMPLATES PATH ─────────────────────────
            search_url = VENDOR_SEARCH_URL_TEMPLATES[vendor_website_normalized].format(
                query=quote_plus(search_query)
            )
            logger.info(f"[Vendor] Direct search URL: {search_url}")
            driver.get(search_url)
            try:
                WebDriverWait(driver, timeout_seconds).until(
                    lambda d: d.execute_script("return document.readyState") == "complete"
                )
            except:
                pass
            human_delay(2, 3)

            # ── No-results detection ──────────────────────────────────────
            try:
                page_text = driver.execute_script("return document.body.innerText") or ""
                page_text_lower = page_text.lower()
                no_results_phrases = [
                    "no tenemos productos para",
                    "no results found",
                    "no se encontraron resultados",
                    "no encontramos resultados",
                    "0 resultados",
                    "sin resultados",
                    "your search returned no results",
                ]
                if any(p in page_text_lower for p in no_results_phrases):
                    logger.warning(f"[Vendor] No results for '{search_query}' on {vendor.name} — skipping")
                    return None, None
            except Exception as nr_err:
                logger.debug(f"[Vendor] No-results check failed: {nr_err}")

            # ── Wait for JS-rendered product cards ────────────────────────
            try:
                WebDriverWait(driver, 8).until(
                    lambda d: len(d.find_elements(By.CSS_SELECTOR,
                        'a[href*="product"], a[href*="producto"], [class*="product"] a, '
                        '[class*="result"] a, [data-product-id] a, [class*="km-items"] a, '
                        '[class*="card"] a, [class*="item"] a'
                    )) > 0
                )
                human_delay(1.5, 2.5)
            except:
                logger.debug(f"[Vendor] JS wait timed out, continuing...")

            # ── Scroll to trigger lazy-loaded cards ───────────────────────
            try:
                driver.execute_script("window.scrollBy(0, 400);")
                human_delay(1.0, 1.5)
                driver.execute_script("window.scrollTo(0, 0);")
                human_delay(0.5, 1.0)
            except:
                pass

            # ── Score links and navigate to best product page ─────────────
            page_source = driver.execute_script("return document.documentElement.outerHTML")
            soup_results = BeautifulSoup(page_source, "html.parser")

            best_url = None
            best_score = 0

            for a in soup_results.find_all("a", href=True):
                href = a.get("href", "")
                if not href or href.startswith("#") or href.startswith("javascript"):
                    continue
                img = a.find("img")
                combined_text = " ".join([
                    a.get_text(separator=" ", strip=True) or "",
                    a.get("title") or "",
                    (img.get("alt") or "") if img else "",
                ])
                score = smart_link_score(combined_text, href, core_words, pack_number)
                if score > best_score:
                    best_score = score
                    best_url = href if href.startswith("http") else                         __import__("urllib.parse", fromlist=["urljoin"]).urljoin(vendor.website_url, href)

            # If BS4 missed AJAX-rendered links, try live Selenium elements
            if best_score < relaxed_min_score:
                try:
                    live_links = driver.find_elements(By.TAG_NAME, "a")
                    for el in live_links:
                        href = el.get_attribute("href") or ""
                        text = el.text or ""
                        title = el.get_attribute("title") or ""
                        score = smart_link_score(text + " " + title, href, core_words, pack_number)
                        if score > best_score:
                            best_score = score
                            best_url = href
                except Exception as live_err:
                    logger.debug(f"[Vendor] Live link scoring failed: {live_err}")

            logger.info(f"[Vendor] Best link score={best_score} (need>={relaxed_min_score}): {str(best_url)[:80]}")

            if not best_url or best_score < relaxed_min_score:
                logger.warning(f"[Vendor] No matching product link for '{search_query}' on {vendor.name} — skipping")
                return None, None

            driver.get(best_url)
            human_delay(2, 3)
            logger.debug(f"[Vendor] Navigated to product page: {driver.current_url}")

        else:
            # ── NON-TEMPLATE PATH: homepage search box ────────────────────
            dismiss_overlays(driver)
            human_delay(0.5, 1.0)
            logger.info(f"[Vendor] Search attempt: '{search_query}'")
            search_success = find_and_search(driver, search_query, vendor.name)

            if not search_success:
                logger.warning(f"[Vendor] Search failed. Trying barcode...")
                search_success = find_and_search(driver, product.barcode, vendor.name)
                if not search_success:
                    return None, None

            human_delay(2, 3)

            page_source = driver.execute_script("return document.documentElement.outerHTML")
            soup_results = BeautifulSoup(page_source, "html.parser")
            best_url = None
            best_score = 0

            for a in soup_results.find_all("a", href=True):
                href = a.get("href", "")
                if not href or href.startswith("#") or href.startswith("javascript"):
                    continue
                img = a.find("img")
                combined_text = " ".join([
                    a.get_text(separator=" ", strip=True) or "",
                    a.get("title") or "",
                    (img.get("alt") or "") if img else "",
                ])
                score = smart_link_score(combined_text, href, core_words, pack_number)
                if score > best_score:
                    best_score = score
                    best_url = href if href.startswith("http") else                         __import__("urllib.parse", fromlist=["urljoin"]).urljoin(vendor.website_url, href)

            if best_score < relaxed_min_score:
                try:
                    live_links = driver.find_elements(By.TAG_NAME, "a")
                    for el in live_links:
                        href = el.get_attribute("href") or ""
                        text = el.text or ""
                        score = smart_link_score(text, href, core_words, pack_number)
                        if score > best_score:
                            best_score = score
                            best_url = href
                except Exception:
                    pass

            if best_url and best_score >= relaxed_min_score:
                logger.info(f"[Vendor] Navigating to product page (score={best_score}): {best_url[:80]}")
                driver.get(best_url)
                human_delay(2, 3)
            else:
                logger.warning(f"[Vendor] No matching link for '{search_query}' on {vendor.name} — skipping")
                return None, None

        # ── PRICE EXTRACTION ON PRODUCT PAGE ─────────────────────────────
        # The full layered extraction is done here after landing on the product page.
        # extract_price_near_product now uses scoped itemprop lookup and max-value
        # fallback so it correctly handles bundle pages (e.g. PrestaShop pack pages
        # that embed single-unit component prices at the bottom of the page).
        page_source = driver.execute_script("return document.documentElement.outerHTML")

        # Layer 1: extract_price_near_product
        #   -> scoped itemprop (main product container) → max itemprop → DOM walk
        scraped_price = extract_price_near_product(page_source, search_query)

        # Layer 2: Platform-specific Selenium CSS selectors (main product block only).
        # Targets the PrestaShop #our_price_display span and equivalent containers
        # on other platforms. Always reads the `content` attribute first (machine-
        # readable) before falling back to visible text — critical for PrestaShop
        # where the price is a SPAN with content="12.001", not a META tag.
        # Every candidate element is also checked via JS .closest() to ensure it
        # is NOT inside a blockpack / related-products noise section.
        if not scraped_price:
            MAIN_SCOPE_SELECTORS = [
                "#our_price_display",
                "#buy_block [itemprop='price']",
                ".content_prices [itemprop='price']",
                ".box-info-product [itemprop='price']",
                ".product-prices .current-price [itemprop='price']",
                ".product-prices [itemprop='price']",
                ".summary .price > .amount",
                ".summary .price .woocommerce-Price-amount",
                ".product-summary .price .amount",
                ".product-info-main [itemprop='price']",
                "[itemprop='offers'] [itemprop='price']",
                "#product-price",
            ]
            NOISE_ANCESTOR_CHECK = (
                "'#blockpack, .blockpack, #related-products, .related-products, "
                ".cross-sells, .up-sells, #crossselling, .crossselling'"
            )
            for sel in MAIN_SCOPE_SELECTORS:
                try:
                    els = driver.find_elements(By.CSS_SELECTOR, sel)
                    for el in els:
                        try:
                            in_noise = driver.execute_script(
                                f"return arguments[0].closest({NOISE_ANCESTOR_CHECK}) !== null;", el
                            )
                            if in_noise:
                                continue
                        except Exception:
                            pass
                        # Read content attribute first (machine-readable span/meta value)
                        raw = el.get_attribute("content") or el.text or ""
                        raw = raw.strip()
                        if not raw:
                            continue
                        normalized = normalize_price(raw)
                        if normalized and 1.0 <= normalized <= 500.0:
                            scraped_price = normalized
                            logger.info(f"[Vendor] ✓ Price from scoped selector '{sel}': €{scraped_price}")
                            break
                    if scraped_price:
                        break
                except Exception:
                    continue

        # Layer 3: All itemprop="price" elements — exclude noise sections, take MAX.
        # Reads content attribute OR text from ANY tag (span, meta, div) with
        # itemprop="price". Filters out elements inside blockpack/related sections
        # via JS closest(), then returns the maximum (bundle price > component price).
        if not scraped_price:
            try:
                all_price_els = driver.find_elements(By.CSS_SELECTOR, "[itemprop='price']")
                candidates = []
                for el in all_price_els:
                    try:
                        in_noise = driver.execute_script(
                            f"return arguments[0].closest({NOISE_ANCESTOR_CHECK}) !== null;", el
                        )
                        if in_noise:
                            continue
                    except Exception:
                        pass
                    raw = el.get_attribute("content") or el.text or ""
                    raw = raw.strip()
                    if not raw:
                        continue
                    normalized = normalize_price(raw)
                    if normalized and 1.0 <= normalized <= 500.0:
                        candidates.append(normalized)
                if candidates:
                    scraped_price = max(candidates)
                    logger.info(f"[Vendor] ✓ Price (max of {len(candidates)} noise-filtered itemprop elements): €{scraped_price}")
            except Exception:
                pass

        # Layer 4: Broader fallback CSS selectors
        if not scraped_price:
            FALLBACK_SELECTORS = [
                ".product-price .price",
                ".product__price .price",
                ".product-info .price",
                ".precio",
                ".product-price",
                "meta[property='product:price:amount']",
                "meta[name='twitter:data1']",
            ]
            for sel in FALLBACK_SELECTORS:
                try:
                    els = driver.find_elements(By.CSS_SELECTOR, sel)
                    for el in els:
                        raw = el.get_attribute("content") or el.text or ""
                        raw = raw.strip()
                        if not raw:
                            continue
                        normalized = normalize_price(raw)
                        if normalized and 1.0 <= normalized <= 500.0:
                            scraped_price = normalized
                            logger.info(f"[Vendor] ✓ Price from fallback selector '{sel}': €{scraped_price}")
                            break
                    if scraped_price:
                        break
                except Exception:
                    continue

        # Layer 5: Full HTML parse (JSON-LD → itemprop → median)
        if not scraped_price:
            scraped_price = extract_price_from_html(page_source)

        if scraped_price:
            logger.info(f"[Vendor] ✓✓✓ PRICE FOUND: {scraped_price} from {vendor.name}")
            return scraped_price, driver.current_url
        else:
            logger.warning(f"[Vendor] ✗ No price found from {vendor.name}")
            return None, None

    except Exception as e:
        logger.error(f"[Vendor] ❌ Error scraping {vendor.name}: {str(e)}")
        return None, None


class ScraperService:
    """Service for scraping product prices from vendors."""

    @staticmethod
    async def scrape_product(
        db: AsyncSession,
        product_id: int,
        enable_discovery: bool = False,
        headless_mode: bool = True,
        timeout_seconds: int = 15
    ) -> Dict:
        """
        Scrape a product against all registered vendors.
        
        Args:
            db: Database session
            product_id: Product ID to scrape
            enable_discovery: If True, also discover vendors via Google (disabled by default)
            headless_mode: Run browser in headless mode
            timeout_seconds: Timeout for page load
            
        Returns:
            Dictionary with scraping results
        """
        # Fetch product
        result = await db.execute(select(Product).where(Product.id == product_id))
        product = result.scalars().first()
        
        if not product:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"Product with ID {product_id} not found"
            )

        # Fetch all active vendors
        vendor_result = await db.execute(select(Vendor).where(Vendor.is_active == True))
        vendors = vendor_result.scalars().all()
        
        if not vendors:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="No active vendors found"
            )

        results = []
        violation_records = []
        created_violations_set = set()  # Track violations actually added to database
        
        # Collect registered vendor domains for exclusion in discovery
        registered_domains = set()
        for vendor in vendors:
            domain = get_domain_from_url(vendor.website_url)
            registered_domains.add(domain)
        
        try:
            # Run scraping in thread pool to avoid blocking event loop
            scraping_task = asyncio.to_thread(
                ScraperService._run_scraping,
                product,
                vendors,
                headless_mode,
                timeout_seconds
            )
            results, violation_records = await scraping_task
            
            # optionally perform google discovery
            if enable_discovery:
                logger.info(f"[Scraper] Google discovery ENABLED for product {product_id}")
                try:
                    # Build normalized URLs set from vendor results to check against
                    vendor_normalized_urls = set()
                    for result in results:
                        url = result.get("vendor_url", "")
                        normalized = normalize_url_for_comparison(url)
                        vendor_normalized_urls.add(normalized)
                    
                    logger.info(f"[Discovery] Starting with {len(vendor_normalized_urls)} vendor URLs to check against")
                    
                    # Keep discovering until we get enough unique URLs or hit max attempts
                    max_discovery_attempts = 3
                    all_discovered = []
                    attempt = 0
                    
                    while attempt < max_discovery_attempts:
                        attempt += 1
                        logger.info(f"[Discovery] Attempt {attempt}/{max_discovery_attempts}")
                        
                        # run in thread since selenium is synchronous
                        discovered = await asyncio.to_thread(
                            discover_alternative_vendors, 
                            product.barcode, 
                            product.product_name, 
                            float(product.msp), 
                            registered_domains
                        )
                        logger.info(f"[Discovery] Attempt {attempt} completed. Found {len(discovered)} alternative vendors")
                        
                        # Store all discovered vendors directly without duplicate checking
                        all_discovered.extend(discovered)
                        logger.info(f"[Discovery] Added {len(discovered)} vendor URLs to storage")
                        
                        # Stop after max attempts
                        if attempt >= max_discovery_attempts:
                            logger.info(f"[Discovery] Completed {max_discovery_attempts} attempts")
                            break
                    
                    # Extend results with all discovered (both new and from previous attempts)
                    logger.info(f"[Discovery] COMPLETE: Found {len(all_discovered)} total unique discovery URLs")
                    # Mark these as Google discovered for source_type assignment in the results processing loop
                    for item in all_discovered:
                        if "source_type_hint" not in item:
                            item["source_type_hint"] = "discovered"
                    results.extend(all_discovered)
                    
                    # Don't create violations here - they will be created in the unified results processing loop below
                    # This prevents double-creation and ensures consistent source_type assignment
                except Exception as exc:
                    logger.error(f"[Discovery] Exception during discovery: {str(exc)}", exc_info=True)
            else:
                logger.info(f"[Scraper] Google discovery DISABLED for product {product_id}")

            # Remove duplicate results to prevent duplicate violations
            # Key insight: Same vendor can be found via different URLs (registered vendor URL vs Google SERP URL)
            # So we deduplicate based on VENDOR + PRICE, not URL
            # This deduplication removes the same vendor selling at the same price, regardless of URL source
            seen_result_keys = set()
            deduplicated_results = []
            removed_duplicates = []
            
            for result in results:
                url = result.get("vendor_url", "")
                vendor_name = result.get("vendor_name", "Unknown")
                scraped_price = result.get("scraped_price")
                msp = result.get("msp")
                
                # Use vendor-based key: (vendor_name, product_name, msp, scraped_price)
                # This deduplicates the same vendor selling at the same price, regardless of URL source
                # Key: NOT including URL, which may differ between registered vendor site and Google discovery
                packs_dedup = result.get("packs", 1)
                dedup_key = (vendor_name, product.product_name, msp, round(scraped_price, 2) if scraped_price else None, packs_dedup)
                
                if dedup_key not in seen_result_keys:
                    seen_result_keys.add(dedup_key)
                    deduplicated_results.append(result)
                    logger.info(f"[Dedup] ✅ KEPT: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price})")
                else:
                    removed_duplicates.append(result)
                    logger.info(f"[Dedup] ❌ REMOVED DUPLICATE: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price}) - Same vendor already found from another source")
            
            results = deduplicated_results
            logger.info(f"\n[Deduplication Summary]")
            logger.info(f"  Input URLs: {len(results) + len(removed_duplicates)}")
            logger.info(f"  Unique results kept: {len(results)}")
            logger.info(f"  Duplicates removed: {len(removed_duplicates)}")
            logger.info(f"  Vendor scraped: {sum(1 for r in results if r.get('vendor_id'))}")
            logger.info(f"  Discovery found: {sum(1 for r in results if not r.get('vendor_id'))}\n")


            # Save all results to database
            # Track violations we're about to create to prevent in-session duplicates
            created_violations = set()  # Track (url, price) tuples of violations we create
            for result in results:
                vendor_id = result.get("vendor_id")  # may be None for discovery, or a string UUID
                vendor_name = result["vendor_name"]
                vendor_url = result["vendor_url"]
                scraped_price = result["scraped_price"]
                msp = result["msp"]
                vendor_status = result["status"]
                price_diff = result.get("price_difference")
                perc_diff = result.get("percentage_difference")
                
                # Determine source type based on vendor_id and source_type_hint
                if vendor_id:
                    source = "registered"
                elif result.get("source_type_hint") == "serp_api_shopping":
                    source = "serp_api_shopping"
                else:
                    source = "discovered"
                    
                domain = get_domain_from_url(vendor_url)
                
                # Convert vendor_id from string to UUID if it's a registered vendor
                vendor_id_uuid = None
                if vendor_id:
                    try:
                        vendor_id_uuid = UUID(vendor_id) if isinstance(vendor_id, str) else vendor_id
                    except (ValueError, TypeError):
                        logger.warning(f"Could not convert vendor_id to UUID: {vendor_id}")
                        vendor_id_uuid = None

                # Create scraping result record
                scraping_result = ScrapingResult(
                    product_id=product.id,
                    vendor_id=vendor_id_uuid,
                    product_name=product.product_name,
                    barcode=product.barcode,
                    reference_id=str(product.reference_id),
                    vendor_name=vendor_name,
                    vendor_url=vendor_url,
                    domain_name=domain,
                    msp=msp,
                    scraped_price=scraped_price,
                    price_difference=price_diff,
                    percentage_difference=perc_diff,
                    compliance_status=vendor_status,
                    source_type=source,
                )

                db.add(scraping_result)

                # Create violation record if applicable (with duplicate prevention)
                if vendor_status == "violation":
                    # Use vendor-based key: (vendor_name, product_name, msp, scraped_price)
                    # This matches the deduplication logic in the results processing above
                    # Same vendor selling at same price is considered a duplicate, regardless of URL
                    packs_vk = result.get("packs", 1)
                    violation_key = (vendor_name, product.product_name, msp, round(scraped_price, 2), packs_vk)
                    
                    # Check for duplicates in current session first (in-memory)
                    if violation_key in created_violations:
                        logger.info(f"⚠️ IN-SESSION DUPLICATE VIOLATION PREVENTED: {vendor_name} - {product.product_name} at {vendor_url[:80]}")
                    else:
                        # Check for duplicates in database
                        is_duplicate = await check_duplicate_violation(
                            db, vendor_url, scraped_price, product.barcode, product.product_name, msp,
                            vendor_name=vendor_name, packs=result.get("packs", 1)
                        )
                        
                        if not is_duplicate:
                            # Determine marketplace: domain for registered, vendor_name for discovered
                            if vendor_id_uuid:
                                marketplace = get_domain_from_url(vendor_url)
                            else:
                                marketplace = vendor_name
                            
                            packs = result.get("packs", 1)
                            violation = Violation(
                                product_id=product.id,
                                vendor_id=vendor_id_uuid,
                                vendor_name=vendor_name,
                                product_name=product.product_name,
                                msp=msp,
                                scraped_price=scraped_price,
                                price_difference=price_diff,
                                percentage_difference=perc_diff,
                                marketplace=marketplace,
                                violation_date=datetime.utcnow(),
                                url=vendor_url,
                                barcode_number=product.barcode,
                                reference_id=str(product.reference_id),
                                source_type=source,
                                packs=packs,
                            )
                            db.add(violation)
                            created_violations.add(violation_key)
                            created_violations_set.add(violation_key)
                            logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} Pack {packs} @ ₹{scraped_price} (MSP: ₹{msp})")
                        else:
                            logger.info(f"⚠️ DATABASE DUPLICATE VIOLATION PREVENTED: {vendor_name} - {product.product_name} @ ₹{scraped_price}")

            # Update last_scraped_date for product
            product.last_scraped_date = datetime.utcnow()
            execution_time_str = datetime.utcnow().isoformat()
            product.last_execution_time = execution_time_str

            # Commit all changes (registered + Google discovery violations)
            await db.commit()
            logger.info(f"[Scraper] ✓ Main violation commit completed. Violations in session: {len(violation_records)}")
            
            # Refresh product to ensure changes are persisted
            await db.refresh(product)
            
            # Return all violations found in THIS run (not filtered by database duplicates)
            # Users want to see what was discovered TODAY, even if it existed before
            logger.info(f"📊 [Response] Final Results: {len(results)} unique URLs, {len(violation_records)} violations found in this run")
            
            resp = {
                "product_id": product_id,
                "product_name": product.product_name,
                "barcode": product.barcode,
                "msp": float(product.msp),
                "pack_counts_scraped": PACK_COUNTS,
                "results": results,
                "violations": violation_records,
                "total_results": len(results),
                "last_execution_time": execution_time_str,
                "total_violations": len(violation_records),
                "discovery_enabled": enable_discovery,
                "source_type": "browser_with_serp_api_discovery" if enable_discovery else "browser_only"
            }
            return resp

        except Exception as e:
            logger.error(f"❌ Fatal scraping error: {str(e)}")
            await db.rollback()
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Scraping failed: {str(e)}"
            )

    @staticmethod
    def _run_scraping(product, vendors: List, headless_mode: bool, timeout_seconds: int) -> Tuple[List[Dict], List[Dict]]:
        """
        Run the actual scraping in a separate thread.

        Hybrid strategy:
          - Sequential by pack: process PACK_COUNTS [1, 2, 3, 4, 5, 6, 12] one at a time.
          - Parallel by vendor: for each pack, scrape ALL vendors concurrently using
            a ThreadPoolExecutor (no cap — fire them all at once per pack).

        Logging strategy:
          Each vendor thread writes to a private list buffer instead of calling
          the global logger directly. Once the thread finishes, its buffer is
          flushed atomically to the real logger in one block. This prevents the
          interleaved log mess that happens when parallel threads all write to
          the same logger simultaneously.
        """
        import logging as _logging

        results = []
        violation_records = []

        logger.info(f"\n{'='*80}")
        logger.info(f"[Scraper] Starting multi-pack scrape for: {product.product_name}")
        logger.info(f"[Scraper] Base MSP: {product.msp} | Pack counts: {PACK_COUNTS}")
        logger.info(f"[Scraper] Vendors: {len(vendors)}")
        logger.info(f"{'='*80}\n")

        def scrape_one_vendor(vendor, search_query: str, pack_msp: float, pack_count: int) -> Tuple[Optional[Dict], List[Tuple[int, str]]]:
            """
            Scrape a single vendor for a single pack variant.

            All log messages are collected into a local `log_buffer` list as
            (level, message) tuples and returned alongside the result.  The
            caller flushes them to the real logger once this thread completes,
            keeping log output clean and non-interleaved.

            Returns:
                (result_dict_or_None, log_buffer)
            """
            log_buffer: List[Tuple[int, str]] = []

            def _log(level: int, msg: str):
                log_buffer.append((level, msg))

            INFO    = _logging.INFO
            WARNING = _logging.WARNING
            ERROR   = _logging.ERROR

            tag = f"[Pack{pack_count}][{vendor.name}]"

            if not vendor.website_url:
                _log(WARNING, f"{tag} No website URL — skipping")
                return None, log_buffer

            driver = None
            try:
                driver = initialize_selenium_driver(headless=headless_mode)
                scraped_price, source_url = scrape_vendor_website(
                    product, vendor, driver, search_query, pack_msp, timeout_seconds
                )
            except Exception as exc:
                _log(ERROR, f"{tag} ❌ Exception during scraping: {exc}")
                scraped_price, source_url = None, None
            finally:
                if driver:
                    try:
                        cleanup_chrome_session(driver)
                        driver.quit()
                    except:
                        pass

            if scraped_price is None:
                _log(WARNING, f"{tag} ❓ No price found")
                return None, log_buffer

            effective_url = source_url if source_url else vendor.website_url
            vendor_status = determine_compliance_status(pack_msp, scraped_price)
            price_diff, perc_diff = calculate_price_difference(pack_msp, scraped_price)

            if vendor_status == "violation":
                _log(WARNING,
                    f"{tag} 🚨 VIOLATION | "
                    f"scraped=€{scraped_price:.4f} < msp=€{pack_msp} | "
                    f"diff=€{price_diff} ({perc_diff}%) | {effective_url}"
                )
            elif vendor_status == "complain":
                _log(INFO,
                    f"{tag} ⚠️  COMPLAIN  | "
                    f"scraped=€{scraped_price:.4f} > msp=€{pack_msp} | "
                    f"diff=+€{abs(price_diff)} ({abs(perc_diff)}%) | {effective_url}"
                )
            else:
                _log(INFO,
                    f"{tag} ✅ COMPLIANT | "
                    f"scraped=€{scraped_price:.4f} = msp=€{pack_msp} | {effective_url}"
                )

            return {
                "vendor_id": str(vendor.id),
                "vendor_name": vendor.name,
                "vendor_url": effective_url,
                "scraped_price": scraped_price,
                "msp": pack_msp,
                "packs": pack_count,
                "status": vendor_status,
                "price_difference": price_diff,
                "percentage_difference": perc_diff,
            }, log_buffer

        # ── Outer loop: sequential by pack ───────────────────────────────
        for pack_count in PACK_COUNTS:
            search_query = generate_search_query(product.product_name, pack_count)
            pack_msp = get_pack_price(product, pack_count)

            if pack_msp is None:
                logger.warning(f"[Pack{pack_count}] No MSP configured — skipping")
                continue

            logger.info(
                f"\n{'─'*70}\n"
                f"  PACK {pack_count}  |  query='{search_query}'  |  msp=€{pack_msp}\n"
                f"  Vendors to scrape: {len(vendors)}\n"
                f"{'─'*70}"
            )

            # ── Inner parallel: all vendors at once ───────────────────────
            pack_results: List[Dict] = []
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(scrape_one_vendor, vendor, search_query, pack_msp, pack_count): vendor
                    for vendor in vendors
                }
                for future in futures:
                    vendor = futures[future]
                    try:
                        result, log_buffer = future.result()
                    except Exception as exc:
                        logger.error(f"[Pack{pack_count}][{vendor.name}] Future error: {exc}")
                        result, log_buffer = None, []

                    # ── Flush this vendor's buffered log as one clean block ──
                    for level, msg in log_buffer:
                        logger.log(level, msg)

                    if result is not None:
                        pack_results.append(result)

            # ── Pack summary (logged after all vendors finish) ─────────────
            found     = [r for r in pack_results if r["scraped_price"] is not None]
            violations = [r for r in pack_results if r["status"] == "violation"]
            no_price  = len(vendors) - len(found)

            logger.info(
                f"\n  Pack {pack_count} summary | "
                f"found: {len(found)}/{len(vendors)} | "
                f"violations: {len(violations)} | "
                f"no price: {no_price}"
            )
            for r in pack_results:
                status_icon = {"violation": "🚨", "complain": "⚠️ ", "compliant": "✅"}.get(r["status"], "❓")
                logger.info(
                    f"    {status_icon}  {r['vendor_name']:<30} "
                    f"€{r['scraped_price']:.4f}  (msp €{r['msp']})"
                )

            # Accumulate into master results
            results.extend(pack_results)
            for r in pack_results:
                if r["status"] == "violation":
                    violation_records.append({
                        "vendor_id":      r["vendor_id"],
                        "vendor_name":    r["vendor_name"],
                        "vendor_url":     r["vendor_url"],
                        "product_name":   product.product_name,
                        "scraped_price":  r["scraped_price"],
                        "msp":            r["msp"],
                        "packs":          r["packs"],
                        "price_difference": r["price_difference"],
                    })

            # Small inter-pack delay
            human_delay(1.5, 2.5)

        # ── Final summary ─────────────────────────────────────────────────
        logger.info(f"\n{'='*70}")
        logger.info(f"  SCRAPE COMPLETE: {product.product_name}")
        logger.info(f"  Total results : {len(results)}")
        logger.info(f"  Total violations: {len(violation_records)}")
        if violation_records:
            logger.info("  Violations found:")
            for v in violation_records:
                logger.warning(
                    f"    🚨 Pack {v['packs']} | {v['vendor_name']:<30} "
                    f"€{v['scraped_price']:.4f} < msp €{v['msp']}"
                )
        logger.info(f"{'='*70}\n")

        return results, violation_records

    @staticmethod
    async def scrape_product_serp(
        db: AsyncSession,
        product_id: int,
        enable_discovery: bool = False,
        headless_mode: bool = True,
        timeout_seconds: int = 15
    ) -> Dict:
        """
        Scrape a product against all registered vendors using SERP API for discovery.
        Same functionality as scrape_product, but uses SERP API Google Shopping Light 
        Engine instead of Tavily for discovering alternative vendors.
        
        Args:
            db: Database session
            product_id: Product ID to scrape
            enable_discovery: If True, also discover vendors via SERP API Shopping (disabled by default)
            headless_mode: Run browser in headless mode
            timeout_seconds: Timeout for page load
            
        Returns:
            Dictionary with scraping results
        """
        # Fetch product
        result = await db.execute(select(Product).where(Product.id == product_id))
        product = result.scalars().first()
        
        if not product:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"Product with ID {product_id} not found"
            )

        # Fetch all active vendors
        vendor_result = await db.execute(select(Vendor).where(Vendor.is_active == True))
        vendors = vendor_result.scalars().all()
        
        if not vendors:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="No active vendors found"
            )

        results = []
        violation_records = []
        created_violations_set = set()
        
        # Collect registered vendor domains for exclusion in discovery
        registered_domains = set()
        for vendor in vendors:
            domain = get_domain_from_url(vendor.website_url)
            registered_domains.add(domain)
        
        try:
            # Run scraping in thread pool to avoid blocking event loop
            scraping_task = asyncio.to_thread(
                ScraperService._run_scraping,
                product,
                vendors,
                headless_mode,
                timeout_seconds
            )
            results, violation_records = await scraping_task
            
            # Process all registered and discovered violations to database
            # This ensures all violations are persisted before returning
            for result in results:
                if result.get("status") == "violation":
                    vendor_id = result.get("vendor_id")
                    vendor_name = result["vendor_name"]
                    vendor_url = result["vendor_url"]
                    scraped_price = result["scraped_price"]
                    msp = float(result["msp"])
                    price_diff = result.get("price_difference")
                    perc_diff = result.get("percentage_difference")
                    
                    # Determine source type
                    if vendor_id:
                        source = "registered"
                        vendor_id_uuid = UUID(vendor_id) if isinstance(vendor_id, str) else vendor_id
                    else:
                        source = "discovered"
                        vendor_id_uuid = None
                    
                    # Check for duplicates and create violation
                    is_duplicate = await check_duplicate_violation(
                        db, vendor_url, scraped_price, product.barcode, product.product_name, msp, vendor_name=vendor_name
                    )
                    
                    if not is_duplicate:
                        # Determine marketplace: domain for registered, vendor_name for discovered
                        if vendor_id_uuid:
                            marketplace = get_domain_from_url(vendor_url)
                        else:
                            marketplace = vendor_name
                        
                        packs_val = result.get("packs", 1)
                        violation = Violation(
                            product_id=product.id,
                            vendor_id=vendor_id_uuid,
                            vendor_name=vendor_name,
                            product_name=product.product_name,
                            msp=msp,
                            scraped_price=scraped_price,
                            price_difference=price_diff,
                            percentage_difference=perc_diff,
                            marketplace=marketplace,
                            violation_date=datetime.utcnow(),
                            url=vendor_url,
                            barcode_number=product.barcode,
                            reference_id=str(product.reference_id),
                            source_type=source,
                            packs=packs_val,
                        )
                        db.add(violation)
                        logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} Pack {packs_val} @ {vendor_url[:60]} (Source: {source})")
            
            # optionally perform SERP API discovery for alternative vendors
            if enable_discovery:
                logger.info(f"[Scraper SERP] SERP API discovery ENABLED for product {product_id}")
                try:
                    # Perform SERP API discovery with multiple attempts
                    max_discovery_attempts = 3
                    all_discovered = []
                    attempt = 0
                    
                    while attempt < max_discovery_attempts:
                        attempt += 1
                        logger.info(f"[Discovery SERP] Attempt {attempt}/{max_discovery_attempts}")
                        
                        # Run SERP discovery in thread
                        discovered = await asyncio.to_thread(
                            discover_alternative_vendors_serp, 
                            product.barcode, 
                            product.product_name, 
                            float(product.msp), 
                            registered_domains
                        )
                        logger.info(f"[Discovery SERP] Attempt {attempt} completed. Found {len(discovered)} alternative vendors")
                        
                        # Store all discovered vendors directly without duplicate checking
                        all_discovered.extend(discovered)
                        logger.info(f"[Discovery SERP] Added {len(discovered)} vendor URLs to storage")
                        
                        # Stop after attempts
                        if attempt >= max_discovery_attempts:
                            logger.info(f"[Discovery SERP] Completed {max_discovery_attempts} attempts")
                            break
                    
                    # Deduplicate discovered vendors across multiple attempts
                    # Use vendor-based key (no URL): vendor_name + product_name + MSP + price
                    # This prevents same vendor found in multiple discovery attempts from creating duplicates
                    seen_discovery_keys = set()
                    deduplicated_discovered = []
                    removed_discovery_duplicates = []
                    
                    for item in all_discovered:
                        url = item.get("vendor_url", "")
                        vendor_name = item.get("vendor_name", "Unknown")
                        scraped_price = item.get("scraped_price")
                        msp = item.get("msp")
                        
                        # Use vendor-based key (NO URL): (vendor_name, product_name, msp, scraped_price)
                        # Same vendor at same price is duplicate, regardless of URL source
                        dedup_key = (vendor_name, product.product_name, msp, round(scraped_price, 2) if scraped_price else None)
                        
                        if dedup_key not in seen_discovery_keys:
                            seen_discovery_keys.add(dedup_key)
                            deduplicated_discovered.append(item)
                            logger.info(f"[Discovery Dedup] ✅ KEPT: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price})")
                        else:
                            removed_discovery_duplicates.append(item)
                            logger.info(f"[Discovery Dedup] ❌ REMOVED DUPLICATE: {vendor_name} - {url[:70]} (Product: {product.product_name}, MSP: {msp}, Price: {scraped_price}) - Same vendor already found")
                    
                    all_discovered = deduplicated_discovered
                    logger.info(f"[Discovery Dedup] Before: {len(all_discovered) + len(removed_discovery_duplicates)} results, After: {len(all_discovered)} unique results ({len(removed_discovery_duplicates)} duplicates removed)")
                    
                    # Process SERP discovered violations to database with duplicate check
                    for item in all_discovered:
                        if item.get("status") == "violation":
                            vendor_name_serp = item.get("vendor_name", "Unknown")
                            vendor_url_serp = item.get("vendor_url", "")
                            scraped_price_serp = item.get("scraped_price")
                            msp_serp = float(item.get("msp", product.msp))
                            
                            # Create SERP violation record without duplicate check
                            # User requirement: Save violations on every run with different execution timestamps for historical tracking
                            violation = Violation(
                                product_id=product.id,
                                vendor_id=None,  # SERP discovered vendors don't have vendor_id
                                vendor_name=vendor_name_serp,
                                product_name=product.product_name,
                                msp=msp_serp,
                                scraped_price=item["scraped_price"],
                                price_difference=item.get("price_difference"),
                                percentage_difference=item.get("percentage_difference"),
                                marketplace=vendor_name_serp,  # For SERP, marketplace = vendor_name
                                url=item.get("vendor_url", ""),
                                barcode_number=product.barcode,
                                reference_id=str(product.reference_id),
                                source_type="serp_api_shopping",
                                violation_date=datetime.utcnow()
                            )
                            db.add(violation)
                            # Flush immediately to ensure violation is written
                            await db.flush()
                            created_violations_set.add((vendor_name_serp, product.product_name, msp_serp, round(scraped_price_serp, 2)))
                            logger.warning(f"🔍 SERP VIOLATION SAVED: {vendor_name_serp} - {product.product_name} @ {scraped_price_serp}")
                            
                            # Always add to violation_records for response, whether duplicate or not
                            # Users want to see violations found in THIS run, even if they exist in DB
                            violation_records.append({
                                "vendor_name": vendor_name_serp,
                                "vendor_url": vendor_url_serp,
                                "product_name": product.product_name,
                                "scraped_price": scraped_price_serp,
                                "msp": msp_serp,
                                "price_difference": item.get("price_difference")
                            })
                    
                    # Extend results with deduplicated discovered vendors
                    logger.info(f"[Discovery SERP] COMPLETE: Found {len(all_discovered)} unique discovery URLs after deduplication")
                    results.extend(all_discovered)
                    logger.info(f"[Discovery SERP] Results extended with {len(all_discovered)} discovery vendors")
                    
                    # Flush SERP violations to ensure they're persisted
                    await db.flush()
                    logger.info(f"[Discovery SERP] ✓ Violations flushed. Will commit after registration updates.")
                    
                except Exception as e:
                    logger.error(f"[Discovery SERP] Error during SERP discovery: {str(e)}", exc_info=True)
            
            # Update product execution timestamps
            product.last_scraped_date = datetime.utcnow()
            execution_time_str = datetime.utcnow().isoformat()
            product.last_execution_time = execution_time_str
            
            # Commit all violations (registered + discovered + SERP) to database
            await db.commit()
            logger.info(f"[Scraper SERP] ✓ SERP violation commit completed. Total violations tracked: {len(violation_records)}")
            
            # Refresh product to ensure changes are persisted
            await db.refresh(product)
            
            logger.info(f"\n{'='*80}")
            logger.info(f"[Scraper SERP] ✓ Scraping complete for product {product_id}")
            logger.info(f"[Scraper SERP] Total Results: {len(results)}")
            logger.info(f"[Scraper SERP] Total Violations Created: {len(violation_records)}")
            logger.info(f"[Scraper SERP] Violations in created_violations_set: {len(created_violations_set)}")
            
            # Log all violations that were saved
            if violation_records:
                logger.info(f"[Scraper SERP] Violations saved:")
                for v in violation_records:
                    logger.info(f"  - {v.get('vendor_name')}: {v.get('product_name')} @ {v.get('scraped_price')} (MSP: {v.get('msp')})")
            
            logger.info(f"{'='*80}\n")
            
            return {
                "product_id": product_id,
                "product_name": product.product_name,
                "barcode": product.barcode,
                "msp": float(product.msp),
                "results": results,
                "violations": violation_records,
                "total_results": len(results),
                "last_execution_time": execution_time_str,
                "total_violations": len(violation_records),
                "discovery_enabled": enable_discovery,
                "source_type": "browser_with_serp_api_discovery"
            }
            
        except Exception as e:
            logger.error(f"❌ Fatal scraping error: {str(e)}")
            await db.rollback()
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Scraping failed: {str(e)}"
            )
    
    @staticmethod
    async def search_google_serp(
        product_name: str, 
        barcode: Optional[str] = None, 
        msp: Optional[float] = None,
        product: Optional['Product'] = None,
        db: Optional[AsyncSession] = None
    ) -> Tuple[List[Dict], List[Dict]]:
        """
        Search using SERP API Google Search Engine (regular search, not shopping).
        Optionally stores violations in database following same schema as SERP Shopping Light API.
        
        Args:
            product_name: Product name for search
            barcode: Product barcode (optional)
            msp: Minimum Selling Price (optional, used for violation detection)
            product: Product object (required for database storage)
            db: AsyncSession (required for database storage)
        
        Returns:
            Tuple of (results_list, violations_list)
        """
        results = []
        violations = []
        
        try:
            logger.info("[Google Search SERP] Using SERP API Google Search Engine for search")
            api_key = os.environ.get('SERP_API_KEY')
            
            # Check if API key is configured
            if not api_key:
                logger.error("[Google Search SERP] ❌ SERP_API_KEY not found in environment variables")
                return results, violations
                
            if api_key == 'your_serp_api_key_here':
                logger.error("[Google Search SERP] ❌ SERP_API_KEY is set to placeholder value")
                return results, violations
            
            # Build search query with product name
            search_query = f"{product_name}"
            logger.info(f"[Google Search SERP] Search query: '{search_query}'")
            
            # Direct HTTP request to SERP API
            serp_url = "https://serpapi.com/search.json"
            
            params = {
                "engine": "google",  # Regular Google Search engine
                "q": search_query,
                "api_key": api_key,
                "location": "Spain",
                "location_requested": "Spain",
                "location_used": "Spain",
                "google_domain": "google.com",
                "hl": "en",
                "gl": "us",
                "device": "desktop"
            }
            
            try:
                response = requests.get(serp_url, params=params, timeout=30)
                response.raise_for_status()
                data = response.json()
            except requests.exceptions.RequestException as req_error:
                logger.error(f"[Google Search SERP] HTTP request failed: {str(req_error)}")
                return results, violations
            except ValueError as json_error:
                logger.error(f"[Google Search SERP] Failed to parse JSON response: {str(json_error)}")
                return results, violations
            
            # Check for empty response
            if not data:
                return results, violations
            
            # Check for API errors
            if "error" in data:
                logger.error(f"[Google Search SERP] API Error: {data.get('error')}")
                return results, violations
            
            # Extract different result types from SERP API response
            organic_results = data.get("organic_results", [])
            shopping_results = data.get("shopping_results", [])
            inline_shopping = data.get("inline_shopping", [])  # NEW: Inline shopping embedded in organic results
            
            logger.info(f"[Google Search SERP] 🔍 SERP API Response Analysis:")
            logger.info(f"[Google Search SERP]   - organic_results: {len(organic_results)} results")
            logger.info(f"[Google Search SERP]   - shopping_results: {len(shopping_results)} results")
            logger.info(f"[Google Search SERP]   - inline_shopping: {len(inline_shopping)} results")
            logger.info(f"[Google Search SERP]   - All available keys: {list(data.keys())}")
            
            if inline_shopping:
                logger.info(f"[Google Search SERP] ✅ GOOD! Found inline_shopping. Sample keys: {list(inline_shopping[0].keys()) if inline_shopping else 'N/A'}")
            else:
                logger.info(f"[Google Search SERP] ⚠️ No inline_shopping in response. Will use organic_results.")
            
            # Track violations we create (to prevent duplicates)
            created_violations = set()
            
            # Priority: Use inline_shopping first (has prices), then shopping_results, then organic_results
            if inline_shopping:
                logger.info(f"[Google Search SERP] Found {len(inline_shopping)} INLINE SHOPPING products with prices! Using those.")
                # Process inline shopping results (these have prices)
                for idx, item in enumerate(inline_shopping):
                    try:
                        title = item.get("title", "")
                        price = item.get("price") or item.get("extracted_price")
                        link = item.get("link", "")
                        source = item.get("source", "")
                        rating = item.get("rating")
                        
                        if not link:
                            continue
                        
                        # Extract domain/vendor name from URL
                        parsed_url = urlparse(link)
                        vendor_name = item.get("source", parsed_url.netloc.replace("www.", ""))
                        
                        # Convert price to float
                        price_float = None
                        if price:
                            try:
                                price_str = str(price).replace("€", "").replace("$", "").replace(",", ".").strip()
                                price_float = float(price_str)
                            except (ValueError, AttributeError, TypeError):
                                pass
                        
                        result_dict = {
                            "position": idx + 1,
                            "vendor_name": vendor_name,
                            "vendor_url": link,
                            "title": title,
                            "price_raw": price,
                            "source": "google_inline_shopping",
                            "rating": rating
                        }
                        
                        # Add price if found
                        if price_float:
                            result_dict["scraped_price"] = price_float
                            
                            # Calculate price difference and compliance status (same as shopping light)
                            if msp:
                                price_diff, perc_diff = calculate_price_difference(msp, price_float)
                                result_dict["price_difference"] = price_diff
                                result_dict["percentage_difference"] = perc_diff
                                
                                # Use helper function for compliance status
                                compliance = determine_compliance_status(msp, price_float)
                                result_dict["status"] = compliance
                                result_dict["compliance_status"] = compliance
                            else:
                                result_dict["status"] = "unknown"
                                result_dict["compliance_status"] = "unknown"
                        else:
                            result_dict["status"] = "unknown"
                            result_dict["compliance_status"] = "unknown"
                        
                        results.append(result_dict)
                        
                        # Store violation if applicable and database is available
                        if db and product and price_float and msp and result_dict.get("status") == "violation":
                            try:
                                violation_key = (vendor_name, product.product_name, msp, round(price_float, 2))
                                if violation_key not in created_violations:
                                    marketplace = vendor_name
                                    violation = Violation(
                                        product_id=product.id,
                                        vendor_id=None,  # Google Search results are discovered, not registered
                                        vendor_name=vendor_name,
                                        product_name=product.product_name,
                                        msp=float(msp),
                                        scraped_price=price_float,
                                        price_difference=price_diff,
                                        percentage_difference=perc_diff,
                                        marketplace=marketplace,
                                        violation_date=datetime.utcnow(),
                                        url=link,
                                        barcode_number=product.barcode,
                                        reference_id=str(product.reference_id),
                                        source_type="serp_api_google_search",
                                    )
                                    db.add(violation)
                                    created_violations.add(violation_key)
                                    violations.append({
                                        "vendor_name": vendor_name,
                                        "product_name": product.product_name,
                                        "msp": float(msp),
                                        "scraped_price": price_float,
                                        "price_difference": price_diff,
                                        "percentage_difference": perc_diff,
                                        "marketplace": marketplace,
                                        "url": link,
                                        "source_type": "serp_api_google_search"
                                    })
                                    logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} @ €{price_float} (MSP: €{msp})")
                            except Exception as violation_err:
                                logger.warning(f"[Google Search SERP] Error creating violation: {str(violation_err)}")
                        
                        price_str = f" - {price}" if price else " (no price)"
                        rating_str = f" ⭐{rating}" if rating else ""
                        status_str = f" [{result_dict.get('status', 'unknown').upper()}]" if price else ""
                        logger.info(f"[Google Search SERP] ✓ Inline Shopping {idx + 1}: {title[:50]}...{price_str}{rating_str}{status_str} ({vendor_name})")
                        
                    except Exception as e:
                        logger.warning(f"[Google Search SERP] Error parsing inline shopping {idx}: {str(e)}")
                        continue
            
            elif shopping_results:
                logger.info(f"[Google Search SERP] Using {len(shopping_results)} shopping results instead")
                organic_results = shopping_results
            
            elif not organic_results:
                logger.warning(f"[Google Search SERP] ⚠️ No results of any type returned")
                if db and created_violations:
                    await db.commit()
                return results, violations
            
            # Also process organic results (for URLs and additional info)
            if organic_results:
                logger.info(f"[Google Search SERP] Processing {len(organic_results)} organic results")
                
                for idx, result in enumerate(organic_results[:20]):  # Process up to 20 results
                    try:
                        title = result.get("title", "")
                        url = result.get("link", "")
                        snippet = result.get("snippet", "")
                        position = result.get("position", idx + 1)
                        
                        if not url:
                            continue
                        
                        # Extract domain/vendor name from URL
                        parsed_url = urlparse(url)
                        vendor_name = parsed_url.netloc.replace("www.", "")
                        
                        # Try to extract price from multiple sources
                        price = None
                        price_source = None
                        
                        # 1. PRIORITY: Check rich_snippet (structured data from SERP API) - MOST RELIABLE
                        rich_snippet = result.get("rich_snippet", {})
                        if rich_snippet and "bottom" in rich_snippet:
                            detected_extensions = rich_snippet.get("bottom", {}).get("detected_extensions", {})
                            if "price" in detected_extensions:
                                price = float(detected_extensions.get("price"))
                                price_source = "rich_snippet"
                                logger.debug(f"[Google Search SERP]   💰 Found price in rich_snippet: €{price}")
                        
                        # 2. Check if price is directly in result (shopping results)
                        if not price and "extracted_price" in result:
                            price = result.get("extracted_price")
                            price_source = "extracted_price"
                        elif not price and "price" in result:
                            price_str = result.get("price", "")
                            try:
                                price = float(str(price_str).replace("€", "").replace("$", "").replace(",", ".").strip())
                                price_source = "direct_price"
                            except (ValueError, AttributeError, TypeError):
                                price = None
                        
                        # 3. Try to extract price from snippet text (regex search) - FALLBACK
                        if not price and snippet:
                            try:
                                # Try multiple regex patterns to catch different formats
                                patterns = [
                                    r'€\s*([\d,]+\.?\d*)',           # €17.99 or € 17.99
                                    r'([\d,]+\.?\d*)\s*€',           # 17.99€ or 17.99 €
                                    r'([\d,]+,[\d]{2})\s*€',         # 17,99€ (European format)
                                    r'€\s*([\d,]+,[\d]{2})',         # €17,99
                                ]
                                
                                for pattern in patterns:
                                    price_match = re.search(pattern, snippet)
                                    if price_match:
                                        price_str = price_match.group(1).replace(",", ".")
                                        try:
                                            price = float(price_str)
                                            price_source = "snippet_regex"
                                            logger.debug(f"[Google Search SERP]   💰 Extracted price from snippet: €{price}")
                                            break  # Found price, stop trying patterns
                                        except (ValueError, TypeError):
                                            continue
                            except (ValueError, AttributeError, TypeError) as e:
                                logger.debug(f"[Google Search SERP]   Price extraction failed: {str(e)}")
                                pass
                        
                        # 4. LAST RESORT: Scrape website if we still don't have price but have a valid URL
                        if not price and url:
                            logger.info(f"[Google Search SERP] 🔗 No price found in API response. Attempting website scrape: {url[:70]}")
                            try:
                                scraped_website_price = await scrape_price_from_website(url, product_name)
                                if scraped_website_price:
                                    price = scraped_website_price
                                    price_source = "website_scrape"
                                    logger.info(f"[Google Search SERP]   ✅ Website scrape successful: €{price}")
                                else:
                                    logger.debug(f"[Google Search SERP]   ⚠️ Website scrape returned no price")
                            except Exception as scrape_error:
                                logger.debug(f"[Google Search SERP]   Website scrape error: {str(scrape_error)}")
                                pass
                        
                        result_dict = {
                            "position": position,
                            "vendor_name": vendor_name,
                            "vendor_url": url,
                            "title": title,
                            "snippet": snippet,
                            "source": "google_organic"
                        }
                        
                        # Add price if found (from any source)
                        if price:
                            result_dict["scraped_price"] = price
                            result_dict["price_source"] = price_source  # Track where price came from
                            
                            # Calculate price difference and compliance status (same as shopping light)
                            if msp:
                                price_diff, perc_diff = calculate_price_difference(msp, price)
                                result_dict["price_difference"] = price_diff
                                result_dict["percentage_difference"] = perc_diff
                                
                                # Use helper function for compliance status
                                compliance = determine_compliance_status(msp, price)
                                result_dict["status"] = compliance
                                result_dict["compliance_status"] = compliance
                            else:
                                result_dict["status"] = "unknown"
                                result_dict["compliance_status"] = "unknown"
                        else:
                            result_dict["status"] = "unknown"
                            result_dict["compliance_status"] = "unknown"
                        
                        results.append(result_dict)
                        
                        # Store violation if applicable and database is available
                        if db and product and price and msp and result_dict.get("status") == "violation":
                            try:
                                violation_key = (vendor_name, product.product_name, msp, round(price, 2))
                                if violation_key not in created_violations:
                                    marketplace = vendor_name
                                    price_diff, perc_diff = calculate_price_difference(msp, price)
                                    violation = Violation(
                                        product_id=product.id,
                                        vendor_id=None,  # Google Search results are discovered, not registered
                                        vendor_name=vendor_name,
                                        product_name=product.product_name,
                                        msp=float(msp),
                                        scraped_price=price,
                                        price_difference=price_diff,
                                        percentage_difference=perc_diff,
                                        marketplace=marketplace,
                                        violation_date=datetime.utcnow(),
                                        url=url,
                                        barcode_number=product.barcode,
                                        reference_id=str(product.reference_id),
                                        source_type="serp_api_google_search",
                                    )
                                    db.add(violation)
                                    created_violations.add(violation_key)
                                    violations.append({
                                        "vendor_name": vendor_name,
                                        "product_name": product.product_name,
                                        "msp": float(msp),
                                        "scraped_price": price,
                                        "price_difference": price_diff,
                                        "percentage_difference": perc_diff,
                                        "marketplace": marketplace,
                                        "url": url,
                                        "source_type": "serp_api_google_search"
                                    })
                                    logger.warning(f"🚨 VIOLATION SAVED: {vendor_name} - {product.product_name} @ €{price} (MSP: €{msp})")
                            except Exception as violation_err:
                                logger.warning(f"[Google Search SERP] Error creating violation: {str(violation_err)}")
                        
                        price_str = f" - €{price:.2f} ({price_source})" if price else " (no price found)"
                        status_str = f" [{result_dict.get('status', 'unknown').upper()}]" if price else ""
                        logger.info(f"[Google Search SERP] ✓ Organic {idx + 1}: {title[:50]}...{price_str}{status_str} ({vendor_name})")
                        if not price and snippet:
                            logger.debug(f"[Google Search SERP]   Snippet: {snippet[:100]}...")
                        
                    except Exception as e:
                        logger.warning(f"[Google Search SERP] Error parsing organic result {idx}: {str(e)}")
                        continue
            
            # Commit violations to database if any were created
            if db and violations:
                await db.commit()
                logger.info(f"[Google Search SERP] ✓ Committed {len(violations)} violations to database")
            
            logger.info(f"[Google Search SERP] ✓ Google Search completed. Returned {len(results)} results, {len(violations)} violations")
            
        except Exception as e:
            logger.error(f"[Google Search SERP] Search failed: {str(e)}", exc_info=True)
            if db:
                await db.rollback()
        
        return results, violations