import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urljoin, urlparse
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

class GoustoScraper:
    def __init__(self, use_selenium=True):
        self.base_url = "https://www.gousto.co.uk"
        self.use_selenium = use_selenium
        
        if use_selenium:
            # Setup Selenium WebDriver with more robust options
            chrome_options = Options()
            chrome_options.add_argument('--headless=new')  # Use new headless mode
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--disable-extensions')
            chrome_options.add_argument('--disable-background-timer-throttling')
            chrome_options.add_argument('--disable-renderer-backgrounding')
            chrome_options.add_argument('--disable-backgrounding-occluded-windows')
            chrome_options.add_argument('--disable-ipc-flooding-protection')
            chrome_options.add_argument('--window-size=1920,1080')
            chrome_options.add_argument('--remote-debugging-port=9222')
            chrome_options.add_argument('--disable-features=TranslateUI')
            chrome_options.add_argument('--disable-default-apps')
            chrome_options.add_argument('--disable-logging')
            chrome_options.add_argument('--disable-plugins')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            
            # Set additional options for stability
            chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            
            try:
                # Try different methods to initialize Chrome
                print("Attempting to initialize Chrome WebDriver...")
                
                # Method 1: Try with webdriver-manager (if available)
                try:
                    from webdriver_manager.chrome import ChromeDriverManager
                    from selenium.webdriver.chrome.service import Service
                    
                    service = Service(ChromeDriverManager().install())
                    self.driver = webdriver.Chrome(service=service, options=chrome_options)
                    print("✓ Selenium WebDriver initialized with webdriver-manager")
                    
                except ImportError:
                    print("webdriver-manager not available, trying system Chrome...")
                    # Method 2: Try system Chrome
                    self.driver = webdriver.Chrome(options=chrome_options)
                    print("✓ Selenium WebDriver initialized with system Chrome")
                
                except Exception as e:
                    print(f"Failed with system Chrome: {e}")
                    # Method 3: Try with explicit Chrome binary path
                    chrome_options.binary_location = "/usr/bin/google-chrome"  # Common Linux path
                    self.driver = webdriver.Chrome(options=chrome_options)
                    print("✓ Selenium WebDriver initialized with explicit binary path")
                
                self.wait = WebDriverWait(self.driver, 10)
                
                # Test the driver
                self.driver.get("https://www.google.com")
                print("✓ WebDriver test successful")
                
            except Exception as e:
                print(f"✗ All Selenium initialization methods failed: {e}")
                print("Common solutions:")
                print("1. Install Chrome browser: sudo apt-get install google-chrome-stable")
                print("2. Install webdriver-manager: pip install webdriver-manager")
                print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/")
                print("4. Make sure Chrome and ChromeDriver versions match")
                print("\nFalling back to requests method...")
                self.use_selenium = False
                self._init_requests_session()
        else:
            self._init_requests_session()
    
    def _init_requests_session(self):
        """Initialize requests session as fallback"""
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

    def get_page_content_selenium(self, url):
        """Get page content using Selenium (handles JavaScript)"""
        try:
            print(f"Loading page with Selenium: {url}")
            self.driver.get(url)
            
            # Wait for recipe cards to load
            try:
                # Wait for recipe elements to be present
                self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))
                
                # Scroll down to trigger lazy loading
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                
                # Scroll back up
                self.driver.execute_script("window.scrollTo(0, 0);")
                time.sleep(1)
                
            except TimeoutException:
                print("Timeout waiting for page elements to load")
            
            return self.driver.page_source
            
        except Exception as e:
            print(f"Error loading page with Selenium: {e}")
            return None

    def get_page_content_requests(self, url):
        """Get page content using requests (static HTML only)"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching page with requests: {e}")
            return None

    def get_recipe_links(self, page_url):
        """Extract recipe links from the cookbook page"""
        # Get page content
        if self.use_selenium:
            html_content = self.get_page_content_selenium(page_url)
        else:
            html_content = self.get_page_content_requests(page_url)
        
        if not html_content:
            return []
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Debug: Print some of the HTML to see what we're working with
        print("Analyzing page structure...")
        print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
        
        # More comprehensive selectors for Gousto
        recipe_links = set()  # Use set to avoid duplicates
        
        # Different selector strategies
        selectors = [
            # Try different link patterns
            'a[href*="/cookbook/recipes/"]:not([href*="?page="])',
            'a[href^="/cookbook/recipes/"]:not([href*="?"])',
            '[data-test-id*="recipe"] a',
            '[data-testid*="recipe"] a', 
            '.recipe-card a',
            '.recipe-tile a',
            '.recipe a',
            'article a[href*="/cookbook/recipes/"]',
            'div[class*="recipe"] a',
            # More generic approaches
            'a[href*="/recipes/"]:not([href*="?page="])',
        ]
        
        for selector in selectors:
            try:
                links = soup.select(selector)
                print(f"Selector '{selector}' found {len(links)} links")
                
                for link in links:
                    href = link.get('href')
                    if href:
                        # Clean and validate the URL
                        if href.startswith('/'):
                            full_url = urljoin(self.base_url, href)
                        else:
                            full_url = href
                        
                        # Filter out pagination and non-recipe links
                        if ('/cookbook/recipes/' in full_url and 
                            '?page=' not in full_url and 
                            full_url != page_url):
                            recipe_links.add(full_url)
                            
                if recipe_links:
                    break  # If we found links with this selector, stop trying others
                    
            except Exception as e:
                print(f"Error with selector '{selector}': {e}")
                continue
        
        recipe_links = list(recipe_links)
        print(f"Total unique recipe links found: {len(recipe_links)}")
        
        # Debug: Print first few links
        for i, link in enumerate(recipe_links[:3]):
            print(f"Sample link {i+1}: {link}")
        
        return recipe_links

    def filter_steps(self, raw_steps, recipe_title=''):
        """Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)"""
        if not raw_steps:
            return []
        
        filtered_steps = []
        
        # Common unwanted step content to filter out
        unwanted_patterns = [
            'cookbook',
            'recipes',
            'home',
            'menu',
            'navigation',
            'skip to',
            'back to',
            'print recipe',
            'save recipe',
            'rate this recipe',
            'share',
            'pinterest',
            'facebook',
            'twitter',
            'instagram'
        ]
        
        # Convert recipe title to lowercase for comparison
        recipe_title_lower = recipe_title.lower() if recipe_title else ''
        
        for i, step in enumerate(raw_steps):
            step_lower = step.lower().strip()
            
            # Skip empty steps
            if not step_lower:
                continue
            
            # Skip if it's exactly "Cookbook"
            if step_lower == 'cookbook':
                continue
            
            # Skip if it matches the recipe title exactly
            if recipe_title_lower and step_lower == recipe_title_lower:
                continue
            
            # Skip if it's very short and likely navigation (less than 10 chars)
            if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns):
                continue
            
            # Skip if it contains unwanted patterns and is short (likely navigation)
            if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns):
                continue
            
            # Skip steps that are just numbers (step numbers without content)
            if step.strip().isdigit():
                continue
            
            # Skip if it's just a single word that's likely navigation
            if len(step.split()) == 1 and len(step) < 15:
                # But keep single words that are likely cooking instructions
                cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve']
                if not any(word in step_lower for word in cooking_words):
                    continue
            
            # Skip if it looks like a breadcrumb pattern (word > word > word)
            if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4):
                continue
            
            # If we've made it this far, it's likely a real cooking step
            filtered_steps.append(step.strip())
        
        # Additional filtering: remove the first few items if they still look like breadcrumbs
        # This handles cases where the first 1-2 items are navigation that slipped through
        if len(filtered_steps) >= 2:
            # Check if first item is very short and generic
            first_item = filtered_steps[0].lower()
            if (len(first_item) < 20 and 
                any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])):
                filtered_steps = filtered_steps[1:]
            
            # Check if second item is the recipe title or similar
            if (len(filtered_steps) >= 2 and recipe_title_lower and 
                filtered_steps[0].lower() == recipe_title_lower):
                filtered_steps = filtered_steps[1:]
        
        return filtered_steps

    def parse_ingredient(self, ingredient_text):
        """Parse ingredient text into structured format with name, amount, and unit"""
        ingredient_text = ingredient_text.strip()
        
        # Common units and their variations
        units = {
            # Volume
            'ml', 'millilitre', 'milliliters', 'millilitres',
            'l', 'litre', 'liter', 'litres', 'liters',
            'cup', 'cups', 'c',
            'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb',
            'tsp', 'teaspoon', 'teaspoons', 'ts',
            'fl oz', 'fluid ounce', 'fluid ounces',
            'pint', 'pints', 'pt',
            
            # Weight
            'g', 'gram', 'grams', 'gr',
            'kg', 'kilogram', 'kilograms',
            'oz', 'ounce', 'ounces',
            'lb', 'pound', 'pounds', 'lbs',
            
            # Count/pieces
            'piece', 'pieces', 'pc', 'pcs',
            'slice', 'slices',
            'clove', 'cloves',
            'bunch', 'bunches',
            'handful', 'handfuls',
            'pinch', 'pinches',
            'dash', 'dashes',
            'sprig', 'sprigs',
            'stick', 'sticks',
            'can', 'cans', 'tin', 'tins',
            'jar', 'jars',
            'packet', 'packets', 'pack', 'packs',
            'box', 'boxes',
            'bag', 'bags',
            
            # Special measurements
            'to taste', 'as needed', 'optional'
        }
        
        # Create a pattern to match numbers (including fractions and decimals)
        number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+'
        
        # Try to extract amount and unit
        ingredient = {
            'name': ingredient_text,
            'amount': '',
            'unit': ''
        }
        
        # Pattern 1: "2 cups flour" or "250g butter"
        pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$'
        match1 = re.match(pattern1, ingredient_text, re.IGNORECASE)
        
        if match1:
            amount = match1.group(1)
            potential_unit = match1.group(2).strip().lower()
            name = match1.group(3).strip()
            
            # Check if the potential unit is in our units list
            if any(unit in potential_unit for unit in units):
                ingredient['amount'] = amount
                ingredient['unit'] = match1.group(2).strip()  # Keep original case
                ingredient['name'] = name
                return ingredient
        
        # Pattern 2: "flour - 2 cups" or "butter (250g)"
        pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$'
        match2 = re.match(pattern2, ingredient_text, re.IGNORECASE)
        
        if match2:
            name = match2.group(1).strip()
            amount = match2.group(2)
            potential_unit = match2.group(3).strip().lower()
            
            if any(unit in potential_unit for unit in units) or not potential_unit:
                ingredient['name'] = name
                ingredient['amount'] = amount
                ingredient['unit'] = match2.group(3).strip()  # Keep original case
                return ingredient
        
        # Pattern 3: Just amount at start "2 onions" or "1 large egg"
        pattern3 = r'^(' + number_pattern + r')\s+(.+)$'
        match3 = re.match(pattern3, ingredient_text, re.IGNORECASE)
        
        if match3:
            amount = match3.group(1)
            rest = match3.group(2).strip()
            
            # Check if it starts with a unit
            words = rest.split()
            if words and words[0].lower() in units:
                ingredient['amount'] = amount
                ingredient['unit'] = words[0]
                ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest
                return ingredient
            else:
                # No explicit unit, treat as pieces/items
                ingredient['amount'] = amount
                ingredient['unit'] = ''
                ingredient['name'] = rest
                return ingredient
        
        # Pattern 4: Fractional amounts "1/2 cup sugar"
        fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$'
        match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE)
        
        if match4:
            amount = match4.group(1)
            potential_unit = match4.group(2).strip().lower()
            name = match4.group(3).strip()
            
            if any(unit in potential_unit for unit in units):
                ingredient['amount'] = amount
                ingredient['unit'] = match4.group(2).strip()
                ingredient['name'] = name
                return ingredient
        
        # If no patterns match, return as-is (some ingredients might not have amounts)
        return ingredient

    def scrape_recipe_details(self, recipe_url):
        """Scrape detailed recipe information from individual recipe page"""
        try:
            if self.use_selenium:
                html_content = self.get_page_content_selenium(recipe_url)
            else:
                response = self.session.get(recipe_url)
                response.raise_for_status()
                html_content = response.text
            
            if not html_content:
                return None
                
            soup = BeautifulSoup(html_content, 'html.parser')
            
            recipe_data = {
                'url': recipe_url,
                'title': '',
                'image_url': '',
                'prep_time': '',
                'serving': '',
                'ingredients': [],
                'description': '',
                'steps': []
            }
            
            # Extract title with more selectors
            title_selectors = [
                'h1',
                '[data-test-id*="title"]',
                '[data-testid*="title"]',
                '.recipe-title', 
                '.title',
                'title'
            ]
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem and title_elem.get_text(strip=True):
                    recipe_data['title'] = title_elem.get_text(strip=True)
                    break
            
            # Extract image URL with more selectors
            img_selectors = [
                'img[src*="recipe"]',
                'img[alt*="recipe"]',
                '.recipe-image img',
                '.hero-image img',
                'img[src*="gousto"]',
                'main img',
                'img[src*="cloudinary"]'  # Common CDN for food images
            ]
            for selector in img_selectors:
                img_elem = soup.select_one(selector)
                if img_elem:
                    src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy')
                    if src and ('recipe' in src.lower() or 'gousto' in src.lower()):
                        if src.startswith('//'):
                            src = 'https:' + src
                        elif src.startswith('/'):
                            src = urljoin(self.base_url, src)
                        recipe_data['image_url'] = src
                        break
            
            # Extract prep time
            time_patterns = [
                r'(\d+)\s*min',
                r'(\d+)\s*hour',
                r'(\d+)\s*hr'
            ]
            
            time_text = soup.get_text()
            for pattern in time_patterns:
                match = re.search(pattern, time_text, re.IGNORECASE)
                if match:
                    recipe_data['prep_time'] = match.group(0)
                    break
            
            # Extract serving information
            serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)'
            serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE)
            if serving_match:
                recipe_data['serving'] = serving_match.group(0)
            
            # Extract ingredients and parse them into structured format
            ingredient_selectors = [
                '.ingredient',
                '.ingredients li',
                '[data-test-id*="ingredient"]',
                'ul[class*="ingredient"] li',
                '.recipe-ingredients li'
            ]
            
            for selector in ingredient_selectors:
                ingredients = soup.select(selector)
                if ingredients:
                    raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)]
                    # Parse each ingredient into structured format
                    recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients]
                    break
            
            # Extract description
            desc_selectors = [
                '.description',
                '.recipe-description',
                'meta[name="description"]',
                '.intro',
                'p'
            ]
            
            for selector in desc_selectors:
                if selector == 'meta[name="description"]':
                    desc_elem = soup.select_one(selector)
                    if desc_elem:
                        recipe_data['description'] = desc_elem.get('content', '').strip()
                        if recipe_data['description']:
                            break
                else:
                    desc_elem = soup.select_one(selector)
                    if desc_elem and len(desc_elem.get_text(strip=True)) > 20:
                        recipe_data['description'] = desc_elem.get_text(strip=True)
                        break
            
            # Extract steps
            step_selectors = [
                '.step',
                '.instruction',
                '.method li',
                '.directions li',
                'ol li',
                '[data-test-id*="step"]'
            ]
            
            for selector in step_selectors:
                steps = soup.select(selector)
                if steps:
                    raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)]
                    # Filter out unwanted steps (breadcrumbs, navigation, etc.)
                    recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', ''))
                    break
            
            return recipe_data
            
        except Exception as e:
            print(f"Error scraping recipe {recipe_url}: {e}")
            return None

    def scrape_recipes(self, start_url, max_recipes=10):
        """Main method to scrape recipes"""
        print(f"Starting to scrape recipes from: {start_url}")
        print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping")
        
        # Get recipe links from the main page
        recipe_links = self.get_recipe_links(start_url)
        
        if not recipe_links:
            print("No recipe links found. Possible reasons:")
            print("1. Website structure has changed")
            print("2. Content is loaded dynamically and requires JavaScript")
            print("3. Website is blocking automated requests")
            return []
        
        # Limit the number of recipes to scrape
        recipe_links = recipe_links[:max_recipes]
        
        recipes = []
        
        for i, recipe_url in enumerate(recipe_links, 1):
            print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}")
            
            recipe_data = self.scrape_recipe_details(recipe_url)
            
            if recipe_data and recipe_data['title']:
                recipes.append(recipe_data)
                print(f"✓ Successfully scraped: {recipe_data['title']}")
            else:
                print(f"✗ Failed to scrape recipe: {recipe_url}")
            
            # Be respectful - add delay between requests
            time.sleep(2)
        
        return recipes
    
    def __del__(self):
        """Clean up Selenium driver"""
        if hasattr(self, 'driver'):
            try:
                self.driver.quit()
            except:
                pass

def main():
    # Option 1: Try with Selenium first (handles JavaScript), fallback to requests
    print("=== Gousto Recipe Scraper ===")
    print("Choose scraping method:")
    print("1. Selenium (recommended - handles JavaScript)")
    print("2. Requests only (faster, but may miss content)")
    
    try:
        choice = input("Enter choice (1 or 2, default=1): ").strip()
        if choice == '2':
            use_selenium = False
            print("Using requests-only method...")
        else:
            use_selenium = True
            print("Attempting to use Selenium...")
    except (EOFError, KeyboardInterrupt):
        # Handle case where input() isn't available (like in some environments)
        use_selenium = True
        print("Defaulting to Selenium method...")
    
    scraper = GoustoScraper(use_selenium=use_selenium)
    
    try:
        # URL to scrape
        start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1"
        
        # Scrape recipes (limit to 5 for testing)
        recipes = scraper.scrape_recipes(start_url, max_recipes=5)
        
        # Save to JSON file
        if recipes:
            with open('gousto_recipes.json', 'w', encoding='utf-8') as f:
                json.dump(recipes, f, indent=2, ensure_ascii=False)
            
            print(f"\n🎉 Successfully scraped {len(recipes)} recipes!")
            print("Data saved to 'gousto_recipes.json'")
            
            # Print sample data with structured ingredients
            if recipes:
                print("\nSample recipe data:")
                sample = recipes[0]
                print(f"Title: {sample['title']}")
                print(f"Image: {sample['image_url']}")
                print(f"Prep Time: {sample['prep_time']}")
                print(f"Serving: {sample['serving']}")
                print(f"Ingredients: {len(sample['ingredients'])} items")
                
                # Show first few ingredients in structured format
                if sample['ingredients']:
                    print("Sample ingredients:")
                    for i, ing in enumerate(sample['ingredients'][:3]):
                        print(f"  {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip())
                
                print(f"Steps: {len(sample['steps'])} steps")
        else:
            print("\n❌ No recipes were successfully scraped.")
            print("This might be due to:")
            print("- Website changes")
            print("- Anti-bot measures")
            print("- Network issues")
            
    finally:
        # Clean up
        del scraper

if __name__ == "__main__":
    main()