Init commit with working script

2025-08-17 11:44:25 +01:00
commit e570dfe1dc
4 changed files with 807 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,56 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+ENV/
+env/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# Project specific
+gousto_recipes.json
+.DS_Store
+
+# Logs and databases
+*.log
+*.sqlite3
+
+# Environment variables
+.env
+.env.local
+.env.*.local
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# Local development settings
+local_settings.py
+
+# Selenium
+*.crx
+*.pem
--- a/README.md
+++ b/README.md
@@ -0,0 +1,73 @@
+# Gousto Recipe Scraper
+
+A Python script to scrape recipe data from Gousto's website and save it to a JSON file.
+
+## Prerequisites
+
+- Python 3.7+
+- Chrome or Chromium browser (for Selenium)
+- ChromeDriver (will be installed automatically by webdriver-manager)
+
+## Setup
+
+1. Clone this repository:
+   ```bash
+   git clone <repository-url>
+   cd gousto-scraper
+   ```
+
+2. Create and activate a virtual environment:
+   ```bash
+   # On Linux/MacOS
+   python3 -m venv venv
+   source venv/bin/activate
+
+   # On Windows
+   python -m venv venv
+   .\venv\Scripts\activate
+   ```
+
+3. Install the required packages:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+Run the scraper with the following command:
+
+```bash
+python scraper.py
+```
+
+This will:
+1. Scrape recipe data from Gousto's website
+2. Save the results to `gousto_recipes.json`
+
+### Options
+
+- `--use-selenium` (default: True): Use Selenium for JavaScript rendering
+- `--headless` (default: True): Run browser in headless mode
+- `--max-pages`: Maximum number of recipe pages to scrape (default: all)
+- `--output`: Output JSON file path (default: gousto_recipes.json)
+
+Example:
+```bash
+python scraper.py --max-pages 5 --output recipes.json
+```
+
+## Output
+
+The script saves the scraped data to a JSON file containing an array of recipe objects, each including:
+- Title
+- Description
+- Ingredients
+- Cooking time
+- Nutritional information
+- And more
+
+## Notes
+
+- This script is for educational purposes only
+- Be respectful of Gousto's website - don't make too many requests in a short period
+- The website structure might change over time, which could break the scraper
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+selenium==4.15.2
+webdriver-manager==4.0.1
+beautifulsoup4==4.12.2
+lxml==4.9.3
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,674 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import re
+from urllib.parse import urljoin, urlparse
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+
+class GoustoScraper:
+    def __init__(self, use_selenium=True):
+        self.base_url = "https://www.gousto.co.uk"
+        self.use_selenium = use_selenium
+        
+        if use_selenium:
+            # Setup Selenium WebDriver with more robust options
+            chrome_options = Options()
+            chrome_options.add_argument('--headless=new')  # Use new headless mode
+            chrome_options.add_argument('--no-sandbox')
+            chrome_options.add_argument('--disable-dev-shm-usage')
+            chrome_options.add_argument('--disable-gpu')
+            chrome_options.add_argument('--disable-extensions')
+            chrome_options.add_argument('--disable-background-timer-throttling')
+            chrome_options.add_argument('--disable-renderer-backgrounding')
+            chrome_options.add_argument('--disable-backgrounding-occluded-windows')
+            chrome_options.add_argument('--disable-ipc-flooding-protection')
+            chrome_options.add_argument('--window-size=1920,1080')
+            chrome_options.add_argument('--remote-debugging-port=9222')
+            chrome_options.add_argument('--disable-features=TranslateUI')
+            chrome_options.add_argument('--disable-default-apps')
+            chrome_options.add_argument('--disable-logging')
+            chrome_options.add_argument('--disable-plugins')
+            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+            
+            # Set additional options for stability
+            chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+            chrome_options.add_experimental_option('useAutomationExtension', False)
+            
+            try:
+                # Try different methods to initialize Chrome
+                print("Attempting to initialize Chrome WebDriver...")
+                
+                # Method 1: Try with webdriver-manager (if available)
+                try:
+                    from webdriver_manager.chrome import ChromeDriverManager
+                    from selenium.webdriver.chrome.service import Service
+                    
+                    service = Service(ChromeDriverManager().install())
+                    self.driver = webdriver.Chrome(service=service, options=chrome_options)
+                    print("✓ Selenium WebDriver initialized with webdriver-manager")
+                    
+                except ImportError:
+                    print("webdriver-manager not available, trying system Chrome...")
+                    # Method 2: Try system Chrome
+                    self.driver = webdriver.Chrome(options=chrome_options)
+                    print("✓ Selenium WebDriver initialized with system Chrome")
+                
+                except Exception as e:
+                    print(f"Failed with system Chrome: {e}")
+                    # Method 3: Try with explicit Chrome binary path
+                    chrome_options.binary_location = "/usr/bin/google-chrome"  # Common Linux path
+                    self.driver = webdriver.Chrome(options=chrome_options)
+                    print("✓ Selenium WebDriver initialized with explicit binary path")
+                
+                self.wait = WebDriverWait(self.driver, 10)
+                
+                # Test the driver
+                self.driver.get("https://www.google.com")
+                print("✓ WebDriver test successful")
+                
+            except Exception as e:
+                print(f"✗ All Selenium initialization methods failed: {e}")
+                print("Common solutions:")
+                print("1. Install Chrome browser: sudo apt-get install google-chrome-stable")
+                print("2. Install webdriver-manager: pip install webdriver-manager")
+                print("3. Download ChromeDriver manually from https://chromedriver.chromium.org/")
+                print("4. Make sure Chrome and ChromeDriver versions match")
+                print("\nFalling back to requests method...")
+                self.use_selenium = False
+                self._init_requests_session()
+        else:
+            self._init_requests_session()
+    
+    def _init_requests_session(self):
+        """Initialize requests session as fallback"""
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+
+    def get_page_content_selenium(self, url):
+        """Get page content using Selenium (handles JavaScript)"""
+        try:
+            print(f"Loading page with Selenium: {url}")
+            self.driver.get(url)
+            
+            # Wait for recipe cards to load
+            try:
+                # Wait for recipe elements to be present
+                self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "a")))
+                
+                # Scroll down to trigger lazy loading
+                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+                time.sleep(2)
+                
+                # Scroll back up
+                self.driver.execute_script("window.scrollTo(0, 0);")
+                time.sleep(1)
+                
+            except TimeoutException:
+                print("Timeout waiting for page elements to load")
+            
+            return self.driver.page_source
+            
+        except Exception as e:
+            print(f"Error loading page with Selenium: {e}")
+            return None
+
+    def get_page_content_requests(self, url):
+        """Get page content using requests (static HTML only)"""
+        try:
+            response = self.session.get(url)
+            response.raise_for_status()
+            return response.text
+        except requests.RequestException as e:
+            print(f"Error fetching page with requests: {e}")
+            return None
+
+    def get_recipe_links(self, page_url):
+        """Extract recipe links from the cookbook page"""
+        # Get page content
+        if self.use_selenium:
+            html_content = self.get_page_content_selenium(page_url)
+        else:
+            html_content = self.get_page_content_requests(page_url)
+        
+        if not html_content:
+            return []
+        
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # Debug: Print some of the HTML to see what we're working with
+        print("Analyzing page structure...")
+        print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
+        
+        # More comprehensive selectors for Gousto
+        recipe_links = set()  # Use set to avoid duplicates
+        
+        # Different selector strategies
+        selectors = [
+            # Try different link patterns
+            'a[href*="/cookbook/recipes/"]:not([href*="?page="])',
+            'a[href^="/cookbook/recipes/"]:not([href*="?"])',
+            '[data-test-id*="recipe"] a',
+            '[data-testid*="recipe"] a', 
+            '.recipe-card a',
+            '.recipe-tile a',
+            '.recipe a',
+            'article a[href*="/cookbook/recipes/"]',
+            'div[class*="recipe"] a',
+            # More generic approaches
+            'a[href*="/recipes/"]:not([href*="?page="])',
+        ]
+        
+        for selector in selectors:
+            try:
+                links = soup.select(selector)
+                print(f"Selector '{selector}' found {len(links)} links")
+                
+                for link in links:
+                    href = link.get('href')
+                    if href:
+                        # Clean and validate the URL
+                        if href.startswith('/'):
+                            full_url = urljoin(self.base_url, href)
+                        else:
+                            full_url = href
+                        
+                        # Filter out pagination and non-recipe links
+                        if ('/cookbook/recipes/' in full_url and 
+                            '?page=' not in full_url and 
+                            full_url != page_url):
+                            recipe_links.add(full_url)
+                            
+                if recipe_links:
+                    break  # If we found links with this selector, stop trying others
+                    
+            except Exception as e:
+                print(f"Error with selector '{selector}': {e}")
+                continue
+        
+        recipe_links = list(recipe_links)
+        print(f"Total unique recipe links found: {len(recipe_links)}")
+        
+        # Debug: Print first few links
+        for i, link in enumerate(recipe_links[:3]):
+            print(f"Sample link {i+1}: {link}")
+        
+        return recipe_links
+
+    def filter_steps(self, raw_steps, recipe_title=''):
+        """Filter out unwanted items from steps array (breadcrumbs, navigation, etc.)"""
+        if not raw_steps:
+            return []
+        
+        filtered_steps = []
+        
+        # Common unwanted step content to filter out
+        unwanted_patterns = [
+            'cookbook',
+            'recipes',
+            'home',
+            'menu',
+            'navigation',
+            'skip to',
+            'back to',
+            'print recipe',
+            'save recipe',
+            'rate this recipe',
+            'share',
+            'pinterest',
+            'facebook',
+            'twitter',
+            'instagram'
+        ]
+        
+        # Convert recipe title to lowercase for comparison
+        recipe_title_lower = recipe_title.lower() if recipe_title else ''
+        
+        for i, step in enumerate(raw_steps):
+            step_lower = step.lower().strip()
+            
+            # Skip empty steps
+            if not step_lower:
+                continue
+            
+            # Skip if it's exactly "Cookbook"
+            if step_lower == 'cookbook':
+                continue
+            
+            # Skip if it matches the recipe title exactly
+            if recipe_title_lower and step_lower == recipe_title_lower:
+                continue
+            
+            # Skip if it's very short and likely navigation (less than 10 chars)
+            if len(step) < 10 and any(pattern in step_lower for pattern in unwanted_patterns):
+                continue
+            
+            # Skip if it contains unwanted patterns and is short (likely navigation)
+            if len(step) < 50 and any(pattern in step_lower for pattern in unwanted_patterns):
+                continue
+            
+            # Skip steps that are just numbers (step numbers without content)
+            if step.strip().isdigit():
+                continue
+            
+            # Skip if it's just a single word that's likely navigation
+            if len(step.split()) == 1 and len(step) < 15:
+                # But keep single words that are likely cooking instructions
+                cooking_words = ['preheat', 'heat', 'boil', 'simmer', 'bake', 'fry', 'mix', 'stir', 'serve']
+                if not any(word in step_lower for word in cooking_words):
+                    continue
+            
+            # Skip if it looks like a breadcrumb pattern (word > word > word)
+            if '>' in step or '»' in step or ('/' in step and len(step.split()) <= 4):
+                continue
+            
+            # If we've made it this far, it's likely a real cooking step
+            filtered_steps.append(step.strip())
+        
+        # Additional filtering: remove the first few items if they still look like breadcrumbs
+        # This handles cases where the first 1-2 items are navigation that slipped through
+        if len(filtered_steps) >= 2:
+            # Check if first item is very short and generic
+            first_item = filtered_steps[0].lower()
+            if (len(first_item) < 20 and 
+                any(pattern in first_item for pattern in ['cookbook', 'recipe', 'home', 'menu'])):
+                filtered_steps = filtered_steps[1:]
+            
+            # Check if second item is the recipe title or similar
+            if (len(filtered_steps) >= 2 and recipe_title_lower and 
+                filtered_steps[0].lower() == recipe_title_lower):
+                filtered_steps = filtered_steps[1:]
+        
+        return filtered_steps
+
+    def parse_ingredient(self, ingredient_text):
+        """Parse ingredient text into structured format with name, amount, and unit"""
+        ingredient_text = ingredient_text.strip()
+        
+        # Common units and their variations
+        units = {
+            # Volume
+            'ml', 'millilitre', 'milliliters', 'millilitres',
+            'l', 'litre', 'liter', 'litres', 'liters',
+            'cup', 'cups', 'c',
+            'tbsp', 'tablespoon', 'tablespoons', 'tbs', 'tb',
+            'tsp', 'teaspoon', 'teaspoons', 'ts',
+            'fl oz', 'fluid ounce', 'fluid ounces',
+            'pint', 'pints', 'pt',
+            
+            # Weight
+            'g', 'gram', 'grams', 'gr',
+            'kg', 'kilogram', 'kilograms',
+            'oz', 'ounce', 'ounces',
+            'lb', 'pound', 'pounds', 'lbs',
+            
+            # Count/pieces
+            'piece', 'pieces', 'pc', 'pcs',
+            'slice', 'slices',
+            'clove', 'cloves',
+            'bunch', 'bunches',
+            'handful', 'handfuls',
+            'pinch', 'pinches',
+            'dash', 'dashes',
+            'sprig', 'sprigs',
+            'stick', 'sticks',
+            'can', 'cans', 'tin', 'tins',
+            'jar', 'jars',
+            'packet', 'packets', 'pack', 'packs',
+            'box', 'boxes',
+            'bag', 'bags',
+            
+            # Special measurements
+            'to taste', 'as needed', 'optional'
+        }
+        
+        # Create a pattern to match numbers (including fractions and decimals)
+        number_pattern = r'\d+(?:[/.]\d+)?(?:\.\d+)?|\d*\.\d+'
+        
+        # Try to extract amount and unit
+        ingredient = {
+            'name': ingredient_text,
+            'amount': '',
+            'unit': ''
+        }
+        
+        # Pattern 1: "2 cups flour" or "250g butter"
+        pattern1 = r'^(' + number_pattern + r')\s*([a-zA-Z\s]+?)\s+(.+)$'
+        match1 = re.match(pattern1, ingredient_text, re.IGNORECASE)
+        
+        if match1:
+            amount = match1.group(1)
+            potential_unit = match1.group(2).strip().lower()
+            name = match1.group(3).strip()
+            
+            # Check if the potential unit is in our units list
+            if any(unit in potential_unit for unit in units):
+                ingredient['amount'] = amount
+                ingredient['unit'] = match1.group(2).strip()  # Keep original case
+                ingredient['name'] = name
+                return ingredient
+        
+        # Pattern 2: "flour - 2 cups" or "butter (250g)"
+        pattern2 = r'^(.+?)[\s\-\(]+(' + number_pattern + r')\s*([a-zA-Z\s]*?)[\)\s]*$'
+        match2 = re.match(pattern2, ingredient_text, re.IGNORECASE)
+        
+        if match2:
+            name = match2.group(1).strip()
+            amount = match2.group(2)
+            potential_unit = match2.group(3).strip().lower()
+            
+            if any(unit in potential_unit for unit in units) or not potential_unit:
+                ingredient['name'] = name
+                ingredient['amount'] = amount
+                ingredient['unit'] = match2.group(3).strip()  # Keep original case
+                return ingredient
+        
+        # Pattern 3: Just amount at start "2 onions" or "1 large egg"
+        pattern3 = r'^(' + number_pattern + r')\s+(.+)$'
+        match3 = re.match(pattern3, ingredient_text, re.IGNORECASE)
+        
+        if match3:
+            amount = match3.group(1)
+            rest = match3.group(2).strip()
+            
+            # Check if it starts with a unit
+            words = rest.split()
+            if words and words[0].lower() in units:
+                ingredient['amount'] = amount
+                ingredient['unit'] = words[0]
+                ingredient['name'] = ' '.join(words[1:]) if len(words) > 1 else rest
+                return ingredient
+            else:
+                # No explicit unit, treat as pieces/items
+                ingredient['amount'] = amount
+                ingredient['unit'] = ''
+                ingredient['name'] = rest
+                return ingredient
+        
+        # Pattern 4: Fractional amounts "1/2 cup sugar"
+        fraction_pattern = r'^(\d+/\d+)\s+([a-zA-Z\s]+?)\s+(.+)$'
+        match4 = re.match(fraction_pattern, ingredient_text, re.IGNORECASE)
+        
+        if match4:
+            amount = match4.group(1)
+            potential_unit = match4.group(2).strip().lower()
+            name = match4.group(3).strip()
+            
+            if any(unit in potential_unit for unit in units):
+                ingredient['amount'] = amount
+                ingredient['unit'] = match4.group(2).strip()
+                ingredient['name'] = name
+                return ingredient
+        
+        # If no patterns match, return as-is (some ingredients might not have amounts)
+        return ingredient
+
+    def scrape_recipe_details(self, recipe_url):
+        """Scrape detailed recipe information from individual recipe page"""
+        try:
+            if self.use_selenium:
+                html_content = self.get_page_content_selenium(recipe_url)
+            else:
+                response = self.session.get(recipe_url)
+                response.raise_for_status()
+                html_content = response.text
+            
+            if not html_content:
+                return None
+                
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            recipe_data = {
+                'url': recipe_url,
+                'title': '',
+                'image_url': '',
+                'prep_time': '',
+                'serving': '',
+                'ingredients': [],
+                'description': '',
+                'steps': []
+            }
+            
+            # Extract title with more selectors
+            title_selectors = [
+                'h1',
+                '[data-test-id*="title"]',
+                '[data-testid*="title"]',
+                '.recipe-title', 
+                '.title',
+                'title'
+            ]
+            for selector in title_selectors:
+                title_elem = soup.select_one(selector)
+                if title_elem and title_elem.get_text(strip=True):
+                    recipe_data['title'] = title_elem.get_text(strip=True)
+                    break
+            
+            # Extract image URL with more selectors
+            img_selectors = [
+                'img[src*="recipe"]',
+                'img[alt*="recipe"]',
+                '.recipe-image img',
+                '.hero-image img',
+                'img[src*="gousto"]',
+                'main img',
+                'img[src*="cloudinary"]'  # Common CDN for food images
+            ]
+            for selector in img_selectors:
+                img_elem = soup.select_one(selector)
+                if img_elem:
+                    src = img_elem.get('src') or img_elem.get('data-src') or img_elem.get('data-lazy')
+                    if src and ('recipe' in src.lower() or 'gousto' in src.lower()):
+                        if src.startswith('//'):
+                            src = 'https:' + src
+                        elif src.startswith('/'):
+                            src = urljoin(self.base_url, src)
+                        recipe_data['image_url'] = src
+                        break
+            
+            # Extract prep time
+            time_patterns = [
+                r'(\d+)\s*min',
+                r'(\d+)\s*hour',
+                r'(\d+)\s*hr'
+            ]
+            
+            time_text = soup.get_text()
+            for pattern in time_patterns:
+                match = re.search(pattern, time_text, re.IGNORECASE)
+                if match:
+                    recipe_data['prep_time'] = match.group(0)
+                    break
+            
+            # Extract serving information
+            serving_pattern = r'(\d+)\s*(?:serve|serving|portion|people)'
+            serving_match = re.search(serving_pattern, soup.get_text(), re.IGNORECASE)
+            if serving_match:
+                recipe_data['serving'] = serving_match.group(0)
+            
+            # Extract ingredients and parse them into structured format
+            ingredient_selectors = [
+                '.ingredient',
+                '.ingredients li',
+                '[data-test-id*="ingredient"]',
+                'ul[class*="ingredient"] li',
+                '.recipe-ingredients li'
+            ]
+            
+            for selector in ingredient_selectors:
+                ingredients = soup.select(selector)
+                if ingredients:
+                    raw_ingredients = [ing.get_text(strip=True) for ing in ingredients if ing.get_text(strip=True)]
+                    # Parse each ingredient into structured format
+                    recipe_data['ingredients'] = [self.parse_ingredient(ing) for ing in raw_ingredients]
+                    break
+            
+            # Extract description
+            desc_selectors = [
+                '.description',
+                '.recipe-description',
+                'meta[name="description"]',
+                '.intro',
+                'p'
+            ]
+            
+            for selector in desc_selectors:
+                if selector == 'meta[name="description"]':
+                    desc_elem = soup.select_one(selector)
+                    if desc_elem:
+                        recipe_data['description'] = desc_elem.get('content', '').strip()
+                        if recipe_data['description']:
+                            break
+                else:
+                    desc_elem = soup.select_one(selector)
+                    if desc_elem and len(desc_elem.get_text(strip=True)) > 20:
+                        recipe_data['description'] = desc_elem.get_text(strip=True)
+                        break
+            
+            # Extract steps
+            step_selectors = [
+                '.step',
+                '.instruction',
+                '.method li',
+                '.directions li',
+                'ol li',
+                '[data-test-id*="step"]'
+            ]
+            
+            for selector in step_selectors:
+                steps = soup.select(selector)
+                if steps:
+                    raw_steps = [step.get_text(strip=True) for step in steps if step.get_text(strip=True)]
+                    # Filter out unwanted steps (breadcrumbs, navigation, etc.)
+                    recipe_data['steps'] = self.filter_steps(raw_steps, recipe_data.get('title', ''))
+                    break
+            
+            return recipe_data
+            
+        except Exception as e:
+            print(f"Error scraping recipe {recipe_url}: {e}")
+            return None
+
+    def scrape_recipes(self, start_url, max_recipes=10):
+        """Main method to scrape recipes"""
+        print(f"Starting to scrape recipes from: {start_url}")
+        print(f"Using {'Selenium' if self.use_selenium else 'Requests'} for web scraping")
+        
+        # Get recipe links from the main page
+        recipe_links = self.get_recipe_links(start_url)
+        
+        if not recipe_links:
+            print("No recipe links found. Possible reasons:")
+            print("1. Website structure has changed")
+            print("2. Content is loaded dynamically and requires JavaScript")
+            print("3. Website is blocking automated requests")
+            return []
+        
+        # Limit the number of recipes to scrape
+        recipe_links = recipe_links[:max_recipes]
+        
+        recipes = []
+        
+        for i, recipe_url in enumerate(recipe_links, 1):
+            print(f"\nScraping recipe {i}/{len(recipe_links)}: {recipe_url}")
+            
+            recipe_data = self.scrape_recipe_details(recipe_url)
+            
+            if recipe_data and recipe_data['title']:
+                recipes.append(recipe_data)
+                print(f"✓ Successfully scraped: {recipe_data['title']}")
+            else:
+                print(f"✗ Failed to scrape recipe: {recipe_url}")
+            
+            # Be respectful - add delay between requests
+            time.sleep(2)
+        
+        return recipes
+    
+    def __del__(self):
+        """Clean up Selenium driver"""
+        if hasattr(self, 'driver'):
+            try:
+                self.driver.quit()
+            except:
+                pass
+
+def main():
+    # Option 1: Try with Selenium first (handles JavaScript), fallback to requests
+    print("=== Gousto Recipe Scraper ===")
+    print("Choose scraping method:")
+    print("1. Selenium (recommended - handles JavaScript)")
+    print("2. Requests only (faster, but may miss content)")
+    
+    try:
+        choice = input("Enter choice (1 or 2, default=1): ").strip()
+        if choice == '2':
+            use_selenium = False
+            print("Using requests-only method...")
+        else:
+            use_selenium = True
+            print("Attempting to use Selenium...")
+    except (EOFError, KeyboardInterrupt):
+        # Handle case where input() isn't available (like in some environments)
+        use_selenium = True
+        print("Defaulting to Selenium method...")
+    
+    scraper = GoustoScraper(use_selenium=use_selenium)
+    
+    try:
+        # URL to scrape
+        start_url = "https://www.gousto.co.uk/cookbook/recipes?page=1"
+        
+        # Scrape recipes (limit to 5 for testing)
+        recipes = scraper.scrape_recipes(start_url, max_recipes=5)
+        
+        # Save to JSON file
+        if recipes:
+            with open('gousto_recipes.json', 'w', encoding='utf-8') as f:
+                json.dump(recipes, f, indent=2, ensure_ascii=False)
+            
+            print(f"\n🎉 Successfully scraped {len(recipes)} recipes!")
+            print("Data saved to 'gousto_recipes.json'")
+            
+            # Print sample data with structured ingredients
+            if recipes:
+                print("\nSample recipe data:")
+                sample = recipes[0]
+                print(f"Title: {sample['title']}")
+                print(f"Image: {sample['image_url']}")
+                print(f"Prep Time: {sample['prep_time']}")
+                print(f"Serving: {sample['serving']}")
+                print(f"Ingredients: {len(sample['ingredients'])} items")
+                
+                # Show first few ingredients in structured format
+                if sample['ingredients']:
+                    print("Sample ingredients:")
+                    for i, ing in enumerate(sample['ingredients'][:3]):
+                        print(f"  {i+1}. {ing['amount']} {ing['unit']} {ing['name']}".strip())
+                
+                print(f"Steps: {len(sample['steps'])} steps")
+        else:
+            print("\n❌ No recipes were successfully scraped.")
+            print("This might be due to:")
+            print("- Website changes")
+            print("- Anti-bot measures")
+            print("- Network issues")
+            
+    finally:
+        # Clean up
+        del scraper
+
+if __name__ == "__main__":
+    main()